diff --git a/gfx/thebes/public/gfxPlatform.h b/gfx/thebes/public/gfxPlatform.h index 3ec6117289c..5f691a4289f 100644 --- a/gfx/thebes/public/gfxPlatform.h +++ b/gfx/thebes/public/gfxPlatform.h @@ -49,32 +49,6 @@ #undef OS2EMX_PLAIN_CHAR #endif -/* - * SSE2 Support Defines - * - * GFX_SSE2_POSSIBLE is defined if SSE2 support may be available. - * On Mac/OSX Intel, SSE2 is always available. On Windows, SSE2 - * support is possible if the chip supports it so GFX_SSE2_POSSIBLE - * is defined for Windows. - * - * GFX_SSE2_AVAILABLE indicates whether runtime SSE2 support is - * available. On Mac/OSX Intel, it is defined to 1 (TRUE). On Windows - * __sse2_available indicates whether support is available or not. -*/ - -#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__) -#define GFX_SSE2_POSSIBLE -#define GFX_SSE2_AVAILABLE __sse2_available -#define GFX_SSE2_ALIGN __declspec(align(16)) -extern "C" int __sse2_available; -#endif - -#if defined(__GNUC__) && defined(__i386__) && defined(XP_MACOSX) -#define GFX_SSE2_POSSIBLE -#define GFX_SSE2_AVAILABLE 1 -#define GFX_SSE2_ALIGN __attribute__ ((aligned (16))) -#endif - typedef void* cmsHPROFILE; typedef void* cmsHTRANSFORM; diff --git a/jpeg/jconfig.h b/jpeg/jconfig.h index 7bb42028398..d181a5ec0cb 100644 --- a/jpeg/jconfig.h +++ b/jpeg/jconfig.h @@ -95,3 +95,13 @@ #endif /* JPEG_CJPEG_DJPEG */ +/* SSE* alignment support - only use on platforms that support declspec and __attribute__ */ + +#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__) +#define ALIGN16_const_vector_short(name) __declspec(align(16)) const short name[8] +#define ALIGN16_const_vector_uchar(name) __declspec(align(16)) const unsigned char name[16] +#else +#define ALIGN16_const_vector_short(name) const short name[8] __attribute__ ((aligned (16))) +#define ALIGN16_const_vector_uchar(name) const unsigned char name[16] __attribute__ ((aligned (16))) +#endif /* ! XP_WIN32 && _M_IX86 && !__GNUC */ + diff --git a/jpeg/jdapimin.c b/jpeg/jdapimin.c index dc2358ed91c..5b85f799d76 100644 --- a/jpeg/jdapimin.c +++ b/jpeg/jdapimin.c @@ -67,10 +67,14 @@ int MMXAvailable; static int mmxsupport(); #endif -#ifdef HAVE_SSE2_INTEL_MNEMONICS +#ifdef HAVE_SSE2_INTRINSICS int SSE2Available = 0; +#ifdef HAVE_SSE2_INTEL_MNEMONICS static int sse2support(); +#else +static int sse2supportGCC(); #endif /* HAVE_SSE2_INTEL_MNEMONICS */ +#endif /* HAVE_SSE2_INTRINSICS */ /* @@ -88,17 +92,27 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize) if(!cpuidDetected) { - MMXAvailable = mmxsupport(); + MMXAvailable = mmxsupport(); #ifdef HAVE_SSE2_INTEL_MNEMONICS /* only do the sse2 support check if mmx is supported (so we know the processor supports cpuid) */ if (MMXAvailable) SSE2Available = sse2support(); -#endif /* HAVE_SSE2_INTEL_MNEMONICS */ +#endif cpuidDetected = 1; } +#else +#ifdef HAVE_SSE2_INTRINSICS + static int cpuidDetected = 0; + + if(!cpuidDetected) { + SSE2Available = sse2supportGCC(); + cpuidDetected = 1; + } + +#endif /* HAVE_SSE2_INTRINSICS */ #endif /* HAVE_MMX_INTEL_MNEMONICS */ /* For debugging purposes, zero the whole master structure. @@ -479,7 +493,7 @@ static int mmxsupport() else return 0; } -#endif /* HAVE_MMX_INTEL_MNEMONICS */ +#endif #ifdef HAVE_SSE2_INTEL_MNEMONICS static int sse2support() @@ -492,5 +506,25 @@ static int sse2support() else return 2; } +#else +#ifdef HAVE_SSE2_INTRINSICS +static int sse2supportGCC() +{ + + /* Mac Intel started with Core Duo chips which have SSE2 Support */ + +#if defined(__GNUC__) && defined(__i386__) +#if defined(XP_MACOSX) + return 1; +#endif /* XP_MACOSX */ +#endif /* GNUC && i386 */ + + /* Add checking for SSE2 support for other platforms here */ + + /* We don't have SSE2 intrinsics support */ + + return 2; +} +#endif /* HAVE_SSE2_INTRINSICS */ #endif /* HAVE_SSE2_INTEL_MNEMONICS */ diff --git a/jpeg/jdcoefct.c b/jpeg/jdcoefct.c index 24cd201f2fa..4938d20fcb6 100644 --- a/jpeg/jdcoefct.c +++ b/jpeg/jdcoefct.c @@ -45,11 +45,11 @@ typedef struct { * In multi-pass modes, this array points to the current MCU's blocks * within the virtual arrays; it is used only by the input side. */ - SSE2_ALIGN JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU]; + JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU]; #ifdef D_MULTISCAN_FILES_SUPPORTED /* In multi-pass modes, we need a virtual block array for each component. */ - SSE2_ALIGN jvirt_barray_ptr whole_image[MAX_COMPONENTS]; + jvirt_barray_ptr whole_image[MAX_COMPONENTS]; #endif #ifdef BLOCK_SMOOTHING_SUPPORTED @@ -471,7 +471,7 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) jpeg_component_info *compptr; inverse_DCT_method_ptr inverse_DCT; boolean first_row, last_row; - SSE2_ALIGN JBLOCK workspace; + JBLOCK workspace; int *coef_bits; JQUANT_TBL *quanttbl; INT32 Q00,Q01,Q02,Q10,Q11,Q20, num; @@ -723,21 +723,9 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer) JBLOCKROW buffer; int i; -#ifdef HAVE_SSE2_INTRINSICS - /* Align on 16-byte boundaries for the SSE2 code. */ - /* This won't work for 64-bit systems so conditional code it. */ - - PRUptrdiff buffer_shift; - - buffer_shift = (PRUptrdiff) - (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE, - D_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK) + 15); - buffer = (JBLOCKROW) ((buffer_shift + 15) & ~15); -#else buffer = (JBLOCKROW) (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE, D_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK)); -#endif /* HAVE_SSE2_INTRINSICS */ for (i = 0; i < D_MAX_BLOCKS_IN_MCU; i++) { coef->MCU_buffer[i] = buffer + i; } diff --git a/jpeg/jddctmgr.c b/jpeg/jddctmgr.c index dd6f2cbb1ef..3a0e8fd399a 100644 --- a/jpeg/jddctmgr.c +++ b/jpeg/jddctmgr.c @@ -19,7 +19,7 @@ #include "jinclude.h" #include "jpeglib.h" #include "jdct.h" /* Private declarations for DCT subsystem */ -#ifdef HAVE_SSE2_INTEL_MNEMONICS +#ifdef HAVE_SSE2_INTRINSICS extern int SSE2Available; #endif @@ -80,7 +80,6 @@ typedef union { #endif #endif -#ifdef HAVE_SSE2_INTEL_MNEMONICS GLOBAL(void) jpeg_idct_islow_sse2 ( j_decompress_ptr cinfo, @@ -88,16 +87,7 @@ jpeg_idct_islow_sse2 ( JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col); -#endif /* HAVE_SSE2_INTEL_MNEMONICS */ -#ifdef HAVE_SSE2_INTRINSICS -jpeg_idct_ifast_sse2 ( - j_decompress_ptr cinfo, - jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, - JDIMENSION output_col); -#endif /* HAVE_SSE2_INTRINSICS */ /* * Prepare for an output pass. @@ -157,13 +147,21 @@ start_pass (j_decompress_ptr cinfo) #endif #ifdef DCT_IFAST_SUPPORTED case JDCT_IFAST: -#ifdef HAVE_SSE2_INTRINSICS - method_ptr = jpeg_idct_ifast_sse2; - method = JDCT_IFAST; +#ifdef HAVE_SSE2_INTEL_MNEMONICS + if (SSE2Available==1) + { + method_ptr = jpeg_idct_islow_sse2; + method = JDCT_ISLOW; + } + else + { + method_ptr = jpeg_idct_ifast; + method = JDCT_IFAST; + } #else - method_ptr = jpeg_idct_ifast; - method = JDCT_IFAST; -#endif /* HAVE_SSE2_INTRINSICS */ + method_ptr = jpeg_idct_ifast; + method = JDCT_IFAST; +#endif /* HAVE_SSE2_INTEL_MNEMONICS */ break; #endif @@ -299,21 +297,9 @@ jinit_inverse_dct (j_decompress_ptr cinfo) for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { /* Allocate and pre-zero a multiplier table for each component */ -#ifdef HAVE_SSE2_INTRINSICS - - /* Align dct_table on 16 bytes for SSE2 IDCT IFAST optimization */ - - PRUptrdiff buffer_pointer; - - buffer_pointer = (PRUptrdiff) - (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(multiplier_table) + 15); - compptr->dct_table = (void *) ((buffer_pointer + 15) & ~15); -#else compptr->dct_table = (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(multiplier_table)); -#endif /* HAVE_SSE2_INTRINSICS */ MEMZERO(compptr->dct_table, SIZEOF(multiplier_table)); /* Mark multiplier table not yet set up for any method */ idct->cur_method[ci] = -1; diff --git a/jpeg/jidctfst.c b/jpeg/jidctfst.c index 1203eef9bf7..a94455a0e50 100644 --- a/jpeg/jidctfst.c +++ b/jpeg/jidctfst.c @@ -171,13 +171,6 @@ jpeg_idct_ifast_orig (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPARRAY output_buf, JDIMENSION output_col); #endif -#ifdef HAVE_SSE2_INTRINSICS -GLOBAL(void) -jpeg_idct_ifast_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col); -#endif /* HAVE_SSE2_INTRINSICS */ - GLOBAL(void) jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info * compptr, JCOEFPTR coef_block, @@ -1654,458 +1647,4 @@ jpeg_idct_ifast_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr, } #endif -#ifdef HAVE_SSE2_INTRINSICS -#include "xmmintrin.h" -#include "emmintrin.h" - -/* These values are taken from the decimal shifted left 14. When multiplying */ -/* by these constants, we have to shift left the other multiplier so that we */ -/* we can use pmulhw which will descale us by the 16 bits. */ - -SSE2_ALIGN unsigned short SSE2_1_414213562[8] = - {23170, 23170, 23170, 23170, 23170, 23170, 23170, 23170}; -SSE2_ALIGN unsigned short SSE2_1_847759065[8] = - {30274, 30274, 30274, 30274, 30274, 30274, 30274, 30274}; -SSE2_ALIGN unsigned short SSE2_1_082392200[8] = - {17734, 17734, 17734, 17734, 17734, 17734, 17734, 17734}; -SSE2_ALIGN unsigned short SSE2_NEG_0_765366865[8] = - {52996, 52996, 52996, 52996, 52996, 52996, 52996, 52996}; - -/* This following is to compensate for range_limit and is shifted five to */ -/* the left as that's what we're scaled for at the time. */ - -SSE2_ALIGN unsigned short SSE2_ADD_128[8] = - {4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096}; - -/*---------------------------------------------------------------------------*/ - -inline GLOBAL(void) - jpeg_idct_ifast_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) -{ - __m128i row0, row1, row2, row3, row4, row5, row6, row7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - __m128i tmp10, tmp11, tmp12, tmp13, z10, z11, z12, z13; - __m128i xps0, xps1, xps2, xps3, rowa; - __m128i * quantptr; - __m128i * coefptr; - JSAMPROW outptr0, outptr1, outptr2, outptr3, outptr4, outptr5, outptr6, outptr7; - - /* ====================================================================== */ - /* Load registers from the coefficient block */ - /* ====================================================================== */ - - quantptr = (__m128i *) compptr->dct_table; - coefptr = (__m128i *) coef_block; - - row0 = *(coefptr+0); - row1 = *(coefptr+1); - row2 = *(coefptr+2); - row3 = *(coefptr+3); - row4 = *(coefptr+4); - row5 = *(coefptr+5); - row6 = *(coefptr+6); - row7 = *(coefptr+7); - row0 = _mm_mullo_epi16(row0, *(quantptr+0)); - row1 = _mm_mullo_epi16(row1, *(quantptr+1)); - row2 = _mm_mullo_epi16(row2, *(quantptr+2)); - row3 = _mm_mullo_epi16(row3, *(quantptr+3)); - row4 = _mm_mullo_epi16(row4, *(quantptr+4)); - row5 = _mm_mullo_epi16(row5, *(quantptr+5)); - row6 = _mm_mullo_epi16(row6, *(quantptr+6)); - row7 = _mm_mullo_epi16(row7, *(quantptr+7)); - - /* ====================================================================== */ - /* Inverse Discrete Cosine Transform Column Processing */ - /* ====================================================================== */ - - /* Even part */ - - /* tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */ - /* tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */ - - tmp10 = _mm_add_epi16(row0, row4); - tmp11 = _mm_sub_epi16(row0, row4); - - /* tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */ - /* tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) */ - /* - tmp13; */ - - tmp13 = _mm_add_epi16(row2, row6); - tmp12 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(row2, row6), 2), - *((__m128i *) SSE2_1_414213562)); - tmp12 = _mm_sub_epi16(tmp12, tmp13); - - /* tmp0 = tmp10 + tmp13; */ - /* tmp3 = tmp10 - tmp13; */ - /* tmp1 = tmp11 + tmp12; */ - /* tmp2 = tmp11 - tmp12; */ - - tmp0 = _mm_add_epi16(tmp10, tmp13); - tmp3 = _mm_sub_epi16(tmp10, tmp13); - tmp1 = _mm_add_epi16(tmp11, tmp12); - tmp2 = _mm_sub_epi16(tmp11, tmp12); - - /* Odd part */ - - /* z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */ - /* z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */ - /* z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */ - /* z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */ - - z13 = _mm_add_epi16(row5, row3); - z10 = _mm_sub_epi16(row5, row3); - z11 = _mm_add_epi16(row1, row7); - z12 = _mm_sub_epi16(row1, row7); - - /* phase 5 */ - - /* tmp7 = z11 + z13; */ - /* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); */ - - tmp7 = _mm_add_epi16(z11, z13); - tmp11 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(z11, z13), 2), - *((__m128i *) SSE2_1_414213562)); - - /* z5 = MULTIPLY(z10 + z12, FIX_1_847759065); */ - /* tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; */ - /* tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; */ - - /* This is really a mess as the negative 2.61 doesn't work well */ - /* as we're working with 16-bit integers. So we're going to apply */ - /* a little algebra to get: */ - - /* tmp10 = z12 * (FIX_1_082392200 - FIX_1_847759065) - z10*FIX_1_847759065 */ - /* tmp12 = z10 * (FIX_1_847759065 - FIX_2_613125930) + z12*FIX_1_847759065 */ - - /* or */ - - /* tmp10 = z12 * FIX_NEG_0_765366865 - z10 * FIX_1_847759065 */ - /* tmp12 = z10 * FIX_NEG_0_765366865 + z12 * FIX_1_847759065 */ - - z10 = _mm_slli_epi16(z10, 2); - z12 = _mm_slli_epi16(z12, 2); - - tmp10 = _mm_sub_epi16(_mm_mulhi_epi16(z12, *((__m128i *)SSE2_NEG_0_765366865)), - _mm_mulhi_epi16(z10, *((__m128i *)SSE2_1_847759065))); - tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10, *((__m128i *)SSE2_NEG_0_765366865)), - _mm_mulhi_epi16(z12, *((__m128i *)SSE2_1_847759065))); - - /* phase 2 */ - - /* tmp6 = tmp12 - tmp7; */ - /* tmp5 = tmp11 - tmp6; */ - /* tmp4 = tmp10 + tmp5; */ - - tmp6 = _mm_sub_epi16(tmp12, tmp7); - tmp5 = _mm_sub_epi16(tmp11, tmp6); - tmp4 = _mm_add_epi16(tmp10, tmp5); - - /* outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */ - /* & RANGE_MASK]; */ - /* outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */ - /* & RANGE_MASK]; */ - - row0 = _mm_add_epi16(tmp0, tmp7); - row7 = _mm_sub_epi16(tmp0, tmp7); - - /* outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */ - /* & RANGE_MASK]; */ - /* outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */ - /* & RANGE_MASK]; */ - - row1 = _mm_add_epi16(tmp1, tmp6); - row6 = _mm_sub_epi16(tmp1, tmp6); - - /* outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */ - /* & RANGE_MASK]; */ - /* outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */ - /* & RANGE_MASK]; */ - - row2 = _mm_add_epi16(tmp2, tmp5); - row5 = _mm_sub_epi16(tmp2, tmp5); - - /* outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */ - /* & RANGE_MASK]; */ - /* outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */ - /* & RANGE_MASK]; */ - - row4 = _mm_add_epi16(tmp3, tmp4); - row3 = _mm_sub_epi16(tmp3, tmp4); - - /* ====================================================================== */ - /* Transpose Matrix (Word) */ - /* ====================================================================== */ - - /* Save off the low quadwords from the even rows */ - - xps0 = _mm_move_epi64(row0); - xps1 = _mm_move_epi64(row2); - xps2 = _mm_move_epi64(row4); - xps3 = _mm_move_epi64(row6); - - /* Interleave the high quadwords */ - - row0 = _mm_unpackhi_epi16(row0, row1); /* 04 14 05 15 06 16 07 17 */ - row2 = _mm_unpackhi_epi16(row2, row3); /* 24 34 25 35 26 36 27 37 */ - row4 = _mm_unpackhi_epi16(row4, row5); /* 44 54 45 55 46 56 47 57 */ - row6 = _mm_unpackhi_epi16(row6, row7); /* 64 74 65 75 66 76 67 77 */ - - rowa = row0; - row0 = _mm_unpacklo_epi32(row0, row2); /* 04 14 24 34 05 15 25 35 */ - rowa = _mm_unpackhi_epi32(rowa, row2); /* 06 16 26 36 07 17 27 37 */ - - row2 = row4; - row4 = _mm_unpacklo_epi32(row4, row6); /* 44 54 64 74 45 55 65 75 */ - row2 = _mm_unpackhi_epi32(row2, row6); /* 46 56 66 76 47 57 67 77 */ - - row6 = row0; - row0 = _mm_unpacklo_epi64(row0, row4); /* 04 14 24 34 44 54 64 74 */ - row6 = _mm_unpackhi_epi64(row6, row4); /* 05 15 25 35 45 55 65 75 */ - - row4 = rowa; - rowa = _mm_unpacklo_epi64(rowa, row2); /* 06 16 26 36 46 56 66 76 */ - row4 = _mm_unpackhi_epi64(row4, row2); /* 07 17 27 37 47 57 67 77 */ - - /* row0, row6, rowa, row4 now contain *4, *5, *6, *7 */ - - /* Interleave the low quadwords */ - - xps0 = _mm_unpacklo_epi16(xps0, row1); /* 00 10 01 11 02 12 03 13 */ - xps1 = _mm_unpacklo_epi16(xps1, row3); /* 20 30 21 31 22 32 23 33 */ - xps2 = _mm_unpacklo_epi16(xps2, row5); /* 40 50 41 51 42 52 43 43 */ - xps3 = _mm_unpacklo_epi16(xps3, row7); /* 60 70 61 71 62 72 63 73 */ - - row2 = xps0; - xps0 = _mm_unpacklo_epi32(xps0, xps1); /* 00 10 20 30 01 11 21 31 */ - row2 = _mm_unpackhi_epi32(row2, xps1); /* 02 12 22 32 03 13 23 33 */ - - xps1 = xps2; - xps2 = _mm_unpacklo_epi32(xps2, xps3); /* 40 50 60 70 41 51 61 71 */ - xps1 = _mm_unpackhi_epi32(xps1, xps3); /* 42 52 62 72 43 53 63 73 */ - - xps3 = xps0; - xps0 = _mm_unpacklo_epi64(xps0, xps2); /* 00 10 20 30 40 50 60 70 */ - xps3 = _mm_unpackhi_epi64(xps3, xps2); /* 01 11 21 31 41 51 61 71 */ - - xps2 = row2; - row2 = _mm_unpacklo_epi64(row2, xps1); /* 02 12 22 32 42 52 62 72 */ - xps2 = _mm_unpackhi_epi64(xps2, xps1); /* 03 13 23 33 43 53 63 73 */ - - /* xps0, xps3, row2, xps2 now contain *0, *1, *2, *3 */ - - /* Restructure back to row0 to row7. */ - - row7 = row4; - row5 = row6; - row6 = rowa; - row4 = row0; - row0 = xps0; - row1 = xps3; - row3 = xps2; - - /* ====================================================================== */ - /* Inverse Discrete Cosine Transform Row Processing */ - /* ====================================================================== */ - - /* Even part */ - - /* tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */ - /* tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */ - - tmp10 = _mm_add_epi16(row0, row4); - tmp11 = _mm_sub_epi16(row0, row4); - - /* tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */ - /* tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) */ - /* - tmp13; */ - - tmp13 = _mm_add_epi16(row2, row6); - tmp12 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(row2, row6), 2), - *((__m128i *) SSE2_1_414213562)); - tmp12 = _mm_sub_epi16(tmp12, tmp13); - - /* tmp0 = tmp10 + tmp13; */ - /* tmp3 = tmp10 - tmp13; */ - /* tmp1 = tmp11 + tmp12; */ - /* tmp2 = tmp11 - tmp12; */ - - tmp0 = _mm_add_epi16(tmp10, tmp13); - tmp3 = _mm_sub_epi16(tmp10, tmp13); - tmp1 = _mm_add_epi16(tmp11, tmp12); - tmp2 = _mm_sub_epi16(tmp11, tmp12); - - /* Odd part */ - - /* z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */ - /* z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */ - /* z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */ - /* z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */ - - z13 = _mm_add_epi16(row5, row3); - z10 = _mm_sub_epi16(row5, row3); - z11 = _mm_add_epi16(row1, row7); - z12 = _mm_sub_epi16(row1, row7); - - /* phase 5 */ - - /* tmp7 = z11 + z13; */ - /* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); */ - - tmp7 = _mm_add_epi16(z11, z13); - tmp11 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(z11, z13), 2), - *((__m128i *) SSE2_1_414213562)); - - /* z5 = MULTIPLY(z10 + z12, FIX_1_847759065); */ - /* tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; */ - /* tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; */ - - /* This is really a mess as the negative 2.61 doesn't work well */ - /* as we're working with 16-bit integers. So we're going to apply */ - /* a little algebra to get: */ - - /* tmp10 = z12 * (FIX_1_082392200 - FIX_1_847759065) - z10*FIX_1_847759065 */ - /* tmp12 = z10 * (FIX_1_847759065 - FIX_2_613125930) + z12*FIX_1_847759065 */ - - /* or */ - - /* tmp10 = z12 * FIX_NEG_0_765366865 - z10 * FIX_1_847759065 */ - /* tmp12 = z10 * FIX_NEG_0_765366865 + z12 * FIX_1_847759065 */ - - z10 = _mm_slli_epi16(z10, 2); - z12 = _mm_slli_epi16(z12, 2); - - tmp10 = _mm_sub_epi16(_mm_mulhi_epi16(z12, *((__m128i *)SSE2_NEG_0_765366865)), - _mm_mulhi_epi16(z10, *((__m128i *)SSE2_1_847759065))); - tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10, *((__m128i *)SSE2_NEG_0_765366865)), - _mm_mulhi_epi16(z12, *((__m128i *)SSE2_1_847759065))); - - /* phase 2 */ - - /* tmp6 = tmp12 - tmp7; */ - /* tmp5 = tmp11 - tmp6; */ - /* tmp4 = tmp10 + tmp5; */ - - tmp6 = _mm_sub_epi16(tmp12, tmp7); - tmp5 = _mm_sub_epi16(tmp11, tmp6); - tmp4 = _mm_add_epi16(tmp10, tmp5); - - /* outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */ - /* & RANGE_MASK]; */ - /* outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */ - /* & RANGE_MASK]; */ - - row0 = _mm_add_epi16(tmp0, tmp7); - row7 = _mm_sub_epi16(tmp0, tmp7); - - /* outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */ - /* & RANGE_MASK]; */ - /* outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */ - /* & RANGE_MASK]; */ - - row1 = _mm_add_epi16(tmp1, tmp6); - row6 = _mm_sub_epi16(tmp1, tmp6); - - /* outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */ - /* & RANGE_MASK]; */ - /* outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */ - /* & RANGE_MASK]; */ - - row2 = _mm_add_epi16(tmp2, tmp5); - row5 = _mm_sub_epi16(tmp2, tmp5); - - /* outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */ - /* & RANGE_MASK]; */ - /* outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */ - /* & RANGE_MASK]; */ - - row4 = _mm_add_epi16(tmp3, tmp4); - row3 = _mm_sub_epi16(tmp3, tmp4); - - /* ====================================================================== */ - /* Transpose Matrix (Byte) */ - /* ====================================================================== */ - - /* Shift right after adding half to descale */ - - row0 = _mm_add_epi16(row0, *((__m128i *) SSE2_ADD_128)); - row1 = _mm_add_epi16(row1, *((__m128i *) SSE2_ADD_128)); - row2 = _mm_add_epi16(row2, *((__m128i *) SSE2_ADD_128)); - row3 = _mm_add_epi16(row3, *((__m128i *) SSE2_ADD_128)); - row4 = _mm_add_epi16(row4, *((__m128i *) SSE2_ADD_128)); - row5 = _mm_add_epi16(row5, *((__m128i *) SSE2_ADD_128)); - row6 = _mm_add_epi16(row6, *((__m128i *) SSE2_ADD_128)); - row7 = _mm_add_epi16(row7, *((__m128i *) SSE2_ADD_128)); - - row0 = _mm_srai_epi16(row0, 5); - row1 = _mm_srai_epi16(row1, 5); - row2 = _mm_srai_epi16(row2, 5); - row3 = _mm_srai_epi16(row3, 5); - row4 = _mm_srai_epi16(row4, 5); - row5 = _mm_srai_epi16(row5, 5); - row6 = _mm_srai_epi16(row6, 5); - row7 = _mm_srai_epi16(row7, 5); - - /* Range Limit and convert Words to Bytes */ - - row0 = _mm_packus_epi16(row0, row4); - row1 = _mm_packus_epi16(row1, row5); - row2 = _mm_packus_epi16(row2, row6); - row3 = _mm_packus_epi16(row3, row7); - - /* Transpose from rows back to columns */ - - row4 = _mm_unpackhi_epi8(row0, row1); /* 40 50 41 51 42 52 43 43 ... */ - row5 = _mm_unpackhi_epi8(row2, row3); /* 60 70 61 71 62 72 63 73 ... */ - row0 = _mm_unpacklo_epi8(row0, row1); /* 00 10 01 11 02 12 03 13 ... */ - row2 = _mm_unpacklo_epi8(row2, row3); /* 20 30 21 31 22 32 23 33 ... */ - - row6 = _mm_unpackhi_epi16(row0, row2); /* 04 14 24 34 05 15 25 35 ... */ - row7 = _mm_unpackhi_epi16(row4, row5); /* 44 54 64 74 45 55 65 75 ... */ - row0 = _mm_unpacklo_epi16(row0, row2); /* 00 10 20 30 01 11 21 31 ... */ - row4 = _mm_unpacklo_epi16(row4, row5); /* 40 50 60 70 41 51 61 71 ... */ - - row1 = _mm_unpackhi_epi32(row0, row4); /* 02 12 22 32 42 52 62 72 ... */ - row2 = _mm_unpackhi_epi32(row6, row7); /* 06 16 26 36 46 56 66 76 ... */ - row0 = _mm_unpacklo_epi32(row0, row4); /* 00 10 20 30 40 50 60 70 ... */ - row6 = _mm_unpacklo_epi32(row6, row7); /* 04 14 24 34 55 54 64 74 ... */ - - /* Rearrange for readability */ - - row3 = row2; - row2 = row6; - - /* ====================================================================== */ - /* Final output */ - /* ====================================================================== */ - - outptr0 = output_buf[0] + output_col; - outptr1 = output_buf[1] + output_col; - outptr2 = output_buf[2] + output_col; - outptr3 = output_buf[3] + output_col; - outptr4 = output_buf[4] + output_col; - outptr5 = output_buf[5] + output_col; - outptr6 = output_buf[6] + output_col; - outptr7 = output_buf[7] + output_col; - - /* We can't use the faster FP instructions to write out the high quad */ - /* so do a rotate right and write them out as the low quad */ - - row4 = _mm_srli_si128(row0, 8); - row5 = _mm_srli_si128(row1, 8); - row6 = _mm_srli_si128(row2, 8); - row7 = _mm_srli_si128(row3, 8); - - _mm_storel_epi64((__m128i *) outptr0, row0); - _mm_storel_epi64((__m128i *) outptr2, row1); - _mm_storel_epi64((__m128i *) outptr4, row2); - _mm_storel_epi64((__m128i *) outptr6, row3); - _mm_storel_epi64((__m128i *) outptr1, row4); - _mm_storel_epi64((__m128i *) outptr3, row5); - _mm_storel_epi64((__m128i *) outptr5, row6); - _mm_storel_epi64((__m128i *) outptr7, row7); -} -#endif /* HAVE_SSE2_INTRINSICS */ - #endif /* DCT_IFAST_SUPPORTED */ diff --git a/jpeg/jmemmgr.c b/jpeg/jmemmgr.c index 0c3823a81ca..d801b322da0 100644 --- a/jpeg/jmemmgr.c +++ b/jpeg/jmemmgr.c @@ -449,9 +449,6 @@ alloc_barray (j_common_ptr cinfo, int pool_id, JBLOCKROW workspace; JDIMENSION rowsperchunk, currow, i; long ltemp; -#ifdef HAVE_SSE2_INTRINSICS - PRUptrdiff workspace_raw; -#endif /* Calculate max # of rows allowed in one allocation chunk */ ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) / @@ -472,16 +469,9 @@ alloc_barray (j_common_ptr cinfo, int pool_id, currow = 0; while (currow < numrows) { rowsperchunk = MIN(rowsperchunk, numrows - currow); -#ifdef HAVE_SSE2_INTRINSICS - workspace_raw = (JBLOCKROW) alloc_large(cinfo, pool_id, - (size_t) ((size_t) rowsperchunk * (size_t) blocksperrow - * SIZEOF(JBLOCK) + 15)); - workspace = (JBLOCKROW) ((workspace_raw + 15) & ~15); -#else workspace = (JBLOCKROW) alloc_large(cinfo, pool_id, (size_t) ((size_t) rowsperchunk * (size_t) blocksperrow * SIZEOF(JBLOCK))); -#endif /* HAVE_SSE2_INTRINSICS */ for (i = rowsperchunk; i > 0; i--) { result[currow++] = workspace; workspace += blocksperrow; diff --git a/jpeg/jmorecfg.h b/jpeg/jmorecfg.h index a1b3a1cda2f..82ae712b429 100644 --- a/jpeg/jmorecfg.h +++ b/jpeg/jmorecfg.h @@ -107,30 +107,22 @@ typedef short JCOEF; /* Defines for MMX/SSE2 support. */ -/* HAVE_SSE2_INTEL_MNEMONICS means Win32 and MSVC. HAVE_SSE2_INTRINSICS */ -/* means that the platform supports SSE2 or higher so that we don't */ -/* have to check runtime CPU capabilities. */ - #if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__) -#define HAVE_MMX_INTEL_MNEMONICS +#define HAVE_MMX_INTEL_MNEMONICS + +/* SSE2 code appears broken for some cpus (bug 247437) */ #define HAVE_SSE2_INTEL_MNEMONICS +#define HAVE_SSE2_INTRINSICS #endif -#if defined(__GNUC__) && defined(__i386__) && defined(XP_MACOSX) +#if defined(__GNUC__) && defined(__i386__) +#if defined(XP_MACOSX) #define HAVE_SSE2_INTRINSICS -#endif /* GNUC && i386 && XP_MACOSX */ +#endif /* ! XP_MACOSX */ +#endif /* ! GNUC && i386 */ /* Add support for other platforms here */ -/* SSE* alignment support - only use on platforms that support declspec and __attribute__ */ -/* We're only aligning Mac OSX structures right now */ - -#ifdef HAVE_SSE2_INTRINSICS -#define SSE2_ALIGN __attribute__ ((aligned (16))) -#else -#define SSE2_ALIGN -#endif - /* Compressed datastreams are represented as arrays of JOCTET. * These must be EXACTLY 8 bits wide, at least once they are written to @@ -328,7 +320,7 @@ typedef unsigned char boolean; /* Capability options common to encoder and decoder: */ #define DCT_ISLOW_SUPPORTED /* slow but accurate integer algorithm */ -#define DCT_IFAST_SUPPORTED /* faster, less accurate integer method */ +#undef DCT_IFAST_SUPPORTED /* faster, less accurate integer method */ #undef DCT_FLOAT_SUPPORTED /* floating-point: accurate, fast on fast HW */ /* Encoder capability options: */ diff --git a/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp b/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp index ec82959301c..8867dbfc428 100644 --- a/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp +++ b/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp @@ -56,10 +56,6 @@ #include "gfxPlatform.h" -#if defined(GFX_SSE2_POSSIBLE) -#include "emmintrin.h" -#endif /* GFX_SSE2_POSSIBLE */ - extern "C" { #include "iccjpeg.h" @@ -538,11 +534,7 @@ nsresult nsJPEGDecoder::ProcessData(const char *data, PRUint32 count, PRUint32 * /* FIXME -- Should reset dct_method and dither mode * for final pass of progressive JPEG */ -#if defined(__GNUC__) && defined(__i386__) && defined(XP_MACOSX) - mInfo.dct_method = JDCT_IFAST; -#else mInfo.dct_method = JDCT_ISLOW; -#endif /* __GNUC__ & __i386__ & XP_MACOSX */ mInfo.dither_mode = JDITHER_FS; mInfo.do_fancy_upsampling = TRUE; mInfo.enable_2pass_quant = FALSE; @@ -1268,16 +1260,6 @@ const int Cb_g_tab[(MAXJSAMPLE+1) * sizeof(int)] ={ #define RIGHT_SHIFT(x,shft) ((x) >> (shft)) #endif -/* Constants for SSE2 support */ - -#ifdef GFX_SSE2_POSSIBLE -GFX_SSE2_ALIGN const short color_constants_1[8] = {128, 128, 128, 128, 128, 128, 128, 128}; -GFX_SSE2_ALIGN const short color_constants_2[8] = {359, 359, 359, 359, 359, 359, 359, 359}; -GFX_SSE2_ALIGN const short color_constants_3[8] = {88, 88, 88, 88, 88, 88, 88, 88}; -GFX_SSE2_ALIGN const short color_constants_4[8] = {183, 183, 183, 183, 183, 183, 183, 183}; -GFX_SSE2_ALIGN const short color_constants_5[8] = {454, 454, 454, 454, 454, 454, 454, 454}; -GFX_SSE2_ALIGN const short color_constants_6[8] = {11382, 11382, 11382, 11382, 11382, 11382, 11382, 11382}; -#endif /* GFX_SSE2_POSSIBLE */ METHODDEF(void) ycc_rgb_convert_argb (j_decompress_ptr cinfo, @@ -1287,142 +1269,31 @@ ycc_rgb_convert_argb (j_decompress_ptr cinfo, JDIMENSION num_cols = cinfo->output_width; JSAMPLE * range_limit = cinfo->sample_range_limit; -/* Locals for SSE2 support */ - -#ifdef GFX_SSE2_POSSIBLE - JSAMPROW inptr0, inptr1, inptr2, outptr; - JDIMENSION col; - __m128i row_Y, row_cb, row_cr, zeroes, ones; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; - __m128i row_B, row_G, row_R; -#endif /* GFX_SSE2_POSSIBLE */ - SHIFT_TEMPS -#ifdef GFX_SSE2_POSSIBLE - if (GFX_SSE2_AVAILABLE == 1) { - - /* Load registers with constants if we're going to do at least one quad */ - - if (num_cols >= 8) { - zeroes = _mm_setzero_si128(); - ones = _mm_cmpeq_epi8(zeroes, zeroes); - } - - while (--num_rows >= 0) { - inptr0 = input_buf[0][input_row]; - inptr1 = input_buf[1][input_row]; - inptr2 = input_buf[2][input_row]; - input_row++; - outptr = *output_buf++; - - /* YCbCr -> RGB using scaled multiplication */ - - for (col = 0; col + 8 <= num_cols; col += 8) { - - /* Read a row of each color component */ - - row_Y = _mm_loadl_epi64((__m128i *) (inptr0 + col)); - row_cb = _mm_loadl_epi64((__m128i *) inptr1); - row_cr = _mm_loadl_epi64((__m128i *) inptr2); - - /* Convert bytes to words */ - - row_Y = _mm_unpacklo_epi8(row_Y, zeroes); - row_cb = _mm_unpacklo_epi8(row_cb, zeroes); - row_cr = _mm_unpacklo_epi8(row_cr, zeroes); - - /* Compute B and R */ - - tmp0 = _mm_sub_epi16(row_cb, *((__m128i *) color_constants_1)); /* cb - 128 */ - tmp1 = _mm_sub_epi16(row_cr, *((__m128i *) color_constants_1)); /* cr - 128 */ - tmp2 = _mm_slli_epi16(tmp0, 8); /* (cb - 128) << 8 */ - tmp3 = _mm_slli_epi16(tmp1, 8); /* (cr - 128) << 8 */ - tmp2 = _mm_mulhi_epi16(tmp2, *((__m128i *) color_constants_5)); /* (454 * ((cb - 128) << 8)) >> 16 */ - tmp3 = _mm_mulhi_epi16(tmp3, *((__m128i *) color_constants_2)); /* (359 * ((cr - 128) << 8)) >> 16 */ - row_B = _mm_add_epi16(tmp2, row_Y); /* B = y + (454 * ((cb - 128) << 8)) >> 16 */ - row_R = _mm_add_epi16(tmp3, row_Y); /* R = y + (359 * ((cr - 128) << 8)) >> 16 */ - - /* Compute G */ - - tmp4 = _mm_mullo_epi16(row_cb, *((__m128i *) color_constants_3)); /* 88 * cb */ - tmp5 = _mm_mullo_epi16(tmp1, *((__m128i *) color_constants_4)); /* 183 * (cr - 128) */ - tmp4 = _mm_sub_epi16(tmp4, *((__m128i *) color_constants_6)); /* 88 * cb - (34806 - 128*256) */ - tmp4 = _mm_add_epi16(tmp4, tmp5); /* (88 * cb) + 183 * (cr - 128) - (34806 - 128*256) */ - tmp4 = _mm_srai_epi16(tmp4, 8); /* Shift the above right by 8 to divide by 256 */ - row_G = _mm_sub_epi16(row_Y, tmp4); /* G */ - - /* Pack word to byte with unsigned saturation which range limits too */ - - row_R = _mm_packus_epi16(row_R, zeroes); - row_G = _mm_packus_epi16(row_G, zeroes); - row_B = _mm_packus_epi16(row_B, zeroes); - - /* Pack horizontally */ - - tmp0 = _mm_unpacklo_epi8(row_B, row_G); /* ABAB ABAB ABAB ABAB */ - tmp1 = _mm_unpacklo_epi8(row_R, ones); /* GRGR GRGR GRGR GRGR */ - tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); /* ARGB ARGB ARGB ARGB (low) */ - tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); /* ARGB ARGB ARGB ARGB (high) */ - - /* Write out 32 bytes of ARGB */ - - _mm_storel_epi64((__m128i *) outptr, tmp2); - _mm_storel_epi64((__m128i *) (outptr + 8), _mm_srli_si128(tmp2, 8)); - _mm_storel_epi64((__m128i *) (outptr + 16), tmp3); - _mm_storel_epi64((__m128i *) (outptr + 24), _mm_srli_si128(tmp3, 8)); - - /* Slide the pointers */ - - /* inptr0 += 8; */ - inptr1 += 8; - inptr2 += 8; - outptr += 32; - } - - /* Process remainder using scalar code */ - for (; col < num_cols; col++) { - int y = GETJSAMPLE(inptr0[col]); - int cb = GETJSAMPLE(inptr1[0]); - int cr = GETJSAMPLE(inptr2[0]); - inptr1++; - inptr2++; - JSAMPLE * range_limit_y = range_limit + y; - /* Range-limiting is essential due to noise introduced by DCT losses. */ - *((PRUint32 *) outptr) = 0xFF000000 | - ( range_limit_y[Cr_r_tab[cr]] << 16 ) | - ( range_limit_y[((int) RIGHT_SHIFT(Cb_g_tab[cb] + Cr_g_tab[cr], SCALEBITS))] << 8 ) | - ( range_limit_y[Cb_b_tab[cb]] ); - outptr += 4; - } - } - - } else -#endif /* GFX_SSE2_POSSIBLE */ - /* This is used if we don't have SSE2 */ - { - while (--num_rows >= 0) { - JSAMPROW inptr0 = input_buf[0][input_row]; - JSAMPROW inptr1 = input_buf[1][input_row]; - JSAMPROW inptr2 = input_buf[2][input_row]; - input_row++; - PRUint32 *outptr = (PRUint32 *) *output_buf++; - for (JDIMENSION col = 0; col < num_cols; col++) { - int y = GETJSAMPLE(inptr0[col]); - int cb = GETJSAMPLE(inptr1[col]); - int cr = GETJSAMPLE(inptr2[col]); - JSAMPLE * range_limit_y = range_limit + y; - /* Range-limiting is essential due to noise introduced by DCT losses. */ - outptr[col] = 0xFF000000 | - ( range_limit_y[Cr_r_tab[cr]] << 16 ) | - ( range_limit_y[((int) RIGHT_SHIFT(Cb_g_tab[cb] + Cr_g_tab[cr], SCALEBITS))] << 8 ) | - ( range_limit_y[Cb_b_tab[cb]] ); - } + + while (--num_rows >= 0) { + JSAMPROW inptr0 = input_buf[0][input_row]; + JSAMPROW inptr1 = input_buf[1][input_row]; + JSAMPROW inptr2 = input_buf[2][input_row]; + input_row++; + PRUint32 *outptr = (PRUint32 *) *output_buf++; + for (JDIMENSION col = 0; col < num_cols; col++) { + int y = GETJSAMPLE(inptr0[col]); + int cb = GETJSAMPLE(inptr1[col]); + int cr = GETJSAMPLE(inptr2[col]); + JSAMPLE * range_limit_y = range_limit + y; + /* Range-limiting is essential due to noise introduced by DCT losses. */ + outptr[col] = 0xFF000000 | + ( range_limit_y[Cr_r_tab[cr]] << 16 ) | + ( range_limit_y[((int) RIGHT_SHIFT(Cb_g_tab[cb] + Cr_g_tab[cr], SCALEBITS))] << 8 ) | + ( range_limit_y[Cb_b_tab[cb]] ); } } } + /**************** Inverted CMYK -> RGB conversion **************/ /* * Input is (Inverted) CMYK stored as 4 bytes per pixel.