mirror of
https://gitlab.winehq.org/wine/wine-gecko.git
synced 2024-09-13 09:24:08 -07:00
Backed out changeset 6eec92f9276a from bug 416157 because the image tests need to be updated before it can be landed.
This commit is contained in:
parent
f9e264bb68
commit
b89ad17341
@ -49,32 +49,6 @@
|
||||
#undef OS2EMX_PLAIN_CHAR
|
||||
#endif
|
||||
|
||||
/*
|
||||
* SSE2 Support Defines
|
||||
*
|
||||
* GFX_SSE2_POSSIBLE is defined if SSE2 support may be available.
|
||||
* On Mac/OSX Intel, SSE2 is always available. On Windows, SSE2
|
||||
* support is possible if the chip supports it so GFX_SSE2_POSSIBLE
|
||||
* is defined for Windows.
|
||||
*
|
||||
* GFX_SSE2_AVAILABLE indicates whether runtime SSE2 support is
|
||||
* available. On Mac/OSX Intel, it is defined to 1 (TRUE). On Windows
|
||||
* __sse2_available indicates whether support is available or not.
|
||||
*/
|
||||
|
||||
#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
|
||||
#define GFX_SSE2_POSSIBLE
|
||||
#define GFX_SSE2_AVAILABLE __sse2_available
|
||||
#define GFX_SSE2_ALIGN __declspec(align(16))
|
||||
extern "C" int __sse2_available;
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && defined(__i386__) && defined(XP_MACOSX)
|
||||
#define GFX_SSE2_POSSIBLE
|
||||
#define GFX_SSE2_AVAILABLE 1
|
||||
#define GFX_SSE2_ALIGN __attribute__ ((aligned (16)))
|
||||
#endif
|
||||
|
||||
typedef void* cmsHPROFILE;
|
||||
typedef void* cmsHTRANSFORM;
|
||||
|
||||
|
@ -95,3 +95,13 @@
|
||||
|
||||
#endif /* JPEG_CJPEG_DJPEG */
|
||||
|
||||
/* SSE* alignment support - only use on platforms that support declspec and __attribute__ */
|
||||
|
||||
#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
|
||||
#define ALIGN16_const_vector_short(name) __declspec(align(16)) const short name[8]
|
||||
#define ALIGN16_const_vector_uchar(name) __declspec(align(16)) const unsigned char name[16]
|
||||
#else
|
||||
#define ALIGN16_const_vector_short(name) const short name[8] __attribute__ ((aligned (16)))
|
||||
#define ALIGN16_const_vector_uchar(name) const unsigned char name[16] __attribute__ ((aligned (16)))
|
||||
#endif /* ! XP_WIN32 && _M_IX86 && !__GNUC */
|
||||
|
||||
|
@ -67,10 +67,14 @@ int MMXAvailable;
|
||||
static int mmxsupport();
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SSE2_INTEL_MNEMONICS
|
||||
#ifdef HAVE_SSE2_INTRINSICS
|
||||
int SSE2Available = 0;
|
||||
#ifdef HAVE_SSE2_INTEL_MNEMONICS
|
||||
static int sse2support();
|
||||
#else
|
||||
static int sse2supportGCC();
|
||||
#endif /* HAVE_SSE2_INTEL_MNEMONICS */
|
||||
#endif /* HAVE_SSE2_INTRINSICS */
|
||||
|
||||
|
||||
/*
|
||||
@ -95,10 +99,20 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
|
||||
we know the processor supports cpuid) */
|
||||
if (MMXAvailable)
|
||||
SSE2Available = sse2support();
|
||||
#endif /* HAVE_SSE2_INTEL_MNEMONICS */
|
||||
#endif
|
||||
|
||||
cpuidDetected = 1;
|
||||
}
|
||||
#else
|
||||
#ifdef HAVE_SSE2_INTRINSICS
|
||||
static int cpuidDetected = 0;
|
||||
|
||||
if(!cpuidDetected) {
|
||||
SSE2Available = sse2supportGCC();
|
||||
cpuidDetected = 1;
|
||||
}
|
||||
|
||||
#endif /* HAVE_SSE2_INTRINSICS */
|
||||
#endif /* HAVE_MMX_INTEL_MNEMONICS */
|
||||
|
||||
/* For debugging purposes, zero the whole master structure.
|
||||
@ -479,7 +493,7 @@ static int mmxsupport()
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
#endif /* HAVE_MMX_INTEL_MNEMONICS */
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SSE2_INTEL_MNEMONICS
|
||||
static int sse2support()
|
||||
@ -492,5 +506,25 @@ static int sse2support()
|
||||
else
|
||||
return 2;
|
||||
}
|
||||
#else
|
||||
#ifdef HAVE_SSE2_INTRINSICS
|
||||
static int sse2supportGCC()
|
||||
{
|
||||
|
||||
/* Mac Intel started with Core Duo chips which have SSE2 Support */
|
||||
|
||||
#if defined(__GNUC__) && defined(__i386__)
|
||||
#if defined(XP_MACOSX)
|
||||
return 1;
|
||||
#endif /* XP_MACOSX */
|
||||
#endif /* GNUC && i386 */
|
||||
|
||||
/* Add checking for SSE2 support for other platforms here */
|
||||
|
||||
/* We don't have SSE2 intrinsics support */
|
||||
|
||||
return 2;
|
||||
}
|
||||
#endif /* HAVE_SSE2_INTRINSICS */
|
||||
#endif /* HAVE_SSE2_INTEL_MNEMONICS */
|
||||
|
||||
|
@ -45,11 +45,11 @@ typedef struct {
|
||||
* In multi-pass modes, this array points to the current MCU's blocks
|
||||
* within the virtual arrays; it is used only by the input side.
|
||||
*/
|
||||
SSE2_ALIGN JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];
|
||||
JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];
|
||||
|
||||
#ifdef D_MULTISCAN_FILES_SUPPORTED
|
||||
/* In multi-pass modes, we need a virtual block array for each component. */
|
||||
SSE2_ALIGN jvirt_barray_ptr whole_image[MAX_COMPONENTS];
|
||||
jvirt_barray_ptr whole_image[MAX_COMPONENTS];
|
||||
#endif
|
||||
|
||||
#ifdef BLOCK_SMOOTHING_SUPPORTED
|
||||
@ -471,7 +471,7 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
|
||||
jpeg_component_info *compptr;
|
||||
inverse_DCT_method_ptr inverse_DCT;
|
||||
boolean first_row, last_row;
|
||||
SSE2_ALIGN JBLOCK workspace;
|
||||
JBLOCK workspace;
|
||||
int *coef_bits;
|
||||
JQUANT_TBL *quanttbl;
|
||||
INT32 Q00,Q01,Q02,Q10,Q11,Q20, num;
|
||||
@ -723,21 +723,9 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
|
||||
JBLOCKROW buffer;
|
||||
int i;
|
||||
|
||||
#ifdef HAVE_SSE2_INTRINSICS
|
||||
/* Align on 16-byte boundaries for the SSE2 code. */
|
||||
/* This won't work for 64-bit systems so conditional code it. */
|
||||
|
||||
PRUptrdiff buffer_shift;
|
||||
|
||||
buffer_shift = (PRUptrdiff)
|
||||
(*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
|
||||
D_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK) + 15);
|
||||
buffer = (JBLOCKROW) ((buffer_shift + 15) & ~15);
|
||||
#else
|
||||
buffer = (JBLOCKROW)
|
||||
(*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
|
||||
D_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
|
||||
#endif /* HAVE_SSE2_INTRINSICS */
|
||||
for (i = 0; i < D_MAX_BLOCKS_IN_MCU; i++) {
|
||||
coef->MCU_buffer[i] = buffer + i;
|
||||
}
|
||||
|
@ -19,7 +19,7 @@
|
||||
#include "jinclude.h"
|
||||
#include "jpeglib.h"
|
||||
#include "jdct.h" /* Private declarations for DCT subsystem */
|
||||
#ifdef HAVE_SSE2_INTEL_MNEMONICS
|
||||
#ifdef HAVE_SSE2_INTRINSICS
|
||||
extern int SSE2Available;
|
||||
#endif
|
||||
|
||||
@ -80,7 +80,6 @@ typedef union {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SSE2_INTEL_MNEMONICS
|
||||
GLOBAL(void)
|
||||
jpeg_idct_islow_sse2 (
|
||||
j_decompress_ptr cinfo,
|
||||
@ -88,16 +87,7 @@ jpeg_idct_islow_sse2 (
|
||||
JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col);
|
||||
#endif /* HAVE_SSE2_INTEL_MNEMONICS */
|
||||
|
||||
#ifdef HAVE_SSE2_INTRINSICS
|
||||
jpeg_idct_ifast_sse2 (
|
||||
j_decompress_ptr cinfo,
|
||||
jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col);
|
||||
#endif /* HAVE_SSE2_INTRINSICS */
|
||||
|
||||
/*
|
||||
* Prepare for an output pass.
|
||||
@ -157,13 +147,21 @@ start_pass (j_decompress_ptr cinfo)
|
||||
#endif
|
||||
#ifdef DCT_IFAST_SUPPORTED
|
||||
case JDCT_IFAST:
|
||||
#ifdef HAVE_SSE2_INTRINSICS
|
||||
method_ptr = jpeg_idct_ifast_sse2;
|
||||
#ifdef HAVE_SSE2_INTEL_MNEMONICS
|
||||
if (SSE2Available==1)
|
||||
{
|
||||
method_ptr = jpeg_idct_islow_sse2;
|
||||
method = JDCT_ISLOW;
|
||||
}
|
||||
else
|
||||
{
|
||||
method_ptr = jpeg_idct_ifast;
|
||||
method = JDCT_IFAST;
|
||||
}
|
||||
#else
|
||||
method_ptr = jpeg_idct_ifast;
|
||||
method = JDCT_IFAST;
|
||||
#endif /* HAVE_SSE2_INTRINSICS */
|
||||
#endif /* HAVE_SSE2_INTEL_MNEMONICS */
|
||||
break;
|
||||
|
||||
#endif
|
||||
@ -299,21 +297,9 @@ jinit_inverse_dct (j_decompress_ptr cinfo)
|
||||
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
|
||||
ci++, compptr++) {
|
||||
/* Allocate and pre-zero a multiplier table for each component */
|
||||
#ifdef HAVE_SSE2_INTRINSICS
|
||||
|
||||
/* Align dct_table on 16 bytes for SSE2 IDCT IFAST optimization */
|
||||
|
||||
PRUptrdiff buffer_pointer;
|
||||
|
||||
buffer_pointer = (PRUptrdiff)
|
||||
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
|
||||
SIZEOF(multiplier_table) + 15);
|
||||
compptr->dct_table = (void *) ((buffer_pointer + 15) & ~15);
|
||||
#else
|
||||
compptr->dct_table =
|
||||
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
|
||||
SIZEOF(multiplier_table));
|
||||
#endif /* HAVE_SSE2_INTRINSICS */
|
||||
MEMZERO(compptr->dct_table, SIZEOF(multiplier_table));
|
||||
/* Mark multiplier table not yet set up for any method */
|
||||
idct->cur_method[ci] = -1;
|
||||
|
461
jpeg/jidctfst.c
461
jpeg/jidctfst.c
@ -171,13 +171,6 @@ jpeg_idct_ifast_orig (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JSAMPARRAY output_buf, JDIMENSION output_col);
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SSE2_INTRINSICS
|
||||
GLOBAL(void)
|
||||
jpeg_idct_ifast_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf, JDIMENSION output_col);
|
||||
#endif /* HAVE_SSE2_INTRINSICS */
|
||||
|
||||
GLOBAL(void)
|
||||
jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block,
|
||||
@ -1654,458 +1647,4 @@ jpeg_idct_ifast_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SSE2_INTRINSICS
|
||||
#include "xmmintrin.h"
|
||||
#include "emmintrin.h"
|
||||
|
||||
/* These values are taken from the decimal shifted left 14. When multiplying */
|
||||
/* by these constants, we have to shift left the other multiplier so that we */
|
||||
/* we can use pmulhw which will descale us by the 16 bits. */
|
||||
|
||||
SSE2_ALIGN unsigned short SSE2_1_414213562[8] =
|
||||
{23170, 23170, 23170, 23170, 23170, 23170, 23170, 23170};
|
||||
SSE2_ALIGN unsigned short SSE2_1_847759065[8] =
|
||||
{30274, 30274, 30274, 30274, 30274, 30274, 30274, 30274};
|
||||
SSE2_ALIGN unsigned short SSE2_1_082392200[8] =
|
||||
{17734, 17734, 17734, 17734, 17734, 17734, 17734, 17734};
|
||||
SSE2_ALIGN unsigned short SSE2_NEG_0_765366865[8] =
|
||||
{52996, 52996, 52996, 52996, 52996, 52996, 52996, 52996};
|
||||
|
||||
/* This following is to compensate for range_limit and is shifted five to */
|
||||
/* the left as that's what we're scaled for at the time. */
|
||||
|
||||
SSE2_ALIGN unsigned short SSE2_ADD_128[8] =
|
||||
{4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096};
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
|
||||
inline GLOBAL(void)
|
||||
jpeg_idct_ifast_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
{
|
||||
__m128i row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
__m128i tmp10, tmp11, tmp12, tmp13, z10, z11, z12, z13;
|
||||
__m128i xps0, xps1, xps2, xps3, rowa;
|
||||
__m128i * quantptr;
|
||||
__m128i * coefptr;
|
||||
JSAMPROW outptr0, outptr1, outptr2, outptr3, outptr4, outptr5, outptr6, outptr7;
|
||||
|
||||
/* ====================================================================== */
|
||||
/* Load registers from the coefficient block */
|
||||
/* ====================================================================== */
|
||||
|
||||
quantptr = (__m128i *) compptr->dct_table;
|
||||
coefptr = (__m128i *) coef_block;
|
||||
|
||||
row0 = *(coefptr+0);
|
||||
row1 = *(coefptr+1);
|
||||
row2 = *(coefptr+2);
|
||||
row3 = *(coefptr+3);
|
||||
row4 = *(coefptr+4);
|
||||
row5 = *(coefptr+5);
|
||||
row6 = *(coefptr+6);
|
||||
row7 = *(coefptr+7);
|
||||
row0 = _mm_mullo_epi16(row0, *(quantptr+0));
|
||||
row1 = _mm_mullo_epi16(row1, *(quantptr+1));
|
||||
row2 = _mm_mullo_epi16(row2, *(quantptr+2));
|
||||
row3 = _mm_mullo_epi16(row3, *(quantptr+3));
|
||||
row4 = _mm_mullo_epi16(row4, *(quantptr+4));
|
||||
row5 = _mm_mullo_epi16(row5, *(quantptr+5));
|
||||
row6 = _mm_mullo_epi16(row6, *(quantptr+6));
|
||||
row7 = _mm_mullo_epi16(row7, *(quantptr+7));
|
||||
|
||||
/* ====================================================================== */
|
||||
/* Inverse Discrete Cosine Transform Column Processing */
|
||||
/* ====================================================================== */
|
||||
|
||||
/* Even part */
|
||||
|
||||
/* tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */
|
||||
/* tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */
|
||||
|
||||
tmp10 = _mm_add_epi16(row0, row4);
|
||||
tmp11 = _mm_sub_epi16(row0, row4);
|
||||
|
||||
/* tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */
|
||||
/* tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) */
|
||||
/* - tmp13; */
|
||||
|
||||
tmp13 = _mm_add_epi16(row2, row6);
|
||||
tmp12 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(row2, row6), 2),
|
||||
*((__m128i *) SSE2_1_414213562));
|
||||
tmp12 = _mm_sub_epi16(tmp12, tmp13);
|
||||
|
||||
/* tmp0 = tmp10 + tmp13; */
|
||||
/* tmp3 = tmp10 - tmp13; */
|
||||
/* tmp1 = tmp11 + tmp12; */
|
||||
/* tmp2 = tmp11 - tmp12; */
|
||||
|
||||
tmp0 = _mm_add_epi16(tmp10, tmp13);
|
||||
tmp3 = _mm_sub_epi16(tmp10, tmp13);
|
||||
tmp1 = _mm_add_epi16(tmp11, tmp12);
|
||||
tmp2 = _mm_sub_epi16(tmp11, tmp12);
|
||||
|
||||
/* Odd part */
|
||||
|
||||
/* z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */
|
||||
/* z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */
|
||||
/* z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */
|
||||
/* z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */
|
||||
|
||||
z13 = _mm_add_epi16(row5, row3);
|
||||
z10 = _mm_sub_epi16(row5, row3);
|
||||
z11 = _mm_add_epi16(row1, row7);
|
||||
z12 = _mm_sub_epi16(row1, row7);
|
||||
|
||||
/* phase 5 */
|
||||
|
||||
/* tmp7 = z11 + z13; */
|
||||
/* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); */
|
||||
|
||||
tmp7 = _mm_add_epi16(z11, z13);
|
||||
tmp11 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(z11, z13), 2),
|
||||
*((__m128i *) SSE2_1_414213562));
|
||||
|
||||
/* z5 = MULTIPLY(z10 + z12, FIX_1_847759065); */
|
||||
/* tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; */
|
||||
/* tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; */
|
||||
|
||||
/* This is really a mess as the negative 2.61 doesn't work well */
|
||||
/* as we're working with 16-bit integers. So we're going to apply */
|
||||
/* a little algebra to get: */
|
||||
|
||||
/* tmp10 = z12 * (FIX_1_082392200 - FIX_1_847759065) - z10*FIX_1_847759065 */
|
||||
/* tmp12 = z10 * (FIX_1_847759065 - FIX_2_613125930) + z12*FIX_1_847759065 */
|
||||
|
||||
/* or */
|
||||
|
||||
/* tmp10 = z12 * FIX_NEG_0_765366865 - z10 * FIX_1_847759065 */
|
||||
/* tmp12 = z10 * FIX_NEG_0_765366865 + z12 * FIX_1_847759065 */
|
||||
|
||||
z10 = _mm_slli_epi16(z10, 2);
|
||||
z12 = _mm_slli_epi16(z12, 2);
|
||||
|
||||
tmp10 = _mm_sub_epi16(_mm_mulhi_epi16(z12, *((__m128i *)SSE2_NEG_0_765366865)),
|
||||
_mm_mulhi_epi16(z10, *((__m128i *)SSE2_1_847759065)));
|
||||
tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10, *((__m128i *)SSE2_NEG_0_765366865)),
|
||||
_mm_mulhi_epi16(z12, *((__m128i *)SSE2_1_847759065)));
|
||||
|
||||
/* phase 2 */
|
||||
|
||||
/* tmp6 = tmp12 - tmp7; */
|
||||
/* tmp5 = tmp11 - tmp6; */
|
||||
/* tmp4 = tmp10 + tmp5; */
|
||||
|
||||
tmp6 = _mm_sub_epi16(tmp12, tmp7);
|
||||
tmp5 = _mm_sub_epi16(tmp11, tmp6);
|
||||
tmp4 = _mm_add_epi16(tmp10, tmp5);
|
||||
|
||||
/* outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */
|
||||
/* & RANGE_MASK]; */
|
||||
/* outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */
|
||||
/* & RANGE_MASK]; */
|
||||
|
||||
row0 = _mm_add_epi16(tmp0, tmp7);
|
||||
row7 = _mm_sub_epi16(tmp0, tmp7);
|
||||
|
||||
/* outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */
|
||||
/* & RANGE_MASK]; */
|
||||
/* outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */
|
||||
/* & RANGE_MASK]; */
|
||||
|
||||
row1 = _mm_add_epi16(tmp1, tmp6);
|
||||
row6 = _mm_sub_epi16(tmp1, tmp6);
|
||||
|
||||
/* outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */
|
||||
/* & RANGE_MASK]; */
|
||||
/* outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */
|
||||
/* & RANGE_MASK]; */
|
||||
|
||||
row2 = _mm_add_epi16(tmp2, tmp5);
|
||||
row5 = _mm_sub_epi16(tmp2, tmp5);
|
||||
|
||||
/* outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */
|
||||
/* & RANGE_MASK]; */
|
||||
/* outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */
|
||||
/* & RANGE_MASK]; */
|
||||
|
||||
row4 = _mm_add_epi16(tmp3, tmp4);
|
||||
row3 = _mm_sub_epi16(tmp3, tmp4);
|
||||
|
||||
/* ====================================================================== */
|
||||
/* Transpose Matrix (Word) */
|
||||
/* ====================================================================== */
|
||||
|
||||
/* Save off the low quadwords from the even rows */
|
||||
|
||||
xps0 = _mm_move_epi64(row0);
|
||||
xps1 = _mm_move_epi64(row2);
|
||||
xps2 = _mm_move_epi64(row4);
|
||||
xps3 = _mm_move_epi64(row6);
|
||||
|
||||
/* Interleave the high quadwords */
|
||||
|
||||
row0 = _mm_unpackhi_epi16(row0, row1); /* 04 14 05 15 06 16 07 17 */
|
||||
row2 = _mm_unpackhi_epi16(row2, row3); /* 24 34 25 35 26 36 27 37 */
|
||||
row4 = _mm_unpackhi_epi16(row4, row5); /* 44 54 45 55 46 56 47 57 */
|
||||
row6 = _mm_unpackhi_epi16(row6, row7); /* 64 74 65 75 66 76 67 77 */
|
||||
|
||||
rowa = row0;
|
||||
row0 = _mm_unpacklo_epi32(row0, row2); /* 04 14 24 34 05 15 25 35 */
|
||||
rowa = _mm_unpackhi_epi32(rowa, row2); /* 06 16 26 36 07 17 27 37 */
|
||||
|
||||
row2 = row4;
|
||||
row4 = _mm_unpacklo_epi32(row4, row6); /* 44 54 64 74 45 55 65 75 */
|
||||
row2 = _mm_unpackhi_epi32(row2, row6); /* 46 56 66 76 47 57 67 77 */
|
||||
|
||||
row6 = row0;
|
||||
row0 = _mm_unpacklo_epi64(row0, row4); /* 04 14 24 34 44 54 64 74 */
|
||||
row6 = _mm_unpackhi_epi64(row6, row4); /* 05 15 25 35 45 55 65 75 */
|
||||
|
||||
row4 = rowa;
|
||||
rowa = _mm_unpacklo_epi64(rowa, row2); /* 06 16 26 36 46 56 66 76 */
|
||||
row4 = _mm_unpackhi_epi64(row4, row2); /* 07 17 27 37 47 57 67 77 */
|
||||
|
||||
/* row0, row6, rowa, row4 now contain *4, *5, *6, *7 */
|
||||
|
||||
/* Interleave the low quadwords */
|
||||
|
||||
xps0 = _mm_unpacklo_epi16(xps0, row1); /* 00 10 01 11 02 12 03 13 */
|
||||
xps1 = _mm_unpacklo_epi16(xps1, row3); /* 20 30 21 31 22 32 23 33 */
|
||||
xps2 = _mm_unpacklo_epi16(xps2, row5); /* 40 50 41 51 42 52 43 43 */
|
||||
xps3 = _mm_unpacklo_epi16(xps3, row7); /* 60 70 61 71 62 72 63 73 */
|
||||
|
||||
row2 = xps0;
|
||||
xps0 = _mm_unpacklo_epi32(xps0, xps1); /* 00 10 20 30 01 11 21 31 */
|
||||
row2 = _mm_unpackhi_epi32(row2, xps1); /* 02 12 22 32 03 13 23 33 */
|
||||
|
||||
xps1 = xps2;
|
||||
xps2 = _mm_unpacklo_epi32(xps2, xps3); /* 40 50 60 70 41 51 61 71 */
|
||||
xps1 = _mm_unpackhi_epi32(xps1, xps3); /* 42 52 62 72 43 53 63 73 */
|
||||
|
||||
xps3 = xps0;
|
||||
xps0 = _mm_unpacklo_epi64(xps0, xps2); /* 00 10 20 30 40 50 60 70 */
|
||||
xps3 = _mm_unpackhi_epi64(xps3, xps2); /* 01 11 21 31 41 51 61 71 */
|
||||
|
||||
xps2 = row2;
|
||||
row2 = _mm_unpacklo_epi64(row2, xps1); /* 02 12 22 32 42 52 62 72 */
|
||||
xps2 = _mm_unpackhi_epi64(xps2, xps1); /* 03 13 23 33 43 53 63 73 */
|
||||
|
||||
/* xps0, xps3, row2, xps2 now contain *0, *1, *2, *3 */
|
||||
|
||||
/* Restructure back to row0 to row7. */
|
||||
|
||||
row7 = row4;
|
||||
row5 = row6;
|
||||
row6 = rowa;
|
||||
row4 = row0;
|
||||
row0 = xps0;
|
||||
row1 = xps3;
|
||||
row3 = xps2;
|
||||
|
||||
/* ====================================================================== */
|
||||
/* Inverse Discrete Cosine Transform Row Processing */
|
||||
/* ====================================================================== */
|
||||
|
||||
/* Even part */
|
||||
|
||||
/* tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */
|
||||
/* tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */
|
||||
|
||||
tmp10 = _mm_add_epi16(row0, row4);
|
||||
tmp11 = _mm_sub_epi16(row0, row4);
|
||||
|
||||
/* tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */
|
||||
/* tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) */
|
||||
/* - tmp13; */
|
||||
|
||||
tmp13 = _mm_add_epi16(row2, row6);
|
||||
tmp12 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(row2, row6), 2),
|
||||
*((__m128i *) SSE2_1_414213562));
|
||||
tmp12 = _mm_sub_epi16(tmp12, tmp13);
|
||||
|
||||
/* tmp0 = tmp10 + tmp13; */
|
||||
/* tmp3 = tmp10 - tmp13; */
|
||||
/* tmp1 = tmp11 + tmp12; */
|
||||
/* tmp2 = tmp11 - tmp12; */
|
||||
|
||||
tmp0 = _mm_add_epi16(tmp10, tmp13);
|
||||
tmp3 = _mm_sub_epi16(tmp10, tmp13);
|
||||
tmp1 = _mm_add_epi16(tmp11, tmp12);
|
||||
tmp2 = _mm_sub_epi16(tmp11, tmp12);
|
||||
|
||||
/* Odd part */
|
||||
|
||||
/* z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */
|
||||
/* z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */
|
||||
/* z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */
|
||||
/* z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */
|
||||
|
||||
z13 = _mm_add_epi16(row5, row3);
|
||||
z10 = _mm_sub_epi16(row5, row3);
|
||||
z11 = _mm_add_epi16(row1, row7);
|
||||
z12 = _mm_sub_epi16(row1, row7);
|
||||
|
||||
/* phase 5 */
|
||||
|
||||
/* tmp7 = z11 + z13; */
|
||||
/* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); */
|
||||
|
||||
tmp7 = _mm_add_epi16(z11, z13);
|
||||
tmp11 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(z11, z13), 2),
|
||||
*((__m128i *) SSE2_1_414213562));
|
||||
|
||||
/* z5 = MULTIPLY(z10 + z12, FIX_1_847759065); */
|
||||
/* tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; */
|
||||
/* tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; */
|
||||
|
||||
/* This is really a mess as the negative 2.61 doesn't work well */
|
||||
/* as we're working with 16-bit integers. So we're going to apply */
|
||||
/* a little algebra to get: */
|
||||
|
||||
/* tmp10 = z12 * (FIX_1_082392200 - FIX_1_847759065) - z10*FIX_1_847759065 */
|
||||
/* tmp12 = z10 * (FIX_1_847759065 - FIX_2_613125930) + z12*FIX_1_847759065 */
|
||||
|
||||
/* or */
|
||||
|
||||
/* tmp10 = z12 * FIX_NEG_0_765366865 - z10 * FIX_1_847759065 */
|
||||
/* tmp12 = z10 * FIX_NEG_0_765366865 + z12 * FIX_1_847759065 */
|
||||
|
||||
z10 = _mm_slli_epi16(z10, 2);
|
||||
z12 = _mm_slli_epi16(z12, 2);
|
||||
|
||||
tmp10 = _mm_sub_epi16(_mm_mulhi_epi16(z12, *((__m128i *)SSE2_NEG_0_765366865)),
|
||||
_mm_mulhi_epi16(z10, *((__m128i *)SSE2_1_847759065)));
|
||||
tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10, *((__m128i *)SSE2_NEG_0_765366865)),
|
||||
_mm_mulhi_epi16(z12, *((__m128i *)SSE2_1_847759065)));
|
||||
|
||||
/* phase 2 */
|
||||
|
||||
/* tmp6 = tmp12 - tmp7; */
|
||||
/* tmp5 = tmp11 - tmp6; */
|
||||
/* tmp4 = tmp10 + tmp5; */
|
||||
|
||||
tmp6 = _mm_sub_epi16(tmp12, tmp7);
|
||||
tmp5 = _mm_sub_epi16(tmp11, tmp6);
|
||||
tmp4 = _mm_add_epi16(tmp10, tmp5);
|
||||
|
||||
/* outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */
|
||||
/* & RANGE_MASK]; */
|
||||
/* outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */
|
||||
/* & RANGE_MASK]; */
|
||||
|
||||
row0 = _mm_add_epi16(tmp0, tmp7);
|
||||
row7 = _mm_sub_epi16(tmp0, tmp7);
|
||||
|
||||
/* outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */
|
||||
/* & RANGE_MASK]; */
|
||||
/* outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */
|
||||
/* & RANGE_MASK]; */
|
||||
|
||||
row1 = _mm_add_epi16(tmp1, tmp6);
|
||||
row6 = _mm_sub_epi16(tmp1, tmp6);
|
||||
|
||||
/* outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */
|
||||
/* & RANGE_MASK]; */
|
||||
/* outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */
|
||||
/* & RANGE_MASK]; */
|
||||
|
||||
row2 = _mm_add_epi16(tmp2, tmp5);
|
||||
row5 = _mm_sub_epi16(tmp2, tmp5);
|
||||
|
||||
/* outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */
|
||||
/* & RANGE_MASK]; */
|
||||
/* outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */
|
||||
/* & RANGE_MASK]; */
|
||||
|
||||
row4 = _mm_add_epi16(tmp3, tmp4);
|
||||
row3 = _mm_sub_epi16(tmp3, tmp4);
|
||||
|
||||
/* ====================================================================== */
|
||||
/* Transpose Matrix (Byte) */
|
||||
/* ====================================================================== */
|
||||
|
||||
/* Shift right after adding half to descale */
|
||||
|
||||
row0 = _mm_add_epi16(row0, *((__m128i *) SSE2_ADD_128));
|
||||
row1 = _mm_add_epi16(row1, *((__m128i *) SSE2_ADD_128));
|
||||
row2 = _mm_add_epi16(row2, *((__m128i *) SSE2_ADD_128));
|
||||
row3 = _mm_add_epi16(row3, *((__m128i *) SSE2_ADD_128));
|
||||
row4 = _mm_add_epi16(row4, *((__m128i *) SSE2_ADD_128));
|
||||
row5 = _mm_add_epi16(row5, *((__m128i *) SSE2_ADD_128));
|
||||
row6 = _mm_add_epi16(row6, *((__m128i *) SSE2_ADD_128));
|
||||
row7 = _mm_add_epi16(row7, *((__m128i *) SSE2_ADD_128));
|
||||
|
||||
row0 = _mm_srai_epi16(row0, 5);
|
||||
row1 = _mm_srai_epi16(row1, 5);
|
||||
row2 = _mm_srai_epi16(row2, 5);
|
||||
row3 = _mm_srai_epi16(row3, 5);
|
||||
row4 = _mm_srai_epi16(row4, 5);
|
||||
row5 = _mm_srai_epi16(row5, 5);
|
||||
row6 = _mm_srai_epi16(row6, 5);
|
||||
row7 = _mm_srai_epi16(row7, 5);
|
||||
|
||||
/* Range Limit and convert Words to Bytes */
|
||||
|
||||
row0 = _mm_packus_epi16(row0, row4);
|
||||
row1 = _mm_packus_epi16(row1, row5);
|
||||
row2 = _mm_packus_epi16(row2, row6);
|
||||
row3 = _mm_packus_epi16(row3, row7);
|
||||
|
||||
/* Transpose from rows back to columns */
|
||||
|
||||
row4 = _mm_unpackhi_epi8(row0, row1); /* 40 50 41 51 42 52 43 43 ... */
|
||||
row5 = _mm_unpackhi_epi8(row2, row3); /* 60 70 61 71 62 72 63 73 ... */
|
||||
row0 = _mm_unpacklo_epi8(row0, row1); /* 00 10 01 11 02 12 03 13 ... */
|
||||
row2 = _mm_unpacklo_epi8(row2, row3); /* 20 30 21 31 22 32 23 33 ... */
|
||||
|
||||
row6 = _mm_unpackhi_epi16(row0, row2); /* 04 14 24 34 05 15 25 35 ... */
|
||||
row7 = _mm_unpackhi_epi16(row4, row5); /* 44 54 64 74 45 55 65 75 ... */
|
||||
row0 = _mm_unpacklo_epi16(row0, row2); /* 00 10 20 30 01 11 21 31 ... */
|
||||
row4 = _mm_unpacklo_epi16(row4, row5); /* 40 50 60 70 41 51 61 71 ... */
|
||||
|
||||
row1 = _mm_unpackhi_epi32(row0, row4); /* 02 12 22 32 42 52 62 72 ... */
|
||||
row2 = _mm_unpackhi_epi32(row6, row7); /* 06 16 26 36 46 56 66 76 ... */
|
||||
row0 = _mm_unpacklo_epi32(row0, row4); /* 00 10 20 30 40 50 60 70 ... */
|
||||
row6 = _mm_unpacklo_epi32(row6, row7); /* 04 14 24 34 55 54 64 74 ... */
|
||||
|
||||
/* Rearrange for readability */
|
||||
|
||||
row3 = row2;
|
||||
row2 = row6;
|
||||
|
||||
/* ====================================================================== */
|
||||
/* Final output */
|
||||
/* ====================================================================== */
|
||||
|
||||
outptr0 = output_buf[0] + output_col;
|
||||
outptr1 = output_buf[1] + output_col;
|
||||
outptr2 = output_buf[2] + output_col;
|
||||
outptr3 = output_buf[3] + output_col;
|
||||
outptr4 = output_buf[4] + output_col;
|
||||
outptr5 = output_buf[5] + output_col;
|
||||
outptr6 = output_buf[6] + output_col;
|
||||
outptr7 = output_buf[7] + output_col;
|
||||
|
||||
/* We can't use the faster FP instructions to write out the high quad */
|
||||
/* so do a rotate right and write them out as the low quad */
|
||||
|
||||
row4 = _mm_srli_si128(row0, 8);
|
||||
row5 = _mm_srli_si128(row1, 8);
|
||||
row6 = _mm_srli_si128(row2, 8);
|
||||
row7 = _mm_srli_si128(row3, 8);
|
||||
|
||||
_mm_storel_epi64((__m128i *) outptr0, row0);
|
||||
_mm_storel_epi64((__m128i *) outptr2, row1);
|
||||
_mm_storel_epi64((__m128i *) outptr4, row2);
|
||||
_mm_storel_epi64((__m128i *) outptr6, row3);
|
||||
_mm_storel_epi64((__m128i *) outptr1, row4);
|
||||
_mm_storel_epi64((__m128i *) outptr3, row5);
|
||||
_mm_storel_epi64((__m128i *) outptr5, row6);
|
||||
_mm_storel_epi64((__m128i *) outptr7, row7);
|
||||
}
|
||||
#endif /* HAVE_SSE2_INTRINSICS */
|
||||
|
||||
#endif /* DCT_IFAST_SUPPORTED */
|
||||
|
@ -449,9 +449,6 @@ alloc_barray (j_common_ptr cinfo, int pool_id,
|
||||
JBLOCKROW workspace;
|
||||
JDIMENSION rowsperchunk, currow, i;
|
||||
long ltemp;
|
||||
#ifdef HAVE_SSE2_INTRINSICS
|
||||
PRUptrdiff workspace_raw;
|
||||
#endif
|
||||
|
||||
/* Calculate max # of rows allowed in one allocation chunk */
|
||||
ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) /
|
||||
@ -472,16 +469,9 @@ alloc_barray (j_common_ptr cinfo, int pool_id,
|
||||
currow = 0;
|
||||
while (currow < numrows) {
|
||||
rowsperchunk = MIN(rowsperchunk, numrows - currow);
|
||||
#ifdef HAVE_SSE2_INTRINSICS
|
||||
workspace_raw = (JBLOCKROW) alloc_large(cinfo, pool_id,
|
||||
(size_t) ((size_t) rowsperchunk * (size_t) blocksperrow
|
||||
* SIZEOF(JBLOCK) + 15));
|
||||
workspace = (JBLOCKROW) ((workspace_raw + 15) & ~15);
|
||||
#else
|
||||
workspace = (JBLOCKROW) alloc_large(cinfo, pool_id,
|
||||
(size_t) ((size_t) rowsperchunk * (size_t) blocksperrow
|
||||
* SIZEOF(JBLOCK)));
|
||||
#endif /* HAVE_SSE2_INTRINSICS */
|
||||
for (i = rowsperchunk; i > 0; i--) {
|
||||
result[currow++] = workspace;
|
||||
workspace += blocksperrow;
|
||||
|
@ -107,30 +107,22 @@ typedef short JCOEF;
|
||||
|
||||
/* Defines for MMX/SSE2 support. */
|
||||
|
||||
/* HAVE_SSE2_INTEL_MNEMONICS means Win32 and MSVC. HAVE_SSE2_INTRINSICS */
|
||||
/* means that the platform supports SSE2 or higher so that we don't */
|
||||
/* have to check runtime CPU capabilities. */
|
||||
|
||||
#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
|
||||
#define HAVE_MMX_INTEL_MNEMONICS
|
||||
|
||||
/* SSE2 code appears broken for some cpus (bug 247437) */
|
||||
#define HAVE_SSE2_INTEL_MNEMONICS
|
||||
#define HAVE_SSE2_INTRINSICS
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && defined(__i386__) && defined(XP_MACOSX)
|
||||
#if defined(__GNUC__) && defined(__i386__)
|
||||
#if defined(XP_MACOSX)
|
||||
#define HAVE_SSE2_INTRINSICS
|
||||
#endif /* GNUC && i386 && XP_MACOSX */
|
||||
#endif /* ! XP_MACOSX */
|
||||
#endif /* ! GNUC && i386 */
|
||||
|
||||
/* Add support for other platforms here */
|
||||
|
||||
/* SSE* alignment support - only use on platforms that support declspec and __attribute__ */
|
||||
/* We're only aligning Mac OSX structures right now */
|
||||
|
||||
#ifdef HAVE_SSE2_INTRINSICS
|
||||
#define SSE2_ALIGN __attribute__ ((aligned (16)))
|
||||
#else
|
||||
#define SSE2_ALIGN
|
||||
#endif
|
||||
|
||||
|
||||
/* Compressed datastreams are represented as arrays of JOCTET.
|
||||
* These must be EXACTLY 8 bits wide, at least once they are written to
|
||||
@ -328,7 +320,7 @@ typedef unsigned char boolean;
|
||||
/* Capability options common to encoder and decoder: */
|
||||
|
||||
#define DCT_ISLOW_SUPPORTED /* slow but accurate integer algorithm */
|
||||
#define DCT_IFAST_SUPPORTED /* faster, less accurate integer method */
|
||||
#undef DCT_IFAST_SUPPORTED /* faster, less accurate integer method */
|
||||
#undef DCT_FLOAT_SUPPORTED /* floating-point: accurate, fast on fast HW */
|
||||
|
||||
/* Encoder capability options: */
|
||||
|
@ -56,10 +56,6 @@
|
||||
|
||||
#include "gfxPlatform.h"
|
||||
|
||||
#if defined(GFX_SSE2_POSSIBLE)
|
||||
#include "emmintrin.h"
|
||||
#endif /* GFX_SSE2_POSSIBLE */
|
||||
|
||||
extern "C" {
|
||||
#include "iccjpeg.h"
|
||||
|
||||
@ -538,11 +534,7 @@ nsresult nsJPEGDecoder::ProcessData(const char *data, PRUint32 count, PRUint32 *
|
||||
/* FIXME -- Should reset dct_method and dither mode
|
||||
* for final pass of progressive JPEG
|
||||
*/
|
||||
#if defined(__GNUC__) && defined(__i386__) && defined(XP_MACOSX)
|
||||
mInfo.dct_method = JDCT_IFAST;
|
||||
#else
|
||||
mInfo.dct_method = JDCT_ISLOW;
|
||||
#endif /* __GNUC__ & __i386__ & XP_MACOSX */
|
||||
mInfo.dither_mode = JDITHER_FS;
|
||||
mInfo.do_fancy_upsampling = TRUE;
|
||||
mInfo.enable_2pass_quant = FALSE;
|
||||
@ -1268,16 +1260,6 @@ const int Cb_g_tab[(MAXJSAMPLE+1) * sizeof(int)] ={
|
||||
#define RIGHT_SHIFT(x,shft) ((x) >> (shft))
|
||||
#endif
|
||||
|
||||
/* Constants for SSE2 support */
|
||||
|
||||
#ifdef GFX_SSE2_POSSIBLE
|
||||
GFX_SSE2_ALIGN const short color_constants_1[8] = {128, 128, 128, 128, 128, 128, 128, 128};
|
||||
GFX_SSE2_ALIGN const short color_constants_2[8] = {359, 359, 359, 359, 359, 359, 359, 359};
|
||||
GFX_SSE2_ALIGN const short color_constants_3[8] = {88, 88, 88, 88, 88, 88, 88, 88};
|
||||
GFX_SSE2_ALIGN const short color_constants_4[8] = {183, 183, 183, 183, 183, 183, 183, 183};
|
||||
GFX_SSE2_ALIGN const short color_constants_5[8] = {454, 454, 454, 454, 454, 454, 454, 454};
|
||||
GFX_SSE2_ALIGN const short color_constants_6[8] = {11382, 11382, 11382, 11382, 11382, 11382, 11382, 11382};
|
||||
#endif /* GFX_SSE2_POSSIBLE */
|
||||
|
||||
METHODDEF(void)
|
||||
ycc_rgb_convert_argb (j_decompress_ptr cinfo,
|
||||
@ -1287,121 +1269,10 @@ ycc_rgb_convert_argb (j_decompress_ptr cinfo,
|
||||
JDIMENSION num_cols = cinfo->output_width;
|
||||
JSAMPLE * range_limit = cinfo->sample_range_limit;
|
||||
|
||||
/* Locals for SSE2 support */
|
||||
|
||||
#ifdef GFX_SSE2_POSSIBLE
|
||||
JSAMPROW inptr0, inptr1, inptr2, outptr;
|
||||
JDIMENSION col;
|
||||
__m128i row_Y, row_cb, row_cr, zeroes, ones;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
|
||||
__m128i row_B, row_G, row_R;
|
||||
#endif /* GFX_SSE2_POSSIBLE */
|
||||
|
||||
SHIFT_TEMPS
|
||||
|
||||
#ifdef GFX_SSE2_POSSIBLE
|
||||
if (GFX_SSE2_AVAILABLE == 1) {
|
||||
|
||||
/* Load registers with constants if we're going to do at least one quad */
|
||||
|
||||
if (num_cols >= 8) {
|
||||
zeroes = _mm_setzero_si128();
|
||||
ones = _mm_cmpeq_epi8(zeroes, zeroes);
|
||||
}
|
||||
|
||||
while (--num_rows >= 0) {
|
||||
inptr0 = input_buf[0][input_row];
|
||||
inptr1 = input_buf[1][input_row];
|
||||
inptr2 = input_buf[2][input_row];
|
||||
input_row++;
|
||||
outptr = *output_buf++;
|
||||
|
||||
/* YCbCr -> RGB using scaled multiplication */
|
||||
|
||||
for (col = 0; col + 8 <= num_cols; col += 8) {
|
||||
|
||||
/* Read a row of each color component */
|
||||
|
||||
row_Y = _mm_loadl_epi64((__m128i *) (inptr0 + col));
|
||||
row_cb = _mm_loadl_epi64((__m128i *) inptr1);
|
||||
row_cr = _mm_loadl_epi64((__m128i *) inptr2);
|
||||
|
||||
/* Convert bytes to words */
|
||||
|
||||
row_Y = _mm_unpacklo_epi8(row_Y, zeroes);
|
||||
row_cb = _mm_unpacklo_epi8(row_cb, zeroes);
|
||||
row_cr = _mm_unpacklo_epi8(row_cr, zeroes);
|
||||
|
||||
/* Compute B and R */
|
||||
|
||||
tmp0 = _mm_sub_epi16(row_cb, *((__m128i *) color_constants_1)); /* cb - 128 */
|
||||
tmp1 = _mm_sub_epi16(row_cr, *((__m128i *) color_constants_1)); /* cr - 128 */
|
||||
tmp2 = _mm_slli_epi16(tmp0, 8); /* (cb - 128) << 8 */
|
||||
tmp3 = _mm_slli_epi16(tmp1, 8); /* (cr - 128) << 8 */
|
||||
tmp2 = _mm_mulhi_epi16(tmp2, *((__m128i *) color_constants_5)); /* (454 * ((cb - 128) << 8)) >> 16 */
|
||||
tmp3 = _mm_mulhi_epi16(tmp3, *((__m128i *) color_constants_2)); /* (359 * ((cr - 128) << 8)) >> 16 */
|
||||
row_B = _mm_add_epi16(tmp2, row_Y); /* B = y + (454 * ((cb - 128) << 8)) >> 16 */
|
||||
row_R = _mm_add_epi16(tmp3, row_Y); /* R = y + (359 * ((cr - 128) << 8)) >> 16 */
|
||||
|
||||
/* Compute G */
|
||||
|
||||
tmp4 = _mm_mullo_epi16(row_cb, *((__m128i *) color_constants_3)); /* 88 * cb */
|
||||
tmp5 = _mm_mullo_epi16(tmp1, *((__m128i *) color_constants_4)); /* 183 * (cr - 128) */
|
||||
tmp4 = _mm_sub_epi16(tmp4, *((__m128i *) color_constants_6)); /* 88 * cb - (34806 - 128*256) */
|
||||
tmp4 = _mm_add_epi16(tmp4, tmp5); /* (88 * cb) + 183 * (cr - 128) - (34806 - 128*256) */
|
||||
tmp4 = _mm_srai_epi16(tmp4, 8); /* Shift the above right by 8 to divide by 256 */
|
||||
row_G = _mm_sub_epi16(row_Y, tmp4); /* G */
|
||||
|
||||
/* Pack word to byte with unsigned saturation which range limits too */
|
||||
|
||||
row_R = _mm_packus_epi16(row_R, zeroes);
|
||||
row_G = _mm_packus_epi16(row_G, zeroes);
|
||||
row_B = _mm_packus_epi16(row_B, zeroes);
|
||||
|
||||
/* Pack horizontally */
|
||||
|
||||
tmp0 = _mm_unpacklo_epi8(row_B, row_G); /* ABAB ABAB ABAB ABAB */
|
||||
tmp1 = _mm_unpacklo_epi8(row_R, ones); /* GRGR GRGR GRGR GRGR */
|
||||
tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); /* ARGB ARGB ARGB ARGB (low) */
|
||||
tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); /* ARGB ARGB ARGB ARGB (high) */
|
||||
|
||||
/* Write out 32 bytes of ARGB */
|
||||
|
||||
_mm_storel_epi64((__m128i *) outptr, tmp2);
|
||||
_mm_storel_epi64((__m128i *) (outptr + 8), _mm_srli_si128(tmp2, 8));
|
||||
_mm_storel_epi64((__m128i *) (outptr + 16), tmp3);
|
||||
_mm_storel_epi64((__m128i *) (outptr + 24), _mm_srli_si128(tmp3, 8));
|
||||
|
||||
/* Slide the pointers */
|
||||
|
||||
/* inptr0 += 8; */
|
||||
inptr1 += 8;
|
||||
inptr2 += 8;
|
||||
outptr += 32;
|
||||
}
|
||||
|
||||
/* Process remainder using scalar code */
|
||||
for (; col < num_cols; col++) {
|
||||
int y = GETJSAMPLE(inptr0[col]);
|
||||
int cb = GETJSAMPLE(inptr1[0]);
|
||||
int cr = GETJSAMPLE(inptr2[0]);
|
||||
inptr1++;
|
||||
inptr2++;
|
||||
JSAMPLE * range_limit_y = range_limit + y;
|
||||
/* Range-limiting is essential due to noise introduced by DCT losses. */
|
||||
*((PRUint32 *) outptr) = 0xFF000000 |
|
||||
( range_limit_y[Cr_r_tab[cr]] << 16 ) |
|
||||
( range_limit_y[((int) RIGHT_SHIFT(Cb_g_tab[cb] + Cr_g_tab[cr], SCALEBITS))] << 8 ) |
|
||||
( range_limit_y[Cb_b_tab[cb]] );
|
||||
outptr += 4;
|
||||
}
|
||||
}
|
||||
|
||||
} else
|
||||
#endif /* GFX_SSE2_POSSIBLE */
|
||||
|
||||
/* This is used if we don't have SSE2 */
|
||||
{
|
||||
|
||||
while (--num_rows >= 0) {
|
||||
JSAMPROW inptr0 = input_buf[0][input_row];
|
||||
JSAMPROW inptr1 = input_buf[1][input_row];
|
||||
@ -1421,7 +1292,7 @@ ycc_rgb_convert_argb (j_decompress_ptr cinfo,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**************** Inverted CMYK -> RGB conversion **************/
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user