Bug 512865. qcms: Improve SSE2 performance, add SSE support. r=jrmuizel

This patch greatly improves the performance of QCMS transformations on x86 &
x86_64 systems.  Some notes:

0. On 32-bit x86 systems it does runtime selection between non-SIMD, SSE, and
SSE2 code paths.

1. On x86_64 systems the SSE2 code path is always taken.  The non-SIMD and SSE
code paths are left intact, but contemporary versions of the GCC and MSVC
compilers will see that they cannot be reached and optimize them away.

2. The execution of the SSE2 code path is reduced by 67%, relative to the
original Intel/Microsoft formatted ASM code.  The relative performance is seen
on a Pentium4 (Northwood) 2.4GHz CPU with DDR1 RAM.

3. The SSE code path provides a 80% reduction in execution time, relative to
the non-SIMD code path.  The relative performance is seen on a Pentium3
(Coppermine) 1.26GHz CPU with SDRAM.

4. The code has been split out into separate files so that it can be built
with different cflags (-msse, and -msse2) when using gcc.
This commit is contained in:
Steve Snyder 2009-09-24 11:58:24 -04:00
parent d2addc16d5
commit 0ed3ad35a2
5 changed files with 578 additions and 368 deletions

View File

@ -15,6 +15,17 @@ EXPORTS = qcms.h qcmstypes.h
CSRCS = iccread.c transform.c
ifeq (86,$(findstring 86,$(OS_TEST)))
CSRCS += transform-sse2.c transform-sse1.c
ifdef GNU_CC
SSE1_FLAGS=-msse
SSE2_FLAGS=-msse2
else
SSE1_FLAGS=
SSE2_FLAGS=
endif
endif
FORCE_STATIC_LIB = 1
# This library is used by other shared libs
FORCE_USE_PIC = 1
@ -22,3 +33,15 @@ FORCE_USE_PIC = 1
include $(topsrcdir)/config/rules.mk
CFLAGS += -DMOZ_QCMS
# special rules for transform-sse*.c to get the right cflags. (taken from pixman/src/Makefile.in)
transform-sse1.$(OBJ_SUFFIX): transform-sse1.c Makefile Makefile.in
$(REPORT_BUILD)
@$(MAKE_DEPS_AUTO_CC)
$(ELOG) $(CC) $(OUTOPTION)$@ -c $(COMPILE_CFLAGS) $(SSE1_FLAGS) $(_VPATH_SRCS)
transform-sse2.$(OBJ_SUFFIX): transform-sse2.c Makefile Makefile.in
$(REPORT_BUILD)
@$(MAKE_DEPS_AUTO_CC)
$(ELOG) $(CC) $(OUTOPTION)$@ -c $(COMPILE_CFLAGS) $(SSE2_FLAGS) $(_VPATH_SRCS)

View File

@ -141,3 +141,20 @@ static inline s15Fixed16Number double_to_s15Fixed16Number(double v)
void precache_release(struct precache_output *p);
qcms_bool set_rgb_colorants(qcms_profile *profile, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries);
void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
unsigned char *src,
unsigned char *dest,
size_t length);
void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
unsigned char *src,
unsigned char *dest,
size_t length);
void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform,
unsigned char *src,
unsigned char *dest,
size_t length);
void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform,
unsigned char *src,
unsigned char *dest,
size_t length);

253
gfx/qcms/transform-sse1.c Normal file
View File

@ -0,0 +1,253 @@
#include <xmmintrin.h>
#include "qcmsint.h"
/* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
#define FLOATSCALE 65536.0f
#define CLAMPMAXVAL ( ((float) (65536 - 1)) / 65536.0f )
static const ALIGN float floatScaleX4[4] =
{ FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
static const ALIGN float clampMaxValueX4[4] =
{ CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform,
unsigned char *src,
unsigned char *dest,
size_t length)
{
unsigned int i;
float (*mat)[4] = transform->matrix;
char input_back[32];
/* Ensure we have a buffer that's 16 byte aligned regardless of the original
* stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
* because they don't work on stack variables. gcc 4.4 does do the right thing
* on x86 but that's too new for us right now. For more info: gcc bug #16660 */
float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
/* share input and output locations to save having to keep the
* locations in separate registers */
uint32_t const * output = (uint32_t*)input;
/* deref *transform now to avoid it in loop */
const float *igtbl_r = transform->input_gamma_table_r;
const float *igtbl_g = transform->input_gamma_table_g;
const float *igtbl_b = transform->input_gamma_table_b;
/* deref *transform now to avoid it in loop */
const uint8_t *otdata_r = &transform->output_table_r->data[0];
const uint8_t *otdata_g = &transform->output_table_g->data[0];
const uint8_t *otdata_b = &transform->output_table_b->data[0];
/* input matrix values never change */
const __m128 mat0 = _mm_load_ps(mat[0]);
const __m128 mat1 = _mm_load_ps(mat[1]);
const __m128 mat2 = _mm_load_ps(mat[2]);
/* these values don't change, either */
const __m128 max = _mm_load_ps(clampMaxValueX4);
const __m128 min = _mm_setzero_ps();
const __m128 scale = _mm_load_ps(floatScaleX4);
/* working variables */
__m128 vec_r, vec_g, vec_b, result;
/* CYA */
if (!length)
return;
/* one pixel is handled outside of the loop */
length--;
/* setup for transforming 1st pixel */
vec_r = _mm_load_ss(&igtbl_r[src[0]]);
vec_g = _mm_load_ss(&igtbl_g[src[1]]);
vec_b = _mm_load_ss(&igtbl_b[src[2]]);
src += 3;
/* transform all but final pixel */
for (i=0; i<length; i++)
{
/* position values from gamma tables */
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
/* gamma * matrix */
vec_r = _mm_mul_ps(vec_r, mat0);
vec_g = _mm_mul_ps(vec_g, mat1);
vec_b = _mm_mul_ps(vec_b, mat2);
/* crunch, crunch, crunch */
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
result = _mm_mul_ps(vec_r, scale);
/* store calc'd output tables indices */
*((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
result = _mm_movehl_ps(result, result);
*((__m64 *)&output[2]) = _mm_cvtps_pi32(result) ;
/* load for next loop while store completes */
vec_r = _mm_load_ss(&igtbl_r[src[0]]);
vec_g = _mm_load_ss(&igtbl_g[src[1]]);
vec_b = _mm_load_ss(&igtbl_b[src[2]]);
src += 3;
/* use calc'd indices to output RGB values */
dest[0] = otdata_r[output[0]];
dest[1] = otdata_g[output[1]];
dest[2] = otdata_b[output[2]];
dest += 3;
}
/* handle final (maybe only) pixel */
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
vec_r = _mm_mul_ps(vec_r, mat0);
vec_g = _mm_mul_ps(vec_g, mat1);
vec_b = _mm_mul_ps(vec_b, mat2);
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
result = _mm_mul_ps(vec_r, scale);
*((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
result = _mm_movehl_ps(result, result);
*((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
dest[0] = otdata_r[output[0]];
dest[1] = otdata_g[output[1]];
dest[2] = otdata_b[output[2]];
_mm_empty();
}
void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform,
unsigned char *src,
unsigned char *dest,
size_t length)
{
unsigned int i;
float (*mat)[4] = transform->matrix;
char input_back[32];
/* Ensure we have a buffer that's 16 byte aligned regardless of the original
* stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
* because they don't work on stack variables. gcc 4.4 does do the right thing
* on x86 but that's too new for us right now. For more info: gcc bug #16660 */
float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
/* share input and output locations to save having to keep the
* locations in separate registers */
uint32_t const * output = (uint32_t*)input;
/* deref *transform now to avoid it in loop */
const float *igtbl_r = transform->input_gamma_table_r;
const float *igtbl_g = transform->input_gamma_table_g;
const float *igtbl_b = transform->input_gamma_table_b;
/* deref *transform now to avoid it in loop */
const uint8_t *otdata_r = &transform->output_table_r->data[0];
const uint8_t *otdata_g = &transform->output_table_g->data[0];
const uint8_t *otdata_b = &transform->output_table_b->data[0];
/* input matrix values never change */
const __m128 mat0 = _mm_load_ps(mat[0]);
const __m128 mat1 = _mm_load_ps(mat[1]);
const __m128 mat2 = _mm_load_ps(mat[2]);
/* these values don't change, either */
const __m128 max = _mm_load_ps(clampMaxValueX4);
const __m128 min = _mm_setzero_ps();
const __m128 scale = _mm_load_ps(floatScaleX4);
/* working variables */
__m128 vec_r, vec_g, vec_b, result;
unsigned char alpha;
/* CYA */
if (!length)
return;
/* one pixel is handled outside of the loop */
length--;
/* setup for transforming 1st pixel */
vec_r = _mm_load_ss(&igtbl_r[src[0]]);
vec_g = _mm_load_ss(&igtbl_g[src[1]]);
vec_b = _mm_load_ss(&igtbl_b[src[2]]);
alpha = src[3];
src += 4;
/* transform all but final pixel */
for (i=0; i<length; i++)
{
/* position values from gamma tables */
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
/* gamma * matrix */
vec_r = _mm_mul_ps(vec_r, mat0);
vec_g = _mm_mul_ps(vec_g, mat1);
vec_b = _mm_mul_ps(vec_b, mat2);
/* store alpha for this pixel; load alpha for next */
dest[3] = alpha;
alpha = src[3];
/* crunch, crunch, crunch */
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
result = _mm_mul_ps(vec_r, scale);
/* store calc'd output tables indices */
*((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
result = _mm_movehl_ps(result, result);
*((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
/* load gamma values for next loop while store completes */
vec_r = _mm_load_ss(&igtbl_r[src[0]]);
vec_g = _mm_load_ss(&igtbl_g[src[1]]);
vec_b = _mm_load_ss(&igtbl_b[src[2]]);
src += 4;
/* use calc'd indices to output RGB values */
dest[0] = otdata_r[output[0]];
dest[1] = otdata_g[output[1]];
dest[2] = otdata_b[output[2]];
dest += 4;
}
/* handle final (maybe only) pixel */
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
vec_r = _mm_mul_ps(vec_r, mat0);
vec_g = _mm_mul_ps(vec_g, mat1);
vec_b = _mm_mul_ps(vec_b, mat2);
dest[3] = alpha;
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
result = _mm_mul_ps(vec_r, scale);
*((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
result = _mm_movehl_ps(result, result);
*((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
dest[0] = otdata_r[output[0]];
dest[1] = otdata_g[output[1]];
dest[2] = otdata_b[output[2]];
_mm_empty();
}

243
gfx/qcms/transform-sse2.c Normal file
View File

@ -0,0 +1,243 @@
#include <emmintrin.h>
#include "qcmsint.h"
/* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
#define FLOATSCALE 65536.0f
#define CLAMPMAXVAL ( ((float) (65536 - 1)) / 65536.0f )
static const ALIGN float floatScaleX4[4] =
{ FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
static const ALIGN float clampMaxValueX4[4] =
{ CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
unsigned char *src,
unsigned char *dest,
size_t length)
{
unsigned int i;
float (*mat)[4] = transform->matrix;
char input_back[32];
/* Ensure we have a buffer that's 16 byte aligned regardless of the original
* stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
* because they don't work on stack variables. gcc 4.4 does do the right thing
* on x86 but that's too new for us right now. For more info: gcc bug #16660 */
float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
/* share input and output locations to save having to keep the
* locations in separate registers */
uint32_t const * output = (uint32_t*)input;
/* deref *transform now to avoid it in loop */
const float *igtbl_r = transform->input_gamma_table_r;
const float *igtbl_g = transform->input_gamma_table_g;
const float *igtbl_b = transform->input_gamma_table_b;
/* deref *transform now to avoid it in loop */
const uint8_t *otdata_r = &transform->output_table_r->data[0];
const uint8_t *otdata_g = &transform->output_table_g->data[0];
const uint8_t *otdata_b = &transform->output_table_b->data[0];
/* input matrix values never change */
const __m128 mat0 = _mm_load_ps(mat[0]);
const __m128 mat1 = _mm_load_ps(mat[1]);
const __m128 mat2 = _mm_load_ps(mat[2]);
/* these values don't change, either */
const __m128 max = _mm_load_ps(clampMaxValueX4);
const __m128 min = _mm_setzero_ps();
const __m128 scale = _mm_load_ps(floatScaleX4);
/* working variables */
__m128 vec_r, vec_g, vec_b, result;
/* CYA */
if (!length)
return;
/* one pixel is handled outside of the loop */
length--;
/* setup for transforming 1st pixel */
vec_r = _mm_load_ss(&igtbl_r[src[0]]);
vec_g = _mm_load_ss(&igtbl_g[src[1]]);
vec_b = _mm_load_ss(&igtbl_b[src[2]]);
src += 3;
/* transform all but final pixel */
for (i=0; i<length; i++)
{
/* position values from gamma tables */
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
/* gamma * matrix */
vec_r = _mm_mul_ps(vec_r, mat0);
vec_g = _mm_mul_ps(vec_g, mat1);
vec_b = _mm_mul_ps(vec_b, mat2);
/* crunch, crunch, crunch */
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
result = _mm_mul_ps(vec_r, scale);
/* store calc'd output tables indices */
_mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
/* load for next loop while store completes */
vec_r = _mm_load_ss(&igtbl_r[src[0]]);
vec_g = _mm_load_ss(&igtbl_g[src[1]]);
vec_b = _mm_load_ss(&igtbl_b[src[2]]);
src += 3;
/* use calc'd indices to output RGB values */
dest[0] = otdata_r[output[0]];
dest[1] = otdata_g[output[1]];
dest[2] = otdata_b[output[2]];
dest += 3;
}
/* handle final (maybe only) pixel */
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
vec_r = _mm_mul_ps(vec_r, mat0);
vec_g = _mm_mul_ps(vec_g, mat1);
vec_b = _mm_mul_ps(vec_b, mat2);
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
result = _mm_mul_ps(vec_r, scale);
_mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
dest[0] = otdata_r[output[0]];
dest[1] = otdata_g[output[1]];
dest[2] = otdata_b[output[2]];
}
void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
unsigned char *src,
unsigned char *dest,
size_t length)
{
unsigned int i;
float (*mat)[4] = transform->matrix;
char input_back[32];
/* Ensure we have a buffer that's 16 byte aligned regardless of the original
* stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
* because they don't work on stack variables. gcc 4.4 does do the right thing
* on x86 but that's too new for us right now. For more info: gcc bug #16660 */
float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
/* share input and output locations to save having to keep the
* locations in separate registers */
uint32_t const * output = (uint32_t*)input;
/* deref *transform now to avoid it in loop */
const float *igtbl_r = transform->input_gamma_table_r;
const float *igtbl_g = transform->input_gamma_table_g;
const float *igtbl_b = transform->input_gamma_table_b;
/* deref *transform now to avoid it in loop */
const uint8_t *otdata_r = &transform->output_table_r->data[0];
const uint8_t *otdata_g = &transform->output_table_g->data[0];
const uint8_t *otdata_b = &transform->output_table_b->data[0];
/* input matrix values never change */
const __m128 mat0 = _mm_load_ps(mat[0]);
const __m128 mat1 = _mm_load_ps(mat[1]);
const __m128 mat2 = _mm_load_ps(mat[2]);
/* these values don't change, either */
const __m128 max = _mm_load_ps(clampMaxValueX4);
const __m128 min = _mm_setzero_ps();
const __m128 scale = _mm_load_ps(floatScaleX4);
/* working variables */
__m128 vec_r, vec_g, vec_b, result;
unsigned char alpha;
/* CYA */
if (!length)
return;
/* one pixel is handled outside of the loop */
length--;
/* setup for transforming 1st pixel */
vec_r = _mm_load_ss(&igtbl_r[src[0]]);
vec_g = _mm_load_ss(&igtbl_g[src[1]]);
vec_b = _mm_load_ss(&igtbl_b[src[2]]);
alpha = src[3];
src += 4;
/* transform all but final pixel */
for (i=0; i<length; i++)
{
/* position values from gamma tables */
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
/* gamma * matrix */
vec_r = _mm_mul_ps(vec_r, mat0);
vec_g = _mm_mul_ps(vec_g, mat1);
vec_b = _mm_mul_ps(vec_b, mat2);
/* store alpha for this pixel; load alpha for next */
dest[3] = alpha;
alpha = src[3];
/* crunch, crunch, crunch */
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
result = _mm_mul_ps(vec_r, scale);
/* store calc'd output tables indices */
_mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
/* load gamma values for next loop while store completes */
vec_r = _mm_load_ss(&igtbl_r[src[0]]);
vec_g = _mm_load_ss(&igtbl_g[src[1]]);
vec_b = _mm_load_ss(&igtbl_b[src[2]]);
src += 4;
/* use calc'd indices to output RGB values */
dest[0] = otdata_r[output[0]];
dest[1] = otdata_g[output[1]];
dest[2] = otdata_b[output[2]];
dest += 4;
}
/* handle final (maybe only) pixel */
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
vec_r = _mm_mul_ps(vec_r, mat0);
vec_g = _mm_mul_ps(vec_g, mat1);
vec_b = _mm_mul_ps(vec_b, mat2);
dest[3] = alpha;
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
result = _mm_mul_ps(vec_r, scale);
_mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
dest[0] = otdata_r[output[0]];
dest[1] = otdata_g[output[1]];
dest[2] = otdata_b[output[2]];
}

View File

@ -25,9 +25,10 @@
#include <assert.h>
#include "qcmsint.h"
#if defined(_M_IX86) || defined(__i386__) || defined(__x86_64__) || defined(_M_AMD64)
/* for MSVC, GCC, and Intel compilers */
#if defined(_M_IX86) || defined(__i386__) || defined(_M_AMD64) || defined(__x86_64__)
#define X86
#endif
#endif /* _M_IX86 || __i386__ || _M_AMD64 || __x86_64__ */
//XXX: could use a bettername
typedef uint16_t uint16_fract_t;
@ -734,352 +735,6 @@ static void qcms_transform_data_graya_out_precache(qcms_transform *transform, un
}
}
static const ALIGN float floatScale = 65536.0f;
static const ALIGN float * const floatScaleAddr = &floatScale; // Win32 ASM doesn't know how to take addressOf inline
static const ALIGN float clampMaxValue = ((float) (65536 - 1)) / 65536.0f;
#ifdef X86
#if 0
#include <emmintrin.h>
void qcms_transform_data_rgb_out_lut_sse_intrin(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
{
int i;
float (*mat)[4] = transform->matrix;
char input_back[32];
/* Ensure we have a buffer that's 16 byte aligned regardless of the original
* stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
* because they don't work on stack variables. gcc 4.4 does do the right thing
* on x86 but that's too new for us right now. For more info: gcc bug #16660 */
float *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
/* share input and output locations to save having to keep the
* locations in separate registers */
uint32_t* output = (uint32_t*)input;
for (i=0; i<length; i++) {
const float *clampMax = &clampMaxValue;
unsigned char device_r = *src++;
unsigned char device_g = *src++;
unsigned char device_b = *src++;
__m128 xmm1 = _mm_load_ps(mat[0]);
__m128 xmm2 = _mm_load_ps(mat[1]);
__m128 xmm3 = _mm_load_ps(mat[2]);
__m128 vec_r = _mm_load_ss(&transform->input_gamma_table_r[device_r]);
vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
__m128 vec_g = _mm_load_ss(&transform->input_gamma_table_r[device_g]);
vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
__m128 vec_b = _mm_load_ss(&transform->input_gamma_table_r[device_b]);
vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
vec_r = _mm_mul_ps(vec_r, xmm1);
vec_g = _mm_mul_ps(vec_g, xmm2);
vec_b = _mm_mul_ps(vec_b, xmm3);
vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
__m128 max = _mm_load_ss(&clampMax);
max = _mm_shuffle_ps(max, max, 0);
__m128 min = _mm_setzero_ps();
vec_r = _mm_max_ps(min, vec_r);
vec_r = _mm_min_ps(max, vec_r);
__m128 scale = _mm_load_ss(&floatScale);
scale = _mm_shuffle_ps(scale, scale, 0);
__m128 result = _mm_mul_ps(vec_r, scale);
__m128i out = _mm_cvtps_epi32(result);
_mm_store_si128((__m128i*)input, out);
*dest++ = transform->output_table_r->data[output[0]];
*dest++ = transform->output_table_g->data[output[1]];
*dest++ = transform->output_table_b->data[output[2]];
}
}
#endif
#if defined(_MSC_VER) && defined(_M_AMD64)
#include <emmintrin.h>
#endif
static void qcms_transform_data_rgb_out_lut_sse(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
{
unsigned int i;
float (*mat)[4] = transform->matrix;
char input_back[32];
/* Ensure we have a buffer that's 16 byte aligned regardless of the original
* stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
* because they don't work on stack variables. gcc 4.4 does do the right thing
* on x86 but that's too new for us right now. For more info: gcc bug #16660 */
float *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
/* share input and output locations to save having to keep the
* locations in separate registers */
uint32_t* output = (uint32_t*)input;
for (i = 0; i < length; i++) {
const float *clampMax = &clampMaxValue;
unsigned char device_r = *src++;
unsigned char device_g = *src++;
unsigned char device_b = *src++;
input[0] = transform->input_gamma_table_r[device_r];
input[1] = transform->input_gamma_table_g[device_g];
input[2] = transform->input_gamma_table_b[device_b];
#ifdef __GNUC__
__asm(
"movaps (%0), %%xmm1;\n\t" // Move the first matrix column to xmm1
"movaps 16(%0), %%xmm2;\n\t" // Move the second matrix column to xmm2
"movaps 32(%0), %%xmm3;\n\t" // move the third matrix column to xmm3
"movaps (%3), %%xmm0;\n\t" // Move the vector to xmm0
// Note - We have to copy and then shuffle because of the weird
// semantics of shufps
//
"movaps %%xmm0, %%xmm4;\n\t" // Copy the vector to xmm4
"shufps $0, %%xmm4, %%xmm4;\n\t" // Shuffle to repeat the first vector element repeated 4 times
"mulps %%xmm4, %%xmm1;\n\t" // Multiply the first vector element by the first matrix column
"movaps %%xmm0, %%xmm5; \n\t" // Copy the vector to xmm5
"shufps $0x55, %%xmm5, %%xmm5;\n\t" // Shuffle to repeat the second vector element repeated 4 times
"mulps %%xmm5, %%xmm2;\n\t" // Multiply the second vector element by the seccond matrix column
"movaps %%xmm0, %%xmm6;\n\t" // Copy the vector to xmm6
"shufps $0xAA, %%xmm6, %%xmm6;\n\t" // Shuffle to repeat the third vector element repeated 4 times
"mulps %%xmm6, %%xmm3;\n\t" // Multiply the third vector element by the third matrix column
"addps %%xmm3, %%xmm2;\n\t" // Sum (second + third) columns
"addps %%xmm2, %%xmm1;\n\t" // Sum ((second + third) + first) columns
"movss (%1), %%xmm7;\n\t" // load the floating point representation of 65535/65536
"shufps $0, %%xmm7, %%xmm7;\n\t" // move it into all of the four slots
"minps %%xmm7, %%xmm1;\n\t" // clamp the vector to 1.0 max
"xorps %%xmm6, %%xmm6;\n\t" // get us cleared bitpatern, which is 0.0f
"maxps %%xmm6, %%xmm1;\n\t" // clamp the vector to 0.0 min
"movss (%2), %%xmm5;\n\t" // load the floating point scale factor
"shufps $0, %%xmm5, %%xmm5;\n\t" // put it in all four slots
"mulps %%xmm5, %%xmm1;\n\t" // multiply by the scale factor
"cvtps2dq %%xmm1, %%xmm1;\n\t" // convert to integers
"movdqa %%xmm1, (%3);\n\t" // store
:
: "r" (mat), "r" (clampMax), "r" (&floatScale), "r" (input)
: "memory"
/* older versions of gcc don't know about these registers so only include them as constraints
if gcc knows about them */
#ifdef __SSE2__
, "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
#endif
);
#elif defined(_MSC_VER) && defined(_M_IX86)
__asm {
mov eax, mat
mov ecx, clampMax
mov edx, floatScaleAddr
mov ebx, input
movaps xmm1, [eax]
movaps xmm2, [eax + 16]
movaps xmm3, [eax + 32]
movaps xmm0, [ebx]
movaps xmm4, xmm0
shufps xmm4, xmm4, 0
mulps xmm1, xmm4
movaps xmm5, xmm0
shufps xmm5, xmm5, 0x55
mulps xmm2, xmm5
movaps xmm6, xmm0
shufps xmm6, xmm6, 0xAA
mulps xmm3, xmm6
addps xmm2, xmm3
addps xmm1, xmm2
movss xmm7, [ecx]
shufps xmm7, xmm7, 0
minps xmm1, xmm7
xorps xmm6, xmm6
maxps xmm1, xmm6
movss xmm5, [edx]
shufps xmm5, xmm5, 0
mulps xmm1, xmm5
cvtps2dq xmm1, xmm1
movdqa [ebx], xmm1
}
#elif defined(_MSC_VER) && defined(_M_AMD64)
{
__m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7;
xmm1 = _mm_load_ps((__m128*)mat);
xmm2 = _mm_load_ps(((__m128*)mat) + 1);
xmm3 = _mm_load_ps(((__m128*)mat) + 2);
xmm0 = _mm_load_ps((__m128*)input);
xmm1 = _mm_mul_ps(xmm1, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0,0,0,0)));
xmm2 = _mm_mul_ps(xmm2, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1)));
xmm3 = _mm_mul_ps(xmm3, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2)));
xmm1 = _mm_add_ps(xmm1, _mm_add_ps(xmm2, xmm3));
xmm7 = _mm_load_ss(clampMax);
xmm7 = _mm_shuffle_ps(xmm7, xmm7, _MM_SHUFFLE(0,0,0,0));
xmm1 = _mm_min_ps(xmm1, xmm7);
xmm6 = _mm_xor_ps(xmm6, xmm6);
xmm1 = _mm_max_ps(xmm1, xmm6);
xmm5 = _mm_load_ss(&floatScale);
xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(0,0,0,0));
xmm1 = _mm_mul_ps(xmm1, xmm5);
_mm_store_si128((__m128i*)input, _mm_cvtps_epi32(xmm1));
}
#else
#error "Unknown platform"
#endif
*dest++ = transform->output_table_r->data[output[0]];
*dest++ = transform->output_table_g->data[output[1]];
*dest++ = transform->output_table_b->data[output[2]];
}
}
static void qcms_transform_data_rgba_out_lut_sse(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
{
unsigned int i;
float (*mat)[4] = transform->matrix;
char input_back[32];
/* align input on 16 byte boundary */
float *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
/* share input and output locations to save having to keep the
* locations in separate registers */
uint32_t* output = (uint32_t*)input;
for (i = 0; i < length; i++) {
const float *clampMax = &clampMaxValue;
unsigned char device_r = *src++;
unsigned char device_g = *src++;
unsigned char device_b = *src++;
unsigned char alpha = *src++;
input[0] = transform->input_gamma_table_r[device_r];
input[1] = transform->input_gamma_table_g[device_g];
input[2] = transform->input_gamma_table_b[device_b];
#ifdef __GNUC__
__asm(
"movaps (%0), %%xmm1;\n\t" // Move the first matrix column to xmm1
"movaps 16(%0), %%xmm2;\n\t" // Move the second matrix column to xmm2
"movaps 32(%0), %%xmm3;\n\t" // move the third matrix column to xmm3
"movaps (%3), %%xmm0;\n\t" // Move the vector to xmm0
// Note - We have to copy and then shuffle because of the weird
// semantics of shufps
//
"movaps %%xmm0, %%xmm4;\n\t" // Copy the vector to xmm4
"shufps $0, %%xmm4, %%xmm4;\n\t" // Shuffle to repeat the first vector element repeated 4 times
"mulps %%xmm4, %%xmm1;\n\t" // Multiply the first vector element by the first matrix column
"movaps %%xmm0, %%xmm5; \n\t" // Copy the vector to xmm5
"shufps $0x55, %%xmm5, %%xmm5;\n\t" // Shuffle to repeat the second vector element repeated 4 times
"mulps %%xmm5, %%xmm2;\n\t" // Multiply the second vector element by the seccond matrix column
"movaps %%xmm0, %%xmm6;\n\t" // Copy the vector to xmm6
"shufps $0xAA, %%xmm6, %%xmm6;\n\t" // Shuffle to repeat the third vector element repeated 4 times
"mulps %%xmm6, %%xmm3;\n\t" // Multiply the third vector element by the third matrix column
"addps %%xmm3, %%xmm2;\n\t" // Sum (second + third) columns
"addps %%xmm2, %%xmm1;\n\t" // Sum ((second + third) + first) columns
"movss (%1), %%xmm7;\n\t" // load the floating point representation of 65535/65536
"shufps $0, %%xmm7, %%xmm7;\n\t" // move it into all of the four slots
"minps %%xmm7, %%xmm1;\n\t" // clamp the vector to 1.0 max
"xorps %%xmm6, %%xmm6;\n\t" // get us cleared bitpatern, which is 0.0f
"maxps %%xmm6, %%xmm1;\n\t" // clamp the vector to 0.0 min
"movss (%2), %%xmm5;\n\t" // load the floating point scale factor
"shufps $0, %%xmm5, %%xmm5;\n\t" // put it in all four slots
"mulps %%xmm5, %%xmm1;\n\t" // multiply by the scale factor
"cvtps2dq %%xmm1, %%xmm1;\n\t" // convert to integers
"movdqa %%xmm1, (%3);\n\t" // store
:
: "r" (mat), "r" (clampMax), "r" (&floatScale), "r" (input)
: "memory"
/* older versions of gcc don't know about these registers so only include them as constraints
if gcc knows about them */
#ifdef __SSE2__
, "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
#endif
);
#elif defined(_MSC_VER) && defined(_M_IX86)
__asm {
mov eax, mat
mov ecx, clampMax
mov edx, floatScaleAddr
mov ebx, input
movaps xmm1, [eax]
movaps xmm2, [eax + 16]
movaps xmm3, [eax + 32]
movaps xmm0, [ebx]
movaps xmm4, xmm0
shufps xmm4, xmm4, 0
mulps xmm1, xmm4
movaps xmm5, xmm0
shufps xmm5, xmm5, 0x55
mulps xmm2, xmm5
movaps xmm6, xmm0
shufps xmm6, xmm6, 0xAA
mulps xmm3, xmm6
addps xmm2, xmm3
addps xmm1, xmm2
movss xmm7, [ecx]
shufps xmm7, xmm7, 0
minps xmm1, xmm7
xorps xmm6, xmm6
maxps xmm1, xmm6
movss xmm5, [edx]
shufps xmm5, xmm5, 0
mulps xmm1, xmm5
cvtps2dq xmm1, xmm1
movdqa [ebx], xmm1
}
#elif defined(_MSC_VER) && defined(_M_AMD64)
{
__m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7;
xmm1 = _mm_load_ps((__m128*)mat);
xmm2 = _mm_load_ps(((__m128*)mat) + 1);
xmm3 = _mm_load_ps(((__m128*)mat) + 2);
xmm0 = _mm_load_ps((__m128*)input);
xmm1 = _mm_mul_ps(xmm1, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0,0,0,0)));
xmm2 = _mm_mul_ps(xmm2, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1)));
xmm3 = _mm_mul_ps(xmm3, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2)));
xmm1 = _mm_add_ps(xmm1, _mm_add_ps(xmm2, xmm3));
xmm7 = _mm_load_ss(clampMax);
xmm7 = _mm_shuffle_ps(xmm7, xmm7, _MM_SHUFFLE(0,0,0,0));
xmm1 = _mm_min_ps(xmm1, xmm7);
xmm6 = _mm_xor_ps(xmm6, xmm6);
xmm1 = _mm_max_ps(xmm1, xmm6);
xmm5 = _mm_load_ss(&floatScale);
xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(0,0,0,0));
xmm1 = _mm_mul_ps(xmm1, xmm5);
_mm_store_si128((__m128i*)input, _mm_cvtps_epi32(xmm1));
}
#else
#error "Unknown platform"
#endif
*dest++ = transform->output_table_r->data[output[0]];
*dest++ = transform->output_table_g->data[output[1]];
*dest++ = transform->output_table_b->data[output[2]];
*dest++ = alpha;
}
}
#endif
static void qcms_transform_data_rgb_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
{
unsigned int i;
@ -1380,7 +1035,7 @@ qcms_bool compute_precache(struct curveType *trc, uint8_t *output)
return true;
}
#ifdef X86
// Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
// mozilla/jpeg)
// -------------------------------------------------------------------------
@ -1423,31 +1078,43 @@ static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t
}
#endif
// -------------------------Runtime SSE2 Detection-----------------------------
// -------------------------Runtime SSEx Detection-----------------------------
/* MMX is always supported per
* Gecko v1.9.1 minimum CPU requirements */
#define SSE1_EDX_MASK (1UL << 25)
#define SSE2_EDX_MASK (1UL << 26)
static qcms_bool sse2_available(void)
#define SSE3_ECX_MASK (1UL << 0)
static int sse_version_available(void)
{
#if defined(__x86_64__) || defined(_M_AMD64)
return true;
/* we know at build time that 64-bit CPUs always have SSE2
* this tells the compiler that non-SSE2 branches will never be
* taken (i.e. OK to optimze away the SSE1 and non-SIMD code */
return 2;
#elif defined(HAS_CPUID)
static int has_sse2 = -1;
uint32_t a, b, c, d;
uint32_t function = 0x00000001;
static int sse_version = -1;
uint32_t a, b, c, d;
uint32_t function = 0x00000001;
if (has_sse2 == -1) {
has_sse2 = 0;
cpuid(function, &a, &b, &c, &d);
if (d & SSE2_EDX_MASK)
has_sse2 = 1;
else
has_sse2 = 0;
}
if (sse_version == -1) {
sse_version = 0;
cpuid(function, &a, &b, &c, &d);
if (c & SSE3_ECX_MASK)
sse_version = 3;
else if (d & SSE2_EDX_MASK)
sse_version = 2;
else if (d & SSE1_EDX_MASK)
sse_version = 1;
}
return has_sse2;
return sse_version;
#else
return 0;
#endif
return false;
}
#endif
void build_output_lut(struct curveType *trc,
uint16_t **output_gamma_lut, size_t *output_gamma_lut_length)
@ -1553,11 +1220,18 @@ qcms_transform* qcms_transform_create(
}
if (precache) {
#ifdef X86
if (sse2_available()) {
if (sse_version_available() >= 2) {
if (in_type == QCMS_DATA_RGB_8)
transform->transform_fn = qcms_transform_data_rgb_out_lut_sse;
transform->transform_fn = qcms_transform_data_rgb_out_lut_sse2;
else
transform->transform_fn = qcms_transform_data_rgba_out_lut_sse;
transform->transform_fn = qcms_transform_data_rgba_out_lut_sse2;
} else
if (sse_version_available() >= 1) {
if (in_type == QCMS_DATA_RGB_8)
transform->transform_fn = qcms_transform_data_rgb_out_lut_sse1;
else
transform->transform_fn = qcms_transform_data_rgba_out_lut_sse1;
} else
#endif