gecko/gfx/ycbcr/yuv_row_win64.cpp

// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "yuv_row.h"

extern "C" {

// x64 compiler doesn't support MMX and inline assembler.  Use SSE2 intrinsics.

#define kCoefficientsRgbU (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 2048)
#define kCoefficientsRgbV (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 4096)

#include <emmintrin.h>

static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf,
                                          const uint8* u_buf,
                                          const uint8* v_buf,
                                          uint8* rgb_buf,
                                          int width) {
  __m128i xmm0, xmmY1, xmmY2;
  __m128  xmmY;

  while (width >= 2) {
    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));

    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);

    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);

    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
                          0x44);
    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);

    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
    rgb_buf += 8;
    width -= 2;
  }

  if (width) {
    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf));
    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
    xmmY1 = _mm_srai_epi16(xmmY1, 6);
    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
  }
}

static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* rgb_buf,
                                    int width,
                                    int source_dx) {
  __m128i xmm0, xmmY1, xmmY2;
  __m128  xmmY;
  uint8 u, v, y;
  int x = 0;

  while (width >= 2) {
    u = u_buf[x >> 17];
    v = v_buf[x >> 17];
    y = y_buf[x >> 16];
    x += source_dx;

    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);

    y = y_buf[x >> 16];
    x += source_dx;

    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);

    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
                          0x44);
    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);

    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
    rgb_buf += 8;
    width -= 2;
  }

  if (width) {
    u = u_buf[x >> 17];
    v = v_buf[x >> 17];
    y = y_buf[x >> 16];

    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
    xmmY1 = _mm_srai_epi16(xmmY1, 6);
    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
  }
}

static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
                                          const uint8* u_buf,
                                          const uint8* v_buf,
                                          uint8* rgb_buf,
                                          int width,
                                          int source_dx) {
  __m128i xmm0, xmmY1, xmmY2;
  __m128  xmmY;
  uint8 u0, u1, v0, v1, y0, y1;
  uint32 uv_frac, y_frac, u, v, y;
  int x = 0;

  if (source_dx >= 0x20000) {
    x = 32768;
  }

  while(width >= 2) {
    u0 = u_buf[x >> 17];
    u1 = u_buf[(x >> 17) + 1];
    v0 = v_buf[x >> 17];
    v1 = v_buf[(x >> 17) + 1];
    y0 = y_buf[x >> 16];
    y1 = y_buf[(x >> 16) + 1];
    uv_frac = (x & 0x1fffe);
    y_frac = (x & 0xffff);
    u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
    v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
    x += source_dx;

    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);

    y0 = y_buf[x >> 16];
    y1 = y_buf[(x >> 16) + 1];
    y_frac = (x & 0xffff);
    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
    x += source_dx;

    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);

    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
                          0x44);
    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);

    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
    rgb_buf += 8;
    width -= 2;
  }

  if (width) {
    u = u_buf[x >> 17];
    v = v_buf[x >> 17];
    y = y_buf[x >> 16];

    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));

    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
    xmmY1 = _mm_srai_epi16(xmmY1, 6);
    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
  }
}

void FastConvertYUVToRGB32Row(const uint8* y_buf,
                              const uint8* u_buf,
                              const uint8* v_buf,
                              uint8* rgb_buf,
                              int width) {
  FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);
}

void ScaleYUVToRGB32Row(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
                        int width,
                        int source_dx) {
  ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
}

void LinearScaleYUVToRGB32Row(const uint8* y_buf,
                              const uint8* u_buf,
                              const uint8* v_buf,
                              uint8* rgb_buf,
                              int width,
                              int source_dx) {
  LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,
                                source_dx);
}

} // extern "C"
Bug 571739 - Use SIMD for conversion from RGB to YUV on Win64. r=kinetik 2011-04-17 17:51:18 -07:00			`// Copyright (c) 2010 The Chromium Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style license that can be`
			`// found in the LICENSE file.`

			`#include "yuv_row.h"`

			`extern "C" {`

			`// x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics.`

			`#define kCoefficientsRgbU (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 2048)`
			`#define kCoefficientsRgbV (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 4096)`

			`#include <emmintrin.h>`

			`static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf,`
			`const uint8* u_buf,`
			`const uint8* v_buf,`
			`uint8* rgb_buf,`
			`int width) {`
			`__m128i xmm0, xmmY1, xmmY2;`
			`__m128 xmmY;`

			`while (width >= 2) {`
			`xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i>(kCoefficientsRgbU + 8 *u_buf++)),`
			`_mm_loadl_epi64(reinterpret_cast<__m128i>(kCoefficientsRgbV + 8 *v_buf++)));`

			`xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i>(reinterpret_cast<uint8>(kCoefficientsRgbY) + 8 * *y_buf++));`
			`xmmY1 = _mm_adds_epi16(xmmY1, xmm0);`

			`xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i>(reinterpret_cast<uint8>(kCoefficientsRgbY) + 8 * *y_buf++));`
			`xmmY2 = _mm_adds_epi16(xmmY2, xmm0);`

			`xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),`
			`0x44);`
			`xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);`
			`xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);`

			`_mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);`
			`rgb_buf += 8;`
			`width -= 2;`
			`}`

			`if (width) {`
			`xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i>(kCoefficientsRgbU + 8 *u_buf)),`
			`_mm_loadl_epi64(reinterpret_cast<__m128i>(kCoefficientsRgbV + 8 *v_buf)));`
			`xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i>(reinterpret_cast<uint8>(kCoefficientsRgbY) + 8 * *y_buf));`
			`xmmY1 = _mm_adds_epi16(xmmY1, xmm0);`
			`xmmY1 = _mm_srai_epi16(xmmY1, 6);`
			`xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);`
			`reinterpret_cast<uint32>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);`
			`}`
			`}`

			`static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf,`
			`const uint8* u_buf,`
			`const uint8* v_buf,`
			`uint8* rgb_buf,`
			`int width,`
			`int source_dx) {`
			`__m128i xmm0, xmmY1, xmmY2;`
			`__m128 xmmY;`
			`uint8 u, v, y;`
			`int x = 0;`

			`while (width >= 2) {`
			`u = u_buf[x >> 17];`
			`v = v_buf[x >> 17];`
			`y = y_buf[x >> 16];`
			`x += source_dx;`

			`xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i>(kCoefficientsRgbU + 8 u)),`
			`_mm_loadl_epi64(reinterpret_cast<__m128i>(kCoefficientsRgbV + 8 v)));`
			`xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i>(reinterpret_cast<uint8>(kCoefficientsRgbY) + 8 * y));`
			`xmmY1 = _mm_adds_epi16(xmmY1, xmm0);`

			`y = y_buf[x >> 16];`
			`x += source_dx;`

			`xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i>(reinterpret_cast<uint8>(kCoefficientsRgbY) + 8 * y));`
			`xmmY2 = _mm_adds_epi16(xmmY2, xmm0);`

			`xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),`
			`0x44);`
			`xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);`
			`xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);`

			`_mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);`
			`rgb_buf += 8;`
			`width -= 2;`
			`}`

			`if (width) {`
			`u = u_buf[x >> 17];`
			`v = v_buf[x >> 17];`
			`y = y_buf[x >> 16];`

			`xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i>(kCoefficientsRgbU + 8 u)),`
			`_mm_loadl_epi64(reinterpret_cast<__m128i>(kCoefficientsRgbV + 8 v)));`
			`xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i>(reinterpret_cast<uint8>(kCoefficientsRgbY) + 8 * y));`
			`xmmY1 = _mm_adds_epi16(xmmY1, xmm0);`
			`xmmY1 = _mm_srai_epi16(xmmY1, 6);`
			`xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);`
			`reinterpret_cast<uint32>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);`
			`}`
			`}`

			`static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf,`
			`const uint8* u_buf,`
			`const uint8* v_buf,`
			`uint8* rgb_buf,`
			`int width,`
			`int source_dx) {`
			`__m128i xmm0, xmmY1, xmmY2;`
			`__m128 xmmY;`
			`uint8 u0, u1, v0, v1, y0, y1;`
			`uint32 uv_frac, y_frac, u, v, y;`
			`int x = 0;`

			`if (source_dx >= 0x20000) {`
			`x = 32768;`
			`}`

			`while(width >= 2) {`
			`u0 = u_buf[x >> 17];`
			`u1 = u_buf[(x >> 17) + 1];`
			`v0 = v_buf[x >> 17];`
			`v1 = v_buf[(x >> 17) + 1];`
			`y0 = y_buf[x >> 16];`
			`y1 = y_buf[(x >> 16) + 1];`
			`uv_frac = (x & 0x1fffe);`
			`y_frac = (x & 0xffff);`
			`u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;`
			`v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;`
			`y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;`
			`x += source_dx;`

			`xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i>(kCoefficientsRgbU + 8 u)),`
			`_mm_loadl_epi64(reinterpret_cast<__m128i>(kCoefficientsRgbV + 8 v)));`
			`xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i>(reinterpret_cast<uint8>(kCoefficientsRgbY) + 8 * y));`
			`xmmY1 = _mm_adds_epi16(xmmY1, xmm0);`

			`y0 = y_buf[x >> 16];`
			`y1 = y_buf[(x >> 16) + 1];`
			`y_frac = (x & 0xffff);`
			`y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;`
			`x += source_dx;`

			`xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i>(reinterpret_cast<uint8>(kCoefficientsRgbY) + 8 * y));`
			`xmmY2 = _mm_adds_epi16(xmmY2, xmm0);`

			`xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),`
			`0x44);`
			`xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);`
			`xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);`

			`_mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);`
			`rgb_buf += 8;`
			`width -= 2;`
			`}`

			`if (width) {`
			`u = u_buf[x >> 17];`
			`v = v_buf[x >> 17];`
			`y = y_buf[x >> 16];`

			`xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i>(kCoefficientsRgbU + 8 u)),`
			`_mm_loadl_epi64(reinterpret_cast<__m128i>(kCoefficientsRgbV + 8 v)));`
			`xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i>(reinterpret_cast<uint8>(kCoefficientsRgbY) + 8 * y));`

			`xmmY1 = _mm_adds_epi16(xmmY1, xmm0);`
			`xmmY1 = _mm_srai_epi16(xmmY1, 6);`
			`xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);`
			`reinterpret_cast<uint32>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);`
			`}`
			`}`

			`void FastConvertYUVToRGB32Row(const uint8* y_buf,`
			`const uint8* u_buf,`
			`const uint8* v_buf,`
			`uint8* rgb_buf,`
			`int width) {`
			`FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);`
			`}`

			`void ScaleYUVToRGB32Row(const uint8* y_buf,`
			`const uint8* u_buf,`
			`const uint8* v_buf,`
			`uint8* rgb_buf,`
			`int width,`
			`int source_dx) {`
			`ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);`
			`}`

			`void LinearScaleYUVToRGB32Row(const uint8* y_buf,`
			`const uint8* u_buf,`
			`const uint8* v_buf,`
			`uint8* rgb_buf,`
			`int width,`
			`int source_dx) {`
			`LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,`
			`source_dx);`
			`}`

			`} // extern "C"`