Backout e89f1fce980d (bug 509052) for an assortment of timeouts and crashes

This commit is contained in:
Ed Morley 2012-11-07 10:47:27 +00:00
parent af422317a5
commit b9ac8afd08
7 changed files with 43 additions and 565 deletions

View File

@ -12,9 +12,6 @@
#include "mozilla/Constants.h"
#include "mozilla/Util.h"
#include "2D.h"
#include "Tools.h"
using namespace std;
namespace mozilla {
@ -314,8 +311,8 @@ SpreadVertical(unsigned char* aInput,
}
}
CheckedInt<int32_t>
AlphaBoxBlur::RoundUpToMultipleOf4(int32_t aVal)
static CheckedInt<int32_t>
RoundUpToMultipleOf4(int32_t aVal)
{
CheckedInt<int32_t> val(aVal);
@ -381,9 +378,10 @@ AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect,
if (stride.isValid()) {
mStride = stride.value();
CheckedInt<int32_t> size = CheckedInt<int32_t>(mStride) * mRect.height;
CheckedInt<int32_t> size = CheckedInt<int32_t>(mStride) * mRect.height *
sizeof(unsigned char);
if (size.isValid()) {
mData = new uint8_t[size.value()];
mData = static_cast<unsigned char*>(malloc(size.value()));
memset(mData, 0, size.value());
}
}
@ -407,7 +405,7 @@ AlphaBoxBlur::AlphaBoxBlur(uint8_t* aData,
AlphaBoxBlur::~AlphaBoxBlur()
{
if (mFreeData) {
delete [] mData;
free(mData);
}
}
@ -457,236 +455,42 @@ AlphaBoxBlur::Blur()
if (mBlurRadius != IntSize(0,0) || mSpreadRadius != IntSize(0,0)) {
int32_t stride = GetStride();
IntSize size = GetSize();
// No need to use CheckedInt here - we have validated it in the constructor.
size_t szB = stride * GetSize().height * sizeof(unsigned char);
unsigned char* tmpData = static_cast<unsigned char*>(malloc(szB));
if (!tmpData)
return; // OOM
memset(tmpData, 0, szB);
if (mSpreadRadius.width > 0 || mSpreadRadius.height > 0) {
// No need to use CheckedInt here - we have validated it in the constructor.
size_t szB = stride * size.height;
unsigned char* tmpData = new uint8_t[szB];
memset(tmpData, 0, szB);
SpreadHorizontal(mData, tmpData, mSpreadRadius.width, GetSize().width, GetSize().height, stride, mSkipRect);
SpreadVertical(tmpData, mData, mSpreadRadius.height, GetSize().width, GetSize().height, stride, mSkipRect);
delete [] tmpData;
}
int32_t horizontalLobes[3][2];
ComputeLobes(mBlurRadius.width, horizontalLobes);
int32_t verticalLobes[3][2];
ComputeLobes(mBlurRadius.height, verticalLobes);
// We want to allow for some extra space on the left for alignment reasons.
int32_t maxLeftLobe = RoundUpToMultipleOf4(horizontalLobes[0][0] + 1).value();
IntSize integralImageSize(size.width + maxLeftLobe + horizontalLobes[1][1],
size.height + verticalLobes[0][0] + verticalLobes[1][1] + 1);
#ifdef IS_BIG_ENDIAN
const bool cIsBigEndian = true;
#else
const bool cIsBigEndian = false;
#endif
if (cIsBigEndian || (integralImageSize.width * integralImageSize.height) > (1 << 24)) {
// Fallback to old blurring code when the surface is so large it may
// overflow our integral image!
// No need to use CheckedInt here - we have validated it in the constructor.
size_t szB = stride * size.height;
unsigned char* tmpData = new uint8_t[szB];
memset(tmpData, 0, szB);
if (mBlurRadius.width > 0) {
BoxBlurHorizontal(mData, tmpData, horizontalLobes[0][0], horizontalLobes[0][1], stride, GetSize().height, mSkipRect);
BoxBlurHorizontal(tmpData, mData, horizontalLobes[1][0], horizontalLobes[1][1], stride, GetSize().height, mSkipRect);
BoxBlurHorizontal(mData, tmpData, horizontalLobes[2][0], horizontalLobes[2][1], stride, GetSize().height, mSkipRect);
} else {
uint8_t *tmp = mData;
mData = tmpData;
tmpData = tmp;
}
if (mBlurRadius.height > 0) {
BoxBlurVertical(tmpData, mData, verticalLobes[0][0], verticalLobes[0][1], stride, GetSize().height, mSkipRect);
BoxBlurVertical(mData, tmpData, verticalLobes[1][0], verticalLobes[1][1], stride, GetSize().height, mSkipRect);
BoxBlurVertical(tmpData, mData, verticalLobes[2][0], verticalLobes[2][1], stride, GetSize().height, mSkipRect);
} else {
uint8_t *tmp = mData;
mData = tmpData;
tmpData = tmp;
}
delete [] tmpData;
if (mBlurRadius.width > 0) {
int32_t lobes[3][2];
ComputeLobes(mBlurRadius.width, lobes);
BoxBlurHorizontal(mData, tmpData, lobes[0][0], lobes[0][1], stride, GetSize().height, mSkipRect);
BoxBlurHorizontal(tmpData, mData, lobes[1][0], lobes[1][1], stride, GetSize().height, mSkipRect);
BoxBlurHorizontal(mData, tmpData, lobes[2][0], lobes[2][1], stride, GetSize().height, mSkipRect);
} else {
size_t integralImageStride = GetAlignedStride<16>(integralImageSize.width * 4);
AlignedArray<uint32_t> integralImage((integralImageStride / 4) * integralImageSize.height);
#ifdef USE_SSE2
if (Factory::HasSSE2()) {
BoxBlur_SSE2(horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0],
verticalLobes[0][1], integralImage, integralImageStride);
BoxBlur_SSE2(horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0],
verticalLobes[1][1], integralImage, integralImageStride);
BoxBlur_SSE2(horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0],
verticalLobes[2][1], integralImage, integralImageStride);
} else
#endif
{
BoxBlur_C(horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0],
verticalLobes[0][1], integralImage, integralImageStride);
BoxBlur_C(horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0],
verticalLobes[1][1], integralImage, integralImageStride);
BoxBlur_C(horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0],
verticalLobes[2][1], integralImage, integralImageStride);
}
memcpy(tmpData, mData, stride * GetSize().height);
}
}
}
MOZ_ALWAYS_INLINE void
GenerateIntegralRow(uint32_t *aDest, const uint8_t *aSource, uint32_t *aPreviousRow,
const uint32_t &aSourceWidth, const uint32_t &aLeftInflation, const uint32_t &aRightInflation)
{
uint32_t currentRowSum = 0;
uint32_t pixel = aSource[0];
for (uint32_t x = 0; x < aLeftInflation; x++) {
currentRowSum += pixel;
*aDest++ = currentRowSum + *aPreviousRow++;
}
for (uint32_t x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x += 4) {
uint32_t alphaValues = *(uint32_t*)(aSource + (x - aLeftInflation));
currentRowSum += alphaValues & 0xff;
*aDest++ = *aPreviousRow++ + currentRowSum;
alphaValues >>= 8;
currentRowSum += alphaValues & 0xff;
*aDest++ = *aPreviousRow++ + currentRowSum;
alphaValues >>= 8;
currentRowSum += alphaValues & 0xff;
*aDest++ = *aPreviousRow++ + currentRowSum;
alphaValues >>= 8;
currentRowSum += alphaValues & 0xff;
*aDest++ = *aPreviousRow++ + currentRowSum;
}
pixel = aSource[aSourceWidth - 1];
for (uint32_t x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
currentRowSum += pixel;
*aDest++ = currentRowSum + *aPreviousRow++;
}
}
MOZ_ALWAYS_INLINE void
GenerateIntegralImage_C(int32_t aLeftInflation, int32_t aRightInflation,
int32_t aTopInflation, int32_t aBottomInflation,
uint32_t *aIntegralImage, size_t aIntegralImageStride,
uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize)
{
uint32_t stride32bit = aIntegralImageStride / 4;
IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
aSize.height + aTopInflation + aBottomInflation);
memset(aIntegralImage, 0, aIntegralImageStride);
GenerateIntegralRow(aIntegralImage, aSource, aIntegralImage,
aSize.width, aLeftInflation, aRightInflation);
for (int y = 1; y < aTopInflation + 1; y++) {
uint32_t *intRow = aIntegralImage + (y * stride32bit);
uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
uint32_t *intFirstRow = aIntegralImage;
GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource, aIntegralImage + (y - 1) * stride32bit,
aSize.width, aLeftInflation, aRightInflation);
}
for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource + aSourceStride * (y - aTopInflation),
aIntegralImage + (y - 1) * stride32bit, aSize.width, aLeftInflation, aRightInflation);
}
if (aBottomInflation) {
for (int y = (aSize.height + aTopInflation); y < integralImageSize.height; y++) {
GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource + ((aSize.height - 1) * aSourceStride),
aIntegralImage + (y - 1) * stride32bit,
aSize.width, aLeftInflation, aRightInflation);
if (mBlurRadius.height > 0) {
int32_t lobes[3][2];
ComputeLobes(mBlurRadius.height, lobes);
BoxBlurVertical(tmpData, mData, lobes[0][0], lobes[0][1], stride, GetSize().height, mSkipRect);
BoxBlurVertical(mData, tmpData, lobes[1][0], lobes[1][1], stride, GetSize().height, mSkipRect);
BoxBlurVertical(tmpData, mData, lobes[2][0], lobes[2][1], stride, GetSize().height, mSkipRect);
} else {
memcpy(mData, tmpData, stride * GetSize().height);
}
}
}
/**
* Attempt to do an in-place box blur using an integral image.
*/
void
AlphaBoxBlur::BoxBlur_C(int32_t aLeftLobe,
int32_t aRightLobe,
int32_t aTopLobe,
int32_t aBottomLobe,
uint32_t *aIntegralImage,
size_t aIntegralImageStride)
{
IntSize size = GetSize();
MOZ_ASSERT(size.width > 0);
// Our 'left' or 'top' lobe will include the current pixel. i.e. when
// looking at an integral image the value of a pixel at 'x,y' is calculated
// using the value of the integral image values above/below that.
aLeftLobe++;
aTopLobe++;
int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
MOZ_ASSERT(boxSize > 0);
if (boxSize == 1) {
return;
free(tmpData);
}
uint32_t stride32bit = aIntegralImageStride / 4;
int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
GenerateIntegralImage_C(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
aIntegralImage, aIntegralImageStride, mData,
mStride, size);
uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
// Storing these locally makes this about 30% faster! Presumably the compiler
// can't be sure we're not altering the member variables in this loop.
IntRect skipRect = mSkipRect;
uint8_t *data = mData;
int32_t stride = mStride;
for (int32_t y = 0; y < size.height; y++) {
bool inSkipRectY = y > skipRect.y && y < skipRect.YMost();
uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * stride32bit - aLeftLobe);
uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * stride32bit + aRightLobe);
uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * stride32bit + aRightLobe);
uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * stride32bit - aLeftLobe);
for (int32_t x = 0; x < size.width; x++) {
if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
x = skipRect.XMost() - 1;
// Trigger early jump on coming loop iterations, this will be reset
// next line anyway.
inSkipRectY = false;
continue;
}
int32_t topLeft = topLeftBase[x];
int32_t topRight = topRightBase[x];
int32_t bottomRight = bottomRightBase[x];
int32_t bottomLeft = bottomLeftBase[x];
uint32_t value = bottomRight - topRight - bottomLeft;
value += topLeft;
data[stride * y + x] = (uint64_t(reciprocal) * value) >> 32;
}
}
}
/**

View File

@ -7,7 +7,6 @@
#include "mozilla/gfx/Rect.h"
#include "mozilla/gfx/Point.h"
#include "mozilla/CheckedInt.h"
namespace mozilla {
namespace gfx {
@ -115,13 +114,6 @@ public:
private:
void BoxBlur_C(int32_t aLeftLobe, int32_t aRightLobe, int32_t aTopLobe,
int32_t aBottomLobe, uint32_t *aIntegralImage, size_t aIntegralImageStride);
void BoxBlur_SSE2(int32_t aLeftLobe, int32_t aRightLobe, int32_t aTopLobe,
int32_t aBottomLobe, uint32_t *aIntegralImage, size_t aIntegralImageStride);
static CheckedInt<int32_t> RoundUpToMultipleOf4(int32_t aVal);
/**
* A rect indicating the area where blurring is unnecessary, and the blur
* algorithm should skip over it.

View File

@ -1,250 +0,0 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "Blur.h"
#include "SSEHelpers.h"
#include <string.h>
namespace mozilla {
namespace gfx {
MOZ_ALWAYS_INLINE
uint32_t DivideAndPack(__m128i aValues, __m128i aDivisor, __m128i aMask)
{
__m128i multiplied = _mm_srli_epi64(_mm_mul_epu32(aValues, aDivisor), 32); // 00p300p1
multiplied = _mm_or_si128(multiplied, _mm_and_si128(_mm_mul_epu32(_mm_srli_epi64(aValues, 32), aDivisor),
aMask)); // p4p3p2p1
__m128i final = _mm_packus_epi16(_mm_packs_epi32(multiplied, _mm_setzero_si128()), _mm_setzero_si128());
return _mm_cvtsi128_si32(final);
}
MOZ_ALWAYS_INLINE
void LoadIntegralRowFromRow(uint32_t *aDest, const uint8_t *aSource,
int32_t aSourceWidth, int32_t aLeftInflation,
int32_t aRightInflation)
{
int32_t currentRowSum = 0;
for (int x = 0; x < aLeftInflation; x++) {
currentRowSum += aSource[0];
aDest[x] = currentRowSum;
}
for (int x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x++) {
currentRowSum += aSource[(x - aLeftInflation)];
aDest[x] = currentRowSum;
}
for (int x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
currentRowSum += aSource[aSourceWidth - 1];
aDest[x] = currentRowSum;
}
}
// This function calculates an integral of four pixels stored in the 4
// 32-bit integers on aPixels. i.e. for { 30, 50, 80, 100 } this returns
// { 30, 80, 160, 260 }. This seems to be the fastest way to do this after
// much testing.
MOZ_ALWAYS_INLINE
__m128i AccumulatePixelSums(__m128i aPixels)
{
__m128i sumPixels = aPixels;
__m128i currentPixels = _mm_slli_si128(aPixels, 4);
sumPixels = _mm_add_epi32(sumPixels, currentPixels);
currentPixels = _mm_unpacklo_epi64(_mm_setzero_si128(), sumPixels);
return _mm_add_epi32(sumPixels, currentPixels);
}
MOZ_ALWAYS_INLINE void
GenerateIntegralImage_SSE2(int32_t aLeftInflation, int32_t aRightInflation,
int32_t aTopInflation, int32_t aBottomInflation,
uint32_t *aIntegralImage, size_t aIntegralImageStride,
uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize)
{
MOZ_ASSERT(!(aLeftInflation & 3));
uint32_t stride32bit = aIntegralImageStride / 4;
IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
aSize.height + aTopInflation + aBottomInflation);
LoadIntegralRowFromRow(aIntegralImage, aSource, aSize.width, aLeftInflation, aRightInflation);
for (int y = 1; y < aTopInflation + 1; y++) {
uint32_t *intRow = aIntegralImage + (y * stride32bit);
uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
uint32_t *intFirstRow = aIntegralImage;
for (int x = 0; x < integralImageSize.width; x += 4) {
__m128i firstRow = _mm_load_si128((__m128i*)(intFirstRow + x));
__m128i previousRow = _mm_load_si128((__m128i*)(intPrevRow + x));
_mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(firstRow, previousRow));
}
}
for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
__m128i currentRowSum = _mm_setzero_si128();
uint32_t *intRow = aIntegralImage + (y * stride32bit);
uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
uint8_t *sourceRow = aSource + aSourceStride * (y - aTopInflation);
uint32_t pixel = sourceRow[0];
for (int x = 0; x < aLeftInflation; x += 4) {
__m128i sumPixels = AccumulatePixelSums(_mm_shuffle_epi32(_mm_set1_epi32(pixel), _MM_SHUFFLE(0, 0, 0, 0)));
sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3));
_mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
}
for (int x = aLeftInflation; x < (aSize.width + aLeftInflation); x += 4) {
uint32_t pixels = *(uint32_t*)(sourceRow + (x - aLeftInflation));
// It's important to shuffle here. When we exit this loop currentRowSum
// has to be set to sumPixels, so that the following loop can get the
// correct pixel for the currentRowSum. The highest order pixel in
// currentRowSum could've originated from accumulation in the stride.
currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3));
__m128i sumPixels = AccumulatePixelSums(_mm_unpacklo_epi16(_mm_unpacklo_epi8( _mm_set1_epi32(pixels), _mm_setzero_si128()), _mm_setzero_si128()));
sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
currentRowSum = sumPixels;
_mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
}
pixel = sourceRow[aSize.width - 1];
int x = (aSize.width + aLeftInflation);
if ((aSize.width & 3)) {
// Deal with unaligned portion. Get the correct pixel from currentRowSum,
// see explanation above.
uint32_t intCurrentRowSum = ((uint32_t*)&currentRowSum)[(aSize.width % 4) - 1];
for (; x < integralImageSize.width; x++) {
// We could be unaligned here!
if (!(x & 3)) {
// aligned!
currentRowSum = _mm_set1_epi32(intCurrentRowSum);
break;
}
intCurrentRowSum += pixel;
intRow[x] = intPrevRow[x] + intCurrentRowSum;
}
} else {
currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3));
}
for (; x < integralImageSize.width; x += 4) {
__m128i sumPixels = AccumulatePixelSums(_mm_set1_epi32(pixel));
sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3));
_mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
}
}
if (aBottomInflation) {
// Store the last valid row of our source image in the last row of
// our integral image. This will be overwritten with the correct values
// in the upcoming loop.
LoadIntegralRowFromRow(aIntegralImage + (integralImageSize.height - 1) * stride32bit,
aSource + (aSize.height - 1) * aSourceStride, aSize.width, aLeftInflation, aRightInflation);
for (int y = aSize.height + aTopInflation; y < integralImageSize.height; y++) {
__m128i *intRow = (__m128i*)(aIntegralImage + (y * stride32bit));
__m128i *intPrevRow = (__m128i*)(aIntegralImage + (y - 1) * stride32bit);
__m128i *intLastRow = (__m128i*)(aIntegralImage + (integralImageSize.height - 1) * stride32bit);
for (int x = 0; x < integralImageSize.width; x += 4) {
_mm_store_si128(intRow + (x / 4),
_mm_add_epi32(_mm_load_si128(intLastRow + (x / 4)),
_mm_load_si128(intPrevRow + (x / 4))));
}
}
}
}
/**
* Attempt to do an in-place box blur using an integral image.
*/
void
AlphaBoxBlur::BoxBlur_SSE2(int32_t aLeftLobe,
int32_t aRightLobe,
int32_t aTopLobe,
int32_t aBottomLobe,
uint32_t *aIntegralImage,
size_t aIntegralImageStride)
{
IntSize size = GetSize();
MOZ_ASSERT(size.height > 0);
// Our 'left' or 'top' lobe will include the current pixel. i.e. when
// looking at an integral image the value of a pixel at 'x,y' is calculated
// using the value of the integral image values above/below that.
aLeftLobe++;
aTopLobe++;
int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
MOZ_ASSERT(boxSize > 0);
if (boxSize == 1) {
return;
}
uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
uint32_t stride32bit = aIntegralImageStride / 4;
int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
GenerateIntegralImage_SSE2(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
aIntegralImage, aIntegralImageStride, mData,
mStride, size);
__m128i divisor = _mm_set1_epi32(reciprocal);
__m128i mask = _mm_setr_epi32(0x0, 0xffffffff, 0x0, 0xffffffff);
// This points to the start of the rectangle within the IntegralImage that overlaps
// the surface being blurred.
uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
IntRect skipRect = mSkipRect;
int32_t stride = mStride;
uint8_t *data = mData;
for (int32_t y = 0; y < size.height; y++) {
bool inSkipRectY = y > skipRect.y && y < skipRect.YMost();
uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) + aRightLobe);
uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) + aRightLobe);
uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
for (int32_t x = 0; x < size.width; x += 4) {
if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
x = skipRect.XMost() - 4;
// Trigger early jump on coming loop iterations, this will be reset
// next line anyway.
inSkipRectY = false;
continue;
}
__m128i topLeft = loadUnaligned128((__m128i*)(topLeftBase + x));
__m128i topRight = loadUnaligned128((__m128i*)(topRightBase + x));
__m128i bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x));
__m128i bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x));
__m128i values = _mm_add_epi32(_mm_sub_epi32(_mm_sub_epi32(bottomRight, topRight), bottomLeft), topLeft);
*(uint32_t*)(data + stride * y + x) = DivideAndPack(values, divisor, mask);
}
}
}
}
}

View File

@ -6,7 +6,8 @@
#include "ImageScaling.h"
#include "mozilla/Attributes.h"
#include "SSEHelpers.h"
#include <xmmintrin.h>
#include <emmintrin.h>
/* The functions below use the following system for averaging 4 pixels:
*
@ -107,6 +108,17 @@ MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b)
return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
}
/* Before Nehalem _mm_loadu_si128 could be very slow, this trick is a little
* faster. Once enough people are on architectures where _mm_loadu_si128 is
* fast we can migrate to it.
*/
MOZ_ALWAYS_INLINE __m128i loadUnaligned128(const __m128i *aSource)
{
// Yes! We use uninitialized memory here, we'll overwrite it though!
__m128 res = _mm_loadl_pi(_mm_set1_ps(0), (const __m64*)aSource);
return _mm_castps_si128(_mm_loadh_pi(res, ((const __m64*)(aSource)) + 1));
}
MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
{
uint32_t sum = a ^ b ^ c;

View File

@ -116,10 +116,7 @@ endif
ifneq (,$(INTEL_ARCHITECTURE))
# VC2005 doesn't support _mm_castsi128_ps, so SSE2 is turned off
ifneq (1400,$(_MSC_VER))
CPPSRCS += \
ImageScalingSSE2.cpp \
BlurSSE2.cpp \
$(NULL)
CPPSRCS += ImageScalingSSE2.cpp
DEFINES += -DUSE_SSE2
endif
endif
@ -164,12 +161,10 @@ DEFINES := $(filter-out -DUNICODE -D_UNICODE,$(DEFINES))
ifneq (,$(INTEL_ARCHITECTURE))
ifdef GNU_CC
ImageScalingSSE2.$(OBJ_SUFFIX): CXXFLAGS+=-msse2
BlurSSE2.$(OBJ_SUFFIX): CXXFLAGS+=-msse2
endif
ifdef SOLARIS_SUNPRO_CXX
ImageScalingSSE2.$(OBJ_SUFFIX): OS_CXXFLAGS += -xarch=sse2 -xO4
BlurSSE2.$(OBJ_SUFFIX): OS_CXXFLAGS += -xarch=sse2 -xO4
endif
endif

View File

@ -1,17 +0,0 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include <xmmintrin.h>
#include <emmintrin.h>
/* Before Nehalem _mm_loadu_si128 could be very slow, this trick is a little
* faster. Once enough people are on architectures where _mm_loadu_si128 is
* fast we can migrate to it.
*/
MOZ_ALWAYS_INLINE __m128i loadUnaligned128(const __m128i *aSource)
{
// Yes! We use uninitialized memory here, we'll overwrite it though!
__m128 res = _mm_loadl_pi(_mm_set1_ps(0), (const __m64*)aSource);
return _mm_castps_si128(_mm_loadh_pi(res, ((const __m64*)(aSource)) + 1));
}

View File

@ -81,64 +81,6 @@ BytesPerPixel(SurfaceFormat aFormat)
}
}
template<typename T, int alignment = 16>
struct AlignedArray
{
AlignedArray()
: mStorage(nullptr)
, mPtr(nullptr)
{
}
MOZ_ALWAYS_INLINE AlignedArray(size_t aSize)
: mStorage(nullptr)
{
Realloc(aSize);
}
MOZ_ALWAYS_INLINE ~AlignedArray()
{
delete [] mStorage;
}
void Dealloc()
{
delete [] mStorage;
mStorage = mPtr = nullptr;
}
MOZ_ALWAYS_INLINE void Realloc(size_t aSize)
{
delete [] mStorage;
mStorage = new T[aSize + (alignment - 1)];
if (uintptr_t(mStorage) % alignment) {
// Our storage does not start at a <alignment>-byte boundary. Make sure mData does!
mPtr = (uint32_t*)(uintptr_t(mStorage) +
(alignment - (uintptr_t(mStorage) % alignment)));
} else {
mPtr = mStorage;
}
}
MOZ_ALWAYS_INLINE operator T*()
{
return mPtr;
}
T *mStorage;
T *mPtr;
};
template<int alignment>
int32_t GetAlignedStride(int32_t aStride)
{
if (aStride % alignment) {
return aStride + (alignment - (aStride % alignment));
}
return aStride;
}
}
}