Backout e89f1fce980d (bug 509052) for an assortment of timeouts and crashes

2024-09-13 09:24:08 -07:00 · 2012-11-07 10:47:27 +00:00 · 2012-11-07 10:47:27 +00:00 · b9ac8afd08
commit b9ac8afd08
parent af422317a5
7 changed files with 43 additions and 565 deletions
--- a/gfx/2d/Blur.cpp
+++ b/gfx/2d/Blur.cpp
@ -12,9 +12,6 @@
 #include "mozilla/Constants.h"
 #include "mozilla/Util.h"

-#include "2D.h"
-#include "Tools.h"
-
 using namespace std;

 namespace mozilla {
@ -314,8 +311,8 @@ SpreadVertical(unsigned char* aInput,
    }
 }

-CheckedInt<int32_t>
-AlphaBoxBlur::RoundUpToMultipleOf4(int32_t aVal)
+static CheckedInt<int32_t>
+RoundUpToMultipleOf4(int32_t aVal)
 {
  CheckedInt<int32_t> val(aVal);

@ -381,9 +378,10 @@ AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect,
  if (stride.isValid()) {
    mStride = stride.value();

-    CheckedInt<int32_t> size = CheckedInt<int32_t>(mStride) * mRect.height;
+    CheckedInt<int32_t> size = CheckedInt<int32_t>(mStride) * mRect.height *
+                               sizeof(unsigned char);
    if (size.isValid()) {
-      mData = new uint8_t[size.value()];
+      mData = static_cast<unsigned char*>(malloc(size.value()));
      memset(mData, 0, size.value());
    }
  }
@ -407,7 +405,7 @@ AlphaBoxBlur::AlphaBoxBlur(uint8_t* aData,
 AlphaBoxBlur::~AlphaBoxBlur()
 {
  if (mFreeData) {
-    delete [] mData;
+    free(mData);
  }
 }

@ -457,236 +455,42 @@ AlphaBoxBlur::Blur()
  if (mBlurRadius != IntSize(0,0) || mSpreadRadius != IntSize(0,0)) {
    int32_t stride = GetStride();

-    IntSize size = GetSize();
+    // No need to use CheckedInt here - we have validated it in the constructor.
+    size_t szB = stride * GetSize().height * sizeof(unsigned char);
+    unsigned char* tmpData = static_cast<unsigned char*>(malloc(szB));
+    if (!tmpData)
+      return; // OOM
+
+    memset(tmpData, 0, szB);

    if (mSpreadRadius.width > 0 || mSpreadRadius.height > 0) {
-      // No need to use CheckedInt here - we have validated it in the constructor.
-      size_t szB = stride * size.height;
-      unsigned char* tmpData = new uint8_t[szB];
-
-      memset(tmpData, 0, szB);
-
      SpreadHorizontal(mData, tmpData, mSpreadRadius.width, GetSize().width, GetSize().height, stride, mSkipRect);
      SpreadVertical(tmpData, mData, mSpreadRadius.height, GetSize().width, GetSize().height, stride, mSkipRect);
-
-      delete [] tmpData;
    }

-    int32_t horizontalLobes[3][2];
-    ComputeLobes(mBlurRadius.width, horizontalLobes);
-    int32_t verticalLobes[3][2];
-    ComputeLobes(mBlurRadius.height, verticalLobes);
-
-    // We want to allow for some extra space on the left for alignment reasons.
-    int32_t maxLeftLobe = RoundUpToMultipleOf4(horizontalLobes[0][0] + 1).value();
-
-    IntSize integralImageSize(size.width + maxLeftLobe + horizontalLobes[1][1],
-                              size.height + verticalLobes[0][0] + verticalLobes[1][1] + 1);
-
-#ifdef IS_BIG_ENDIAN
-    const bool cIsBigEndian = true;
-#else
-    const bool cIsBigEndian = false;
-#endif
-
-    if (cIsBigEndian || (integralImageSize.width * integralImageSize.height) > (1 << 24)) {
-      // Fallback to old blurring code when the surface is so large it may
-      // overflow our integral image!
-
-      // No need to use CheckedInt here - we have validated it in the constructor.
-      size_t szB = stride * size.height;
-      unsigned char* tmpData = new uint8_t[szB];
-
-      memset(tmpData, 0, szB);
-
-      if (mBlurRadius.width > 0) {
-        BoxBlurHorizontal(mData, tmpData, horizontalLobes[0][0], horizontalLobes[0][1], stride, GetSize().height, mSkipRect);
-        BoxBlurHorizontal(tmpData, mData, horizontalLobes[1][0], horizontalLobes[1][1], stride, GetSize().height, mSkipRect);
-        BoxBlurHorizontal(mData, tmpData, horizontalLobes[2][0], horizontalLobes[2][1], stride, GetSize().height, mSkipRect);
-      } else {
-        uint8_t *tmp = mData;
-        mData = tmpData;
-        tmpData = tmp;
-      }
-      if (mBlurRadius.height > 0) {
-        BoxBlurVertical(tmpData, mData, verticalLobes[0][0], verticalLobes[0][1], stride, GetSize().height, mSkipRect);
-        BoxBlurVertical(mData, tmpData, verticalLobes[1][0], verticalLobes[1][1], stride, GetSize().height, mSkipRect);
-        BoxBlurVertical(tmpData, mData, verticalLobes[2][0], verticalLobes[2][1], stride, GetSize().height, mSkipRect);
-      } else {
-        uint8_t *tmp = mData;
-        mData = tmpData;
-        tmpData = tmp;
-      }
-
-      delete [] tmpData;
+    if (mBlurRadius.width > 0) {
+      int32_t lobes[3][2];
+      ComputeLobes(mBlurRadius.width, lobes);
+      BoxBlurHorizontal(mData, tmpData, lobes[0][0], lobes[0][1], stride, GetSize().height, mSkipRect);
+      BoxBlurHorizontal(tmpData, mData, lobes[1][0], lobes[1][1], stride, GetSize().height, mSkipRect);
+      BoxBlurHorizontal(mData, tmpData, lobes[2][0], lobes[2][1], stride, GetSize().height, mSkipRect);
    } else {
-      size_t integralImageStride = GetAlignedStride<16>(integralImageSize.width * 4);
-
-      AlignedArray<uint32_t> integralImage((integralImageStride / 4) * integralImageSize.height);
-
-#ifdef USE_SSE2
-      if (Factory::HasSSE2()) {
-        BoxBlur_SSE2(horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0],
-                     verticalLobes[0][1], integralImage, integralImageStride);
-        BoxBlur_SSE2(horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0],
-                     verticalLobes[1][1], integralImage, integralImageStride);
-        BoxBlur_SSE2(horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0],
-                     verticalLobes[2][1], integralImage, integralImageStride);
-      } else
-#endif
-      {
-        BoxBlur_C(horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0],
-                  verticalLobes[0][1], integralImage, integralImageStride);
-        BoxBlur_C(horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0],
-                  verticalLobes[1][1], integralImage, integralImageStride);
-        BoxBlur_C(horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0],
-                  verticalLobes[2][1], integralImage, integralImageStride);
-      }
+      memcpy(tmpData, mData, stride * GetSize().height);
    }
-  }
-}

-MOZ_ALWAYS_INLINE void
-GenerateIntegralRow(uint32_t  *aDest, const uint8_t *aSource, uint32_t *aPreviousRow,
-                    const uint32_t &aSourceWidth, const uint32_t &aLeftInflation, const uint32_t &aRightInflation)
-{
-  uint32_t currentRowSum = 0;
-  uint32_t pixel = aSource[0];
-  for (uint32_t x = 0; x < aLeftInflation; x++) {
-    currentRowSum += pixel;
-    *aDest++ = currentRowSum + *aPreviousRow++;
-  }
-  for (uint32_t x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x += 4) {
-      uint32_t alphaValues = *(uint32_t*)(aSource + (x - aLeftInflation));
-      currentRowSum += alphaValues & 0xff;
-      *aDest++ = *aPreviousRow++ + currentRowSum;
-      alphaValues >>= 8;
-      currentRowSum += alphaValues & 0xff;
-      *aDest++ = *aPreviousRow++ + currentRowSum;
-      alphaValues >>= 8;
-      currentRowSum += alphaValues & 0xff;
-      *aDest++ = *aPreviousRow++ + currentRowSum;
-      alphaValues >>= 8;
-      currentRowSum += alphaValues & 0xff;
-      *aDest++ = *aPreviousRow++ + currentRowSum;
-  }
-  pixel = aSource[aSourceWidth - 1];
-  for (uint32_t x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
-    currentRowSum += pixel;
-    *aDest++ = currentRowSum + *aPreviousRow++;
-  }
-}
-
-MOZ_ALWAYS_INLINE void
-GenerateIntegralImage_C(int32_t aLeftInflation, int32_t aRightInflation,
-                        int32_t aTopInflation, int32_t aBottomInflation,
-                        uint32_t *aIntegralImage, size_t aIntegralImageStride,
-                        uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize)
-{
-  uint32_t stride32bit = aIntegralImageStride / 4;
-
-  IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
-                            aSize.height + aTopInflation + aBottomInflation);
-
-  memset(aIntegralImage, 0, aIntegralImageStride);
-
-  GenerateIntegralRow(aIntegralImage, aSource, aIntegralImage,
-                      aSize.width, aLeftInflation, aRightInflation);
-  for (int y = 1; y < aTopInflation + 1; y++) {
-    uint32_t *intRow = aIntegralImage + (y * stride32bit);
-    uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
-    uint32_t *intFirstRow = aIntegralImage;
-
-    GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource, aIntegralImage + (y - 1) * stride32bit,
-                        aSize.width, aLeftInflation, aRightInflation);
-  }
-
-  for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
-    GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource + aSourceStride * (y - aTopInflation),
-                        aIntegralImage + (y - 1) * stride32bit, aSize.width, aLeftInflation, aRightInflation);
-  }
-
-  if (aBottomInflation) {
-    for (int y = (aSize.height + aTopInflation); y < integralImageSize.height; y++) {
-      GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource + ((aSize.height - 1) * aSourceStride),
-                          aIntegralImage + (y - 1) * stride32bit,
-                          aSize.width, aLeftInflation, aRightInflation);
+    if (mBlurRadius.height > 0) {
+      int32_t lobes[3][2];
+      ComputeLobes(mBlurRadius.height, lobes);
+      BoxBlurVertical(tmpData, mData, lobes[0][0], lobes[0][1], stride, GetSize().height, mSkipRect);
+      BoxBlurVertical(mData, tmpData, lobes[1][0], lobes[1][1], stride, GetSize().height, mSkipRect);
+      BoxBlurVertical(tmpData, mData, lobes[2][0], lobes[2][1], stride, GetSize().height, mSkipRect);
+    } else {
+      memcpy(mData, tmpData, stride * GetSize().height);
    }
-  }
-}

-/**
- * Attempt to do an in-place box blur using an integral image.
- */
-void
-AlphaBoxBlur::BoxBlur_C(int32_t aLeftLobe,
-                        int32_t aRightLobe,
-                        int32_t aTopLobe,
-                        int32_t aBottomLobe,
-                        uint32_t *aIntegralImage,
-                        size_t aIntegralImageStride)
-{
-  IntSize size = GetSize();
-
-  MOZ_ASSERT(size.width > 0);
-
-  // Our 'left' or 'top' lobe will include the current pixel. i.e. when
-  // looking at an integral image the value of a pixel at 'x,y' is calculated
-  // using the value of the integral image values above/below that.
-  aLeftLobe++;
-  aTopLobe++;
-  int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
-
-  MOZ_ASSERT(boxSize > 0);
-
-  if (boxSize == 1) {
-      return;
+    free(tmpData);
  }

-  uint32_t stride32bit = aIntegralImageStride / 4;
-
-  int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
-
-  GenerateIntegralImage_C(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
-                          aIntegralImage, aIntegralImageStride, mData,
-                          mStride, size);
-
-  uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
-
-  uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
-
-  // Storing these locally makes this about 30% faster! Presumably the compiler
-  // can't be sure we're not altering the member variables in this loop.
-  IntRect skipRect = mSkipRect;
-  uint8_t *data = mData;
-  int32_t stride = mStride;
-  for (int32_t y = 0; y < size.height; y++) {
-    bool inSkipRectY = y > skipRect.y && y < skipRect.YMost();
-
-    uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * stride32bit - aLeftLobe);
-    uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * stride32bit + aRightLobe);
-    uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * stride32bit + aRightLobe);
-    uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * stride32bit - aLeftLobe);
-
-    for (int32_t x = 0; x < size.width; x++) {
-      if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
-        x = skipRect.XMost() - 1;
-        // Trigger early jump on coming loop iterations, this will be reset
-        // next line anyway.
-        inSkipRectY = false;
-        continue;
-      }
-      int32_t topLeft = topLeftBase[x];
-      int32_t topRight = topRightBase[x];
-      int32_t bottomRight = bottomRightBase[x];
-      int32_t bottomLeft = bottomLeftBase[x];
-
-      uint32_t value = bottomRight - topRight - bottomLeft;
-      value += topLeft;
-
-      data[stride * y + x] = (uint64_t(reciprocal) * value) >> 32;
-    }
-  }
 }

 /**
--- a/gfx/2d/Blur.h
+++ b/gfx/2d/Blur.h
@ -7,7 +7,6 @@

 #include "mozilla/gfx/Rect.h"
 #include "mozilla/gfx/Point.h"
-#include "mozilla/CheckedInt.h"

 namespace mozilla {
 namespace gfx {
@ -115,13 +114,6 @@ public:

 private:

-  void BoxBlur_C(int32_t aLeftLobe, int32_t aRightLobe, int32_t aTopLobe,
-                 int32_t aBottomLobe, uint32_t *aIntegralImage, size_t aIntegralImageStride);
-  void BoxBlur_SSE2(int32_t aLeftLobe, int32_t aRightLobe, int32_t aTopLobe,
-                    int32_t aBottomLobe, uint32_t *aIntegralImage, size_t aIntegralImageStride);
-
-  static CheckedInt<int32_t> RoundUpToMultipleOf4(int32_t aVal);
-
  /**
   * A rect indicating the area where blurring is unnecessary, and the blur
   * algorithm should skip over it.
--- a/gfx/2d/BlurSSE2.cpp
+++ b/gfx/2d/BlurSSE2.cpp
@ -1,250 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-#include "Blur.h"
-
-#include "SSEHelpers.h"
-
-#include <string.h>
-
-namespace mozilla {
-namespace gfx {
-
-MOZ_ALWAYS_INLINE
-uint32_t DivideAndPack(__m128i aValues, __m128i aDivisor, __m128i aMask)
-{
-  __m128i multiplied = _mm_srli_epi64(_mm_mul_epu32(aValues, aDivisor), 32); // 00p300p1
-  multiplied = _mm_or_si128(multiplied, _mm_and_si128(_mm_mul_epu32(_mm_srli_epi64(aValues, 32), aDivisor),
-    aMask)); // p4p3p2p1
-  __m128i final = _mm_packus_epi16(_mm_packs_epi32(multiplied, _mm_setzero_si128()), _mm_setzero_si128());
-
-  return _mm_cvtsi128_si32(final);
-}
-
-MOZ_ALWAYS_INLINE
-void LoadIntegralRowFromRow(uint32_t *aDest, const uint8_t *aSource,
-                            int32_t aSourceWidth, int32_t aLeftInflation,
-                            int32_t aRightInflation)
-{
-  int32_t currentRowSum = 0;
-
-  for (int x = 0; x < aLeftInflation; x++) {
-    currentRowSum += aSource[0];
-    aDest[x] = currentRowSum;
-  }
-  for (int x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x++) {
-    currentRowSum += aSource[(x - aLeftInflation)];
-    aDest[x] = currentRowSum;
-  }
-  for (int x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
-    currentRowSum += aSource[aSourceWidth - 1];
-    aDest[x] = currentRowSum;
-  }
-}
-
-// This function calculates an integral of four pixels stored in the 4
-// 32-bit integers on aPixels. i.e. for { 30, 50, 80, 100 } this returns
-// { 30, 80, 160, 260 }. This seems to be the fastest way to do this after
-// much testing.
-MOZ_ALWAYS_INLINE
-__m128i AccumulatePixelSums(__m128i aPixels)
-{
-  __m128i sumPixels = aPixels;
-  __m128i currentPixels = _mm_slli_si128(aPixels, 4);
-  sumPixels = _mm_add_epi32(sumPixels, currentPixels);
-  currentPixels = _mm_unpacklo_epi64(_mm_setzero_si128(), sumPixels);
-
-  return _mm_add_epi32(sumPixels, currentPixels);
-}
-
-MOZ_ALWAYS_INLINE void
-GenerateIntegralImage_SSE2(int32_t aLeftInflation, int32_t aRightInflation,
-                           int32_t aTopInflation, int32_t aBottomInflation,
-                           uint32_t *aIntegralImage, size_t aIntegralImageStride,
-                           uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize)
-{
-  MOZ_ASSERT(!(aLeftInflation & 3));
-
-  uint32_t stride32bit = aIntegralImageStride / 4;
-
-  IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
-                            aSize.height + aTopInflation + aBottomInflation);
-
-  LoadIntegralRowFromRow(aIntegralImage, aSource, aSize.width, aLeftInflation, aRightInflation);
-
-  for (int y = 1; y < aTopInflation + 1; y++) {
-    uint32_t *intRow = aIntegralImage + (y * stride32bit);
-    uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
-    uint32_t *intFirstRow = aIntegralImage;
-
-    for (int x = 0; x < integralImageSize.width; x += 4) {
-      __m128i firstRow = _mm_load_si128((__m128i*)(intFirstRow + x));
-      __m128i previousRow = _mm_load_si128((__m128i*)(intPrevRow + x));
-      _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(firstRow, previousRow));
-    }
-  }
-
-  for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
-    __m128i currentRowSum = _mm_setzero_si128();
-    uint32_t *intRow = aIntegralImage + (y * stride32bit);
-    uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
-    uint8_t *sourceRow = aSource + aSourceStride * (y - aTopInflation);
-
-    uint32_t pixel = sourceRow[0];
-    for (int x = 0; x < aLeftInflation; x += 4) {
-      __m128i sumPixels = AccumulatePixelSums(_mm_shuffle_epi32(_mm_set1_epi32(pixel), _MM_SHUFFLE(0, 0, 0, 0)));
-
-      sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
-
-      currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3));
-
-      _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
-    }
-    for (int x = aLeftInflation; x < (aSize.width + aLeftInflation); x += 4) {
-      uint32_t pixels = *(uint32_t*)(sourceRow + (x - aLeftInflation));
-
-      // It's important to shuffle here. When we exit this loop currentRowSum
-      // has to be set to sumPixels, so that the following loop can get the
-      // correct pixel for the currentRowSum. The highest order pixel in
-      // currentRowSum could've originated from accumulation in the stride.
-      currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3));
-
-      __m128i sumPixels = AccumulatePixelSums(_mm_unpacklo_epi16(_mm_unpacklo_epi8( _mm_set1_epi32(pixels), _mm_setzero_si128()), _mm_setzero_si128()));
-      sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
-
-      currentRowSum = sumPixels;
-
-      _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
-    }
-
-    pixel = sourceRow[aSize.width - 1];
-    int x = (aSize.width + aLeftInflation);
-    if ((aSize.width & 3)) {
-      // Deal with unaligned portion. Get the correct pixel from currentRowSum,
-      // see explanation above.
-      uint32_t intCurrentRowSum = ((uint32_t*)&currentRowSum)[(aSize.width % 4) - 1];
-      for (; x < integralImageSize.width; x++) {
-        // We could be unaligned here!
-        if (!(x & 3)) {
-          // aligned!
-          currentRowSum = _mm_set1_epi32(intCurrentRowSum);
-          break;
-        }
-        intCurrentRowSum += pixel;
-        intRow[x] = intPrevRow[x] + intCurrentRowSum;
-      }
-    } else {
-      currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3));
-    }
-    for (; x < integralImageSize.width; x += 4) {
-      __m128i sumPixels = AccumulatePixelSums(_mm_set1_epi32(pixel));
-
-      sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
-
-      currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3));
-
-      _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
-    }
-  }
-
-  if (aBottomInflation) {
-    // Store the last valid row of our source image in the last row of
-    // our integral image. This will be overwritten with the correct values
-    // in the upcoming loop.
-    LoadIntegralRowFromRow(aIntegralImage + (integralImageSize.height - 1) * stride32bit,
-                           aSource + (aSize.height - 1) * aSourceStride, aSize.width, aLeftInflation, aRightInflation);
-
-
-    for (int y = aSize.height + aTopInflation; y < integralImageSize.height; y++) {
-      __m128i *intRow = (__m128i*)(aIntegralImage + (y * stride32bit));
-      __m128i *intPrevRow = (__m128i*)(aIntegralImage + (y - 1) * stride32bit);
-      __m128i *intLastRow = (__m128i*)(aIntegralImage + (integralImageSize.height - 1) * stride32bit);
-
-      for (int x = 0; x < integralImageSize.width; x += 4) {
-        _mm_store_si128(intRow + (x / 4),
-                        _mm_add_epi32(_mm_load_si128(intLastRow + (x / 4)),
-                                      _mm_load_si128(intPrevRow + (x / 4))));
-      }
-    }
-  }
-}
-
-/**
- * Attempt to do an in-place box blur using an integral image.
- */
-void
-AlphaBoxBlur::BoxBlur_SSE2(int32_t aLeftLobe,
-                           int32_t aRightLobe,
-                           int32_t aTopLobe,
-                           int32_t aBottomLobe,
-                           uint32_t *aIntegralImage,
-                           size_t aIntegralImageStride)
-{
-  IntSize size = GetSize();
-
-  MOZ_ASSERT(size.height > 0);
-
-  // Our 'left' or 'top' lobe will include the current pixel. i.e. when
-  // looking at an integral image the value of a pixel at 'x,y' is calculated
-  // using the value of the integral image values above/below that.
-  aLeftLobe++;
-  aTopLobe++;
-  int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
-
-  MOZ_ASSERT(boxSize > 0);
-
-  if (boxSize == 1) {
-      return;
-  }
-
-  uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
-
-  uint32_t stride32bit = aIntegralImageStride / 4;
-  int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
-
-  GenerateIntegralImage_SSE2(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
-                             aIntegralImage, aIntegralImageStride, mData,
-                             mStride, size);
-
-  __m128i divisor = _mm_set1_epi32(reciprocal);
-  __m128i mask = _mm_setr_epi32(0x0, 0xffffffff, 0x0, 0xffffffff);
-
-  // This points to the start of the rectangle within the IntegralImage that overlaps
-  // the surface being blurred.
-  uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
-
-  IntRect skipRect = mSkipRect;
-  int32_t stride = mStride;
-  uint8_t *data = mData;
-  for (int32_t y = 0; y < size.height; y++) {
-    bool inSkipRectY = y > skipRect.y && y < skipRect.YMost();
-
-    uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
-    uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) + aRightLobe);
-    uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) + aRightLobe);
-    uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
-
-    for (int32_t x = 0; x < size.width; x += 4) {
-      if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
-        x = skipRect.XMost() - 4;
-        // Trigger early jump on coming loop iterations, this will be reset
-        // next line anyway.
-        inSkipRectY = false;
-        continue;
-      }
-      __m128i topLeft = loadUnaligned128((__m128i*)(topLeftBase + x));
-      __m128i topRight = loadUnaligned128((__m128i*)(topRightBase + x));
-      __m128i bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x));
-      __m128i bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x));
-
-      __m128i values = _mm_add_epi32(_mm_sub_epi32(_mm_sub_epi32(bottomRight, topRight), bottomLeft), topLeft);
-
-      *(uint32_t*)(data + stride * y + x) = DivideAndPack(values, divisor, mask);
-    }
-  }
-
-}
-
-}
-}
--- a/gfx/2d/ImageScalingSSE2.cpp
+++ b/gfx/2d/ImageScalingSSE2.cpp
@ -6,7 +6,8 @@
 #include "ImageScaling.h"
 #include "mozilla/Attributes.h"

-#include "SSEHelpers.h"
+#include <xmmintrin.h>
+#include <emmintrin.h>

 /* The functions below use the following system for averaging 4 pixels:
 *
@ -107,6 +108,17 @@ MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b)
  return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
 }

+/* Before Nehalem _mm_loadu_si128 could be very slow, this trick is a little
+ * faster. Once enough people are on architectures where _mm_loadu_si128 is
+ * fast we can migrate to it.
+ */
+MOZ_ALWAYS_INLINE __m128i loadUnaligned128(const __m128i *aSource)
+{
+  // Yes! We use uninitialized memory here, we'll overwrite it though!
+  __m128 res = _mm_loadl_pi(_mm_set1_ps(0), (const __m64*)aSource);
+  return _mm_castps_si128(_mm_loadh_pi(res, ((const __m64*)(aSource)) + 1));
+}
+
 MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
 {
  uint32_t sum = a ^ b ^ c;
--- a/gfx/2d/Makefile.in
+++ b/gfx/2d/Makefile.in
@ -116,10 +116,7 @@ endif
 ifneq (,$(INTEL_ARCHITECTURE))
 # VC2005 doesn't support _mm_castsi128_ps, so SSE2 is turned off
 ifneq (1400,$(_MSC_VER))
-CPPSRCS += \
-        ImageScalingSSE2.cpp \
-        BlurSSE2.cpp \
-        $(NULL)
+CPPSRCS += ImageScalingSSE2.cpp
 DEFINES += -DUSE_SSE2
 endif
 endif
@ -164,12 +161,10 @@ DEFINES := $(filter-out -DUNICODE -D_UNICODE,$(DEFINES))
 ifneq (,$(INTEL_ARCHITECTURE))
 ifdef GNU_CC
 ImageScalingSSE2.$(OBJ_SUFFIX): CXXFLAGS+=-msse2
-BlurSSE2.$(OBJ_SUFFIX): CXXFLAGS+=-msse2
 endif

 ifdef SOLARIS_SUNPRO_CXX
 ImageScalingSSE2.$(OBJ_SUFFIX): OS_CXXFLAGS += -xarch=sse2 -xO4
-BlurSSE2.$(OBJ_SUFFIX): OS_CXXFLAGS += -xarch=sse2 -xO4
 endif
 endif

--- a/gfx/2d/SSEHelpers.h
+++ b/gfx/2d/SSEHelpers.h
@ -1,17 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-#include <xmmintrin.h>
-#include <emmintrin.h>
-
-/* Before Nehalem _mm_loadu_si128 could be very slow, this trick is a little
- * faster. Once enough people are on architectures where _mm_loadu_si128 is
- * fast we can migrate to it.
- */
-MOZ_ALWAYS_INLINE __m128i loadUnaligned128(const __m128i *aSource)
-{
-  // Yes! We use uninitialized memory here, we'll overwrite it though!
-  __m128 res = _mm_loadl_pi(_mm_set1_ps(0), (const __m64*)aSource);
-  return _mm_castps_si128(_mm_loadh_pi(res, ((const __m64*)(aSource)) + 1));
-}
--- a/gfx/2d/Tools.h
+++ b/gfx/2d/Tools.h
@ -81,64 +81,6 @@ BytesPerPixel(SurfaceFormat aFormat)
  }
 }

-template<typename T, int alignment = 16>
-struct AlignedArray
-{
-  AlignedArray()
-    : mStorage(nullptr)
-    , mPtr(nullptr)
-  {
-  }
-
-  MOZ_ALWAYS_INLINE AlignedArray(size_t aSize)
-    : mStorage(nullptr)
-  {
-    Realloc(aSize);
-  }
-
-  MOZ_ALWAYS_INLINE ~AlignedArray()
-  {
-    delete [] mStorage;
-  }
-
-  void Dealloc()
-  {
-    delete [] mStorage;
-    mStorage = mPtr = nullptr;
-  }
-
-  MOZ_ALWAYS_INLINE void Realloc(size_t aSize)
-  {
-    delete [] mStorage;
-    mStorage = new T[aSize + (alignment - 1)];
-    if (uintptr_t(mStorage) % alignment) {
-      // Our storage does not start at a <alignment>-byte boundary. Make sure mData does!
-      mPtr = (uint32_t*)(uintptr_t(mStorage) +
-        (alignment - (uintptr_t(mStorage) % alignment)));
-    } else {
-      mPtr = mStorage;
-    }
-  }
-
-  MOZ_ALWAYS_INLINE operator T*()
-  {
-    return mPtr;
-  }
-
-  T *mStorage;
-  T *mPtr;
-};
-
-template<int alignment>
-int32_t GetAlignedStride(int32_t aStride)
-{
-  if (aStride % alignment) {
-    return aStride + (alignment - (aStride % alignment));
-  }
-
-  return aStride;
-}
-
 }
 }