Bug 924102 - Add filter processing code for many SVG filters. r=Bas

2024-09-13 09:24:08 -07:00 · 2013-11-27 12:22:27 +01:00 · 2013-11-27 12:22:27 +01:00 · 4c57389fc2
commit 4c57389fc2
parent 410c0c2218
7 changed files with 1880 additions and 0 deletions
--- a/gfx/2d/FilterProcessing.cpp
+++ b/gfx/2d/FilterProcessing.cpp
@ -0,0 +1,224 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "FilterProcessing.h"
+
+namespace mozilla {
+namespace gfx {
+
+TemporaryRef<DataSourceSurface>
+FilterProcessing::ExtractAlpha(DataSourceSurface* aSource)
+{
+  IntSize size = aSource->GetSize();
+  RefPtr<DataSourceSurface> alpha = Factory::CreateDataSourceSurface(size, FORMAT_A8);
+  uint8_t* sourceData = aSource->GetData();
+  int32_t sourceStride = aSource->Stride();
+  uint8_t* alphaData = alpha->GetData();
+  int32_t alphaStride = alpha->Stride();
+
+  if (Factory::HasSSE2()) {
+#ifdef USE_SSE2
+    ExtractAlpha_SSE2(size, sourceData, sourceStride, alphaData, alphaStride);
+#endif
+  } else {
+    ExtractAlpha_Scalar(size, sourceData, sourceStride, alphaData, alphaStride);
+  }
+
+  return alpha;
+}
+
+TemporaryRef<DataSourceSurface>
+FilterProcessing::ConvertToB8G8R8A8(SourceSurface* aSurface)
+{
+  if (Factory::HasSSE2()) {
+#ifdef USE_SSE2
+    return ConvertToB8G8R8A8_SSE2(aSurface);
+#endif
+  }
+  return ConvertToB8G8R8A8_Scalar(aSurface);
+}
+
+TemporaryRef<DataSourceSurface>
+FilterProcessing::ApplyBlending(DataSourceSurface* aInput1, DataSourceSurface* aInput2,
+                                BlendMode aBlendMode)
+{
+  if (Factory::HasSSE2()) {
+#ifdef USE_SSE2
+    return ApplyBlending_SSE2(aInput1, aInput2, aBlendMode);
+#endif
+  }
+  return ApplyBlending_Scalar(aInput1, aInput2, aBlendMode);
+}
+
+void
+FilterProcessing::ApplyMorphologyHorizontal(uint8_t* aSourceData, int32_t aSourceStride,
+                                            uint8_t* aDestData, int32_t aDestStride,
+                                            const IntRect& aDestRect, int32_t aRadius,
+                                            MorphologyOperator aOp)
+{
+  if (Factory::HasSSE2()) {
+#ifdef USE_SSE2
+    ApplyMorphologyHorizontal_SSE2(
+      aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius, aOp);
+#endif
+  } else {
+    ApplyMorphologyHorizontal_Scalar(
+      aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius, aOp);
+  }
+}
+
+void
+FilterProcessing::ApplyMorphologyVertical(uint8_t* aSourceData, int32_t aSourceStride,
+                                            uint8_t* aDestData, int32_t aDestStride,
+                                            const IntRect& aDestRect, int32_t aRadius,
+                                            MorphologyOperator aOp)
+{
+  if (Factory::HasSSE2()) {
+#ifdef USE_SSE2
+    ApplyMorphologyVertical_SSE2(
+      aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius, aOp);
+#endif
+  } else {
+    ApplyMorphologyVertical_Scalar(
+      aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius, aOp);
+  }
+}
+
+TemporaryRef<DataSourceSurface>
+FilterProcessing::ApplyColorMatrix(DataSourceSurface* aInput, const Matrix5x4 &aMatrix)
+{
+  if (Factory::HasSSE2()) {
+#ifdef USE_SSE2
+    return ApplyColorMatrix_SSE2(aInput, aMatrix);
+#endif
+  }
+  return ApplyColorMatrix_Scalar(aInput, aMatrix);
+}
+
+void
+FilterProcessing::ApplyComposition(DataSourceSurface* aSource, DataSourceSurface* aDest,
+                                   CompositeOperator aOperator)
+{
+  if (Factory::HasSSE2()) {
+#ifdef USE_SSE2
+    ApplyComposition_SSE2(aSource, aDest, aOperator);
+#endif
+  } else {
+    ApplyComposition_Scalar(aSource, aDest, aOperator);
+  }
+}
+
+void
+FilterProcessing::SeparateColorChannels(DataSourceSurface* aSource,
+                                        RefPtr<DataSourceSurface>& aChannel0,
+                                        RefPtr<DataSourceSurface>& aChannel1,
+                                        RefPtr<DataSourceSurface>& aChannel2,
+                                        RefPtr<DataSourceSurface>& aChannel3)
+{
+  IntSize size = aSource->GetSize();
+  aChannel0 = Factory::CreateDataSourceSurface(size, FORMAT_A8);
+  aChannel1 = Factory::CreateDataSourceSurface(size, FORMAT_A8);
+  aChannel2 = Factory::CreateDataSourceSurface(size, FORMAT_A8);
+  aChannel3 = Factory::CreateDataSourceSurface(size, FORMAT_A8);
+  uint8_t* sourceData = aSource->GetData();
+  int32_t sourceStride = aSource->Stride();
+  uint8_t* channel0Data = aChannel0->GetData();
+  uint8_t* channel1Data = aChannel1->GetData();
+  uint8_t* channel2Data = aChannel2->GetData();
+  uint8_t* channel3Data = aChannel3->GetData();
+  int32_t channelStride = aChannel0->Stride();
+
+  if (Factory::HasSSE2()) {
+#ifdef USE_SSE2
+    SeparateColorChannels_SSE2(size, sourceData, sourceStride, channel0Data, channel1Data, channel2Data, channel3Data, channelStride);
+#endif
+  } else {
+    SeparateColorChannels_Scalar(size, sourceData, sourceStride, channel0Data, channel1Data, channel2Data, channel3Data, channelStride);
+  }
+}
+
+TemporaryRef<DataSourceSurface>
+FilterProcessing::CombineColorChannels(DataSourceSurface* aChannel0, DataSourceSurface* aChannel1,
+                                       DataSourceSurface* aChannel2, DataSourceSurface* aChannel3)
+{
+  IntSize size = aChannel0->GetSize();
+  RefPtr<DataSourceSurface> result =
+    Factory::CreateDataSourceSurface(size, FORMAT_B8G8R8A8);
+  int32_t resultStride = result->Stride();
+  uint8_t* resultData = result->GetData();
+  int32_t channelStride = aChannel0->Stride();
+  uint8_t* channel0Data = aChannel0->GetData();
+  uint8_t* channel1Data = aChannel1->GetData();
+  uint8_t* channel2Data = aChannel2->GetData();
+  uint8_t* channel3Data = aChannel3->GetData();
+
+  if (Factory::HasSSE2()) {
+#ifdef USE_SSE2
+    CombineColorChannels_SSE2(size, resultStride, resultData, channelStride, channel0Data, channel1Data, channel2Data, channel3Data);
+#endif
+  } else {
+    CombineColorChannels_Scalar(size, resultStride, resultData, channelStride, channel0Data, channel1Data, channel2Data, channel3Data);
+  }
+
+  return result;
+}
+
+void
+FilterProcessing::DoPremultiplicationCalculation(const IntSize& aSize,
+                                                 uint8_t* aTargetData, int32_t aTargetStride,
+                                                 uint8_t* aSourceData, int32_t aSourceStride)
+{
+  if (Factory::HasSSE2()) {
+#ifdef USE_SSE2 
+    DoPremultiplicationCalculation_SSE2(
+      aSize, aTargetData, aTargetStride, aSourceData, aSourceStride);
+#endif
+  } else {
+    DoPremultiplicationCalculation_Scalar(
+      aSize, aTargetData, aTargetStride, aSourceData, aSourceStride);
+  }
+}
+
+void
+FilterProcessing::DoUnpremultiplicationCalculation(const IntSize& aSize,
+                                                   uint8_t* aTargetData, int32_t aTargetStride,
+                                                   uint8_t* aSourceData, int32_t aSourceStride)
+{
+  if (Factory::HasSSE2()) {
+#ifdef USE_SSE2 
+    DoUnpremultiplicationCalculation_SSE2(
+      aSize, aTargetData, aTargetStride, aSourceData, aSourceStride);
+#endif
+  } else {
+    DoUnpremultiplicationCalculation_Scalar(
+      aSize, aTargetData, aTargetStride, aSourceData, aSourceStride);
+  }
+}
+
+TemporaryRef<DataSourceSurface>
+FilterProcessing::RenderTurbulence(const IntSize &aSize, const Point &aOffset, const Size &aBaseFrequency,
+                                   int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch, const Rect &aTileRect)
+{
+  if (Factory::HasSSE2()) {
+#ifdef USE_SSE2
+    return RenderTurbulence_SSE2(aSize, aOffset, aBaseFrequency, aSeed, aNumOctaves, aType, aStitch, aTileRect);
+#endif
+  }
+  return RenderTurbulence_Scalar(aSize, aOffset, aBaseFrequency, aSeed, aNumOctaves, aType, aStitch, aTileRect);
+}
+
+TemporaryRef<DataSourceSurface>
+FilterProcessing::ApplyArithmeticCombine(DataSourceSurface* aInput1, DataSourceSurface* aInput2, Float aK1, Float aK2, Float aK3, Float aK4)
+{
+  if (Factory::HasSSE2()) {
+#ifdef USE_SSE2
+    return ApplyArithmeticCombine_SSE2(aInput1, aInput2, aK1, aK2, aK3, aK4);
+#endif
+  }
+  return ApplyArithmeticCombine_Scalar(aInput1, aInput2, aK1, aK2, aK3, aK4);
+}
+
+} // namespace gfx
+} // namespace mozilla
--- a/gfx/2d/FilterProcessing.h
+++ b/gfx/2d/FilterProcessing.h
@ -0,0 +1,142 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef _MOZILLA_GFX_FILTERPROCESSING_H_
+#define _MOZILLA_GFX_FILTERPROCESSING_H_
+
+#include "2D.h"
+#include "Filters.h"
+
+namespace mozilla {
+namespace gfx {
+
+const ptrdiff_t B8G8R8A8_COMPONENT_BYTEOFFSET_B = 0;
+const ptrdiff_t B8G8R8A8_COMPONENT_BYTEOFFSET_G = 1;
+const ptrdiff_t B8G8R8A8_COMPONENT_BYTEOFFSET_R = 2;
+const ptrdiff_t B8G8R8A8_COMPONENT_BYTEOFFSET_A = 3;
+
+class FilterProcessing
+{
+public:
+
+  // Fast approximate division by 255. It has the property that
+  // for all 0 <= v <= 255*255, FastDivideBy255(v) == v/255.
+  // But it only uses two adds and two shifts instead of an
+  // integer division (which is expensive on many processors).
+  template<class B, class A>
+  static B FastDivideBy255(A v)
+  {
+    return ((v << 8) + v + 255) >> 16;
+  }
+
+  static TemporaryRef<DataSourceSurface> ExtractAlpha(DataSourceSurface* aSource);
+  static TemporaryRef<DataSourceSurface> ConvertToB8G8R8A8(SourceSurface* aSurface);
+  static TemporaryRef<DataSourceSurface> ApplyBlending(DataSourceSurface* aInput1, DataSourceSurface* aInput2, BlendMode aBlendMode);
+  static void ApplyMorphologyHorizontal(uint8_t* aSourceData, int32_t aSourceStride,
+                                          uint8_t* aDestData, int32_t aDestStride,
+                                          const IntRect& aDestRect, int32_t aRadius,
+                                          MorphologyOperator aOperator);
+  static void ApplyMorphologyVertical(uint8_t* aSourceData, int32_t aSourceStride,
+                                          uint8_t* aDestData, int32_t aDestStride,
+                                          const IntRect& aDestRect, int32_t aRadius,
+                                          MorphologyOperator aOperator);
+  static TemporaryRef<DataSourceSurface> ApplyColorMatrix(DataSourceSurface* aInput, const Matrix5x4 &aMatrix);
+  static void ApplyComposition(DataSourceSurface* aSource, DataSourceSurface* aDest, CompositeOperator aOperator);
+  static void SeparateColorChannels(DataSourceSurface* aSource,
+                                    RefPtr<DataSourceSurface>& aChannel0,
+                                    RefPtr<DataSourceSurface>& aChannel1,
+                                    RefPtr<DataSourceSurface>& aChannel2,
+                                    RefPtr<DataSourceSurface>& aChannel3);
+  static TemporaryRef<DataSourceSurface>
+    CombineColorChannels(DataSourceSurface* aChannel0, DataSourceSurface* aChannel1,
+                         DataSourceSurface* aChannel2, DataSourceSurface* aChannel3);
+  static void DoPremultiplicationCalculation(const IntSize& aSize,
+                                        uint8_t* aTargetData, int32_t aTargetStride,
+                                        uint8_t* aSourceData, int32_t aSourceStride);
+  static void DoUnpremultiplicationCalculation(const IntSize& aSize,
+                                               uint8_t* aTargetData, int32_t aTargetStride,
+                                               uint8_t* aSourceData, int32_t aSourceStride);
+  static TemporaryRef<DataSourceSurface>
+    RenderTurbulence(const IntSize &aSize, const Point &aOffset, const Size &aBaseFrequency,
+                     int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch, const Rect &aTileRect);
+  static TemporaryRef<DataSourceSurface>
+    ApplyArithmeticCombine(DataSourceSurface* aInput1, DataSourceSurface* aInput2, Float aK1, Float aK2, Float aK3, Float aK4);
+
+protected:
+  static void ExtractAlpha_Scalar(const IntSize& size, uint8_t* sourceData, int32_t sourceStride, uint8_t* alphaData, int32_t alphaStride);
+  static TemporaryRef<DataSourceSurface> ConvertToB8G8R8A8_Scalar(SourceSurface* aSurface);
+  static TemporaryRef<DataSourceSurface> ApplyBlending_Scalar(DataSourceSurface* aInput1, DataSourceSurface* aInput2, BlendMode aBlendMode);
+  static void ApplyMorphologyHorizontal_Scalar(uint8_t* aSourceData, int32_t aSourceStride,
+                                               uint8_t* aDestData, int32_t aDestStride,
+                                               const IntRect& aDestRect, int32_t aRadius,
+                                               MorphologyOperator aOperator);
+  static void ApplyMorphologyVertical_Scalar(uint8_t* aSourceData, int32_t aSourceStride,
+                                               uint8_t* aDestData, int32_t aDestStride,
+                                               const IntRect& aDestRect, int32_t aRadius,
+                                               MorphologyOperator aOperator);
+  static TemporaryRef<DataSourceSurface> ApplyColorMatrix_Scalar(DataSourceSurface* aInput, const Matrix5x4 &aMatrix);
+  static void ApplyComposition_Scalar(DataSourceSurface* aSource, DataSourceSurface* aDest, CompositeOperator aOperator);
+
+  static void SeparateColorChannels_Scalar(const IntSize &size, uint8_t* sourceData, int32_t sourceStride, uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, uint8_t* channel3Data, int32_t channelStride);
+  static void CombineColorChannels_Scalar(const IntSize &size, int32_t resultStride, uint8_t* resultData, int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, uint8_t* channel3Data);
+  static void DoPremultiplicationCalculation_Scalar(const IntSize& aSize,
+                                        uint8_t* aTargetData, int32_t aTargetStride,
+                                        uint8_t* aSourceData, int32_t aSourceStride);
+  static void DoUnpremultiplicationCalculation_Scalar(const IntSize& aSize,
+                                               uint8_t* aTargetData, int32_t aTargetStride,
+                                               uint8_t* aSourceData, int32_t aSourceStride);
+  static TemporaryRef<DataSourceSurface>
+    RenderTurbulence_Scalar(const IntSize &aSize, const Point &aOffset, const Size &aBaseFrequency,
+                            int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch, const Rect &aTileRect);
+  static TemporaryRef<DataSourceSurface>
+    ApplyArithmeticCombine_Scalar(DataSourceSurface* aInput1, DataSourceSurface* aInput2, Float aK1, Float aK2, Float aK3, Float aK4);
+
+#ifdef USE_SSE2
+  static void ExtractAlpha_SSE2(const IntSize& size, uint8_t* sourceData, int32_t sourceStride, uint8_t* alphaData, int32_t alphaStride);
+  static TemporaryRef<DataSourceSurface> ConvertToB8G8R8A8_SSE2(SourceSurface* aSurface);
+  static TemporaryRef<DataSourceSurface> ApplyBlending_SSE2(DataSourceSurface* aInput1, DataSourceSurface* aInput2, BlendMode aBlendMode);
+  static void ApplyMorphologyHorizontal_SSE2(uint8_t* aSourceData, int32_t aSourceStride,
+                                             uint8_t* aDestData, int32_t aDestStride,
+                                             const IntRect& aDestRect, int32_t aRadius,
+                                             MorphologyOperator aOperator);
+  static void ApplyMorphologyVertical_SSE2(uint8_t* aSourceData, int32_t aSourceStride,
+                                             uint8_t* aDestData, int32_t aDestStride,
+                                             const IntRect& aDestRect, int32_t aRadius,
+                                             MorphologyOperator aOperator);
+  static TemporaryRef<DataSourceSurface> ApplyColorMatrix_SSE2(DataSourceSurface* aInput, const Matrix5x4 &aMatrix);
+  static void ApplyComposition_SSE2(DataSourceSurface* aSource, DataSourceSurface* aDest, CompositeOperator aOperator);
+  static void SeparateColorChannels_SSE2(const IntSize &size, uint8_t* sourceData, int32_t sourceStride, uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, uint8_t* channel3Data, int32_t channelStride);
+  static void CombineColorChannels_SSE2(const IntSize &size, int32_t resultStride, uint8_t* resultData, int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, uint8_t* channel3Data);
+  static void DoPremultiplicationCalculation_SSE2(const IntSize& aSize,
+                                        uint8_t* aTargetData, int32_t aTargetStride,
+                                        uint8_t* aSourceData, int32_t aSourceStride);
+  static void DoUnpremultiplicationCalculation_SSE2(const IntSize& aSize,
+                                               uint8_t* aTargetData, int32_t aTargetStride,
+                                               uint8_t* aSourceData, int32_t aSourceStride);
+  static TemporaryRef<DataSourceSurface>
+    RenderTurbulence_SSE2(const IntSize &aSize, const Point &aOffset, const Size &aBaseFrequency,
+                          int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch, const Rect &aTileRect);
+  static TemporaryRef<DataSourceSurface>
+    ApplyArithmeticCombine_SSE2(DataSourceSurface* aInput1, DataSourceSurface* aInput2, Float aK1, Float aK2, Float aK3, Float aK4);
+#endif
+};
+
+// Constant-time max and min functions for unsigned arguments
+static inline unsigned
+umax(unsigned a, unsigned b)
+{
+  return a - ((a - b) & -(a < b));
+}
+
+static inline unsigned
+umin(unsigned a, unsigned b)
+{
+  return a - ((a - b) & -(a > b));
+}
+
+} // namespace gfx
+} // namespace mozilla
+
+#endif // _MOZILLA_GFX_FILTERPROCESSING_H_
--- a/gfx/2d/FilterProcessingSIMD-inl.h
+++ b/gfx/2d/FilterProcessingSIMD-inl.h
--- a/gfx/2d/FilterProcessingSSE2.cpp
+++ b/gfx/2d/FilterProcessingSSE2.cpp
@ -0,0 +1,112 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#define SIMD_COMPILE_SSE2
+
+#include "FilterProcessingSIMD-inl.h"
+
+#ifndef USE_SSE2
+static_assert(false, "If this file is built, FilterProcessing.h should know about it!");
+#endif
+
+namespace mozilla {
+namespace gfx {
+
+void
+FilterProcessing::ExtractAlpha_SSE2(const IntSize& size, uint8_t* sourceData, int32_t sourceStride, uint8_t* alphaData, int32_t alphaStride)
+{
+  ExtractAlpha_SIMD<__m128i>(size, sourceData, sourceStride, alphaData, alphaStride);
+}
+
+TemporaryRef<DataSourceSurface>
+FilterProcessing::ConvertToB8G8R8A8_SSE2(SourceSurface* aSurface)
+{
+  return ConvertToB8G8R8A8_SIMD<__m128i>(aSurface);
+}
+
+TemporaryRef<DataSourceSurface>
+FilterProcessing::ApplyBlending_SSE2(DataSourceSurface* aInput1, DataSourceSurface* aInput2,
+                                     BlendMode aBlendMode)
+{
+  return ApplyBlending_SIMD<__m128i,__m128i,__m128i>(aInput1, aInput2, aBlendMode);
+}
+
+void
+FilterProcessing::ApplyMorphologyHorizontal_SSE2(uint8_t* aSourceData, int32_t aSourceStride,
+                                                 uint8_t* aDestData, int32_t aDestStride,
+                                                 const IntRect& aDestRect, int32_t aRadius,
+                                                 MorphologyOperator aOp)
+{
+  ApplyMorphologyHorizontal_SIMD<__m128i,__m128i>(
+    aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius, aOp);
+}
+
+void
+FilterProcessing::ApplyMorphologyVertical_SSE2(uint8_t* aSourceData, int32_t aSourceStride,
+                                                 uint8_t* aDestData, int32_t aDestStride,
+                                                 const IntRect& aDestRect, int32_t aRadius,
+                                                 MorphologyOperator aOp)
+{
+  ApplyMorphologyVertical_SIMD<__m128i,__m128i>(
+    aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius, aOp);
+}
+
+TemporaryRef<DataSourceSurface>
+FilterProcessing::ApplyColorMatrix_SSE2(DataSourceSurface* aInput, const Matrix5x4 &aMatrix)
+{
+  return ApplyColorMatrix_SIMD<__m128i,__m128i,__m128i>(aInput, aMatrix);
+}
+
+void
+FilterProcessing::ApplyComposition_SSE2(DataSourceSurface* aSource, DataSourceSurface* aDest,
+                                        CompositeOperator aOperator)
+{
+  return ApplyComposition_SIMD<__m128i,__m128i,__m128i>(aSource, aDest, aOperator);
+}
+
+void
+FilterProcessing::SeparateColorChannels_SSE2(const IntSize &size, uint8_t* sourceData, int32_t sourceStride, uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, uint8_t* channel3Data, int32_t channelStride)
+{
+  SeparateColorChannels_SIMD<__m128i>(size, sourceData, sourceStride, channel0Data, channel1Data, channel2Data, channel3Data, channelStride);
+}
+
+void
+FilterProcessing::CombineColorChannels_SSE2(const IntSize &size, int32_t resultStride, uint8_t* resultData, int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, uint8_t* channel3Data)
+{
+  CombineColorChannels_SIMD<__m128i>(size, resultStride, resultData, channelStride, channel0Data, channel1Data, channel2Data, channel3Data);
+}
+
+void
+FilterProcessing::DoPremultiplicationCalculation_SSE2(const IntSize& aSize,
+                                     uint8_t* aTargetData, int32_t aTargetStride,
+                                     uint8_t* aSourceData, int32_t aSourceStride)
+{
+  DoPremultiplicationCalculation_SIMD<__m128i,__m128i,__m128i>(aSize, aTargetData, aTargetStride, aSourceData, aSourceStride);
+}
+
+void
+FilterProcessing::DoUnpremultiplicationCalculation_SSE2(
+                                 const IntSize& aSize,
+                                 uint8_t* aTargetData, int32_t aTargetStride,
+                                 uint8_t* aSourceData, int32_t aSourceStride)
+{
+  DoUnpremultiplicationCalculation_SIMD<__m128i,__m128i>(aSize, aTargetData, aTargetStride, aSourceData, aSourceStride);
+}
+
+TemporaryRef<DataSourceSurface>
+FilterProcessing::RenderTurbulence_SSE2(const IntSize &aSize, const Point &aOffset, const Size &aBaseFrequency,
+                                        int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch, const Rect &aTileRect)
+{
+  return RenderTurbulence_SIMD<__m128,__m128i,__m128i>(aSize, aOffset, aBaseFrequency, aSeed, aNumOctaves, aType, aStitch, aTileRect);
+}
+
+TemporaryRef<DataSourceSurface>
+FilterProcessing::ApplyArithmeticCombine_SSE2(DataSourceSurface* aInput1, DataSourceSurface* aInput2, Float aK1, Float aK2, Float aK3, Float aK4)
+{
+  return ApplyArithmeticCombine_SIMD<__m128i,__m128i,__m128i>(aInput1, aInput2, aK1, aK2, aK3, aK4);
+}
+
+} // namespace mozilla
+} // namespace gfx
--- a/gfx/2d/FilterProcessingScalar.cpp
+++ b/gfx/2d/FilterProcessingScalar.cpp
@ -0,0 +1,321 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#define FILTER_PROCESSING_SCALAR
+
+#include "FilterProcessingSIMD-inl.h"
+
+namespace mozilla {
+namespace gfx {
+
+void
+FilterProcessing::ExtractAlpha_Scalar(const IntSize& size, uint8_t* sourceData, int32_t sourceStride, uint8_t* alphaData, int32_t alphaStride)
+{
+  for (int32_t y = 0; y < size.height; y++) {
+    for (int32_t x = 0; x < size.width; x++) {
+      int32_t sourceIndex = y * sourceStride + 4 * x;
+      int32_t targetIndex = y * alphaStride + x;
+      alphaData[targetIndex] = sourceData[sourceIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_A];
+    }
+  }
+}
+
+TemporaryRef<DataSourceSurface>
+FilterProcessing::ConvertToB8G8R8A8_Scalar(SourceSurface* aSurface)
+{
+  return ConvertToB8G8R8A8_SIMD<simd::Scalaru8x16_t>(aSurface);
+}
+
+template<BlendMode aBlendMode>
+static TemporaryRef<DataSourceSurface>
+ApplyBlending_Scalar(DataSourceSurface* aInput1, DataSourceSurface* aInput2)
+{
+  IntSize size = aInput1->GetSize();
+  RefPtr<DataSourceSurface> target =
+    Factory::CreateDataSourceSurface(size, FORMAT_B8G8R8A8);
+  if (!target) {
+    return nullptr;
+  }
+
+  uint8_t* source1Data = aInput1->GetData();
+  uint8_t* source2Data = aInput2->GetData();
+  uint8_t* targetData = target->GetData();
+  uint32_t targetStride = target->Stride();
+  uint32_t source1Stride = aInput1->Stride();
+  uint32_t source2Stride = aInput2->Stride();
+
+  for (int32_t y = 0; y < size.height; y++) {
+    for (int32_t x = 0; x < size.width; x++) {
+      uint32_t targetIndex = y * targetStride + 4 * x;
+      uint32_t source1Index = y * source1Stride + 4 * x;
+      uint32_t source2Index = y * source2Stride + 4 * x;
+      uint32_t qa = source1Data[source1Index + B8G8R8A8_COMPONENT_BYTEOFFSET_A];
+      uint32_t qb = source2Data[source2Index + B8G8R8A8_COMPONENT_BYTEOFFSET_A];
+      for (int32_t i = std::min(B8G8R8A8_COMPONENT_BYTEOFFSET_B, B8G8R8A8_COMPONENT_BYTEOFFSET_R);
+           i <= std::max(B8G8R8A8_COMPONENT_BYTEOFFSET_B, B8G8R8A8_COMPONENT_BYTEOFFSET_R); i++) {
+        uint32_t ca = source1Data[source1Index + i];
+        uint32_t cb = source2Data[source2Index + i];
+        uint32_t val;
+        switch (aBlendMode) {
+          case BLEND_MODE_MULTIPLY:
+            val = ((255 - qa) * cb + (255 - qb + cb) * ca);
+            break;
+          case BLEND_MODE_SCREEN:
+            val = 255 * (cb + ca) - ca * cb;
+            break;
+          case BLEND_MODE_DARKEN:
+            val = umin((255 - qa) * cb + 255 * ca,
+                       (255 - qb) * ca + 255 * cb);
+            break;
+          case BLEND_MODE_LIGHTEN:
+            val = umax((255 - qa) * cb + 255 * ca,
+                       (255 - qb) * ca + 255 * cb);
+            break;
+          default:
+            MOZ_CRASH();
+        }
+        val = umin(FilterProcessing::FastDivideBy255<unsigned>(val), 255U);
+        targetData[targetIndex + i] = static_cast<uint8_t>(val);
+      }
+      uint32_t alpha = 255 * 255 - (255 - qa) * (255 - qb);
+      targetData[targetIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_A] =
+        FilterProcessing::FastDivideBy255<uint8_t>(alpha);
+    }
+  }
+
+  return target;
+}
+
+TemporaryRef<DataSourceSurface>
+FilterProcessing::ApplyBlending_Scalar(DataSourceSurface* aInput1, DataSourceSurface* aInput2,
+                                       BlendMode aBlendMode)
+{
+  switch (aBlendMode) {
+    case BLEND_MODE_MULTIPLY:
+      return gfx::ApplyBlending_Scalar<BLEND_MODE_MULTIPLY>(aInput1, aInput2);
+    case BLEND_MODE_SCREEN:
+      return gfx::ApplyBlending_Scalar<BLEND_MODE_SCREEN>(aInput1, aInput2);
+    case BLEND_MODE_DARKEN:
+      return gfx::ApplyBlending_Scalar<BLEND_MODE_DARKEN>(aInput1, aInput2);
+    case BLEND_MODE_LIGHTEN:
+      return gfx::ApplyBlending_Scalar<BLEND_MODE_LIGHTEN>(aInput1, aInput2);
+    default:
+      return nullptr;
+  }
+}
+
+template<MorphologyOperator Operator>
+static void
+ApplyMorphologyHorizontal_Scalar(uint8_t* aSourceData, int32_t aSourceStride,
+                                 uint8_t* aDestData, int32_t aDestStride,
+                                 const IntRect& aDestRect, int32_t aRadius)
+{
+  static_assert(Operator == MORPHOLOGY_OPERATOR_ERODE ||
+                Operator == MORPHOLOGY_OPERATOR_DILATE,
+                "unexpected morphology operator");
+
+  for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++) {
+    int32_t startX = aDestRect.x - aRadius;
+    int32_t endX = aDestRect.x + aRadius;
+    for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x++, startX++, endX++) {
+      int32_t sourceIndex = y * aSourceStride + 4 * startX;
+      uint8_t u[4];
+      for (size_t i = 0; i < 4; i++) {
+        u[i] = aSourceData[sourceIndex + i];
+      }
+      sourceIndex += 4;
+      for (int32_t ix = startX + 1; ix <= endX; ix++, sourceIndex += 4) {
+        for (size_t i = 0; i < 4; i++) {
+          if (Operator == MORPHOLOGY_OPERATOR_ERODE) {
+            u[i] = umin(u[i], aSourceData[sourceIndex + i]);
+          } else {
+            u[i] = umax(u[i], aSourceData[sourceIndex + i]);
+          }
+        }
+      }
+
+      int32_t destIndex = y * aDestStride + 4 * x;
+      for (size_t i = 0; i < 4; i++) {
+        aDestData[destIndex+i] = u[i];
+      }
+    }
+  }
+}
+
+void
+FilterProcessing::ApplyMorphologyHorizontal_Scalar(uint8_t* aSourceData, int32_t aSourceStride,
+                                                   uint8_t* aDestData, int32_t aDestStride,
+                                                   const IntRect& aDestRect, int32_t aRadius,
+                                                   MorphologyOperator aOp)
+{
+  if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
+    gfx::ApplyMorphologyHorizontal_Scalar<MORPHOLOGY_OPERATOR_ERODE>(
+      aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
+  } else {
+    gfx::ApplyMorphologyHorizontal_Scalar<MORPHOLOGY_OPERATOR_DILATE>(
+      aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
+  }
+}
+
+template<MorphologyOperator Operator>
+static void ApplyMorphologyVertical_Scalar(uint8_t* aSourceData, int32_t aSourceStride,
+                                           uint8_t* aDestData, int32_t aDestStride,
+                                           const IntRect& aDestRect, int32_t aRadius)
+{
+  static_assert(Operator == MORPHOLOGY_OPERATOR_ERODE ||
+                Operator == MORPHOLOGY_OPERATOR_DILATE,
+                "unexpected morphology operator");
+
+  int32_t startY = aDestRect.y - aRadius;
+  int32_t endY = aDestRect.y + aRadius;
+  for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++, startY++, endY++) {
+    for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x++) {
+      int32_t sourceIndex = startY * aSourceStride + 4 * x;
+      uint8_t u[4];
+      for (size_t i = 0; i < 4; i++) {
+        u[i] = aSourceData[sourceIndex + i];
+      }
+      sourceIndex += aSourceStride;
+      for (int32_t iy = startY + 1; iy <= endY; iy++, sourceIndex += aSourceStride) {
+        for (size_t i = 0; i < 4; i++) {
+          if (Operator == MORPHOLOGY_OPERATOR_ERODE) {
+            u[i] = umin(u[i], aSourceData[sourceIndex + i]);
+          } else {
+            u[i] = umax(u[i], aSourceData[sourceIndex + i]);
+          }
+        }
+      }
+
+      int32_t destIndex = y * aDestStride + 4 * x;
+      for (size_t i = 0; i < 4; i++) {
+        aDestData[destIndex+i] = u[i];
+      }
+    }
+  }
+}
+
+void
+FilterProcessing::ApplyMorphologyVertical_Scalar(uint8_t* aSourceData, int32_t aSourceStride,
+                                                   uint8_t* aDestData, int32_t aDestStride,
+                                                   const IntRect& aDestRect, int32_t aRadius,
+                                                   MorphologyOperator aOp)
+{
+  if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
+    gfx::ApplyMorphologyVertical_Scalar<MORPHOLOGY_OPERATOR_ERODE>(
+      aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
+  } else {
+    gfx::ApplyMorphologyVertical_Scalar<MORPHOLOGY_OPERATOR_DILATE>(
+      aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
+  }
+}
+
+TemporaryRef<DataSourceSurface>
+FilterProcessing::ApplyColorMatrix_Scalar(DataSourceSurface* aInput, const Matrix5x4 &aMatrix)
+{
+  return ApplyColorMatrix_SIMD<simd::Scalari32x4_t,simd::Scalari16x8_t,simd::Scalaru8x16_t>(aInput, aMatrix);
+}
+
+void
+FilterProcessing::ApplyComposition_Scalar(DataSourceSurface* aSource, DataSourceSurface* aDest,
+                                          CompositeOperator aOperator)
+{
+  return ApplyComposition_SIMD<simd::Scalari32x4_t,simd::Scalaru16x8_t,simd::Scalaru8x16_t>(aSource, aDest, aOperator);
+}
+
+void
+FilterProcessing::SeparateColorChannels_Scalar(const IntSize &size, uint8_t* sourceData, int32_t sourceStride, uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, uint8_t* channel3Data, int32_t channelStride)
+{
+  for (int32_t y = 0; y < size.height; y++) {
+    for (int32_t x = 0; x < size.width; x++) {
+      int32_t sourceIndex = y * sourceStride + 4 * x;
+      int32_t targetIndex = y * channelStride + x;
+      channel0Data[targetIndex] = sourceData[sourceIndex];
+      channel1Data[targetIndex] = sourceData[sourceIndex+1];
+      channel2Data[targetIndex] = sourceData[sourceIndex+2];
+      channel3Data[targetIndex] = sourceData[sourceIndex+3];
+    }
+  }
+}
+
+void
+FilterProcessing::CombineColorChannels_Scalar(const IntSize &size, int32_t resultStride, uint8_t* resultData, int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, uint8_t* channel3Data)
+{
+  for (int32_t y = 0; y < size.height; y++) {
+    for (int32_t x = 0; x < size.width; x++) {
+      int32_t resultIndex = y * resultStride + 4 * x;
+      int32_t channelIndex = y * channelStride + x;
+      resultData[resultIndex] = channel0Data[channelIndex];
+      resultData[resultIndex+1] = channel1Data[channelIndex];
+      resultData[resultIndex+2] = channel2Data[channelIndex];
+      resultData[resultIndex+3] = channel3Data[channelIndex];
+    }
+  }
+}
+
+void
+FilterProcessing::DoPremultiplicationCalculation_Scalar(const IntSize& aSize,
+                                     uint8_t* aTargetData, int32_t aTargetStride,
+                                     uint8_t* aSourceData, int32_t aSourceStride)
+{
+  for (int32_t y = 0; y < aSize.height; y++) {
+    for (int32_t x = 0; x < aSize.width; x++) {
+      int32_t inputIndex = y * aSourceStride + 4 * x;
+      int32_t targetIndex = y * aTargetStride + 4 * x;
+      uint8_t alpha = aSourceData[inputIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_A];
+      aTargetData[targetIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_R] =
+        FastDivideBy255<uint8_t>(aSourceData[inputIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_R] * alpha);
+      aTargetData[targetIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_G] =
+        FastDivideBy255<uint8_t>(aSourceData[inputIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_G] * alpha);
+      aTargetData[targetIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_B] =
+        FastDivideBy255<uint8_t>(aSourceData[inputIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_B] * alpha);
+      aTargetData[targetIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_A] = alpha;
+    }
+  }
+}
+
+void
+FilterProcessing::DoUnpremultiplicationCalculation_Scalar(
+                                 const IntSize& aSize,
+                                 uint8_t* aTargetData, int32_t aTargetStride,
+                                 uint8_t* aSourceData, int32_t aSourceStride)
+{
+  for (int32_t y = 0; y < aSize.height; y++) {
+    for (int32_t x = 0; x < aSize.width; x++) {
+      int32_t inputIndex = y * aSourceStride + 4 * x;
+      int32_t targetIndex = y * aTargetStride + 4 * x;
+      uint8_t alpha = aSourceData[inputIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_A];
+      uint16_t alphaFactor = sAlphaFactors[alpha];
+      // inputColor * alphaFactor + 128 is guaranteed to fit into uint16_t
+      // because the input is premultiplied and thus inputColor <= inputAlpha.
+      // The maximum value this can attain is 65520 (which is less than 65535)
+      // for color == alpha == 244:
+      // 244 * sAlphaFactors[244] + 128 == 244 * 268 + 128 == 65520
+      aTargetData[targetIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_R] =
+        (aSourceData[inputIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_R] * alphaFactor + 128) >> 8;
+      aTargetData[targetIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_G] =
+        (aSourceData[inputIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_G] * alphaFactor + 128) >> 8;
+      aTargetData[targetIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_B] =
+        (aSourceData[inputIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_B] * alphaFactor + 128) >> 8;
+      aTargetData[targetIndex + B8G8R8A8_COMPONENT_BYTEOFFSET_A] = alpha;
+    }
+  }
+}
+
+TemporaryRef<DataSourceSurface>
+FilterProcessing::RenderTurbulence_Scalar(const IntSize &aSize, const Point &aOffset, const Size &aBaseFrequency,
+                                          int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch, const Rect &aTileRect)
+{
+   return RenderTurbulence_SIMD<simd::Scalarf32x4_t,simd::Scalari32x4_t,simd::Scalaru8x16_t>(
+     aSize, aOffset, aBaseFrequency, aSeed, aNumOctaves, aType, aStitch, aTileRect);
+}
+
+TemporaryRef<DataSourceSurface>
+FilterProcessing::ApplyArithmeticCombine_Scalar(DataSourceSurface* aInput1, DataSourceSurface* aInput2, Float aK1, Float aK2, Float aK3, Float aK4)
+{
+  return ApplyArithmeticCombine_SIMD<simd::Scalari32x4_t,simd::Scalari16x8_t,simd::Scalaru8x16_t>(aInput1, aInput2, aK1, aK2, aK3, aK4);
+}
+
+} // namespace mozilla
+} // namespace gfx
--- a/gfx/2d/Makefile.in
+++ b/gfx/2d/Makefile.in
@ -52,11 +52,13 @@ ifneq (,$(INTEL_ARCHITECTURE))
 ifdef GNU_CC
 ImageScalingSSE2.$(OBJ_SUFFIX): CXXFLAGS+=-msse2
 BlurSSE2.$(OBJ_SUFFIX): CXXFLAGS+=-msse2
+FilterProcessingSSE2.$(OBJ_SUFFIX): CXXFLAGS+=-msse2
 endif

 ifdef SOLARIS_SUNPRO_CXX
 ImageScalingSSE2.$(OBJ_SUFFIX): OS_CXXFLAGS += -xarch=sse2 -xO4
 BlurSSE2.$(OBJ_SUFFIX): OS_CXXFLAGS += -xarch=sse2 -xO4
+FilterProcessingSSE2.$(OBJ_SUFFIX): OS_CXXFLAGS += -xarch=sse2 -xO4
 endif
 endif

--- a/gfx/2d/moz.build
+++ b/gfx/2d/moz.build
@ -79,6 +79,7 @@ if CONFIG['INTEL_ARCHITECTURE']:
    if CONFIG['_MSC_VER'] != '1400':
        SOURCES += [
            'BlurSSE2.cpp',
+            'FilterProcessingSSE2.cpp',
            'ImageScalingSSE2.cpp',
        ]

@ -90,6 +91,8 @@ UNIFIED_SOURCES += [
    'DrawTargetDual.cpp',
    'DrawTargetRecording.cpp',
    'Factory.cpp',
+    'FilterProcessing.cpp',
+    'FilterProcessingScalar.cpp',
    'ImageScaling.cpp',
    'Matrix.cpp',
    'Path.cpp',