From 826a893d647b7eab3da9c73a165343e9f59acefe Mon Sep 17 00:00:00 2001
From: Benoit Jacob <bjacob@mozilla.com>
Date: Mon, 7 May 2012 13:05:32 -0400
Subject: [PATCH] Bug 749711 - Lots of WebGL texture conversion fixes and
 improvements - r=jgilbert

* Templatize pack/unpack routines
** Inside anonymous namespace in a naive attempt to not hammer PGO linker
   memory usage.
* Support conversions changing texel size
* Support conversion from integer to float formats
* Support RGBA32F properly
* Avoid compiling useless paths (code size down to 17k from 44k)
---
 content/canvas/src/Makefile.in               |   1 +
 content/canvas/src/WebGLContext.h            |  59 +-
 content/canvas/src/WebGLContextGL.cpp        | 305 ++------
 content/canvas/src/WebGLTexelConversions.cpp | 382 +++++++++
 content/canvas/src/WebGLTexelConversions.h   | 771 ++++++++++++-------
 5 files changed, 991 insertions(+), 527 deletions(-)
 create mode 100644 content/canvas/src/WebGLTexelConversions.cpp

diff --git a/content/canvas/src/Makefile.in b/content/canvas/src/Makefile.in
index 8919c2c4b91..662a8b61b79 100644
--- a/content/canvas/src/Makefile.in
+++ b/content/canvas/src/Makefile.in
@@ -82,6 +82,7 @@ CPPSRCS += \
 	WebGLExtensionStandardDerivatives.cpp \
 	WebGLExtensionTextureFilterAnisotropic.cpp \
 	WebGLExtensionLoseContext.cpp \
+	WebGLTexelConversions.cpp \
 	$(NULL)
 
 DEFINES += -DUSE_ANGLE
diff --git a/content/canvas/src/WebGLContext.h b/content/canvas/src/WebGLContext.h
index 8ccb337bebf..40a027aeea4 100644
--- a/content/canvas/src/WebGLContext.h
+++ b/content/canvas/src/WebGLContext.h
@@ -128,16 +128,51 @@ struct BackbufferClearingStatus {
     enum { NotClearedSinceLastPresented, ClearedToDefaultValues, HasBeenDrawnTo };
 };
 
-struct WebGLTexelFormat {
-    enum { Generic, Auto, RGBA8, RGB8, RGBX8, BGRA8, BGR8, BGRX8, RGBA5551, RGBA4444, RGB565, R8, RA8, A8,
-           RGBA32F, RGB32F, A32F, R32F, RA32F };
+namespace WebGLTexelConversions {
+
+/*
+ * The formats that may participate, either as source or destination formats,
+ * in WebGL texture conversions. This includes:
+ *  - all the formats accepted by WebGL.texImage2D, e.g. RGBA4444
+ *  - additional formats provided by extensions, e.g. RGB32F
+ *  - additional source formats, depending on browser details, used when uploading
+ *    textures from DOM elements. See gfxImageSurface::Format().
+ */
+enum WebGLTexelFormat
+{
+    // dummy error code returned by GetWebGLTexelFormat in error cases,
+    // after assertion failure (so this never happens in debug builds)
+    BadFormat,
+    // dummy pseudo-format meaning "use the other format".
+    // For example, if SrcFormat=Auto and DstFormat=RGB8, then the source
+    // is implicitly treated as being RGB8 itself.
+    Auto,
+    // 1-channel formats
+    R8,
+    A8,
+    R32F, // used for OES_texture_float extension
+    A32F, // used for OES_texture_float extension
+    // 2-channel formats
+    RA8,
+    RA32F,
+    // 3-channel formats
+    RGB8,
+    BGRX8, // used for DOM elements. Source format only.
+    RGB565,
+    RGB32F, // used for OES_texture_float extension
+    // 4-channel formats
+    RGBA8,
+    BGRA8, // used for DOM elements
+    RGBA5551,
+    RGBA4444,
+    RGBA32F // used for OES_texture_float extension
 };
 
-struct WebGLTexelPremultiplicationOp {
-    enum { Generic, None, Premultiply, Unmultiply };
-};
+} // end namespace WebGLTexelConversions
 
-int GetWebGLTexelFormat(GLenum format, GLenum type);
+using WebGLTexelConversions::WebGLTexelFormat;
+
+WebGLTexelFormat GetWebGLTexelFormat(GLenum format, GLenum type);
 
 // Zero is not an integer power of two.
 inline bool is_pot_assuming_nonnegative(WebGLsizei x)
@@ -1205,26 +1240,26 @@ protected:
                          WebGLenum format, WebGLenum type,
                          void *data, PRUint32 byteLength,
                          int jsArrayType,
-                         int srcFormat, bool srcPremultiplied);
+                         WebGLTexelFormat srcFormat, bool srcPremultiplied);
     void TexSubImage2D_base(WebGLenum target, WebGLint level,
                             WebGLint xoffset, WebGLint yoffset,
                             WebGLsizei width, WebGLsizei height, WebGLsizei srcStrideOrZero,
                             WebGLenum format, WebGLenum type,
                             void *pixels, PRUint32 byteLength,
                             int jsArrayType,
-                            int srcFormat, bool srcPremultiplied);
+                            WebGLTexelFormat srcFormat, bool srcPremultiplied);
     void TexParameter_base(WebGLenum target, WebGLenum pname,
                            WebGLint *intParamPtr, WebGLfloat *floatParamPtr);
 
     void ConvertImage(size_t width, size_t height, size_t srcStride, size_t dstStride,
                       const PRUint8*src, PRUint8 *dst,
-                      int srcFormat, bool srcPremultiplied,
-                      int dstFormat, bool dstPremultiplied,
+                      WebGLTexelFormat srcFormat, bool srcPremultiplied,
+                      WebGLTexelFormat dstFormat, bool dstPremultiplied,
                       size_t dstTexelSize);
 
     nsresult DOMElementToImageSurface(dom::Element* imageOrCanvas,
                                       gfxImageSurface **imageOut,
-                                      int *format);
+                                      WebGLTexelFormat *format);
 
     void CopyTexSubImage2D_base(WebGLenum target,
                                 WebGLint level,
diff --git a/content/canvas/src/WebGLContextGL.cpp b/content/canvas/src/WebGLContextGL.cpp
index c6f1ce6a2e2..1ef346251ee 100644
--- a/content/canvas/src/WebGLContextGL.cpp
+++ b/content/canvas/src/WebGLContextGL.cpp
@@ -4296,198 +4296,9 @@ WebGLContext::StencilOpSeparate(WebGLenum face, WebGLenum sfail, WebGLenum dpfai
     gl->fStencilOpSeparate(face, sfail, dpfail, dppass);
 }
 
-struct WebGLImageConverter
-{
-    bool flip;
-    size_t width, height, srcStride, dstStride, srcTexelSize, dstTexelSize;
-    const PRUint8 *src;
-    PRUint8 *dst;
-
-    WebGLImageConverter()
-    {
-        memset(this, 0, sizeof(WebGLImageConverter));
-    }
-
-    template<typename SrcType, typename DstType, typename UnpackType,
-         void unpackingFunc(const SrcType*, UnpackType*),
-         void packingFunc(const UnpackType*, DstType*)>
-    void run()
-    {
-        // Note -- even though the functions take UnpackType, the
-        // pointers below are all in terms of PRUint8; otherwise
-        // pointer math starts getting tricky.
-        for (size_t src_row = 0; src_row < height; ++src_row) {
-            size_t dst_row = flip ? (height - 1 - src_row) : src_row;
-            PRUint8 *dst_row_ptr = dst + dst_row * dstStride;
-            const PRUint8 *src_row_ptr = src + src_row * srcStride;
-            const PRUint8 *src_row_end = src_row_ptr + width * srcTexelSize; // != src_row_ptr + byteStride
-            while (src_row_ptr != src_row_end) {
-                UnpackType tmp[4];
-                unpackingFunc(reinterpret_cast<const SrcType*>(src_row_ptr), tmp);
-                packingFunc(tmp, reinterpret_cast<DstType*>(dst_row_ptr));
-                src_row_ptr += srcTexelSize;
-                dst_row_ptr += dstTexelSize;
-            }
-        }
-    }
-};
-
-void
-WebGLContext::ConvertImage(size_t width, size_t height, size_t srcStride, size_t dstStride,
-                           const PRUint8*src, PRUint8 *dst,
-                           int srcFormat, bool srcPremultiplied,
-                           int dstFormat, bool dstPremultiplied,
-                           size_t dstTexelSize)
-{
-    if (width <= 0 || height <= 0)
-        return;
-
-    if (srcFormat == dstFormat &&
-        srcPremultiplied == dstPremultiplied)
-    {
-        // fast exit path: we just have to memcpy all the rows.
-        //
-        // The case where absolutely nothing needs to be done is supposed to have
-        // been handled earlier (in TexImage2D_base, etc).
-        //
-        // So the case we're handling here is when even though no format conversion is needed,
-        // we still might have to flip vertically and/or to adjust to a different stride.
-
-        NS_ASSERTION(mPixelStoreFlipY || srcStride != dstStride, "Performance trap -- should handle this case earlier, to avoid memcpy");
-
-        size_t row_size = width * dstTexelSize; // doesn't matter, src and dst formats agree
-        const PRUint8* src_row = src;
-        const PRUint8* src_end = src + height * srcStride;
-
-        PRUint8* dst_row = mPixelStoreFlipY ? dst + (height-1) * dstStride : dst;
-        ptrdiff_t dstStrideSigned(dstStride);
-        ptrdiff_t dst_delta = mPixelStoreFlipY ? -dstStrideSigned : dstStrideSigned;
-
-        while(src_row != src_end) {
-            memcpy(dst_row, src_row, row_size);
-            src_row += srcStride;
-            dst_row += dst_delta;
-        }
-        return;
-    }
-
-    WebGLImageConverter converter;
-    converter.flip = mPixelStoreFlipY;
-    converter.width = width;
-    converter.height = height;
-    converter.srcStride = srcStride;
-    converter.dstStride = dstStride;
-    converter.dstTexelSize = dstTexelSize;
-    converter.src = src;
-    converter.dst = dst;
-
-    int premultiplicationOp = (!srcPremultiplied && dstPremultiplied) ? WebGLTexelPremultiplicationOp::Premultiply
-                            : (srcPremultiplied && !dstPremultiplied) ? WebGLTexelPremultiplicationOp::Unmultiply
-                            : WebGLTexelPremultiplicationOp::None;
-
-#define HANDLE_DSTFORMAT(format, SrcType, DstType, unpackFunc, packFunc) \
-        case WebGLTexelFormat::format: \
-            switch (premultiplicationOp) { \
-                case WebGLTexelPremultiplicationOp::Premultiply: \
-                    converter.run<SrcType, DstType, PRUint8,          \
-                                  WebGLTexelConversions::unpackFunc, \
-                                  WebGLTexelConversions::packFunc##Premultiply>(); \
-                break; \
-                case WebGLTexelPremultiplicationOp::Unmultiply: \
-                    converter.run<SrcType, DstType, PRUint8, \
-                                  WebGLTexelConversions::unpackFunc, \
-                                  WebGLTexelConversions::packFunc##Unmultiply>(); \
-                break; \
-                default: \
-                    converter.run<SrcType, DstType, PRUint8, \
-                                  WebGLTexelConversions::unpackFunc, \
-                                  WebGLTexelConversions::packFunc>(); \
-                break; \
-            } \
-            break;
-
-#define HANDLE_SRCFORMAT(format, size, SrcType, unpackFunc) \
-        case WebGLTexelFormat::format: \
-            converter.srcTexelSize = size; \
-            switch (dstFormat) { \
-                HANDLE_DSTFORMAT(RGBA8,    SrcType, PRUint8,  unpackFunc, packRGBA8ToRGBA8) \
-                HANDLE_DSTFORMAT(RGB8,     SrcType, PRUint8,  unpackFunc, packRGBA8ToRGB8) \
-                HANDLE_DSTFORMAT(R8,       SrcType, PRUint8,  unpackFunc, packRGBA8ToR8) \
-                HANDLE_DSTFORMAT(RA8,      SrcType, PRUint8,  unpackFunc, packRGBA8ToRA8) \
-                HANDLE_DSTFORMAT(RGBA5551, SrcType, PRUint16, unpackFunc, packRGBA8ToUnsignedShort5551) \
-                HANDLE_DSTFORMAT(RGBA4444, SrcType, PRUint16, unpackFunc, packRGBA8ToUnsignedShort4444) \
-                HANDLE_DSTFORMAT(RGB565,   SrcType, PRUint16, unpackFunc, packRGBA8ToUnsignedShort565) \
-                /* A8 needs to be special-cased as it doesn't have color channels to premultiply */ \
-                case WebGLTexelFormat::A8: \
-                    converter.run<SrcType, PRUint8, PRUint8,          \
-                                  WebGLTexelConversions::unpackFunc, \
-                                  WebGLTexelConversions::packRGBA8ToA8>(); \
-                    break; \
-                default: \
-                    NS_ASSERTION(false, "Coding error?! Should never reach this point."); \
-                    return; \
-            } \
-            break;
-
-#define HANDLE_FLOAT_DSTFORMAT(format, unpackFunc, packFunc) \
-        case WebGLTexelFormat::format: \
-            switch (premultiplicationOp) { \
-                case WebGLTexelPremultiplicationOp::Premultiply: \
-                    converter.run<float, float, float,                \
-                                  WebGLTexelConversions::unpackFunc, \
-                                  WebGLTexelConversions::packFunc##Premultiply>(); \
-                break; \
-                case WebGLTexelPremultiplicationOp::Unmultiply: \
-                    NS_ASSERTION(false, "Floating point can't be un-premultiplied -- we have no premultiplied source data!"); \
-                break; \
-                default: \
-                    converter.run<float, float, float,                \
-                                  WebGLTexelConversions::unpackFunc, \
-                                  WebGLTexelConversions::packFunc>(); \
-                break; \
-            } \
-            break;
-
-#define HANDLE_FLOAT_SRCFORMAT(format, size, unpackFunc)                \
-        case WebGLTexelFormat::format:                                  \
-            converter.srcTexelSize = size;                              \
-            switch (dstFormat) {                                        \
-                HANDLE_FLOAT_DSTFORMAT(RGB32F, unpackFunc, packRGBA32FToRGB32F) \
-                HANDLE_FLOAT_DSTFORMAT(A32F,   unpackFunc, packRGBA32FToA32F) \
-                HANDLE_FLOAT_DSTFORMAT(R32F,   unpackFunc, packRGBA32FToR32F) \
-                HANDLE_FLOAT_DSTFORMAT(RA32F,  unpackFunc, packRGBA32FToRA32F) \
-                default: \
-                    NS_ASSERTION(false, "Coding error?! Should never reach this point."); \
-                    return; \
-            } \
-            break;
-        
-    switch (srcFormat) {
-        HANDLE_SRCFORMAT(RGBA8,    4, PRUint8,  unpackRGBA8ToRGBA8)
-        HANDLE_SRCFORMAT(RGBX8,    4, PRUint8,  unpackRGB8ToRGBA8)
-        HANDLE_SRCFORMAT(RGB8,     3, PRUint8,  unpackRGB8ToRGBA8)
-        HANDLE_SRCFORMAT(BGRA8,    4, PRUint8,  unpackBGRA8ToRGBA8)
-        HANDLE_SRCFORMAT(BGRX8,    4, PRUint8,  unpackBGR8ToRGBA8)
-        HANDLE_SRCFORMAT(BGR8,     3, PRUint8,  unpackBGR8ToRGBA8)
-        HANDLE_SRCFORMAT(R8,       1, PRUint8,  unpackR8ToRGBA8)
-        HANDLE_SRCFORMAT(A8,       1, PRUint8,  unpackA8ToRGBA8)
-        HANDLE_SRCFORMAT(RA8,      2, PRUint8,  unpackRA8ToRGBA8)
-        HANDLE_SRCFORMAT(RGBA5551, 2, PRUint16, unpackRGBA5551ToRGBA8)
-        HANDLE_SRCFORMAT(RGBA4444, 2, PRUint16, unpackRGBA4444ToRGBA8)
-        HANDLE_SRCFORMAT(RGB565,   2, PRUint16, unpackRGB565ToRGBA8)
-        HANDLE_FLOAT_SRCFORMAT(RGB32F,  12, unpackRGB32FToRGBA32F)
-        HANDLE_FLOAT_SRCFORMAT(RA32F,    8, unpackRA32FToRGBA32F)
-        HANDLE_FLOAT_SRCFORMAT(R32F,     4, unpackR32FToRGBA32F)
-        HANDLE_FLOAT_SRCFORMAT(A32F,     4, unpackA32FToRGBA32F)
-        default:
-            NS_ASSERTION(false, "Coding error?! Should never reach this point.");
-            return;
-    }
-}
-
 nsresult
 WebGLContext::DOMElementToImageSurface(Element* imageOrCanvas,
-                                       gfxImageSurface **imageOut, int *format)
+                                       gfxImageSurface **imageOut, WebGLTexelFormat *format)
 {
     if (!imageOrCanvas) {
         return NS_ERROR_FAILURE;
@@ -4556,16 +4367,16 @@ WebGLContext::DOMElementToImageSurface(Element* imageOrCanvas,
 
     switch (surf->Format()) {
         case gfxASurface::ImageFormatARGB32:
-            *format = WebGLTexelFormat::BGRA8; // careful, our ARGB means BGRA
+            *format = WebGLTexelConversions::BGRA8; // careful, our ARGB means BGRA
             break;
         case gfxASurface::ImageFormatRGB24:
-            *format = WebGLTexelFormat::BGRX8; // careful, our RGB24 is not tightly packed. Whence BGRX8.
+            *format = WebGLTexelConversions::BGRX8; // careful, our RGB24 is not tightly packed. Whence BGRX8.
             break;
         case gfxASurface::ImageFormatA8:
-            *format = WebGLTexelFormat::A8;
+            *format = WebGLTexelConversions::A8;
             break;
         case gfxASurface::ImageFormatRGB16_565:
-            *format = WebGLTexelFormat::RGB565;
+            *format = WebGLTexelConversions::RGB565;
             break;
         default:
             NS_ASSERTION(false, "Unsupported image format. Unimplemented.");
@@ -5644,7 +5455,7 @@ WebGLContext::TexImage2D_base(WebGLenum target, WebGLint level, WebGLenum intern
                               WebGLenum format, WebGLenum type,
                               void *data, PRUint32 byteLength,
                               int jsArrayType, // a TypedArray format enum, or -1 if not relevant
-                              int srcFormat, bool srcPremultiplied)
+                              WebGLTexelFormat srcFormat, bool srcPremultiplied)
 {
     switch (target) {
         case LOCAL_GL_TEXTURE_2D:
@@ -5699,14 +5510,19 @@ WebGLContext::TexImage2D_base(WebGLenum target, WebGLint level, WebGLenum intern
     if (border != 0)
         return ErrorInvalidValue("TexImage2D: border must be 0");
 
-    PRUint32 texelSize = 0;
-    if (!ValidateTexFormatAndType(format, type, jsArrayType, &texelSize, "texImage2D"))
+    PRUint32 dstTexelSize = 0;
+    if (!ValidateTexFormatAndType(format, type, jsArrayType, &dstTexelSize, "texImage2D"))
         return;
 
-    CheckedUint32 checked_neededByteLength = 
-        GetImageSize(height, width, texelSize, mPixelStoreUnpackAlignment); 
+    WebGLTexelFormat dstFormat = GetWebGLTexelFormat(format, type);
+    WebGLTexelFormat actualSrcFormat = srcFormat == WebGLTexelConversions::Auto ? dstFormat : srcFormat;
 
-    CheckedUint32 checked_plainRowSize = CheckedUint32(width) * texelSize;
+    PRUint32 srcTexelSize = WebGLTexelConversions::TexelBytesForFormat(actualSrcFormat);
+
+    CheckedUint32 checked_neededByteLength = 
+        GetImageSize(height, width, srcTexelSize, mPixelStoreUnpackAlignment);
+
+    CheckedUint32 checked_plainRowSize = CheckedUint32(width) * srcTexelSize;
 
     CheckedUint32 checked_alignedRowSize =
         RoundedToNextMultipleOf(checked_plainRowSize.value(), mPixelStoreUnpackAlignment);
@@ -5734,11 +5550,9 @@ WebGLContext::TexImage2D_base(WebGLenum target, WebGLint level, WebGLenum intern
     GLenum error = LOCAL_GL_NO_ERROR;
 
     if (byteLength) {
-        int dstFormat = GetWebGLTexelFormat(format, type);
-        int actualSrcFormat = srcFormat == WebGLTexelFormat::Auto ? dstFormat : srcFormat;
         size_t srcStride = srcStrideOrZero ? srcStrideOrZero : checked_alignedRowSize.value();
 
-        size_t dstPlainRowSize = texelSize * width;
+        size_t dstPlainRowSize = dstTexelSize * width;
         size_t unpackAlignment = mPixelStoreUnpackAlignment;
         size_t dstStride = ((dstPlainRowSize + unpackAlignment-1) / unpackAlignment) * unpackAlignment;
 
@@ -5753,11 +5567,12 @@ WebGLContext::TexImage2D_base(WebGLenum target, WebGLint level, WebGLenum intern
         }
         else
         {
-            nsAutoArrayPtr<PRUint8> convertedData(new PRUint8[bytesNeeded]);
+            size_t convertedDataSize = height * dstStride;
+            nsAutoArrayPtr<PRUint8> convertedData(new PRUint8[convertedDataSize]);
             ConvertImage(width, height, srcStride, dstStride,
                         (PRUint8*)data, convertedData,
                         actualSrcFormat, srcPremultiplied,
-                        dstFormat, mPixelStorePremultiplyAlpha, texelSize);
+                        dstFormat, mPixelStorePremultiplyAlpha, dstTexelSize);
             error = CheckedTexImage2D(target, level, internalformat,
                                       width, height, border, format, type, convertedData);
         }
@@ -5814,7 +5629,7 @@ WebGLContext::TexImage2D(JSContext* cx, WebGLenum target, WebGLint level,
                            pixels ? pixels->mData : 0,
                            pixels ? pixels->mLength : 0,
                            pixels ? (int)JS_GetTypedArrayType(pixels->mObj, cx) : -1,
-                           WebGLTexelFormat::Auto, false);
+                           WebGLTexelConversions::Auto, false);
 }
 
 NS_IMETHODIMP
@@ -5832,7 +5647,7 @@ WebGLContext::TexImage2D_imageData(WebGLenum target, WebGLint level, WebGLenum i
                     pixels ? JS_GetArrayBufferViewData(pixels, cx) : 0,
                     pixels ? JS_GetArrayBufferViewByteLength(pixels, cx) : 0,
                     -1,
-                    WebGLTexelFormat::RGBA8, false);
+                    WebGLTexelConversions::RGBA8, false);
     return NS_OK;
 }
 
@@ -5853,7 +5668,7 @@ WebGLContext::TexImage2D(JSContext* cx, WebGLenum target, WebGLint level,
     return TexImage2D_base(target, level, internalformat, pixels->GetWidth(),
                            pixels->GetHeight(), 4*pixels->GetWidth(), 0,
                            format, type, arr.mData, arr.mLength, -1,
-                           WebGLTexelFormat::RGBA8, false);
+                           WebGLTexelConversions::RGBA8, false);
 }
 
 
@@ -5877,7 +5692,7 @@ WebGLContext::TexImage2D(JSContext* /* unused */, WebGLenum target,
 
     nsRefPtr<gfxImageSurface> isurf;
 
-    int srcFormat;
+    WebGLTexelFormat srcFormat;
     rv = DOMElementToImageSurface(elt, getter_AddRefs(isurf), &srcFormat);
     if (rv.Failed())
         return;
@@ -5908,7 +5723,7 @@ WebGLContext::TexSubImage2D_base(WebGLenum target, WebGLint level,
                                  WebGLenum format, WebGLenum type,
                                  void *pixels, PRUint32 byteLength,
                                  int jsArrayType,
-                                 int srcFormat, bool srcPremultiplied)
+                                 WebGLTexelFormat srcFormat, bool srcPremultiplied)
 {
     switch (target) {
         case LOCAL_GL_TEXTURE_2D:
@@ -5943,17 +5758,22 @@ WebGLContext::TexSubImage2D_base(WebGLenum target, WebGLint level,
             return ErrorInvalidValue("texSubImage2D: with level > 0, width and height must be powers of two");
     }
 
-    PRUint32 texelSize = 0;
-    if (!ValidateTexFormatAndType(format, type, jsArrayType, &texelSize, "texSubImage2D"))
+    PRUint32 dstTexelSize = 0;
+    if (!ValidateTexFormatAndType(format, type, jsArrayType, &dstTexelSize, "texSubImage2D"))
         return;
 
+    WebGLTexelFormat dstFormat = GetWebGLTexelFormat(format, type);
+    WebGLTexelFormat actualSrcFormat = srcFormat == WebGLTexelConversions::Auto ? dstFormat : srcFormat;
+
+    PRUint32 srcTexelSize = WebGLTexelConversions::TexelBytesForFormat(actualSrcFormat);
+
     if (width == 0 || height == 0)
         return; // ES 2.0 says it has no effect, we better return right now
 
     CheckedUint32 checked_neededByteLength = 
-        GetImageSize(height, width, texelSize, mPixelStoreUnpackAlignment);
+        GetImageSize(height, width, srcTexelSize, mPixelStoreUnpackAlignment);
 
-    CheckedUint32 checked_plainRowSize = CheckedUint32(width) * texelSize;
+    CheckedUint32 checked_plainRowSize = CheckedUint32(width) * srcTexelSize;
 
     CheckedUint32 checked_alignedRowSize = 
         RoundedToNextMultipleOf(checked_plainRowSize.value(), mPixelStoreUnpackAlignment);
@@ -5986,11 +5806,9 @@ WebGLContext::TexSubImage2D_base(WebGLenum target, WebGLint level,
 
     MakeContextCurrent();
 
-    int dstFormat = GetWebGLTexelFormat(format, type);
-    int actualSrcFormat = srcFormat == WebGLTexelFormat::Auto ? dstFormat : srcFormat;
     size_t srcStride = srcStrideOrZero ? srcStrideOrZero : checked_alignedRowSize.value();
 
-    size_t dstPlainRowSize = texelSize * width;
+    size_t dstPlainRowSize = dstTexelSize * width;
     // There are checks above to ensure that this won't overflow.
     size_t dstStride = RoundedToNextMultipleOf(dstPlainRowSize, mPixelStoreUnpackAlignment).value();
 
@@ -6004,11 +5822,12 @@ WebGLContext::TexSubImage2D_base(WebGLenum target, WebGLint level,
     }
     else
     {
-        nsAutoArrayPtr<PRUint8> convertedData(new PRUint8[bytesNeeded]);
+        size_t convertedDataSize = height * dstStride;
+        nsAutoArrayPtr<PRUint8> convertedData(new PRUint8[convertedDataSize]);
         ConvertImage(width, height, srcStride, dstStride,
                     (const PRUint8*)pixels, convertedData,
                     actualSrcFormat, srcPremultiplied,
-                    dstFormat, mPixelStorePremultiplyAlpha, texelSize);
+                    dstFormat, mPixelStorePremultiplyAlpha, dstTexelSize);
 
         gl->fTexSubImage2D(target, level, xoffset, yoffset, width, height, format, type, convertedData);
     }
@@ -6051,7 +5870,7 @@ WebGLContext::TexSubImage2D(JSContext* cx, WebGLenum target, WebGLint level,
                               width, height, 0, format, type,
                               pixels->mData, pixels->mLength,
                               JS_GetTypedArrayType(pixels->mObj, cx),
-                              WebGLTexelFormat::Auto, false);
+                              WebGLTexelConversions::Auto, false);
 }
 
 NS_IMETHODIMP
@@ -6075,7 +5894,7 @@ WebGLContext::TexSubImage2D_imageData(WebGLenum target, WebGLint level,
                        width, height, 4*width, format, type,
                        JS_GetArrayBufferViewData(pixels, cx), JS_GetArrayBufferViewByteLength(pixels, cx),
                        -1,
-                       WebGLTexelFormat::RGBA8, false);
+                       WebGLTexelConversions::RGBA8, false);
     return NS_OK;
 }
 
@@ -6097,7 +5916,7 @@ WebGLContext::TexSubImage2D(JSContext* cx, WebGLenum target, WebGLint level,
                               4*pixels->GetWidth(), format, type,
                               arr.mData, arr.mLength,
                               -1,
-                              WebGLTexelFormat::RGBA8, false);
+                              WebGLTexelConversions::RGBA8, false);
 }
 
 NS_IMETHODIMP
@@ -6122,7 +5941,7 @@ WebGLContext::TexSubImage2D(JSContext* /* unused */, WebGLenum target,
 
     nsRefPtr<gfxImageSurface> isurf;
 
-    int srcFormat;
+    WebGLTexelFormat srcFormat;
     rv = DOMElementToImageSurface(elt, getter_AddRefs(isurf), &srcFormat);
     if (rv.Failed())
         return;
@@ -6232,52 +6051,52 @@ BaseTypeAndSizeFromUniformType(WebGLenum uType, WebGLenum *baseType, WebGLint *u
 }
 
 
-int mozilla::GetWebGLTexelFormat(GLenum format, GLenum type)
+WebGLTexelFormat mozilla::GetWebGLTexelFormat(GLenum format, GLenum type)
 {
     if (type == LOCAL_GL_UNSIGNED_BYTE) {
         switch (format) {
             case LOCAL_GL_RGBA:
-                return WebGLTexelFormat::RGBA8;
+                return WebGLTexelConversions::RGBA8;
             case LOCAL_GL_RGB:
-                return WebGLTexelFormat::RGB8;
+                return WebGLTexelConversions::RGB8;
             case LOCAL_GL_ALPHA:
-                return WebGLTexelFormat::A8;
+                return WebGLTexelConversions::A8;
             case LOCAL_GL_LUMINANCE:
-                return WebGLTexelFormat::R8;
+                return WebGLTexelConversions::R8;
             case LOCAL_GL_LUMINANCE_ALPHA:
-                return WebGLTexelFormat::RA8;
+                return WebGLTexelConversions::RA8;
             default:
-                NS_ASSERTION(false, "Coding mistake?! Should never reach this point.");
-                return WebGLTexelFormat::Generic;
+                NS_ABORT_IF_FALSE(false, "Coding mistake?! Should never reach this point.");
+                return WebGLTexelConversions::BadFormat;
         }
     } else if (type == LOCAL_GL_FLOAT) {
         // OES_texture_float
         switch (format) {
             case LOCAL_GL_RGBA:
-                return WebGLTexelFormat::RGBA32F;
+                return WebGLTexelConversions::RGBA32F;
             case LOCAL_GL_RGB:
-                return WebGLTexelFormat::RGB32F;
+                return WebGLTexelConversions::RGB32F;
             case LOCAL_GL_ALPHA:
-                return WebGLTexelFormat::A32F;
+                return WebGLTexelConversions::A32F;
             case LOCAL_GL_LUMINANCE:
-                return WebGLTexelFormat::R32F;
+                return WebGLTexelConversions::R32F;
             case LOCAL_GL_LUMINANCE_ALPHA:
-                return WebGLTexelFormat::RA32F;
+                return WebGLTexelConversions::RA32F;
             default:
-                NS_ASSERTION(false, "Coding mistake?! Should never reach this point.");
-                return WebGLTexelFormat::Generic;
+                NS_ABORT_IF_FALSE(false, "Coding mistake?! Should never reach this point.");
+                return WebGLTexelConversions::BadFormat;
         }
     } else {
         switch (type) {
             case LOCAL_GL_UNSIGNED_SHORT_4_4_4_4:
-                return WebGLTexelFormat::RGBA4444;
+                return WebGLTexelConversions::RGBA4444;
             case LOCAL_GL_UNSIGNED_SHORT_5_5_5_1:
-                return WebGLTexelFormat::RGBA5551;
+                return WebGLTexelConversions::RGBA5551;
             case LOCAL_GL_UNSIGNED_SHORT_5_6_5:
-                return WebGLTexelFormat::RGB565;
+                return WebGLTexelConversions::RGB565;
             default:
-                NS_ASSERTION(false, "Coding mistake?! Should never reach this point.");
-                return WebGLTexelFormat::Generic;
+                NS_ABORT_IF_FALSE(false, "Coding mistake?! Should never reach this point.");
+                return WebGLTexelConversions::BadFormat;
         }
     }
 }
diff --git a/content/canvas/src/WebGLTexelConversions.cpp b/content/canvas/src/WebGLTexelConversions.cpp
new file mode 100644
index 00000000000..04036cdaaed
--- /dev/null
+++ b/content/canvas/src/WebGLTexelConversions.cpp
@@ -0,0 +1,382 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "WebGLTexelConversions.h"
+
+namespace mozilla {
+
+using namespace WebGLTexelConversions;
+
+namespace {
+
+/** @class WebGLImageConverter
+ *
+ * This class is just a helper to implement WebGLContext::ConvertImage below.
+ *
+ * Design comments:
+ * 
+ * WebGLContext::ConvertImage has to handle hundreds of format conversion paths.
+ * It is important to minimize executable code size here. Instead of passing around
+ * a large number of function parameters hundreds of times, we create a
+ * WebGLImageConverter object once, storing these parameters, and then we call
+ * the run() method on it.
+ */
+class WebGLImageConverter
+{
+    const size_t mWidth, mHeight;
+    const void* const mSrcStart;
+    void* const mDstStart;
+    const ptrdiff_t mSrcStride, mDstStride;
+    bool mAlreadyRun;
+    bool mSuccess;
+
+    /*
+     * Returns sizeof(texel)/sizeof(type). The point is that we will iterate over
+     * texels with typed pointers and this value will tell us by how much we need
+     * to increment these pointers to advance to the next texel.
+     */
+    template<int Format>
+    static size_t NumElementsPerTexelForFormat() {
+        switch (Format) {
+            case R8:
+            case A8:
+            case R32F:
+            case A32F:
+            case RGBA5551:
+            case RGBA4444:
+            case RGB565:
+                return 1;
+            case RA8:
+            case RA32F:
+                return 2;
+            case RGB8:
+            case RGB32F:
+                return 3;
+            case RGBA8:
+            case BGRA8:
+            case BGRX8:
+            case RGBA32F:
+                return 4;
+            default:
+                NS_ABORT_IF_FALSE(false, "Unknown texel format. Coding mistake?");
+                return 0;
+        }
+    }
+
+    /*
+     * This is the completely format-specific templatized conversion function,
+     * that will be instantiated hundreds of times for all different combinations.
+     * It is important to avoid generating useless code here. In particular, many
+     * instantiations of this function template will never be called, so we try
+     * to return immediately in these cases to allow the compiler to avoid generating
+     * useless code.
+     */
+    template<WebGLTexelFormat SrcFormat,
+             WebGLTexelFormat DstFormat,
+             WebGLTexelPremultiplicationOp PremultiplicationOp>
+    void run()
+    {
+        // check for never-called cases. We early-return to allow the compiler
+        // to avoid generating this code. It would be tempting to abort() instead,
+        // as returning early does leave the destination surface with uninitialized
+        // data, but that would not allow the compiler to avoid generating this code.
+        // So instead, we return early, so Success() will return false, and the caller
+        // must check that and abort in that case. See WebGLContext::ConvertImage.
+
+        if (SrcFormat == DstFormat &&
+            PremultiplicationOp == NoPremultiplicationOp)
+        {
+            // Should have used a fast exit path earlier, rather than entering this function.
+            // we explicitly return here to allow the compiler to avoid generating this code
+            return;
+        }
+
+        // Only textures uploaded from DOM elements or ImageData can allow DstFormat != SrcFormat.
+        // DOM elements can only give BGRA8, BGRX8, A8, RGB565 formats. See DOMElementToImageSurface.
+        // ImageData is always RGBA8. So all other SrcFormat will always satisfy DstFormat==SrcFormat,
+        // so we can avoid compiling the code for all the unreachable paths.
+        const bool CanSrcFormatComeFromDOMElementOrImageData
+            = SrcFormat == BGRA8 ||
+              SrcFormat == BGRX8 ||
+              SrcFormat == A8 ||
+              SrcFormat == RGB565 ||
+              SrcFormat == RGBA8;
+        if (!CanSrcFormatComeFromDOMElementOrImageData &&
+            SrcFormat != DstFormat)
+        {
+            return;
+        }
+
+        // Likewise, only textures uploaded from DOM elements or ImageData can possibly have to be unpremultiplied.
+        if (!CanSrcFormatComeFromDOMElementOrImageData &&
+            PremultiplicationOp == Unpremultiply)
+        {
+            return;
+        }
+
+        // there is no point in premultiplication/unpremultiplication
+        // in the following cases:
+        //  - the source format has no alpha
+        //  - the source format has no color
+        //  - the destination format has no color
+        if (!HasAlpha(SrcFormat) ||
+            !HasColor(SrcFormat) ||
+            !HasColor(DstFormat))
+        {
+
+            if (PremultiplicationOp != NoPremultiplicationOp)
+            {
+                return;
+            }
+        }
+
+        // end of early return cases.
+
+        NS_ABORT_IF_FALSE(!mAlreadyRun, "converter should be run only once!");
+        mAlreadyRun = true;
+
+        // gather some compile-time meta-data about the formats at hand.
+
+        typedef
+            typename DataTypeForFormat<SrcFormat>::Type
+            SrcType;
+        typedef
+            typename DataTypeForFormat<DstFormat>::Type
+            DstType;
+
+        const int IntermediateSrcFormat
+            = IntermediateFormat<SrcFormat>::Value;
+        const int IntermediateDstFormat
+            = IntermediateFormat<DstFormat>::Value;
+        typedef
+            typename DataTypeForFormat<IntermediateSrcFormat>::Type
+            IntermediateSrcType;
+        typedef
+            typename DataTypeForFormat<IntermediateDstFormat>::Type
+            IntermediateDstType;
+
+        const size_t NumElementsPerSrcTexel = NumElementsPerTexelForFormat<SrcFormat>();
+        const size_t NumElementsPerDstTexel = NumElementsPerTexelForFormat<DstFormat>();
+        const size_t MaxElementsPerTexel = 4;
+        NS_ABORT_IF_FALSE(NumElementsPerSrcTexel <= MaxElementsPerTexel, "unhandled format");
+        NS_ABORT_IF_FALSE(NumElementsPerDstTexel <= MaxElementsPerTexel, "unhandled format");
+
+        // we assume that the strides are multiples of the sizeof of respective types.
+        // this assumption will allow us to iterate over src and dst images using typed
+        // pointers, e.g. uint8_t* or uint16_t* or float*, instead of untyped pointers.
+        // So this assumption allows us to write cleaner and safer code, but it might
+        // not be true forever and if it eventually becomes wrong, we'll have to revert
+        // to always iterating using uint8_t* pointers regardless of the types at hand.
+        NS_ABORT_IF_FALSE(mSrcStride % sizeof(SrcType) == 0 &&
+                          mDstStride % sizeof(DstType) == 0,
+                          "Unsupported: texture stride is not a multiple of sizeof(type)");
+        const ptrdiff_t srcStrideInElements = mSrcStride / sizeof(SrcType);
+        const ptrdiff_t dstStrideInElements = mDstStride / sizeof(DstType);
+
+        const SrcType *srcRowStart = static_cast<const SrcType*>(mSrcStart);
+        DstType *dstRowStart = static_cast<DstType*>(mDstStart);
+
+        // the loop performing the texture format conversion
+        for (size_t i = 0; i < mHeight; ++i) {
+            const SrcType *srcRowEnd = srcRowStart + mWidth * NumElementsPerSrcTexel;
+            const SrcType *srcPtr = srcRowStart;
+            DstType *dstPtr = dstRowStart;
+            while (srcPtr != srcRowEnd) {
+                // convert a single texel. We proceed in 3 steps: unpack the source texel
+                // so the corresponding interchange format (e.g. unpack RGB565 to RGBA8),
+                // convert the resulting data type to the destination type (e.g. convert
+                // from RGBA8 to RGBA32F), and finally pack the destination texel
+                // (e.g. pack RGBA32F to RGB32F).
+                IntermediateSrcType unpackedSrc[MaxElementsPerTexel];
+                IntermediateDstType unpackedDst[MaxElementsPerTexel];
+
+                // unpack a src texel to corresponding intermediate src format.
+                // for example, unpack RGB565 to RGBA8
+                unpack<SrcFormat>(srcPtr, unpackedSrc);
+                // convert the data type to the destination type, if needed.
+                // for example, convert RGBA8 to RGBA32F
+                convertType(unpackedSrc, unpackedDst);
+                // pack the destination texel.
+                // for example, pack RGBA32F to RGB32F
+                pack<DstFormat, PremultiplicationOp>(unpackedDst, dstPtr);
+
+                srcPtr += NumElementsPerSrcTexel;
+                dstPtr += NumElementsPerDstTexel;
+            }
+            srcRowStart += srcStrideInElements;
+            dstRowStart += dstStrideInElements;
+        }
+
+        mSuccess = true;
+        return;
+    }
+
+    template<WebGLTexelFormat SrcFormat, WebGLTexelFormat DstFormat>
+    void run(WebGLTexelPremultiplicationOp premultiplicationOp)
+    {
+        #define WEBGLIMAGECONVERTER_CASE_PREMULTIPLICATIONOP(PremultiplicationOp) \
+            case PremultiplicationOp: \
+                return run<SrcFormat, DstFormat, PremultiplicationOp>();
+
+        switch (premultiplicationOp) {
+            WEBGLIMAGECONVERTER_CASE_PREMULTIPLICATIONOP(NoPremultiplicationOp)
+            WEBGLIMAGECONVERTER_CASE_PREMULTIPLICATIONOP(Premultiply)
+            WEBGLIMAGECONVERTER_CASE_PREMULTIPLICATIONOP(Unpremultiply)
+            default:
+                NS_ABORT_IF_FALSE(false, "unhandled case. Coding mistake?");
+        }
+
+        #undef WEBGLIMAGECONVERTER_CASE_PREMULTIPLICATIONOP
+    }
+
+    template<WebGLTexelFormat SrcFormat>
+    void run(WebGLTexelFormat dstFormat,
+             WebGLTexelPremultiplicationOp premultiplicationOp)
+    {
+        #define WEBGLIMAGECONVERTER_CASE_DSTFORMAT(DstFormat) \
+            case DstFormat: \
+                return run<SrcFormat, DstFormat>(premultiplicationOp);
+
+        switch (dstFormat) {
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(R8)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(A8)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(R32F)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(A32F)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RA8)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RA32F)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RGB8)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RGB565)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RGB32F)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RGBA8)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RGBA5551)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RGBA4444)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RGBA32F)
+            default:
+                NS_ABORT_IF_FALSE(false, "unhandled case. Coding mistake?");
+        }
+
+        #undef WEBGLIMAGECONVERTER_CASE_DSTFORMAT
+    }
+
+public:
+
+    void run(WebGLTexelFormat srcFormat,
+             WebGLTexelFormat dstFormat,
+             WebGLTexelPremultiplicationOp premultiplicationOp)
+    {
+        #define WEBGLIMAGECONVERTER_CASE_SRCFORMAT(SrcFormat) \
+            case SrcFormat: \
+                return run<SrcFormat>(dstFormat, premultiplicationOp);
+
+        switch (srcFormat) {
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(R8)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(A8)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(R32F)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(A32F)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RA8)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RA32F)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RGB8)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(BGRX8) // source format only
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RGB565)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RGB32F)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RGBA8)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(BGRA8)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RGBA5551)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RGBA4444)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RGBA32F)
+            default:
+                NS_ABORT_IF_FALSE(false, "unhandled case. Coding mistake?");
+        }
+
+        #undef WEBGLIMAGECONVERTER_CASE_SRCFORMAT
+    }
+
+    WebGLImageConverter(size_t width, size_t height,
+                        const void* srcStart, void* dstStart,
+                        ptrdiff_t srcStride, ptrdiff_t dstStride)
+        : mWidth(width), mHeight(height),
+          mSrcStart(srcStart), mDstStart(dstStart),
+          mSrcStride(srcStride), mDstStride(dstStride),
+          mAlreadyRun(false), mSuccess(false)
+    {}
+
+    bool Success() const {
+        return mSuccess;
+    }
+};
+
+} // end anonymous namespace
+
+void
+WebGLContext::ConvertImage(size_t width, size_t height, size_t srcStride, size_t dstStride,
+                           const uint8_t* src, uint8_t *dst,
+                           WebGLTexelFormat srcFormat, bool srcPremultiplied,
+                           WebGLTexelFormat dstFormat, bool dstPremultiplied,
+                           size_t dstTexelSize)
+{
+    if (width <= 0 || height <= 0)
+        return;
+
+    const bool FormatsRequireNoPremultiplicationOp =
+        !HasAlpha(srcFormat) ||
+        !HasColor(srcFormat) ||
+        !HasColor(dstFormat);
+
+    if (srcFormat == dstFormat &&
+        (FormatsRequireNoPremultiplicationOp || srcPremultiplied == dstPremultiplied))
+    {
+        // fast exit path: we just have to memcpy all the rows.
+        //
+        // The case where absolutely nothing needs to be done is supposed to have
+        // been handled earlier (in TexImage2D_base, etc).
+        //
+        // So the case we're handling here is when even though no format conversion is needed,
+        // we still might have to flip vertically and/or to adjust to a different stride.
+
+        NS_ABORT_IF_FALSE(mPixelStoreFlipY || srcStride != dstStride, "Performance trap -- should handle this case earlier, to avoid memcpy");
+
+        size_t row_size = width * dstTexelSize; // doesn't matter, src and dst formats agree
+        const uint8_t* ptr = src;
+        const uint8_t* src_end = src + height * srcStride;
+
+        uint8_t* dst_row = mPixelStoreFlipY
+                           ? dst + (height-1) * dstStride
+                           : dst;
+        ptrdiff_t dstStrideSigned(dstStride);
+        ptrdiff_t dst_delta = mPixelStoreFlipY ? -dstStrideSigned : dstStrideSigned;
+
+        while(ptr != src_end) {
+            memcpy(dst_row, ptr, row_size);
+            ptr += srcStride;
+            dst_row += dst_delta;
+        }
+        return;
+    }
+
+    uint8_t* dstStart = dst;
+    ptrdiff_t signedDstStride = dstStride;
+    if (mPixelStoreFlipY) {
+        dstStart = dst + (height - 1) * dstStride;
+        signedDstStride = -dstStride;
+    }
+
+    WebGLImageConverter converter(width, height, src, dstStart, srcStride, signedDstStride);
+
+    const WebGLTexelPremultiplicationOp premultiplicationOp
+        = FormatsRequireNoPremultiplicationOp     ? NoPremultiplicationOp
+        : (!srcPremultiplied && dstPremultiplied) ? Premultiply
+        : (srcPremultiplied && !dstPremultiplied) ? Unpremultiply
+                                                  : NoPremultiplicationOp;
+
+    converter.run(srcFormat, dstFormat, premultiplicationOp);
+
+    if (!converter.Success()) {
+        // the dst image may be left uninitialized, so we better not try to
+        // continue even in release builds. This should never happen anyway,
+        // and would be a bug in our code.
+        NS_RUNTIMEABORT("programming mistake in WebGL texture conversions");
+    }
+}
+
+} // end namespace mozilla 
diff --git a/content/canvas/src/WebGLTexelConversions.h b/content/canvas/src/WebGLTexelConversions.h
index 2ef798dfa71..015b8ad2197 100644
--- a/content/canvas/src/WebGLTexelConversions.h
+++ b/content/canvas/src/WebGLTexelConversions.h
@@ -25,17 +25,6 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-// the pixel conversions code here is originally from this file:
-//   http://trac.webkit.org/browser/trunk/WebCore/platform/graphics/GraphicsContext3D.cpp
-
-// Keep as much as possible unchanged to ease sharing code with the WebKit guys.
-// Changes:
-//  * added BGR8 path, we need it in Mozilla to load textures from DOMElements
-//  * enclosing in a namespace WebGLTexelConversions to make it clear it is, in profilers and in symbol table dumps
-//  * added __restrict keywords. Although non-standard, this is very well supported across all compilers
-//    that I know of (GCC/LLVM/MSC/ICC/XLC...)
-//  * optimized scaleFactor computation in Unmultiply functions (1 div instead of 2)
-
 #ifndef WEBGLTEXELCONVERSIONS_H_
 #define WEBGLTEXELCONVERSIONS_H_
 
@@ -44,6 +33,7 @@
 #endif
 
 #include "WebGLContext.h"
+#include "mozilla/StandardInteger.h"
 
 #if defined _MSC_VER
 #define FORCE_INLINE __forceinline
@@ -57,410 +47,647 @@ namespace mozilla {
 
 namespace WebGLTexelConversions {
 
+enum WebGLTexelPremultiplicationOp
+{
+    NoPremultiplicationOp,
+    Premultiply,
+    Unpremultiply
+};
+
+template<int Format>
+struct IsFloatFormat
+{
+    static const bool Value =
+        Format == RGBA32F ||
+        Format == RGB32F ||
+        Format == RA32F ||
+        Format == R32F ||
+        Format == A32F;
+};
+
+template<int Format>
+struct Is16bppFormat
+{
+    static const bool Value =
+        Format == RGBA4444 ||
+        Format == RGBA5551 ||
+        Format == RGB565;
+};
+
+template<int Format,
+         bool IsFloat = IsFloatFormat<Format>::Value,
+         bool Is16bpp = Is16bppFormat<Format>::Value>
+struct DataTypeForFormat
+{
+    typedef uint8_t Type;
+};
+
+template<int Format>
+struct DataTypeForFormat<Format, true, false>
+{
+    typedef float Type;
+};
+
+template<int Format>
+struct DataTypeForFormat<Format, false, true>
+{
+    typedef uint16_t Type;
+};
+
+template<int Format>
+struct IntermediateFormat
+{
+    static const int Value = IsFloatFormat<Format>::Value ? RGBA32F : RGBA8;
+};
+
+inline size_t TexelBytesForFormat(int format) {
+    switch (format) {
+        case WebGLTexelConversions::R8:
+        case WebGLTexelConversions::A8:
+            return 1;
+        case WebGLTexelConversions::RA8:
+        case WebGLTexelConversions::RGBA5551:
+        case WebGLTexelConversions::RGBA4444:
+        case WebGLTexelConversions::RGB565:
+            return 2;
+        case WebGLTexelConversions::RGB8:
+            return 3;
+        case WebGLTexelConversions::RGBA8:
+        case WebGLTexelConversions::BGRA8:
+        case WebGLTexelConversions::BGRX8:
+        case WebGLTexelConversions::R32F:
+        case WebGLTexelConversions::A32F:
+            return 4;
+        case WebGLTexelConversions::RA32F:
+            return 8;
+        case WebGLTexelConversions::RGB32F:
+            return 12;
+        case WebGLTexelConversions::RGBA32F:
+            return 16;
+        default:
+            NS_ABORT_IF_FALSE(false, "Unknown texel format. Coding mistake?");
+            return 0;
+    }
+}
+
+FORCE_INLINE bool HasAlpha(int format) {
+    return format == A8 ||
+           format == A32F ||
+           format == RA8 ||
+           format == RA32F ||
+           format == RGBA8 ||
+           format == BGRA8 ||
+           format == RGBA32F ||
+           format == RGBA4444 ||
+           format == RGBA5551;
+}
+
+FORCE_INLINE bool HasColor(int format) {
+    return format == R8 ||
+           format == R32F ||
+           format == RA8 ||
+           format == RA32F ||
+           format == RGB8 ||
+           format == BGRX8 ||
+           format == RGB565 ||
+           format == RGB32F ||
+           format == RGBA8 ||
+           format == BGRA8 ||
+           format == RGBA32F ||
+           format == RGBA4444 ||
+           format == RGBA5551;
+}
+
+
 /****** BEGIN CODE SHARED WITH WEBKIT ******/
 
+// the pack/unpack functions here are originally from this file:
+//   http://trac.webkit.org/browser/trunk/WebCore/platform/graphics/GraphicsContext3D.cpp
+
 //----------------------------------------------------------------------
 // Pixel unpacking routines.
 
-FORCE_INLINE void unpackRGBA8ToRGBA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<int Format>
+FORCE_INLINE void
+unpack(const typename DataTypeForFormat<Format>::Type* __restrict src,
+       typename DataTypeForFormat<IntermediateFormat<Format>::Value>::Type* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[1];
-    destination[2] = source[2];
-    destination[3] = source[3];
+    NS_ABORT_IF_FALSE(false, "Unimplemented texture format conversion");
 }
 
-FORCE_INLINE void unpackRGB8ToRGBA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<RGBA8>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[1];
-    destination[2] = source[2];
-    destination[3] = 0xFF;
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = src[3];
 }
 
-FORCE_INLINE void unpackBGRA8ToRGBA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<RGB8>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = source[2];
-    destination[1] = source[1];
-    destination[2] = source[0];
-    destination[3] = source[3];
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = 0xFF;
 }
 
-FORCE_INLINE void unpackBGR8ToRGBA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<BGRA8>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = source[2];
-    destination[1] = source[1];
-    destination[2] = source[0];
-    destination[3] = 0xFF;
+    dst[0] = src[2];
+    dst[1] = src[1];
+    dst[2] = src[0];
+    dst[3] = src[3];
 }
 
-FORCE_INLINE void unpackRGBA5551ToRGBA8(const uint16_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<BGRX8>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    uint16_t packedValue = source[0];
-    uint8_t r = packedValue >> 11;
+    dst[0] = src[2];
+    dst[1] = src[1];
+    dst[2] = src[0];
+    dst[3] = 0xFF;
+}
+
+template<> FORCE_INLINE void
+unpack<RGBA5551>(const uint16_t* __restrict src, uint8_t* __restrict dst)
+{
+    uint16_t packedValue = src[0];
+    uint8_t r = (packedValue >> 11) & 0x1F;
     uint8_t g = (packedValue >> 6) & 0x1F;
     uint8_t b = (packedValue >> 1) & 0x1F;
-    destination[0] = (r << 3) | (r & 0x7);
-    destination[1] = (g << 3) | (g & 0x7);
-    destination[2] = (b << 3) | (b & 0x7);
-    destination[3] = (packedValue & 0x1) ? 0xFF : 0x0;
+    dst[0] = (r << 3) | (r & 0x7);
+    dst[1] = (g << 3) | (g & 0x7);
+    dst[2] = (b << 3) | (b & 0x7);
+    dst[3] = (packedValue & 0x1) ? 0xFF : 0;
 }
 
-FORCE_INLINE void unpackRGBA4444ToRGBA8(const uint16_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<RGBA4444>(const uint16_t* __restrict src, uint8_t* __restrict dst)
 {
-    uint16_t packedValue = source[0];
-    uint8_t r = packedValue >> 12;
+    uint16_t packedValue = src[0];
+    uint8_t r = (packedValue >> 12) & 0x0F;
     uint8_t g = (packedValue >> 8) & 0x0F;
     uint8_t b = (packedValue >> 4) & 0x0F;
     uint8_t a = packedValue & 0x0F;
-    destination[0] = r << 4 | r;
-    destination[1] = g << 4 | g;
-    destination[2] = b << 4 | b;
-    destination[3] = a << 4 | a;
+    dst[0] = (r << 4) | r;
+    dst[1] = (g << 4) | g;
+    dst[2] = (b << 4) | b;
+    dst[3] = (a << 4) | a;
 }
 
-FORCE_INLINE void unpackRGB565ToRGBA8(const uint16_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<RGB565>(const uint16_t* __restrict src, uint8_t* __restrict dst)
 {
-    uint16_t packedValue = source[0];
-    uint8_t r = packedValue >> 11;
+    uint16_t packedValue = src[0];
+    uint8_t r = (packedValue >> 11) & 0x1F;
     uint8_t g = (packedValue >> 5) & 0x3F;
     uint8_t b = packedValue & 0x1F;
-    destination[0] = (r << 3) | (r & 0x7);
-    destination[1] = (g << 2) | (g & 0x3);
-    destination[2] = (b << 3) | (b & 0x7);
-    destination[3] = 0xFF;
+    dst[0] = (r << 3) | (r & 0x7);
+    dst[1] = (g << 2) | (g & 0x3);
+    dst[2] = (b << 3) | (b & 0x7);
+    dst[3] = 0xFF;
 }
 
-FORCE_INLINE void unpackR8ToRGBA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<R8>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[0];
-    destination[2] = source[0];
-    destination[3] = 0xFF;
+    dst[0] = src[0];
+    dst[1] = src[0];
+    dst[2] = src[0];
+    dst[3] = 0xFF;
 }
 
-FORCE_INLINE void unpackRA8ToRGBA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<RA8>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[0];
-    destination[2] = source[0];
-    destination[3] = source[1];
+    dst[0] = src[0];
+    dst[1] = src[0];
+    dst[2] = src[0];
+    dst[3] = src[1];
 }
 
-FORCE_INLINE void unpackA8ToRGBA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<A8>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = 0x0;
-    destination[1] = 0x0;
-    destination[2] = 0x0;
-    destination[3] = source[0];
+    dst[0] = 0;
+    dst[1] = 0;
+    dst[2] = 0;
+    dst[3] = src[0];
 }
 
-FORCE_INLINE void unpackRGB32FToRGBA32F(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+unpack<RGBA32F>(const float* __restrict src, float* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[1];
-    destination[2] = source[2];
-    destination[3] = 1;
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = src[3];
 }
 
-FORCE_INLINE void unpackR32FToRGBA32F(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+unpack<RGB32F>(const float* __restrict src, float* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[0];
-    destination[2] = source[0];
-    destination[3] = 1;
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = 1.0f;
 }
 
-FORCE_INLINE void unpackRA32FToRGBA32F(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+unpack<R32F>(const float* __restrict src, float* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[0];
-    destination[2] = source[0];
-    destination[3] = source[1];
+    dst[0] = src[0];
+    dst[1] = src[0];
+    dst[2] = src[0];
+    dst[3] = 1.0f;
 }
 
-FORCE_INLINE void unpackA32FToRGBA32F(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+unpack<RA32F>(const float* __restrict src, float* __restrict dst)
 {
-    destination[0] = 0;
-    destination[1] = 0;
-    destination[2] = 0;
-    destination[3] = source[0];
+    dst[0] = src[0];
+    dst[1] = src[0];
+    dst[2] = src[0];
+    dst[3] = src[1];
+}
+
+template<> FORCE_INLINE void
+unpack<A32F>(const float* __restrict src, float* __restrict dst)
+{
+    dst[0] = 0;
+    dst[1] = 0;
+    dst[2] = 0;
+    dst[3] = src[0];
 }
 
 //----------------------------------------------------------------------
 // Pixel packing routines.
 //
 
-FORCE_INLINE void packRGBA8ToA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<int Format, int PremultiplicationOp>
+FORCE_INLINE void
+pack(const typename DataTypeForFormat<IntermediateFormat<Format>::Value>::Type* __restrict src,
+     typename DataTypeForFormat<Format>::Type* __restrict dst)
 {
-    destination[0] = source[3];
+    NS_ABORT_IF_FALSE(false, "Unimplemented texture format conversion");
 }
 
-FORCE_INLINE void packRGBA8ToR8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<A8, NoPremultiplicationOp>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = source[0];
+    dst[0] = src[3];
 }
 
-FORCE_INLINE void packRGBA8ToR8Premultiply(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<A8, Premultiply>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    float scaleFactor = source[3] / 255.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    destination[0] = sourceR;
+    dst[0] = src[3];
+}
+
+template<> FORCE_INLINE void
+pack<A8, Unpremultiply>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    dst[0] = src[3];
+}
+
+template<> FORCE_INLINE void
+pack<R8, NoPremultiplicationOp>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    dst[0] = src[0];
+}
+
+template<> FORCE_INLINE void
+pack<R8, Premultiply>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    float scaleFactor = src[3] / 255.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    dst[0] = srcR;
+}
+
+template<> FORCE_INLINE void
+pack<R8, Unpremultiply>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    float scaleFactor = src[3] ? 255.0f / src[3] : 1.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    dst[0] = srcR;
+}
+
+template<> FORCE_INLINE void
+pack<RA8, NoPremultiplicationOp>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    dst[0] = src[0];
+    dst[1] = src[3];
+}
+
+template<> FORCE_INLINE void
+pack<RA8, Premultiply>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    float scaleFactor = src[3] / 255.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    dst[0] = srcR;
+    dst[1] = src[3];
 }
 
 // FIXME: this routine is lossy and must be removed.
-FORCE_INLINE void packRGBA8ToR8Unmultiply(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RA8, Unpremultiply>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    float scaleFactor = source[3] ? 255.0f / source[3] : 1.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    destination[0] = sourceR;
+    float scaleFactor = src[3] ? 255.0f / src[3] : 1.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    dst[0] = srcR;
+    dst[1] = src[3];
 }
 
-FORCE_INLINE void packRGBA8ToRA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGB8, NoPremultiplicationOp>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[3];
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
 }
 
-FORCE_INLINE void packRGBA8ToRA8Premultiply(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGB8, Premultiply>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    float scaleFactor = source[3] / 255.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    destination[0] = sourceR;
-    destination[1] = source[3];
+    float scaleFactor = src[3] / 255.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    dst[0] = srcR;
+    dst[1] = srcG;
+    dst[2] = srcB;
+}
+
+template<> FORCE_INLINE void
+pack<RGB8, Unpremultiply>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    float scaleFactor = src[3] ? 255.0f / src[3] : 1.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    dst[0] = srcR;
+    dst[1] = srcG;
+    dst[2] = srcB;
+}
+
+template<> FORCE_INLINE void
+pack<RGBA8, NoPremultiplicationOp>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = src[3];
+}
+
+template<> FORCE_INLINE void
+pack<RGBA8, Premultiply>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    float scaleFactor = src[3] / 255.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    dst[0] = srcR;
+    dst[1] = srcG;
+    dst[2] = srcB;
+    dst[3] = src[3];
 }
 
 // FIXME: this routine is lossy and must be removed.
-FORCE_INLINE void packRGBA8ToRA8Unmultiply(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA8, Unpremultiply>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    float scaleFactor = source[3] ? 255.0f / source[3] : 1.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    destination[0] = sourceR;
-    destination[1] = source[3];
+    float scaleFactor = src[3] ? 255.0f / src[3] : 1.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    dst[0] = srcR;
+    dst[1] = srcG;
+    dst[2] = srcB;
+    dst[3] = src[3];
 }
 
-FORCE_INLINE void packRGBA8ToRGB8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA4444, NoPremultiplicationOp>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[1];
-    destination[2] = source[2];
+    *dst = ( ((src[0] & 0xF0) << 8)
+           | ((src[1] & 0xF0) << 4)
+           | (src[2] & 0xF0)
+           | (src[3] >> 4) );
 }
 
-FORCE_INLINE void packRGBA8ToRGB8Premultiply(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA4444, Premultiply>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    float scaleFactor = source[3] / 255.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    destination[0] = sourceR;
-    destination[1] = sourceG;
-    destination[2] = sourceB;
+    float scaleFactor = src[3] / 255.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    *dst = ( ((srcR & 0xF0) << 8)
+           | ((srcG & 0xF0) << 4)
+           | (srcB & 0xF0)
+           | (src[3] >> 4));
 }
 
 // FIXME: this routine is lossy and must be removed.
-FORCE_INLINE void packRGBA8ToRGB8Unmultiply(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA4444, Unpremultiply>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    float scaleFactor = source[3] ? 255.0f / source[3] : 1.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    destination[0] = sourceR;
-    destination[1] = sourceG;
-    destination[2] = sourceB;
+    float scaleFactor = src[3] ? 255.0f / src[3] : 1.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    *dst = ( ((srcR & 0xF0) << 8)
+           | ((srcG & 0xF0) << 4)
+           | (srcB & 0xF0)
+           | (src[3] >> 4));
 }
 
-// This is only used when the source format is different than kSourceFormatRGBA8.
-FORCE_INLINE void packRGBA8ToRGBA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA5551, NoPremultiplicationOp>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[1];
-    destination[2] = source[2];
-    destination[3] = source[3];
+    *dst = ( ((src[0] & 0xF8) << 8)
+           | ((src[1] & 0xF8) << 3)
+           | ((src[2] & 0xF8) >> 2)
+           | (src[3] >> 7));
 }
 
-FORCE_INLINE void packRGBA8ToRGBA8Premultiply(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA5551, Premultiply>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    float scaleFactor = source[3] / 255.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    destination[0] = sourceR;
-    destination[1] = sourceG;
-    destination[2] = sourceB;
-    destination[3] = source[3];
+    float scaleFactor = src[3] / 255.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    *dst = ( ((srcR & 0xF8) << 8)
+           | ((srcG & 0xF8) << 3)
+           | ((srcB & 0xF8) >> 2)
+           | (src[3] >> 7));
 }
 
 // FIXME: this routine is lossy and must be removed.
-FORCE_INLINE void packRGBA8ToRGBA8Unmultiply(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA5551, Unpremultiply>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    float scaleFactor = source[3] ? 255.0f / source[3] : 1.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    destination[0] = sourceR;
-    destination[1] = sourceG;
-    destination[2] = sourceB;
-    destination[3] = source[3];
+    float scaleFactor = src[3] ? 255.0f / src[3] : 1.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    *dst = ( ((srcR & 0xF8) << 8)
+           | ((srcG & 0xF8) << 3)
+           | ((srcB & 0xF8) >> 2)
+           | (src[3] >> 7));
 }
 
-FORCE_INLINE void packRGBA8ToUnsignedShort4444(const uint8_t* __restrict source, uint16_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGB565, NoPremultiplicationOp>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    *destination = (((source[0] & 0xF0) << 8)
-                    | ((source[1] & 0xF0) << 4)
-                    | (source[2] & 0xF0)
-                    | (source[3] >> 4));
+    *dst = ( ((src[0] & 0xF8) << 8)
+           | ((src[1] & 0xFC) << 3)
+           | ((src[2] & 0xF8) >> 3));
 }
 
-FORCE_INLINE void packRGBA8ToUnsignedShort4444Premultiply(const uint8_t* __restrict source, uint16_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGB565, Premultiply>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    float scaleFactor = source[3] / 255.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    *destination = (((sourceR & 0xF0) << 8)
-                    | ((sourceG & 0xF0) << 4)
-                    | (sourceB & 0xF0)
-                    | (source[3] >> 4));
+    float scaleFactor = src[3] / 255.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    *dst = ( ((srcR & 0xF8) << 8)
+           | ((srcG & 0xFC) << 3)
+           | ((srcB & 0xF8) >> 3));
 }
 
 // FIXME: this routine is lossy and must be removed.
-FORCE_INLINE void packRGBA8ToUnsignedShort4444Unmultiply(const uint8_t* __restrict source, uint16_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGB565, Unpremultiply>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    float scaleFactor = source[3] ? 255.0f / source[3] : 1.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    *destination = (((sourceR & 0xF0) << 8)
-                    | ((sourceG & 0xF0) << 4)
-                    | (sourceB & 0xF0)
-                    | (source[3] >> 4));
+    float scaleFactor = src[3] ? 255.0f / src[3] : 1.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    *dst = ( ((srcR & 0xF8) << 8)
+           | ((srcG & 0xFC) << 3)
+           | ((srcB & 0xF8) >> 3));
 }
 
-FORCE_INLINE void packRGBA8ToUnsignedShort5551(const uint8_t* __restrict source, uint16_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGB32F, NoPremultiplicationOp>(const float* __restrict src, float* __restrict dst)
 {
-    *destination = (((source[0] & 0xF8) << 8)
-                    | ((source[1] & 0xF8) << 3)
-                    | ((source[2] & 0xF8) >> 2)
-                    | (source[3] >> 7));
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
 }
 
-FORCE_INLINE void packRGBA8ToUnsignedShort5551Premultiply(const uint8_t* __restrict source, uint16_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGB32F, Premultiply>(const float* __restrict src, float* __restrict dst)
 {
-    float scaleFactor = source[3] / 255.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    *destination = (((sourceR & 0xF8) << 8)
-                    | ((sourceG & 0xF8) << 3)
-                    | ((sourceB & 0xF8) >> 2)
-                    | (source[3] >> 7));
+    float scaleFactor = src[3];
+    dst[0] = src[0] * scaleFactor;
+    dst[1] = src[1] * scaleFactor;
+    dst[2] = src[2] * scaleFactor;
 }
 
-// FIXME: this routine is lossy and must be removed.
-FORCE_INLINE void packRGBA8ToUnsignedShort5551Unmultiply(const uint8_t* __restrict source, uint16_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA32F, NoPremultiplicationOp>(const float* __restrict src, float* __restrict dst)
 {
-    float scaleFactor = source[3] ? 255.0f / source[3] : 1.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    *destination = (((sourceR & 0xF8) << 8)
-                    | ((sourceG & 0xF8) << 3)
-                    | ((sourceB & 0xF8) >> 2)
-                    | (source[3] >> 7));
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = src[3];
 }
 
-FORCE_INLINE void packRGBA8ToUnsignedShort565(const uint8_t* __restrict source, uint16_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA32F, Premultiply>(const float* __restrict src, float* __restrict dst)
 {
-    *destination = (((source[0] & 0xF8) << 8)
-                    | ((source[1] & 0xFC) << 3)
-                    | ((source[2] & 0xF8) >> 3));
+    float scaleFactor = src[3];
+    dst[0] = src[0] * scaleFactor;
+    dst[1] = src[1] * scaleFactor;
+    dst[2] = src[2] * scaleFactor;
+    dst[3] = src[3];
 }
 
-FORCE_INLINE void packRGBA8ToUnsignedShort565Premultiply(const uint8_t* __restrict source, uint16_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<A32F, NoPremultiplicationOp>(const float* __restrict src, float* __restrict dst)
 {
-    float scaleFactor = source[3] / 255.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    *destination = (((sourceR & 0xF8) << 8)
-                    | ((sourceG & 0xFC) << 3)
-                    | ((sourceB & 0xF8) >> 3));
+    dst[0] = src[3];
 }
 
-// FIXME: this routine is lossy and must be removed.
-FORCE_INLINE void packRGBA8ToUnsignedShort565Unmultiply(const uint8_t* __restrict source, uint16_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<A32F, Premultiply>(const float* __restrict src, float* __restrict dst)
 {
-    float scaleFactor = source[3] ? 255.0f / source[3] : 1.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    *destination = (((sourceR & 0xF8) << 8)
-                    | ((sourceG & 0xFC) << 3)
-                    | ((sourceB & 0xF8) >> 3));
+    dst[0] = src[3];
 }
 
-FORCE_INLINE void packRGBA32FToRGB32F(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+pack<R32F, NoPremultiplicationOp>(const float* __restrict src, float* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[1];
-    destination[2] = source[2];
+    dst[0] = src[0];
 }
 
-FORCE_INLINE void packRGBA32FToRGB32FPremultiply(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+pack<R32F, Premultiply>(const float* __restrict src, float* __restrict dst)
 {
-    float scaleFactor = source[3];
-    destination[0] = source[0] * scaleFactor;
-    destination[1] = source[1] * scaleFactor;
-    destination[2] = source[2] * scaleFactor;
+    float scaleFactor = src[3];
+    dst[0] = src[0] * scaleFactor;
 }
 
-FORCE_INLINE void packRGBA32FToRGBA32FPremultiply(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+pack<RA32F, NoPremultiplicationOp>(const float* __restrict src, float* __restrict dst)
 {
-    float scaleFactor = source[3];
-    destination[0] = source[0] * scaleFactor;
-    destination[1] = source[1] * scaleFactor;
-    destination[2] = source[2] * scaleFactor;
-    destination[3] = source[3];
+    dst[0] = src[0];
+    dst[1] = src[3];
 }
 
-FORCE_INLINE void packRGBA32FToA32F(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+pack<RA32F, Premultiply>(const float* __restrict src, float* __restrict dst)
 {
-    destination[0] = source[3];
-}
-
-// identical to above, to avoid special-casing
-FORCE_INLINE void packRGBA32FToA32FPremultiply(const float* __restrict source, float* __restrict destination)
-{
-    destination[0] = source[3];
-}
-
-FORCE_INLINE void packRGBA32FToR32F(const float* __restrict source, float* __restrict destination)
-{
-    destination[0] = source[0];
-}
-
-FORCE_INLINE void packRGBA32FToR32FPremultiply(const float* __restrict source, float* __restrict destination)
-{
-    float scaleFactor = source[3];
-    destination[0] = source[0] * scaleFactor;
-}
-
-
-FORCE_INLINE void packRGBA32FToRA32F(const float* __restrict source, float* __restrict destination)
-{
-    destination[0] = source[0];
-    destination[1] = source[3];
-}
-
-FORCE_INLINE void packRGBA32FToRA32FPremultiply(const float* __restrict source, float* __restrict destination)
-{
-    float scaleFactor = source[3];
-    destination[0] = source[0] * scaleFactor;
-    destination[1] = scaleFactor;
+    float scaleFactor = src[3];
+    dst[0] = src[0] * scaleFactor;
+    dst[1] = scaleFactor;
 }
 
 /****** END CODE SHARED WITH WEBKIT ******/
 
+template<typename SrcType, typename DstType> FORCE_INLINE void
+convertType(const SrcType* __restrict src, DstType* __restrict dst)
+{
+    NS_ABORT_IF_FALSE(false, "Unimplemented texture format conversion");
+}
+
+template<> FORCE_INLINE void
+convertType<uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = src[3];
+}
+
+template<> FORCE_INLINE void
+convertType<float, float>(const float* __restrict src, float* __restrict dst)
+{
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = src[3];
+}
+
+template<> FORCE_INLINE void
+convertType<uint8_t, float>(const uint8_t* __restrict src, float* __restrict dst)
+{
+    const float scaleFactor = 1.f / 255.0f;
+    dst[0] = src[0] * scaleFactor;
+    dst[1] = src[1] * scaleFactor;
+    dst[2] = src[2] * scaleFactor;
+    dst[3] = src[3] * scaleFactor;
+}
+
+#undef FORCE_INLINE
+
 } // end namespace WebGLTexelConversions
 
 } // end namespace mozilla