From 100f2a76b4f84b9b6a775aed71349d0fda595a86 Mon Sep 17 00:00:00 2001
From: "reed@reedloden.com" <none@none>
Date: Fri, 21 Dec 2007 02:26:31 -0800
Subject: [PATCH] Bug 406580 - "Faster copying of RGB pixel data" (Where
 Alpha=0xFF, keep RGB values contiguous in pixel copying)
 [p=swsnyder@insightbb.com (Steve Snyder) r+a1.9=stuart]

---
 gfx/thebes/public/gfxColor.h                  | 27 ++++++++++++++++++
 .../libpr0n/decoders/gif/nsGIFDecoder2.cpp    | 28 +++++++++++++++++--
 .../libpr0n/decoders/jpeg/nsJPEGDecoder.cpp   | 21 +++++++++++++-
 modules/libpr0n/decoders/png/nsPNGDecoder.cpp | 21 +++++++++++++-
 4 files changed, 93 insertions(+), 4 deletions(-)

diff --git a/gfx/thebes/public/gfxColor.h b/gfx/thebes/public/gfxColor.h
index 3db701c580d..34c5ae40269 100644
--- a/gfx/thebes/public/gfxColor.h
+++ b/gfx/thebes/public/gfxColor.h
@@ -44,6 +44,33 @@
 
 #include "gfxTypes.h"
 
+/**
+ * Attempt to use x86's bswap instruction for byte-swapping, via compiler
+ *  intrinsic functions, in preference to a sequence of shift/or operations.
+ * 64-bit swapping also supported but not used here.
+ */
+#if defined(_WIN32) && (_MSC_VER >= 1300) && defined(_M_IX86)
+#  include <stdlib.h>
+#  pragma intrinsic(_byteswap_ushort,_byteswap_ulong)
+#  define GFX_BYTESWAP16(x) _byteswap_ushort(x)
+#  define GFX_BYTESWAP32(x) _byteswap_ulong(x)
+#  define _GFX_USE_INTRIN_BYTESWAP_
+#elif defined(__GNUC__) && (__GNUC__ >= 2) && defined(__i386__) && !defined(XP_OS2)
+#  include <byteswap.h>
+#  define GFX_BYTESWAP16(x) bswap_16(x)
+#  define GFX_BYTESWAP32(x) bswap_32(x)
+#  define _GFX_USE_INTRIN_BYTESWAP_
+#else
+#  define GFX_BYTESWAP16(x) ( (((x) & 0xff) << 8) | (((x) >> 8) & 0xff) )
+#  define GFX_BYTESWAP32(x) ( (GFX_BYTESWAP16((x) & 0xffff) << 16) | GFX_BYTESWAP16(x >> 16) )
+#endif
+
+// Avoid tortured construction of 32-bit ARGB pixel from 3 individual bytes
+//   of memory plus constant 0xFF.  RGB bytes are already contiguous!
+// Equivalent to: GFX_PACKED_PIXEL(0xff,r,g,b)
+#define GFX_0XFF_PPIXEL_FROM_BPTR(pbptr) \
+   (GFX_BYTESWAP32(*((PRUint32 *)(pbptr))) >> 8) | (0xFF << 24)
+
 /**
  * Fast approximate division by 255. It has the property that
  * for all 0 <= n <= 255*255, FAST_DIVIDE_BY_255(n) == n/255.
diff --git a/modules/libpr0n/decoders/gif/nsGIFDecoder2.cpp b/modules/libpr0n/decoders/gif/nsGIFDecoder2.cpp
index 53b38f94a07..03b50965673 100644
--- a/modules/libpr0n/decoders/gif/nsGIFDecoder2.cpp
+++ b/modules/libpr0n/decoders/gif/nsGIFDecoder2.cpp
@@ -671,9 +671,33 @@ static void ConvertColormap(PRUint32 *aColormap, PRUint32 aColors)
   PRUint32 *to = aColormap + aColors;
 
   // Convert color entries to Cairo format
-  for (PRUint32 c = aColors; c > 0; c--) {
+
+  // set up for loops below
+  if (!aColors) return;
+  PRUint32 c = aColors;
+
+  // copy 1st pixel as bytes to avoid reading past end of buffer
+  *--to = GFX_PACKED_PIXEL(0xFF, from[-3], from[-2], from[-1]);
+  from -= 3; c--;
+
+  // bulk copy of pixels.
+  while (c >= 4) {
+    PRUint32 p0, p1, p2, p3; // to avoid back-to-back register stalls
+    from -= 12;
+    to   -=  4;
+    c    -=  4;
+    p0 = GFX_0XFF_PPIXEL_FROM_BPTR(from+9);
+    p1 = GFX_0XFF_PPIXEL_FROM_BPTR(from+6);
+    p2 = GFX_0XFF_PPIXEL_FROM_BPTR(from+3);
+    p3 = GFX_0XFF_PPIXEL_FROM_BPTR(from+0);
+    to[3] = p0; to[2] = p1;
+    to[1] = p2; to[0] = p3;
+  }
+
+  // copy remaining pixel(s)
+  while (c--) {
     from -= 3;
-    *--to = GFX_PACKED_PIXEL(0xFF, from[0], from[1], from[2]);
+    *--to = GFX_0XFF_PPIXEL_FROM_BPTR(from);
   }
 }
 
diff --git a/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp b/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp
index 150da9b2ae2..b1242b42367 100644
--- a/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp
+++ b/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp
@@ -714,7 +714,26 @@ nsJPEGDecoder::OutputScanlines()
         }
       }
 
-      for (PRUint32 i=mInfo.output_width; i>0; --i) {
+      // counter for while() loops below
+      PRUint32 idx = mInfo.output_width;
+
+      // bulk copy of pixels.
+      while (idx > 4) {          // >4 to avoid last 3 bytes in buffer
+        PRUint32 p0, p1, p2, p3; // to avoid back-to-back register stalls
+        p0 = GFX_0XFF_PPIXEL_FROM_BPTR(sampleRow+0);
+        p1 = GFX_0XFF_PPIXEL_FROM_BPTR(sampleRow+3);
+        p2 = GFX_0XFF_PPIXEL_FROM_BPTR(sampleRow+6);
+        p3 = GFX_0XFF_PPIXEL_FROM_BPTR(sampleRow+9);
+        imageRow[0] = p0; imageRow[1] = p1;
+        imageRow[2] = p2; imageRow[3] = p3;
+        idx       -=  4;
+        sampleRow += 12;
+        imageRow  +=  4;
+      }
+
+      // copy remaining pixel(s)
+      while (idx--) {
+        // 32-bit read of final pixel will exceed buffer, so read bytes
         *imageRow++ = GFX_PACKED_PIXEL(0xFF, sampleRow[0], sampleRow[1], sampleRow[2]);
         sampleRow += 3;
       }
diff --git a/modules/libpr0n/decoders/png/nsPNGDecoder.cpp b/modules/libpr0n/decoders/png/nsPNGDecoder.cpp
index e86d4353b57..5a36575411f 100644
--- a/modules/libpr0n/decoders/png/nsPNGDecoder.cpp
+++ b/modules/libpr0n/decoders/png/nsPNGDecoder.cpp
@@ -732,7 +732,26 @@ row_callback(png_structp png_ptr, png_bytep new_row,
     case gfxIFormats::RGB:
     case gfxIFormats::BGR:
       {
-        for (PRUint32 x=iwidth; x>0; --x) {
+        // counter for while() loops below
+        PRUint32 idx = iwidth;
+
+        // bulk copy of pixels.
+        while (idx > 4) {          // >4 to avoid last 3 bytes in buffer
+          PRUint32 p0, p1, p2, p3; // to avoid back-to-back register stalls
+          p0 = GFX_0XFF_PPIXEL_FROM_BPTR(line+0);
+          p1 = GFX_0XFF_PPIXEL_FROM_BPTR(line+3);
+          p2 = GFX_0XFF_PPIXEL_FROM_BPTR(line+6);
+          p3 = GFX_0XFF_PPIXEL_FROM_BPTR(line+9);
+          cptr32[0] = p0; cptr32[1] = p1;
+          cptr32[2] = p2; cptr32[3] = p3;
+          idx    -=  4;
+          line   += 12;
+          cptr32 +=  4;
+        }
+
+        // copy remaining pixel(s)
+        while (idx--) {
+          // 32-bit read of final pixel will exceed buffer, so read bytes
           *cptr32++ = GFX_PACKED_PIXEL(0xFF, line[0], line[1], line[2]);
           line += 3;
         }