From 100f2a76b4f84b9b6a775aed71349d0fda595a86 Mon Sep 17 00:00:00 2001 From: "reed@reedloden.com" Date: Fri, 21 Dec 2007 02:26:31 -0800 Subject: [PATCH] Bug 406580 - "Faster copying of RGB pixel data" (Where Alpha=0xFF, keep RGB values contiguous in pixel copying) [p=swsnyder@insightbb.com (Steve Snyder) r+a1.9=stuart] --- gfx/thebes/public/gfxColor.h | 27 ++++++++++++++++++ .../libpr0n/decoders/gif/nsGIFDecoder2.cpp | 28 +++++++++++++++++-- .../libpr0n/decoders/jpeg/nsJPEGDecoder.cpp | 21 +++++++++++++- modules/libpr0n/decoders/png/nsPNGDecoder.cpp | 21 +++++++++++++- 4 files changed, 93 insertions(+), 4 deletions(-) diff --git a/gfx/thebes/public/gfxColor.h b/gfx/thebes/public/gfxColor.h index 3db701c580d..34c5ae40269 100644 --- a/gfx/thebes/public/gfxColor.h +++ b/gfx/thebes/public/gfxColor.h @@ -44,6 +44,33 @@ #include "gfxTypes.h" +/** + * Attempt to use x86's bswap instruction for byte-swapping, via compiler + * intrinsic functions, in preference to a sequence of shift/or operations. + * 64-bit swapping also supported but not used here. + */ +#if defined(_WIN32) && (_MSC_VER >= 1300) && defined(_M_IX86) +# include +# pragma intrinsic(_byteswap_ushort,_byteswap_ulong) +# define GFX_BYTESWAP16(x) _byteswap_ushort(x) +# define GFX_BYTESWAP32(x) _byteswap_ulong(x) +# define _GFX_USE_INTRIN_BYTESWAP_ +#elif defined(__GNUC__) && (__GNUC__ >= 2) && defined(__i386__) && !defined(XP_OS2) +# include +# define GFX_BYTESWAP16(x) bswap_16(x) +# define GFX_BYTESWAP32(x) bswap_32(x) +# define _GFX_USE_INTRIN_BYTESWAP_ +#else +# define GFX_BYTESWAP16(x) ( (((x) & 0xff) << 8) | (((x) >> 8) & 0xff) ) +# define GFX_BYTESWAP32(x) ( (GFX_BYTESWAP16((x) & 0xffff) << 16) | GFX_BYTESWAP16(x >> 16) ) +#endif + +// Avoid tortured construction of 32-bit ARGB pixel from 3 individual bytes +// of memory plus constant 0xFF. RGB bytes are already contiguous! +// Equivalent to: GFX_PACKED_PIXEL(0xff,r,g,b) +#define GFX_0XFF_PPIXEL_FROM_BPTR(pbptr) \ + (GFX_BYTESWAP32(*((PRUint32 *)(pbptr))) >> 8) | (0xFF << 24) + /** * Fast approximate division by 255. It has the property that * for all 0 <= n <= 255*255, FAST_DIVIDE_BY_255(n) == n/255. diff --git a/modules/libpr0n/decoders/gif/nsGIFDecoder2.cpp b/modules/libpr0n/decoders/gif/nsGIFDecoder2.cpp index 53b38f94a07..03b50965673 100644 --- a/modules/libpr0n/decoders/gif/nsGIFDecoder2.cpp +++ b/modules/libpr0n/decoders/gif/nsGIFDecoder2.cpp @@ -671,9 +671,33 @@ static void ConvertColormap(PRUint32 *aColormap, PRUint32 aColors) PRUint32 *to = aColormap + aColors; // Convert color entries to Cairo format - for (PRUint32 c = aColors; c > 0; c--) { + + // set up for loops below + if (!aColors) return; + PRUint32 c = aColors; + + // copy 1st pixel as bytes to avoid reading past end of buffer + *--to = GFX_PACKED_PIXEL(0xFF, from[-3], from[-2], from[-1]); + from -= 3; c--; + + // bulk copy of pixels. + while (c >= 4) { + PRUint32 p0, p1, p2, p3; // to avoid back-to-back register stalls + from -= 12; + to -= 4; + c -= 4; + p0 = GFX_0XFF_PPIXEL_FROM_BPTR(from+9); + p1 = GFX_0XFF_PPIXEL_FROM_BPTR(from+6); + p2 = GFX_0XFF_PPIXEL_FROM_BPTR(from+3); + p3 = GFX_0XFF_PPIXEL_FROM_BPTR(from+0); + to[3] = p0; to[2] = p1; + to[1] = p2; to[0] = p3; + } + + // copy remaining pixel(s) + while (c--) { from -= 3; - *--to = GFX_PACKED_PIXEL(0xFF, from[0], from[1], from[2]); + *--to = GFX_0XFF_PPIXEL_FROM_BPTR(from); } } diff --git a/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp b/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp index 150da9b2ae2..b1242b42367 100644 --- a/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp +++ b/modules/libpr0n/decoders/jpeg/nsJPEGDecoder.cpp @@ -714,7 +714,26 @@ nsJPEGDecoder::OutputScanlines() } } - for (PRUint32 i=mInfo.output_width; i>0; --i) { + // counter for while() loops below + PRUint32 idx = mInfo.output_width; + + // bulk copy of pixels. + while (idx > 4) { // >4 to avoid last 3 bytes in buffer + PRUint32 p0, p1, p2, p3; // to avoid back-to-back register stalls + p0 = GFX_0XFF_PPIXEL_FROM_BPTR(sampleRow+0); + p1 = GFX_0XFF_PPIXEL_FROM_BPTR(sampleRow+3); + p2 = GFX_0XFF_PPIXEL_FROM_BPTR(sampleRow+6); + p3 = GFX_0XFF_PPIXEL_FROM_BPTR(sampleRow+9); + imageRow[0] = p0; imageRow[1] = p1; + imageRow[2] = p2; imageRow[3] = p3; + idx -= 4; + sampleRow += 12; + imageRow += 4; + } + + // copy remaining pixel(s) + while (idx--) { + // 32-bit read of final pixel will exceed buffer, so read bytes *imageRow++ = GFX_PACKED_PIXEL(0xFF, sampleRow[0], sampleRow[1], sampleRow[2]); sampleRow += 3; } diff --git a/modules/libpr0n/decoders/png/nsPNGDecoder.cpp b/modules/libpr0n/decoders/png/nsPNGDecoder.cpp index e86d4353b57..5a36575411f 100644 --- a/modules/libpr0n/decoders/png/nsPNGDecoder.cpp +++ b/modules/libpr0n/decoders/png/nsPNGDecoder.cpp @@ -732,7 +732,26 @@ row_callback(png_structp png_ptr, png_bytep new_row, case gfxIFormats::RGB: case gfxIFormats::BGR: { - for (PRUint32 x=iwidth; x>0; --x) { + // counter for while() loops below + PRUint32 idx = iwidth; + + // bulk copy of pixels. + while (idx > 4) { // >4 to avoid last 3 bytes in buffer + PRUint32 p0, p1, p2, p3; // to avoid back-to-back register stalls + p0 = GFX_0XFF_PPIXEL_FROM_BPTR(line+0); + p1 = GFX_0XFF_PPIXEL_FROM_BPTR(line+3); + p2 = GFX_0XFF_PPIXEL_FROM_BPTR(line+6); + p3 = GFX_0XFF_PPIXEL_FROM_BPTR(line+9); + cptr32[0] = p0; cptr32[1] = p1; + cptr32[2] = p2; cptr32[3] = p3; + idx -= 4; + line += 12; + cptr32 += 4; + } + + // copy remaining pixel(s) + while (idx--) { + // 32-bit read of final pixel will exceed buffer, so read bytes *cptr32++ = GFX_PACKED_PIXEL(0xFF, line[0], line[1], line[2]); line += 3; }