diff --git a/gfx/ycbcr/yuv_row_linux.cpp b/gfx/ycbcr/yuv_row_linux.cpp --- a/gfx/ycbcr/yuv_row_linux.cpp +++ b/gfx/ycbcr/yuv_row_linux.cpp @@ -250,18 +250,18 @@ MMX_ALIGNED(int16 kCoefficientsRgbY[768] // AMD64 ABI uses register paremters. void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi const uint8* u_buf, // rsi const uint8* v_buf, // rdx uint8* rgb_buf, // rcx int width) { // r8 asm( - "jmp convertend\n" -"convertloop:" + "jmp 1f\n" +"0:" "movzb (%1),%%r10\n" "add $0x1,%1\n" "movzb (%2),%%r11\n" "add $0x1,%2\n" "movq 2048(%5,%%r10,8),%%xmm0\n" "movzb (%0),%%r10\n" "movq 4096(%5,%%r11,8),%%xmm1\n" "movzb 0x1(%0),%%r11\n" @@ -271,36 +271,36 @@ void FastConvertYUVToRGB32Row(const uint "movq (%5,%%r11,8),%%xmm3\n" "paddsw %%xmm0,%%xmm2\n" "paddsw %%xmm0,%%xmm3\n" "shufps $0x44,%%xmm3,%%xmm2\n" "psraw $0x6,%%xmm2\n" "packuswb %%xmm2,%%xmm2\n" "movq %%xmm2,0x0(%3)\n" "add $0x8,%3\n" -"convertend:" +"1:" "sub $0x2,%4\n" - "jns convertloop\n" + "jns 0b\n" -"convertnext:" +"2:" "add $0x1,%4\n" - "js convertdone\n" + "js 3f\n" "movzb (%1),%%r10\n" "movq 2048(%5,%%r10,8),%%xmm0\n" "movzb (%2),%%r10\n" "movq 4096(%5,%%r10,8),%%xmm1\n" "paddsw %%xmm1,%%xmm0\n" "movzb (%0),%%r10\n" "movq (%5,%%r10,8),%%xmm1\n" "paddsw %%xmm0,%%xmm1\n" "psraw $0x6,%%xmm1\n" "packuswb %%xmm1,%%xmm1\n" "movd %%xmm1,0x0(%3)\n" -"convertdone:" +"3:" : : "r"(y_buf), // %0 "r"(u_buf), // %1 "r"(v_buf), // %2 "r"(rgb_buf), // %3 "r"(width), // %4 "r" (kCoefficientsRgbY) // %5 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" @@ -309,28 +309,35 @@ void FastConvertYUVToRGB32Row(const uint #else void FastConvertYUVToRGB32Row(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); + +// It's necessary to specify the correct section for the following code, +// otherwise it will be placed in whatever the current section is as this unit +// is compiled. Because GCC remembers the last section it emitted, we must +// also revert to the previous section state at the end of the asm block. asm( + ".section .text\n" ".global FastConvertYUVToRGB32Row\n" + ".type FastConvertYUVToRGB32Row, @function\n" "FastConvertYUVToRGB32Row:\n" "pusha\n" "mov 0x24(%esp),%edx\n" "mov 0x28(%esp),%edi\n" "mov 0x2c(%esp),%esi\n" "mov 0x30(%esp),%ebp\n" "mov 0x34(%esp),%ecx\n" - "jmp convertend\n" + "jmp 1f\n" -"convertloop:" +"0:" "movzbl (%edi),%eax\n" "add $0x1,%edi\n" "movzbl (%esi),%ebx\n" "add $0x1,%esi\n" "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" "movzbl (%edx),%eax\n" "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" "movzbl 0x1(%edx),%ebx\n" @@ -339,34 +346,35 @@ void FastConvertYUVToRGB32Row(const uint "movq kCoefficientsRgbY(,%ebx,8),%mm2\n" "paddsw %mm0,%mm1\n" "paddsw %mm0,%mm2\n" "psraw $0x6,%mm1\n" "psraw $0x6,%mm2\n" "packuswb %mm2,%mm1\n" "movntq %mm1,0x0(%ebp)\n" "add $0x8,%ebp\n" -"convertend:" +"1:" "sub $0x2,%ecx\n" - "jns convertloop\n" + "jns 0b\n" "and $0x1,%ecx\n" - "je convertdone\n" + "je 2f\n" "movzbl (%edi),%eax\n" "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" "movzbl (%esi),%eax\n" "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" "movzbl (%edx),%eax\n" "movq kCoefficientsRgbY(,%eax,8),%mm1\n" "paddsw %mm0,%mm1\n" "psraw $0x6,%mm1\n" "packuswb %mm1,%mm1\n" "movd %mm1,0x0(%ebp)\n" -"convertdone:" +"2:" "popa\n" "ret\n" + ".previous\n" ); #endif #endif // ARCH_CPU_ARM_FAMILY } // extern "C"