Bug 771403 - Update libjpeg-turbo to version 1.2.1. r=jlebar

This commit is contained in:
Ryan VanderMeulen 2012-07-07 10:21:32 -04:00
parent 6c77f55217
commit 8e1f1321e9
10 changed files with 335 additions and 467 deletions

View File

@ -58,6 +58,10 @@ To upgrade to a new revision of libjpeg-turbo, do the following:
$ hg addremove
== July 4, 2012 (libjpeg-turbo v1.2.1 r853 2012-06-30) ==
* Updated to v1.2.1 stable release.
== June 5, 2012 (libjpeg-turbo v1.2.x branch, r831 2012-05-30) ==
* Updated to latest version on v1.2.x branch (bug 759891).

View File

@ -1,5 +1,5 @@
#define VERSION "1.2.0"
#define BUILD "2012-02-10"
#define VERSION "1.2.1"
#define BUILD "2012-06-30"
#define PACKAGE_NAME "libjpeg-turbo"
/* Need to use Mozilla-specific function inlining. */

View File

@ -1,7 +1,7 @@
;
; jdclrss2-64.asm - colorspace conversion (64-bit SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2009 D. R. Commander
;
; Based on
@ -251,17 +251,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
.out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow
@ -271,26 +267,23 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
jmp near .columnloop
.column_st32:
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
.column_st15:
%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_MMWORD
@ -324,47 +317,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
test rcx, rcx
jz short .nextrow
mov BYTE [rdi], al
%else
mov rax,rcx
xor rcx, byte 0x0F
shl rcx, 2
movd xmmB,ecx
psrlq xmmH,4
pcmpeqb xmmE,xmmE
psrlq xmmH,xmmB
psrlq xmmE,xmmB
punpcklbw xmmE,xmmH
; ----------------
mov rcx,rdi
and rcx, byte SIZEOF_XMMWORD-1
jz short .adj0
add rax,rcx
cmp rax, byte SIZEOF_XMMWORD
ja short .adj0
and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,rcx
movdqa xmmG,xmmA
movdqa xmmC,xmmE
pslldq xmmA, SIZEOF_XMMWORD/2
pslldq xmmE, SIZEOF_XMMWORD/2
movd xmmD,ecx
sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
jb short .adj1
movd xmmF,ecx
psllq xmmA,xmmF
psllq xmmE,xmmF
jmp short .adj0
.adj1: neg ecx
movd xmmF,ecx
psrlq xmmA,xmmF
psrlq xmmE,xmmF
psllq xmmG,xmmD
psllq xmmC,xmmD
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@ -409,19 +361,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
.out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow
@ -431,25 +378,22 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
jmp near .columnloop
.column_st32:
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub rcx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
.column_st15:
%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_XMMWORD/8
@ -464,47 +408,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
test rcx, rcx
jz short .nextrow
movd DWORD [rdi], xmmA
%else
cmp rcx, byte SIZEOF_XMMWORD/16
jb near .nextrow
mov rax,rcx
xor rcx, byte 0x03
inc rcx
shl rcx, 4
movd xmmF,ecx
psrlq xmmE,xmmF
punpcklbw xmmE,xmmE
; ----------------
mov rcx,rdi
and rcx, byte SIZEOF_XMMWORD-1
jz short .adj0
lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
cmp rax, byte SIZEOF_XMMWORD
ja short .adj0
and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
movdqa xmmB,xmmA
movdqa xmmG,xmmE
pslldq xmmA, SIZEOF_XMMWORD/2
pslldq xmmE, SIZEOF_XMMWORD/2
movd xmmC,ecx
sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
jb short .adj1
movd xmmH,ecx
psllq xmmA,xmmH
psllq xmmE,xmmH
jmp short .adj0
.adj1: neg rcx
movd xmmH,ecx
psrlq xmmA,xmmH
psrlq xmmE,xmmH
psllq xmmB,xmmC
psllq xmmG,xmmC
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------

View File

@ -1,7 +1,7 @@
;
; jdclrss2.asm - colorspace conversion (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
;
; Based on
; x86 SIMD extension for IJG JPEG library
@ -262,17 +262,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
add edi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow
@ -283,26 +279,23 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
alignx 16,7
.column_st32:
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
@ -336,47 +329,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
test ecx, ecx
jz short .nextrow
mov BYTE [edi], al
%else
mov eax,ecx
xor ecx, byte 0x0F
shl ecx, 2
movd xmmB,ecx
psrlq xmmH,4
pcmpeqb xmmE,xmmE
psrlq xmmH,xmmB
psrlq xmmE,xmmB
punpcklbw xmmE,xmmH
; ----------------
mov ecx,edi
and ecx, byte SIZEOF_XMMWORD-1
jz short .adj0
add eax,ecx
cmp eax, byte SIZEOF_XMMWORD
ja short .adj0
and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
movdqa xmmG,xmmA
movdqa xmmC,xmmE
pslldq xmmA, SIZEOF_XMMWORD/2
pslldq xmmE, SIZEOF_XMMWORD/2
movd xmmD,ecx
sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
jb short .adj1
movd xmmF,ecx
psllq xmmA,xmmF
psllq xmmE,xmmF
jmp short .adj0
.adj1: neg ecx
movd xmmF,ecx
psrlq xmmA,xmmF
psrlq xmmE,xmmF
psllq xmmG,xmmD
psllq xmmC,xmmD
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@ -421,19 +373,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
add edi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow
@ -444,25 +391,22 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
alignx 16,7
.column_st32:
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub ecx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_XMMWORD/8
@ -477,47 +421,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
test ecx, ecx
jz short .nextrow
movd DWORD [edi], xmmA
%else
cmp ecx, byte SIZEOF_XMMWORD/16
jb short .nextrow
mov eax,ecx
xor ecx, byte 0x03
inc ecx
shl ecx, 4
movd xmmF,ecx
psrlq xmmE,xmmF
punpcklbw xmmE,xmmE
; ----------------
mov ecx,edi
and ecx, byte SIZEOF_XMMWORD-1
jz short .adj0
lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
cmp eax, byte SIZEOF_XMMWORD
ja short .adj0
and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
movdqa xmmB,xmmA
movdqa xmmG,xmmE
pslldq xmmA, SIZEOF_XMMWORD/2
pslldq xmmE, SIZEOF_XMMWORD/2
movd xmmC,ecx
sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
jb short .adj1
movd xmmH,ecx
psllq xmmA,xmmH
psllq xmmE,xmmH
jmp short .adj0
.adj1: neg ecx
movd xmmH,ecx
psrlq xmmA,xmmH
psrlq xmmE,xmmH
psllq xmmB,xmmC
psllq xmmG,xmmC
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------

View File

@ -1,7 +1,7 @@
;
; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2009 D. R. Commander
;
; Based on
@ -12,7 +12,7 @@
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ for
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; [TAB8]
@ -252,17 +252,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
.out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn
@ -275,26 +271,23 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
jmp near .columnloop
.column_st32:
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
.column_st15:
%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_MMWORD
@ -328,47 +321,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
test rcx, rcx
jz short .endcolumn
mov BYTE [rdi], al
%else
mov rax,rcx
xor rcx, byte 0x0F
shl rcx, 2
movd xmmB,ecx
psrlq xmmH,4
pcmpeqb xmmE,xmmE
psrlq xmmH,xmmB
psrlq xmmE,xmmB
punpcklbw xmmE,xmmH
; ----------------
mov rcx,rdi
and rcx, byte SIZEOF_XMMWORD-1
jz short .adj0
add rax,rcx
cmp rax, byte SIZEOF_XMMWORD
ja short .adj0
and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
movdqa xmmG,xmmA
movdqa xmmC,xmmE
pslldq xmmA, SIZEOF_XMMWORD/2
pslldq xmmE, SIZEOF_XMMWORD/2
movd xmmD,ecx
sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
jb short .adj1
movd xmmF,ecx
psllq xmmA,xmmF
psllq xmmE,xmmF
jmp short .adj0
.adj1: neg rcx
movd xmmF,ecx
psrlq xmmA,xmmF
psrlq xmmE,xmmF
psllq xmmG,xmmD
psllq xmmC,xmmD
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@ -413,19 +365,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
.out0:
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn
@ -438,25 +385,22 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
jmp near .columnloop
.column_st32:
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub rcx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
.column_st15:
%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_XMMWORD/8
@ -471,47 +415,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
test rcx, rcx
jz short .endcolumn
movd DWORD [rdi], xmmA
%else
cmp rcx, byte SIZEOF_XMMWORD/16
jb near .endcolumn
mov rax,rcx
xor rcx, byte 0x03
inc rcx
shl rcx, 4
movd xmmF,ecx
psrlq xmmE,xmmF
punpcklbw xmmE,xmmE
; ----------------
mov rcx,rdi
and rcx, byte SIZEOF_XMMWORD-1
jz short .adj0
lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
cmp rax, byte SIZEOF_XMMWORD
ja short .adj0
and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
movdqa xmmB,xmmA
movdqa xmmG,xmmE
pslldq xmmA, SIZEOF_XMMWORD/2
pslldq xmmE, SIZEOF_XMMWORD/2
movd xmmC,ecx
sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
jb short .adj1
movd xmmH,ecx
psllq xmmA,xmmH
psllq xmmE,xmmH
jmp short .adj0
.adj1: neg rcx
movd xmmH,ecx
psrlq xmmA,xmmH
psrlq xmmE,xmmH
psllq xmmB,xmmC
psllq xmmG,xmmC
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------

View File

@ -1,7 +1,7 @@
;
; jdmrgss2.asm - merged upsampling/color conversion (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
;
; Based on
; x86 SIMD extension for IJG JPEG library
@ -264,17 +264,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
add edi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn
@ -288,26 +284,23 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
alignx 16,7
.column_st32:
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
@ -341,47 +334,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
test ecx, ecx
jz short .endcolumn
mov BYTE [edi], al
%else
mov eax,ecx
xor ecx, byte 0x0F
shl ecx, 2
movd xmmB,ecx
psrlq xmmH,4
pcmpeqb xmmE,xmmE
psrlq xmmH,xmmB
psrlq xmmE,xmmB
punpcklbw xmmE,xmmH
; ----------------
mov ecx,edi
and ecx, byte SIZEOF_XMMWORD-1
jz short .adj0
add eax,ecx
cmp eax, byte SIZEOF_XMMWORD
ja short .adj0
and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
movdqa xmmG,xmmA
movdqa xmmC,xmmE
pslldq xmmA, SIZEOF_XMMWORD/2
pslldq xmmE, SIZEOF_XMMWORD/2
movd xmmD,ecx
sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
jb short .adj1
movd xmmF,ecx
psllq xmmA,xmmF
psllq xmmE,xmmF
jmp short .adj0
.adj1: neg ecx
movd xmmF,ecx
psrlq xmmA,xmmF
psrlq xmmE,xmmF
psllq xmmG,xmmD
psllq xmmC,xmmD
por xmmA,xmmG
por xmmE,xmmC
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
@ -426,19 +378,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
add edi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn
@ -452,80 +399,36 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
alignx 16,7
.column_st32:
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
add edi, byte SIZEOF_XMMWORD ; outptr
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub ecx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_XMMWORD/8
jb short .column_st7
movq MMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD/2
add edi, byte SIZEOF_XMMWORD/8*4
sub ecx, byte SIZEOF_XMMWORD/8
psrldq xmmA, 64
psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7:
; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space.
test ecx, ecx
jz short .endcolumn
movd DWORD [edi], xmmA
%else
cmp ecx, byte SIZEOF_XMMWORD/16
jb short .endcolumn
mov eax,ecx
xor ecx, byte 0x03
inc ecx
shl ecx, 4
movd xmmF,ecx
psrlq xmmE,xmmF
punpcklbw xmmE,xmmE
; ----------------
mov ecx,edi
and ecx, byte SIZEOF_XMMWORD-1
jz short .adj0
lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
cmp eax, byte SIZEOF_XMMWORD
ja short .adj0
and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
movdqa xmmB,xmmA
movdqa xmmG,xmmE
pslldq xmmA, SIZEOF_XMMWORD/2
pslldq xmmE, SIZEOF_XMMWORD/2
movd xmmC,ecx
sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
jb short .adj1
movd xmmH,ecx
psllq xmmA,xmmH
psllq xmmE,xmmH
jmp short .adj0
.adj1: neg ecx
movd xmmH,ecx
psrlq xmmA,xmmH
psrlq xmmE,xmmH
psllq xmmB,xmmC
psllq xmmG,xmmC
por xmmA,xmmB
por xmmE,xmmG
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------

View File

@ -522,6 +522,10 @@ EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
EXTERN(void) jsimd_h2v1_fancy_upsample_neon
JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
/* SIMD Sample Conversion */
EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
JDIMENSION start_col,

View File

@ -338,6 +338,15 @@ jsimd_can_h2v1_fancy_upsample (void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_ARM_NEON)
return 1;
return 0;
}
@ -355,6 +364,9 @@ jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
JSAMPARRAY input_data,
JSAMPARRAY * output_data_ptr)
{
if (simd_support & JSIMD_ARM_NEON)
jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data, output_data_ptr);
}
GLOBAL(int)

View File

@ -2157,3 +2157,241 @@ asm_function jsimd_quantize_neon
.unreq SHIFT
.unreq LOOP_COUNT
.endfunc
/*****************************************************************************/
/*
* GLOBAL(void)
* jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
* JDIMENSION downsampled_width,
* JSAMPARRAY input_data,
* JSAMPARRAY * output_data_ptr);
*
* Note: the use of unaligned writes is the main remaining bottleneck in
* this code, which can be potentially solved to get up to tens
* of percents performance improvement on Cortex-A8/Cortex-A9.
*/
/*
* Upsample 16 source pixels to 32 destination pixels. The new 16 source
* pixels are loaded to q0. The previous 16 source pixels are in q1. The
* shifted-by-one source pixels are constructed in q2 by using q0 and q1.
* Register d28 is used for multiplication by 3. Register q15 is used
* for adding +1 bias.
*/
.macro upsample16 OUTPTR, INPTR
vld1.8 {q0}, [\INPTR]!
vmovl.u8 q8, d0
vext.8 q2, q1, q0, #15
vmovl.u8 q9, d1
vaddw.u8 q10, q15, d4
vaddw.u8 q11, q15, d5
vmlal.u8 q8, d4, d28
vmlal.u8 q9, d5, d28
vmlal.u8 q10, d0, d28
vmlal.u8 q11, d1, d28
vmov q1, q0 /* backup source pixels to q1 */
vrshrn.u16 d6, q8, #2
vrshrn.u16 d7, q9, #2
vshrn.u16 d8, q10, #2
vshrn.u16 d9, q11, #2
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
.endm
/*
* Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
* macro, the roles of q0 and q1 registers are reversed for even and odd
* groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
* Also this unrolling allows to reorder loads and stores to compensate
* multiplication latency and reduce stalls.
*/
.macro upsample32 OUTPTR, INPTR
/* even 16 pixels group */
vld1.8 {q0}, [\INPTR]!
vmovl.u8 q8, d0
vext.8 q2, q1, q0, #15
vmovl.u8 q9, d1
vaddw.u8 q10, q15, d4
vaddw.u8 q11, q15, d5
vmlal.u8 q8, d4, d28
vmlal.u8 q9, d5, d28
vmlal.u8 q10, d0, d28
vmlal.u8 q11, d1, d28
/* odd 16 pixels group */
vld1.8 {q1}, [\INPTR]!
vrshrn.u16 d6, q8, #2
vrshrn.u16 d7, q9, #2
vshrn.u16 d8, q10, #2
vshrn.u16 d9, q11, #2
vmovl.u8 q8, d2
vext.8 q2, q0, q1, #15
vmovl.u8 q9, d3
vaddw.u8 q10, q15, d4
vaddw.u8 q11, q15, d5
vmlal.u8 q8, d4, d28
vmlal.u8 q9, d5, d28
vmlal.u8 q10, d2, d28
vmlal.u8 q11, d3, d28
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
vrshrn.u16 d6, q8, #2
vrshrn.u16 d7, q9, #2
vshrn.u16 d8, q10, #2
vshrn.u16 d9, q11, #2
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
.endm
/*
* Upsample a row of WIDTH pixels from INPTR to OUTPTR.
*/
.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
/* special case for the first and last pixels */
sub \WIDTH, \WIDTH, #1
add \OUTPTR, \OUTPTR, #1
ldrb \TMP1, [\INPTR, \WIDTH]
strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
ldrb \TMP1, [\INPTR], #1
strb \TMP1, [\OUTPTR, #-1]
vmov.8 d3[7], \TMP1
subs \WIDTH, \WIDTH, #32
blt 5f
0: /* process 32 pixels per iteration */
upsample32 \OUTPTR, \INPTR
subs \WIDTH, \WIDTH, #32
bge 0b
5:
adds \WIDTH, \WIDTH, #16
blt 1f
0: /* process 16 pixels if needed */
upsample16 \OUTPTR, \INPTR
subs \WIDTH, \WIDTH, #16
1:
adds \WIDTH, \WIDTH, #16
beq 9f
/* load the remaining 1-15 pixels */
add \INPTR, \INPTR, \WIDTH
tst \WIDTH, #1
beq 2f
sub \INPTR, \INPTR, #1
vld1.8 {d0[0]}, [\INPTR]
2:
tst \WIDTH, #2
beq 2f
vext.8 d0, d0, d0, #6
sub \INPTR, \INPTR, #1
vld1.8 {d0[1]}, [\INPTR]
sub \INPTR, \INPTR, #1
vld1.8 {d0[0]}, [\INPTR]
2:
tst \WIDTH, #4
beq 2f
vrev64.32 d0, d0
sub \INPTR, \INPTR, #1
vld1.8 {d0[3]}, [\INPTR]
sub \INPTR, \INPTR, #1
vld1.8 {d0[2]}, [\INPTR]
sub \INPTR, \INPTR, #1
vld1.8 {d0[1]}, [\INPTR]
sub \INPTR, \INPTR, #1
vld1.8 {d0[0]}, [\INPTR]
2:
tst \WIDTH, #8
beq 2f
vmov d1, d0
sub \INPTR, \INPTR, #8
vld1.8 {d0}, [\INPTR]
2: /* upsample the remaining pixels */
vmovl.u8 q8, d0
vext.8 q2, q1, q0, #15
vmovl.u8 q9, d1
vaddw.u8 q10, q15, d4
vaddw.u8 q11, q15, d5
vmlal.u8 q8, d4, d28
vmlal.u8 q9, d5, d28
vmlal.u8 q10, d0, d28
vmlal.u8 q11, d1, d28
vrshrn.u16 d10, q8, #2
vrshrn.u16 d12, q9, #2
vshrn.u16 d11, q10, #2
vshrn.u16 d13, q11, #2
vzip.8 d10, d11
vzip.8 d12, d13
/* store the remaining pixels */
tst \WIDTH, #8
beq 2f
vst1.8 {d10, d11}, [\OUTPTR]!
vmov q5, q6
2:
tst \WIDTH, #4
beq 2f
vst1.8 {d10}, [\OUTPTR]!
vmov d10, d11
2:
tst \WIDTH, #2
beq 2f
vst1.8 {d10[0]}, [\OUTPTR]!
vst1.8 {d10[1]}, [\OUTPTR]!
vst1.8 {d10[2]}, [\OUTPTR]!
vst1.8 {d10[3]}, [\OUTPTR]!
vext.8 d10, d10, d10, #4
2:
tst \WIDTH, #1
beq 2f
vst1.8 {d10[0]}, [\OUTPTR]!
vst1.8 {d10[1]}, [\OUTPTR]!
2:
9:
.endm
asm_function jsimd_h2v1_fancy_upsample_neon
MAX_V_SAMP_FACTOR .req r0
DOWNSAMPLED_WIDTH .req r1
INPUT_DATA .req r2
OUTPUT_DATA_PTR .req r3
OUTPUT_DATA .req OUTPUT_DATA_PTR
OUTPTR .req r4
INPTR .req r5
WIDTH .req ip
TMP .req lr
push {r4, r5, r6, lr}
vpush {d8-d15}
ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
cmp MAX_V_SAMP_FACTOR, #0
ble 99f
/* initialize constants */
vmov.u8 d28, #3
vmov.u16 q15, #1
11:
ldr INPTR, [INPUT_DATA], #4
ldr OUTPTR, [OUTPUT_DATA], #4
mov WIDTH, DOWNSAMPLED_WIDTH
upsample_row OUTPTR, INPTR, WIDTH, TMP
subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
bgt 11b
99:
vpop {d8-d15}
pop {r4, r5, r6, pc}
.unreq MAX_V_SAMP_FACTOR
.unreq DOWNSAMPLED_WIDTH
.unreq INPUT_DATA
.unreq OUTPUT_DATA_PTR
.unreq OUTPUT_DATA
.unreq OUTPTR
.unreq INPTR
.unreq WIDTH
.unreq TMP
.endfunc
.purgem upsample16
.purgem upsample32
.purgem upsample_row

View File

@ -86,8 +86,6 @@ section .note.GNU-stack noalloc noexec nowrite progbits
%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
%endif
%define STRICT_MEMORY_ACCESS 1
; To make the code position-independent, append -DPIC to the commandline
;
%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC