mirror of
https://gitlab.winehq.org/wine/wine-gecko.git
synced 2024-09-13 09:24:08 -07:00
Bug 771403 - Update libjpeg-turbo to version 1.2.1. r=jlebar
This commit is contained in:
parent
6c77f55217
commit
8e1f1321e9
@ -58,6 +58,10 @@ To upgrade to a new revision of libjpeg-turbo, do the following:
|
||||
|
||||
$ hg addremove
|
||||
|
||||
== July 4, 2012 (libjpeg-turbo v1.2.1 r853 2012-06-30) ==
|
||||
|
||||
* Updated to v1.2.1 stable release.
|
||||
|
||||
== June 5, 2012 (libjpeg-turbo v1.2.x branch, r831 2012-05-30) ==
|
||||
|
||||
* Updated to latest version on v1.2.x branch (bug 759891).
|
||||
|
@ -1,5 +1,5 @@
|
||||
#define VERSION "1.2.0"
|
||||
#define BUILD "2012-02-10"
|
||||
#define VERSION "1.2.1"
|
||||
#define BUILD "2012-06-30"
|
||||
#define PACKAGE_NAME "libjpeg-turbo"
|
||||
|
||||
/* Need to use Mozilla-specific function inlining. */
|
||||
|
@ -1,7 +1,7 @@
|
||||
;
|
||||
; jdclrss2-64.asm - colorspace conversion (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
@ -251,17 +251,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||
.out0:
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
@ -271,26 +267,23 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st32:
|
||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
||||
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp rcx, byte 2*SIZEOF_XMMWORD
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmF
|
||||
sub rcx, byte 2*SIZEOF_XMMWORD
|
||||
jmp short .column_st15
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
.column_st15:
|
||||
%ifdef STRICT_MEMORY_ACCESS
|
||||
; Store the lower 8 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_MMWORD
|
||||
@ -324,47 +317,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
test rcx, rcx
|
||||
jz short .nextrow
|
||||
mov BYTE [rdi], al
|
||||
%else
|
||||
mov rax,rcx
|
||||
xor rcx, byte 0x0F
|
||||
shl rcx, 2
|
||||
movd xmmB,ecx
|
||||
psrlq xmmH,4
|
||||
pcmpeqb xmmE,xmmE
|
||||
psrlq xmmH,xmmB
|
||||
psrlq xmmE,xmmB
|
||||
punpcklbw xmmE,xmmH
|
||||
; ----------------
|
||||
mov rcx,rdi
|
||||
and rcx, byte SIZEOF_XMMWORD-1
|
||||
jz short .adj0
|
||||
add rax,rcx
|
||||
cmp rax, byte SIZEOF_XMMWORD
|
||||
ja short .adj0
|
||||
and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
|
||||
shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,rcx
|
||||
movdqa xmmG,xmmA
|
||||
movdqa xmmC,xmmE
|
||||
pslldq xmmA, SIZEOF_XMMWORD/2
|
||||
pslldq xmmE, SIZEOF_XMMWORD/2
|
||||
movd xmmD,ecx
|
||||
sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
|
||||
jb short .adj1
|
||||
movd xmmF,ecx
|
||||
psllq xmmA,xmmF
|
||||
psllq xmmE,xmmF
|
||||
jmp short .adj0
|
||||
.adj1: neg ecx
|
||||
movd xmmF,ecx
|
||||
psrlq xmmA,xmmF
|
||||
psrlq xmmE,xmmF
|
||||
psllq xmmG,xmmD
|
||||
psllq xmmC,xmmD
|
||||
por xmmA,xmmG
|
||||
por xmmE,xmmC
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
@ -409,19 +361,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||
.out0:
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
@ -431,25 +378,22 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st32:
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
cmp rcx, byte SIZEOF_XMMWORD/2
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmC
|
||||
movdqa xmmD,xmmH
|
||||
sub rcx, byte SIZEOF_XMMWORD/2
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_XMMWORD/4
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub rcx, byte SIZEOF_XMMWORD/4
|
||||
.column_st15:
|
||||
%ifdef STRICT_MEMORY_ACCESS
|
||||
; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_XMMWORD/8
|
||||
@ -464,47 +408,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
test rcx, rcx
|
||||
jz short .nextrow
|
||||
movd DWORD [rdi], xmmA
|
||||
%else
|
||||
cmp rcx, byte SIZEOF_XMMWORD/16
|
||||
jb near .nextrow
|
||||
mov rax,rcx
|
||||
xor rcx, byte 0x03
|
||||
inc rcx
|
||||
shl rcx, 4
|
||||
movd xmmF,ecx
|
||||
psrlq xmmE,xmmF
|
||||
punpcklbw xmmE,xmmE
|
||||
; ----------------
|
||||
mov rcx,rdi
|
||||
and rcx, byte SIZEOF_XMMWORD-1
|
||||
jz short .adj0
|
||||
lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
|
||||
cmp rax, byte SIZEOF_XMMWORD
|
||||
ja short .adj0
|
||||
and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
|
||||
shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
|
||||
movdqa xmmB,xmmA
|
||||
movdqa xmmG,xmmE
|
||||
pslldq xmmA, SIZEOF_XMMWORD/2
|
||||
pslldq xmmE, SIZEOF_XMMWORD/2
|
||||
movd xmmC,ecx
|
||||
sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
|
||||
jb short .adj1
|
||||
movd xmmH,ecx
|
||||
psllq xmmA,xmmH
|
||||
psllq xmmE,xmmH
|
||||
jmp short .adj0
|
||||
.adj1: neg rcx
|
||||
movd xmmH,ecx
|
||||
psrlq xmmA,xmmH
|
||||
psrlq xmmE,xmmH
|
||||
psllq xmmB,xmmC
|
||||
psllq xmmG,xmmC
|
||||
por xmmA,xmmB
|
||||
por xmmE,xmmG
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
;
|
||||
; jdclrss2.asm - colorspace conversion (SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
@ -262,17 +262,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
||||
.out0:
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
@ -283,26 +279,23 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
alignx 16,7
|
||||
|
||||
.column_st32:
|
||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp ecx, byte 2*SIZEOF_XMMWORD
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmF
|
||||
sub ecx, byte 2*SIZEOF_XMMWORD
|
||||
jmp short .column_st15
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
.column_st15:
|
||||
%ifdef STRICT_MEMORY_ACCESS
|
||||
; Store the lower 8 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
@ -336,47 +329,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
test ecx, ecx
|
||||
jz short .nextrow
|
||||
mov BYTE [edi], al
|
||||
%else
|
||||
mov eax,ecx
|
||||
xor ecx, byte 0x0F
|
||||
shl ecx, 2
|
||||
movd xmmB,ecx
|
||||
psrlq xmmH,4
|
||||
pcmpeqb xmmE,xmmE
|
||||
psrlq xmmH,xmmB
|
||||
psrlq xmmE,xmmB
|
||||
punpcklbw xmmE,xmmH
|
||||
; ----------------
|
||||
mov ecx,edi
|
||||
and ecx, byte SIZEOF_XMMWORD-1
|
||||
jz short .adj0
|
||||
add eax,ecx
|
||||
cmp eax, byte SIZEOF_XMMWORD
|
||||
ja short .adj0
|
||||
and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
|
||||
shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
|
||||
movdqa xmmG,xmmA
|
||||
movdqa xmmC,xmmE
|
||||
pslldq xmmA, SIZEOF_XMMWORD/2
|
||||
pslldq xmmE, SIZEOF_XMMWORD/2
|
||||
movd xmmD,ecx
|
||||
sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
|
||||
jb short .adj1
|
||||
movd xmmF,ecx
|
||||
psllq xmmA,xmmF
|
||||
psllq xmmE,xmmF
|
||||
jmp short .adj0
|
||||
.adj1: neg ecx
|
||||
movd xmmF,ecx
|
||||
psrlq xmmA,xmmF
|
||||
psrlq xmmE,xmmF
|
||||
psllq xmmG,xmmD
|
||||
psllq xmmC,xmmD
|
||||
por xmmA,xmmG
|
||||
por xmmE,xmmC
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
@ -421,19 +373,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
||||
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
||||
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
||||
.out0:
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
@ -444,25 +391,22 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
alignx 16,7
|
||||
|
||||
.column_st32:
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
cmp ecx, byte SIZEOF_XMMWORD/2
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmC
|
||||
movdqa xmmD,xmmH
|
||||
sub ecx, byte SIZEOF_XMMWORD/2
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_XMMWORD/4
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub ecx, byte SIZEOF_XMMWORD/4
|
||||
.column_st15:
|
||||
%ifdef STRICT_MEMORY_ACCESS
|
||||
; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp ecx, byte SIZEOF_XMMWORD/8
|
||||
@ -477,47 +421,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
test ecx, ecx
|
||||
jz short .nextrow
|
||||
movd DWORD [edi], xmmA
|
||||
%else
|
||||
cmp ecx, byte SIZEOF_XMMWORD/16
|
||||
jb short .nextrow
|
||||
mov eax,ecx
|
||||
xor ecx, byte 0x03
|
||||
inc ecx
|
||||
shl ecx, 4
|
||||
movd xmmF,ecx
|
||||
psrlq xmmE,xmmF
|
||||
punpcklbw xmmE,xmmE
|
||||
; ----------------
|
||||
mov ecx,edi
|
||||
and ecx, byte SIZEOF_XMMWORD-1
|
||||
jz short .adj0
|
||||
lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
|
||||
cmp eax, byte SIZEOF_XMMWORD
|
||||
ja short .adj0
|
||||
and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
|
||||
shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
|
||||
movdqa xmmB,xmmA
|
||||
movdqa xmmG,xmmE
|
||||
pslldq xmmA, SIZEOF_XMMWORD/2
|
||||
pslldq xmmE, SIZEOF_XMMWORD/2
|
||||
movd xmmC,ecx
|
||||
sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
|
||||
jb short .adj1
|
||||
movd xmmH,ecx
|
||||
psllq xmmA,xmmH
|
||||
psllq xmmE,xmmH
|
||||
jmp short .adj0
|
||||
.adj1: neg ecx
|
||||
movd xmmH,ecx
|
||||
psrlq xmmA,xmmH
|
||||
psrlq xmmE,xmmH
|
||||
psllq xmmB,xmmC
|
||||
psllq xmmG,xmmC
|
||||
por xmmA,xmmB
|
||||
por xmmE,xmmG
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
;
|
||||
; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
@ -12,7 +12,7 @@
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ for
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
@ -252,17 +252,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||
.out0:
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
@ -275,26 +271,23 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st32:
|
||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
||||
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp rcx, byte 2*SIZEOF_XMMWORD
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmF
|
||||
sub rcx, byte 2*SIZEOF_XMMWORD
|
||||
jmp short .column_st15
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
.column_st15:
|
||||
%ifdef STRICT_MEMORY_ACCESS
|
||||
; Store the lower 8 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_MMWORD
|
||||
@ -328,47 +321,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
test rcx, rcx
|
||||
jz short .endcolumn
|
||||
mov BYTE [rdi], al
|
||||
%else
|
||||
mov rax,rcx
|
||||
xor rcx, byte 0x0F
|
||||
shl rcx, 2
|
||||
movd xmmB,ecx
|
||||
psrlq xmmH,4
|
||||
pcmpeqb xmmE,xmmE
|
||||
psrlq xmmH,xmmB
|
||||
psrlq xmmE,xmmB
|
||||
punpcklbw xmmE,xmmH
|
||||
; ----------------
|
||||
mov rcx,rdi
|
||||
and rcx, byte SIZEOF_XMMWORD-1
|
||||
jz short .adj0
|
||||
add rax,rcx
|
||||
cmp rax, byte SIZEOF_XMMWORD
|
||||
ja short .adj0
|
||||
and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
|
||||
shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
|
||||
movdqa xmmG,xmmA
|
||||
movdqa xmmC,xmmE
|
||||
pslldq xmmA, SIZEOF_XMMWORD/2
|
||||
pslldq xmmE, SIZEOF_XMMWORD/2
|
||||
movd xmmD,ecx
|
||||
sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
|
||||
jb short .adj1
|
||||
movd xmmF,ecx
|
||||
psllq xmmA,xmmF
|
||||
psllq xmmE,xmmF
|
||||
jmp short .adj0
|
||||
.adj1: neg rcx
|
||||
movd xmmF,ecx
|
||||
psrlq xmmA,xmmF
|
||||
psrlq xmmE,xmmF
|
||||
psllq xmmG,xmmD
|
||||
psllq xmmC,xmmD
|
||||
por xmmA,xmmG
|
||||
por xmmE,xmmC
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
@ -413,19 +365,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||
.out0:
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
@ -438,25 +385,22 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st32:
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
cmp rcx, byte SIZEOF_XMMWORD/2
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmC
|
||||
movdqa xmmD,xmmH
|
||||
sub rcx, byte SIZEOF_XMMWORD/2
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_XMMWORD/4
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub rcx, byte SIZEOF_XMMWORD/4
|
||||
.column_st15:
|
||||
%ifdef STRICT_MEMORY_ACCESS
|
||||
; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_XMMWORD/8
|
||||
@ -471,47 +415,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
test rcx, rcx
|
||||
jz short .endcolumn
|
||||
movd DWORD [rdi], xmmA
|
||||
%else
|
||||
cmp rcx, byte SIZEOF_XMMWORD/16
|
||||
jb near .endcolumn
|
||||
mov rax,rcx
|
||||
xor rcx, byte 0x03
|
||||
inc rcx
|
||||
shl rcx, 4
|
||||
movd xmmF,ecx
|
||||
psrlq xmmE,xmmF
|
||||
punpcklbw xmmE,xmmE
|
||||
; ----------------
|
||||
mov rcx,rdi
|
||||
and rcx, byte SIZEOF_XMMWORD-1
|
||||
jz short .adj0
|
||||
lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
|
||||
cmp rax, byte SIZEOF_XMMWORD
|
||||
ja short .adj0
|
||||
and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
|
||||
shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
|
||||
movdqa xmmB,xmmA
|
||||
movdqa xmmG,xmmE
|
||||
pslldq xmmA, SIZEOF_XMMWORD/2
|
||||
pslldq xmmE, SIZEOF_XMMWORD/2
|
||||
movd xmmC,ecx
|
||||
sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
|
||||
jb short .adj1
|
||||
movd xmmH,ecx
|
||||
psllq xmmA,xmmH
|
||||
psllq xmmE,xmmH
|
||||
jmp short .adj0
|
||||
.adj1: neg rcx
|
||||
movd xmmH,ecx
|
||||
psrlq xmmA,xmmH
|
||||
psrlq xmmE,xmmH
|
||||
psllq xmmB,xmmC
|
||||
psllq xmmG,xmmC
|
||||
por xmmA,xmmB
|
||||
por xmmE,xmmG
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
;
|
||||
; jdmrgss2.asm - merged upsampling/color conversion (SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
@ -264,17 +264,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
|
||||
.out0:
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
@ -288,26 +284,23 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
alignx 16,7
|
||||
|
||||
.column_st32:
|
||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
||||
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp ecx, byte 2*SIZEOF_XMMWORD
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmF
|
||||
sub ecx, byte 2*SIZEOF_XMMWORD
|
||||
jmp short .column_st15
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
.column_st15:
|
||||
%ifdef STRICT_MEMORY_ACCESS
|
||||
; Store the lower 8 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp ecx, byte SIZEOF_MMWORD
|
||||
@ -341,47 +334,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
test ecx, ecx
|
||||
jz short .endcolumn
|
||||
mov BYTE [edi], al
|
||||
%else
|
||||
mov eax,ecx
|
||||
xor ecx, byte 0x0F
|
||||
shl ecx, 2
|
||||
movd xmmB,ecx
|
||||
psrlq xmmH,4
|
||||
pcmpeqb xmmE,xmmE
|
||||
psrlq xmmH,xmmB
|
||||
psrlq xmmE,xmmB
|
||||
punpcklbw xmmE,xmmH
|
||||
; ----------------
|
||||
mov ecx,edi
|
||||
and ecx, byte SIZEOF_XMMWORD-1
|
||||
jz short .adj0
|
||||
add eax,ecx
|
||||
cmp eax, byte SIZEOF_XMMWORD
|
||||
ja short .adj0
|
||||
and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
|
||||
shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
|
||||
movdqa xmmG,xmmA
|
||||
movdqa xmmC,xmmE
|
||||
pslldq xmmA, SIZEOF_XMMWORD/2
|
||||
pslldq xmmE, SIZEOF_XMMWORD/2
|
||||
movd xmmD,ecx
|
||||
sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
|
||||
jb short .adj1
|
||||
movd xmmF,ecx
|
||||
psllq xmmA,xmmF
|
||||
psllq xmmE,xmmF
|
||||
jmp short .adj0
|
||||
.adj1: neg ecx
|
||||
movd xmmF,ecx
|
||||
psrlq xmmA,xmmF
|
||||
psrlq xmmE,xmmF
|
||||
psllq xmmG,xmmD
|
||||
psllq xmmC,xmmD
|
||||
por xmmA,xmmG
|
||||
por xmmE,xmmC
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
@ -426,19 +378,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
||||
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
|
||||
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
|
||||
.out0:
|
||||
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
sub ecx, byte SIZEOF_XMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
@ -452,80 +399,36 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
alignx 16,7
|
||||
|
||||
.column_st32:
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
cmp ecx, byte SIZEOF_XMMWORD/2
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
|
||||
add edi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmC
|
||||
movdqa xmmD,xmmH
|
||||
sub ecx, byte SIZEOF_XMMWORD/2
|
||||
.column_st16:
|
||||
cmp ecx, byte SIZEOF_XMMWORD/4
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub ecx, byte SIZEOF_XMMWORD/4
|
||||
.column_st15:
|
||||
%ifdef STRICT_MEMORY_ACCESS
|
||||
; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp ecx, byte SIZEOF_XMMWORD/8
|
||||
jb short .column_st7
|
||||
movq MMWORD [edi], xmmA
|
||||
add edi, byte SIZEOF_XMMWORD/2
|
||||
add edi, byte SIZEOF_XMMWORD/8*4
|
||||
sub ecx, byte SIZEOF_XMMWORD/8
|
||||
psrldq xmmA, 64
|
||||
psrldq xmmA, SIZEOF_XMMWORD/8*4
|
||||
.column_st7:
|
||||
; Store one pixel (4 bytes) of xmmA to the output when it has enough
|
||||
; space.
|
||||
test ecx, ecx
|
||||
jz short .endcolumn
|
||||
movd DWORD [edi], xmmA
|
||||
%else
|
||||
cmp ecx, byte SIZEOF_XMMWORD/16
|
||||
jb short .endcolumn
|
||||
mov eax,ecx
|
||||
xor ecx, byte 0x03
|
||||
inc ecx
|
||||
shl ecx, 4
|
||||
movd xmmF,ecx
|
||||
psrlq xmmE,xmmF
|
||||
punpcklbw xmmE,xmmE
|
||||
; ----------------
|
||||
mov ecx,edi
|
||||
and ecx, byte SIZEOF_XMMWORD-1
|
||||
jz short .adj0
|
||||
lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
|
||||
cmp eax, byte SIZEOF_XMMWORD
|
||||
ja short .adj0
|
||||
and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
|
||||
shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
|
||||
movdqa xmmB,xmmA
|
||||
movdqa xmmG,xmmE
|
||||
pslldq xmmA, SIZEOF_XMMWORD/2
|
||||
pslldq xmmE, SIZEOF_XMMWORD/2
|
||||
movd xmmC,ecx
|
||||
sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
|
||||
jb short .adj1
|
||||
movd xmmH,ecx
|
||||
psllq xmmA,xmmH
|
||||
psllq xmmE,xmmH
|
||||
jmp short .adj0
|
||||
.adj1: neg ecx
|
||||
movd xmmH,ecx
|
||||
psrlq xmmA,xmmH
|
||||
psrlq xmmE,xmmH
|
||||
psllq xmmB,xmmC
|
||||
psllq xmmG,xmmC
|
||||
por xmmA,xmmB
|
||||
por xmmE,xmmG
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
|
@ -522,6 +522,10 @@ EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
|
||||
JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
|
||||
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
|
||||
|
||||
EXTERN(void) jsimd_h2v1_fancy_upsample_neon
|
||||
JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
|
||||
JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
|
||||
|
||||
/* SIMD Sample Conversion */
|
||||
EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
|
||||
JDIMENSION start_col,
|
||||
|
@ -338,6 +338,15 @@ jsimd_can_h2v1_fancy_upsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_ARM_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -355,6 +364,9 @@ jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY * output_data_ptr)
|
||||
{
|
||||
if (simd_support & JSIMD_ARM_NEON)
|
||||
jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
|
||||
compptr->downsampled_width, input_data, output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
|
@ -2157,3 +2157,241 @@ asm_function jsimd_quantize_neon
|
||||
.unreq SHIFT
|
||||
.unreq LOOP_COUNT
|
||||
.endfunc
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
/*
|
||||
* GLOBAL(void)
|
||||
* jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
|
||||
* JDIMENSION downsampled_width,
|
||||
* JSAMPARRAY input_data,
|
||||
* JSAMPARRAY * output_data_ptr);
|
||||
*
|
||||
* Note: the use of unaligned writes is the main remaining bottleneck in
|
||||
* this code, which can be potentially solved to get up to tens
|
||||
* of percents performance improvement on Cortex-A8/Cortex-A9.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Upsample 16 source pixels to 32 destination pixels. The new 16 source
|
||||
* pixels are loaded to q0. The previous 16 source pixels are in q1. The
|
||||
* shifted-by-one source pixels are constructed in q2 by using q0 and q1.
|
||||
* Register d28 is used for multiplication by 3. Register q15 is used
|
||||
* for adding +1 bias.
|
||||
*/
|
||||
.macro upsample16 OUTPTR, INPTR
|
||||
vld1.8 {q0}, [\INPTR]!
|
||||
vmovl.u8 q8, d0
|
||||
vext.8 q2, q1, q0, #15
|
||||
vmovl.u8 q9, d1
|
||||
vaddw.u8 q10, q15, d4
|
||||
vaddw.u8 q11, q15, d5
|
||||
vmlal.u8 q8, d4, d28
|
||||
vmlal.u8 q9, d5, d28
|
||||
vmlal.u8 q10, d0, d28
|
||||
vmlal.u8 q11, d1, d28
|
||||
vmov q1, q0 /* backup source pixels to q1 */
|
||||
vrshrn.u16 d6, q8, #2
|
||||
vrshrn.u16 d7, q9, #2
|
||||
vshrn.u16 d8, q10, #2
|
||||
vshrn.u16 d9, q11, #2
|
||||
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
|
||||
* macro, the roles of q0 and q1 registers are reversed for even and odd
|
||||
* groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
|
||||
* Also this unrolling allows to reorder loads and stores to compensate
|
||||
* multiplication latency and reduce stalls.
|
||||
*/
|
||||
.macro upsample32 OUTPTR, INPTR
|
||||
/* even 16 pixels group */
|
||||
vld1.8 {q0}, [\INPTR]!
|
||||
vmovl.u8 q8, d0
|
||||
vext.8 q2, q1, q0, #15
|
||||
vmovl.u8 q9, d1
|
||||
vaddw.u8 q10, q15, d4
|
||||
vaddw.u8 q11, q15, d5
|
||||
vmlal.u8 q8, d4, d28
|
||||
vmlal.u8 q9, d5, d28
|
||||
vmlal.u8 q10, d0, d28
|
||||
vmlal.u8 q11, d1, d28
|
||||
/* odd 16 pixels group */
|
||||
vld1.8 {q1}, [\INPTR]!
|
||||
vrshrn.u16 d6, q8, #2
|
||||
vrshrn.u16 d7, q9, #2
|
||||
vshrn.u16 d8, q10, #2
|
||||
vshrn.u16 d9, q11, #2
|
||||
vmovl.u8 q8, d2
|
||||
vext.8 q2, q0, q1, #15
|
||||
vmovl.u8 q9, d3
|
||||
vaddw.u8 q10, q15, d4
|
||||
vaddw.u8 q11, q15, d5
|
||||
vmlal.u8 q8, d4, d28
|
||||
vmlal.u8 q9, d5, d28
|
||||
vmlal.u8 q10, d2, d28
|
||||
vmlal.u8 q11, d3, d28
|
||||
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
|
||||
vrshrn.u16 d6, q8, #2
|
||||
vrshrn.u16 d7, q9, #2
|
||||
vshrn.u16 d8, q10, #2
|
||||
vshrn.u16 d9, q11, #2
|
||||
vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Upsample a row of WIDTH pixels from INPTR to OUTPTR.
|
||||
*/
|
||||
.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
|
||||
/* special case for the first and last pixels */
|
||||
sub \WIDTH, \WIDTH, #1
|
||||
add \OUTPTR, \OUTPTR, #1
|
||||
ldrb \TMP1, [\INPTR, \WIDTH]
|
||||
strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
|
||||
ldrb \TMP1, [\INPTR], #1
|
||||
strb \TMP1, [\OUTPTR, #-1]
|
||||
vmov.8 d3[7], \TMP1
|
||||
|
||||
subs \WIDTH, \WIDTH, #32
|
||||
blt 5f
|
||||
0: /* process 32 pixels per iteration */
|
||||
upsample32 \OUTPTR, \INPTR
|
||||
subs \WIDTH, \WIDTH, #32
|
||||
bge 0b
|
||||
5:
|
||||
adds \WIDTH, \WIDTH, #16
|
||||
blt 1f
|
||||
0: /* process 16 pixels if needed */
|
||||
upsample16 \OUTPTR, \INPTR
|
||||
subs \WIDTH, \WIDTH, #16
|
||||
1:
|
||||
adds \WIDTH, \WIDTH, #16
|
||||
beq 9f
|
||||
|
||||
/* load the remaining 1-15 pixels */
|
||||
add \INPTR, \INPTR, \WIDTH
|
||||
tst \WIDTH, #1
|
||||
beq 2f
|
||||
sub \INPTR, \INPTR, #1
|
||||
vld1.8 {d0[0]}, [\INPTR]
|
||||
2:
|
||||
tst \WIDTH, #2
|
||||
beq 2f
|
||||
vext.8 d0, d0, d0, #6
|
||||
sub \INPTR, \INPTR, #1
|
||||
vld1.8 {d0[1]}, [\INPTR]
|
||||
sub \INPTR, \INPTR, #1
|
||||
vld1.8 {d0[0]}, [\INPTR]
|
||||
2:
|
||||
tst \WIDTH, #4
|
||||
beq 2f
|
||||
vrev64.32 d0, d0
|
||||
sub \INPTR, \INPTR, #1
|
||||
vld1.8 {d0[3]}, [\INPTR]
|
||||
sub \INPTR, \INPTR, #1
|
||||
vld1.8 {d0[2]}, [\INPTR]
|
||||
sub \INPTR, \INPTR, #1
|
||||
vld1.8 {d0[1]}, [\INPTR]
|
||||
sub \INPTR, \INPTR, #1
|
||||
vld1.8 {d0[0]}, [\INPTR]
|
||||
2:
|
||||
tst \WIDTH, #8
|
||||
beq 2f
|
||||
vmov d1, d0
|
||||
sub \INPTR, \INPTR, #8
|
||||
vld1.8 {d0}, [\INPTR]
|
||||
2: /* upsample the remaining pixels */
|
||||
vmovl.u8 q8, d0
|
||||
vext.8 q2, q1, q0, #15
|
||||
vmovl.u8 q9, d1
|
||||
vaddw.u8 q10, q15, d4
|
||||
vaddw.u8 q11, q15, d5
|
||||
vmlal.u8 q8, d4, d28
|
||||
vmlal.u8 q9, d5, d28
|
||||
vmlal.u8 q10, d0, d28
|
||||
vmlal.u8 q11, d1, d28
|
||||
vrshrn.u16 d10, q8, #2
|
||||
vrshrn.u16 d12, q9, #2
|
||||
vshrn.u16 d11, q10, #2
|
||||
vshrn.u16 d13, q11, #2
|
||||
vzip.8 d10, d11
|
||||
vzip.8 d12, d13
|
||||
/* store the remaining pixels */
|
||||
tst \WIDTH, #8
|
||||
beq 2f
|
||||
vst1.8 {d10, d11}, [\OUTPTR]!
|
||||
vmov q5, q6
|
||||
2:
|
||||
tst \WIDTH, #4
|
||||
beq 2f
|
||||
vst1.8 {d10}, [\OUTPTR]!
|
||||
vmov d10, d11
|
||||
2:
|
||||
tst \WIDTH, #2
|
||||
beq 2f
|
||||
vst1.8 {d10[0]}, [\OUTPTR]!
|
||||
vst1.8 {d10[1]}, [\OUTPTR]!
|
||||
vst1.8 {d10[2]}, [\OUTPTR]!
|
||||
vst1.8 {d10[3]}, [\OUTPTR]!
|
||||
vext.8 d10, d10, d10, #4
|
||||
2:
|
||||
tst \WIDTH, #1
|
||||
beq 2f
|
||||
vst1.8 {d10[0]}, [\OUTPTR]!
|
||||
vst1.8 {d10[1]}, [\OUTPTR]!
|
||||
2:
|
||||
9:
|
||||
.endm
|
||||
|
||||
asm_function jsimd_h2v1_fancy_upsample_neon
|
||||
|
||||
MAX_V_SAMP_FACTOR .req r0
|
||||
DOWNSAMPLED_WIDTH .req r1
|
||||
INPUT_DATA .req r2
|
||||
OUTPUT_DATA_PTR .req r3
|
||||
OUTPUT_DATA .req OUTPUT_DATA_PTR
|
||||
|
||||
OUTPTR .req r4
|
||||
INPTR .req r5
|
||||
WIDTH .req ip
|
||||
TMP .req lr
|
||||
|
||||
push {r4, r5, r6, lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
|
||||
cmp MAX_V_SAMP_FACTOR, #0
|
||||
ble 99f
|
||||
|
||||
/* initialize constants */
|
||||
vmov.u8 d28, #3
|
||||
vmov.u16 q15, #1
|
||||
11:
|
||||
ldr INPTR, [INPUT_DATA], #4
|
||||
ldr OUTPTR, [OUTPUT_DATA], #4
|
||||
mov WIDTH, DOWNSAMPLED_WIDTH
|
||||
upsample_row OUTPTR, INPTR, WIDTH, TMP
|
||||
subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
|
||||
bgt 11b
|
||||
|
||||
99:
|
||||
vpop {d8-d15}
|
||||
pop {r4, r5, r6, pc}
|
||||
|
||||
.unreq MAX_V_SAMP_FACTOR
|
||||
.unreq DOWNSAMPLED_WIDTH
|
||||
.unreq INPUT_DATA
|
||||
.unreq OUTPUT_DATA_PTR
|
||||
.unreq OUTPUT_DATA
|
||||
|
||||
.unreq OUTPTR
|
||||
.unreq INPTR
|
||||
.unreq WIDTH
|
||||
.unreq TMP
|
||||
|
||||
.endfunc
|
||||
|
||||
.purgem upsample16
|
||||
.purgem upsample32
|
||||
.purgem upsample_row
|
||||
|
@ -86,8 +86,6 @@ section .note.GNU-stack noalloc noexec nowrite progbits
|
||||
%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
|
||||
%endif
|
||||
|
||||
%define STRICT_MEMORY_ACCESS 1
|
||||
|
||||
; To make the code position-independent, append -DPIC to the commandline
|
||||
;
|
||||
%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
|
||||
|
Loading…
Reference in New Issue
Block a user