gecko/media/libvpx/vp8/common/x86/postproc_sse2.asm

;
;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license 
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may 
;  be found in the AUTHORS file in the root of the source tree.
;


%include "vpx_ports/x86_abi_support.asm"

;void vp8_post_proc_down_and_across_xmm
;(
;    unsigned char *src_ptr,
;    unsigned char *dst_ptr,
;    int src_pixels_per_line,
;    int dst_pixels_per_line,
;    int rows,
;    int cols,
;    int flimit
;)
global sym(vp8_post_proc_down_and_across_xmm)
sym(vp8_post_proc_down_and_across_xmm):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

%if ABI_IS_32BIT=1 && CONFIG_PIC=1
    ALIGN_STACK 16, rax
    ; move the global rd onto the stack, since we don't have enough registers
    ; to do PIC addressing
    movdqa      xmm0, [rd42 GLOBAL]
    sub         rsp, 16
    movdqa      [rsp], xmm0
%define RD42 [rsp]
%else
%define RD42 [rd42 GLOBAL]
%endif


        movd        xmm2,       dword ptr arg(6) ;flimit
        punpcklwd   xmm2,       xmm2
        punpckldq   xmm2,       xmm2
        punpcklqdq  xmm2,       xmm2

        mov         rsi,        arg(0) ;src_ptr
        mov         rdi,        arg(1) ;dst_ptr

        movsxd      rcx,        DWORD PTR arg(4) ;rows
        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
        pxor        xmm0,       xmm0              ; mm0 = 00000000

nextrow:

        xor         rdx,        rdx       ; clear out rdx for use as loop counter
nextcol:
        movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
        punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
        movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
        psllw       xmm3,       2                       ;

        movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
        punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
        paddusw     xmm3,       xmm5                    ; mm3 += mm6

        ; thresholding
        movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
        psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
        psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
        paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
        pcmpgtw     xmm7,       xmm2

        movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
        punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
        paddusw     xmm3,       xmm5                    ; mm3 += mm5

        ; thresholding
        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
        psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
        psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6                    ; accumulate thresholds


        neg         rax
        movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
        punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
        paddusw     xmm3,       xmm5                    ; mm3 += mm5

        ; thresholding
        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
        psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
        psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6                    ; accumulate thresholds

        movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
        punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
        paddusw     xmm3,       xmm4                    ; mm3 += mm5

        ; thresholding
        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
        psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
        psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
        paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6                    ; accumulate thresholds


        paddusw     xmm3,       RD42                    ; mm3 += round value
        psraw       xmm3,       3                       ; mm3 /= 8

        pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
        pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
        paddusw     xmm1,       xmm7                    ; combination

        packuswb    xmm1,       xmm0                    ; pack to bytes
        movq        QWORD PTR [rdi], xmm1             ;

        neg         rax                   ; pitch is positive
        add         rsi,        8
        add         rdi,        8

        add         rdx,        8
        cmp         edx,        dword arg(5) ;cols

        jl          nextcol

        ; done with the all cols, start the across filtering in place
        sub         rsi,        rdx
        sub         rdi,        rdx

        xor         rdx,        rdx
        movq        mm0,        QWORD PTR [rdi-8];

acrossnextcol:
        movq        xmm7,       QWORD PTR [rdi +rdx -2]
        movd        xmm4,       DWORD PTR [rdi +rdx +6]

        pslldq      xmm4,       8
        por         xmm4,       xmm7

        movdqa      xmm3,       xmm4
        psrldq      xmm3,       2
        punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
        movdqa      xmm1,       xmm3              ; mm1 = p0..p3
        psllw       xmm3,       2


        movdqa      xmm5,       xmm4
        psrldq      xmm5,       3
        punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
        paddusw     xmm3,       xmm5              ; mm3 += mm6

        ; thresholding
        movdqa      xmm7,       xmm1              ; mm7 = p0..p3
        psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
        paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
        pcmpgtw     xmm7,       xmm2

        movdqa      xmm5,       xmm4
        psrldq      xmm5,       4
        punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
        paddusw     xmm3,       xmm5              ; mm3 += mm5

        ; thresholding
        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6              ; accumulate thresholds


        movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
        punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
        paddusw     xmm3,       xmm5              ; mm3 += mm5

        ; thresholding
        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6              ; accumulate thresholds

        psrldq      xmm4,       1                   ; mm4 = p-1..p5
        punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
        paddusw     xmm3,       xmm4              ; mm3 += mm5

        ; thresholding
        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
        psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
        psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
        paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6              ; accumulate thresholds

        paddusw     xmm3,       RD42              ; mm3 += round value
        psraw       xmm3,       3                 ; mm3 /= 8

        pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
        pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
        paddusw     xmm1,       xmm7              ; combination

        packuswb    xmm1,       xmm0              ; pack to bytes
        movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
        movdq2q     mm0,        xmm1

        add         rdx,        8
        cmp         edx,        dword arg(5) ;cols
        jl          acrossnextcol;

        ; last 8 pixels
        movq        QWORD PTR [rdi+rdx-8],  mm0

        ; done with this rwo
        add         rsi,rax               ; next line
        mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
        add         rdi,rax               ; next destination
        mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?

        dec         rcx                   ; decrement count
        jnz         nextrow               ; next row

%if ABI_IS_32BIT=1 && CONFIG_PIC=1
    add rsp,16
    pop rsp
%endif
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
%undef RD42


;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
;                            int pitch, int rows, int cols,int flimit)
extern sym(vp8_rv)
global sym(vp8_mbpost_proc_down_xmm)
sym(vp8_mbpost_proc_down_xmm):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    ALIGN_STACK 16, rax
    sub         rsp, 128+16

    ; unsigned char d[16][8] at [rsp]
    ; create flimit2 at [rsp+128]
    mov         eax, dword ptr arg(4) ;flimit
    mov         [rsp+128], eax
    mov         [rsp+128+4], eax
    mov         [rsp+128+8], eax
    mov         [rsp+128+12], eax
%define flimit4 [rsp+128]

%if ABI_IS_32BIT=0
    lea         r8,       [sym(vp8_rv) GLOBAL]
%endif

    ;rows +=8;
    add         dword arg(2), 8

    ;for(c=0; c<cols; c+=8)
loop_col:
            mov         rsi,        arg(0) ; s
            pxor        xmm0,       xmm0        ;

            movsxd      rax,        dword ptr arg(1) ;pitch       ;
            neg         rax                                     ; rax = -pitch

            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
            neg         rax


            pxor        xmm5,       xmm5
            pxor        xmm6,       xmm6        ;

            pxor        xmm7,       xmm7        ;
            mov         rdi,        rsi

            mov         rcx,        15          ;

loop_initvar:
            movq        xmm1,       QWORD PTR [rdi];
            punpcklbw   xmm1,       xmm0        ;

            paddw       xmm5,       xmm1        ;
            pmullw      xmm1,       xmm1        ;

            movdqa      xmm2,       xmm1        ;
            punpcklwd   xmm1,       xmm0        ;

            punpckhwd   xmm2,       xmm0        ;
            paddd       xmm6,       xmm1        ;

            paddd       xmm7,       xmm2        ;
            lea         rdi,        [rdi+rax]   ;

            dec         rcx
            jne         loop_initvar
            ;save the var and sum
            xor         rdx,        rdx
loop_row:
            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]

            punpcklbw   xmm1,       xmm0
            punpcklbw   xmm2,       xmm0

            paddw       xmm5,       xmm2
            psubw       xmm5,       xmm1

            pmullw      xmm2,       xmm2
            movdqa      xmm4,       xmm2

            punpcklwd   xmm2,       xmm0
            punpckhwd   xmm4,       xmm0

            paddd       xmm6,       xmm2
            paddd       xmm7,       xmm4

            pmullw      xmm1,       xmm1
            movdqa      xmm2,       xmm1

            punpcklwd   xmm1,       xmm0
            psubd       xmm6,       xmm1

            punpckhwd   xmm2,       xmm0
            psubd       xmm7,       xmm2


            movdqa      xmm3,       xmm6
            pslld       xmm3,       4

            psubd       xmm3,       xmm6
            movdqa      xmm1,       xmm5

            movdqa      xmm4,       xmm5
            pmullw      xmm1,       xmm1

            pmulhw      xmm4,       xmm4
            movdqa      xmm2,       xmm1

            punpcklwd   xmm1,       xmm4
            punpckhwd   xmm2,       xmm4

            movdqa      xmm4,       xmm7
            pslld       xmm4,       4

            psubd       xmm4,       xmm7

            psubd       xmm3,       xmm1
            psubd       xmm4,       xmm2

            psubd       xmm3,       flimit4
            psubd       xmm4,       flimit4

            psrad       xmm3,       31
            psrad       xmm4,       31

            packssdw    xmm3,       xmm4
            packsswb    xmm3,       xmm0

            movq        xmm1,       QWORD PTR [rsi+rax*8]

            movq        xmm2,       xmm1
            punpcklbw   xmm1,       xmm0

            paddw       xmm1,       xmm5
            mov         rcx,        rdx

            and         rcx,        127
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
            push        rax
            lea         rax,        [sym(vp8_rv) GLOBAL]
            movdqu      xmm4,       [rax + rcx*2] ;vp8_rv[rcx*2]
            pop         rax
%elif ABI_IS_32BIT=0
            movdqu      xmm4,       [r8 + rcx*2] ;vp8_rv[rcx*2]
%else
            movdqu      xmm4,       [sym(vp8_rv) + rcx*2]
%endif

            paddw       xmm1,       xmm4
            ;paddw     xmm1,       eight8s
            psraw       xmm1,       4

            packuswb    xmm1,       xmm0
            pand        xmm1,       xmm3

            pandn       xmm3,       xmm2
            por         xmm1,       xmm3

            and         rcx,        15
            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]

            mov         rcx,        rdx
            sub         rcx,        8

            and         rcx,        15
            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]

            movq        [rsi],      mm0
            lea         rsi,        [rsi+rax]

            lea         rdi,        [rdi+rax]
            add         rdx,        1

            cmp         edx,        dword arg(2) ;rows
            jl          loop_row

        add         dword arg(0), 8 ; s += 8
        sub         dword arg(3), 8 ; cols -= 8
        cmp         dword arg(3), 0
        jg          loop_col

    add         rsp, 128+16
    pop         rsp

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
%undef flimit4


;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
;                                int pitch, int rows, int cols,int flimit)
global sym(vp8_mbpost_proc_across_ip_xmm)
sym(vp8_mbpost_proc_across_ip_xmm):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    ALIGN_STACK 16, rax
    sub         rsp, 16

    ; create flimit4 at [rsp]
    mov         eax, dword ptr arg(4) ;flimit
    mov         [rsp], eax
    mov         [rsp+4], eax
    mov         [rsp+8], eax
    mov         [rsp+12], eax
%define flimit4 [rsp]


    ;for(r=0;r<rows;r++)
ip_row_loop:

        xor         rdx,    rdx ;sumsq=0;
        xor         rcx,    rcx ;sum=0;
        mov         rsi,    arg(0); s
        mov         rdi,    -8
ip_var_loop:
        ;for(i=-8;i<=6;i++)
        ;{
        ;    sumsq += s[i]*s[i];
        ;    sum   += s[i];
        ;}
        movzx       eax, byte [rsi+rdi]
        add         ecx, eax
        mul         al
        add         edx, eax
        add         rdi, 1
        cmp         rdi, 6
        jle         ip_var_loop


            ;mov         rax,    sumsq
            ;movd        xmm7,   rax
            movd        xmm7,   edx

            ;mov         rax,    sum
            ;movd        xmm6,   rax
            movd        xmm6,   ecx

            mov         rsi,    arg(0) ;s
            xor         rcx,    rcx

            movsxd      rdx,    dword arg(3) ;cols
            add         rdx,    8
            pxor        mm0,    mm0
            pxor        mm1,    mm1

            pxor        xmm0,   xmm0
nextcol4:

            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10

            punpcklbw   xmm1,   xmm0                    ; expanding
            punpcklbw   xmm2,   xmm0                    ; expanding

            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
            punpcklwd   xmm2,   xmm0                    ; expanding to dwords

            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2

            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5

            paddd       xmm6,   xmm2
            paddd       xmm7,   xmm1

            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones

            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000

            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared

            paddd       xmm6,   xmm4
            paddd       xmm7,   xmm3

            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared

            paddd       xmm7,   xmm3
            paddd       xmm6,   xmm4

            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared

            paddd       xmm7,   xmm3
            paddd       xmm6,   xmm4

            movdqa      xmm3,   xmm6
            pmaddwd     xmm3,   xmm3

            movdqa      xmm5,   xmm7
            pslld       xmm5,   4

            psubd       xmm5,   xmm7
            psubd       xmm5,   xmm3

            psubd       xmm5,   flimit4
            psrad       xmm5,   31

            packssdw    xmm5,   xmm0
            packsswb    xmm5,   xmm0

            movd        xmm1,   DWORD PTR [rsi+rcx]
            movq        xmm2,   xmm1

            punpcklbw   xmm1,   xmm0
            punpcklwd   xmm1,   xmm0

            paddd       xmm1,   xmm6
            paddd       xmm1,   [four8s GLOBAL]

            psrad       xmm1,   4
            packssdw    xmm1,   xmm0

            packuswb    xmm1,   xmm0
            pand        xmm1,   xmm5

            pandn       xmm5,   xmm2
            por         xmm5,   xmm1

            movd        [rsi+rcx-8],  mm0
            movq        mm0,    mm1

            movdq2q     mm1,    xmm5
            psrldq      xmm7,   12

            psrldq      xmm6,   12
            add         rcx,    4

            cmp         rcx,    rdx
            jl          nextcol4

        ;s+=pitch;
        movsxd rax, dword arg(1)
        add    arg(0), rax

        sub dword arg(2), 1 ;rows-=1
        cmp dword arg(2), 0
        jg ip_row_loop

    add         rsp, 16
    pop         rsp

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
%undef flimit4


;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
;                            unsigned char blackclamp[16],
;                            unsigned char whiteclamp[16],
;                            unsigned char bothclamp[16],
;                            unsigned int Width, unsigned int Height, int Pitch)
extern sym(rand)
global sym(vp8_plane_add_noise_wmt)
sym(vp8_plane_add_noise_wmt):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 8
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

addnoise_loop:
    call sym(rand) WRT_PLT
    mov     rcx, arg(1) ;noise
    and     rax, 0xff
    add     rcx, rax

    ; we rely on the fact that the clamping vectors are stored contiguously
    ; in black/white/both order. Note that we have to reload this here because
    ; rdx could be trashed by rand()
    mov     rdx, arg(2) ; blackclamp


            mov     rdi, rcx
            movsxd  rcx, dword arg(5) ;[Width]
            mov     rsi, arg(0) ;Pos
            xor         rax,rax

addnoise_nextset:
            movdqu      xmm1,[rsi+rax]         ; get the source

            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
            paddusb     xmm1, [rdx+32] ;bothclamp
            psubusb     xmm1, [rdx+16] ;whiteclamp

            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
            paddb       xmm1,xmm2              ; add it in
            movdqu      [rsi+rax],xmm1         ; store the result

            add         rax,16                 ; move to the next line

            cmp         rax, rcx
            jl          addnoise_nextset

    movsxd  rax, dword arg(7) ; Pitch
    add     arg(0), rax ; Start += Pitch
    sub     dword arg(6), 1   ; Height -= 1
    jg      addnoise_loop

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret


SECTION_RODATA
align 16
rd42:
    times 8 dw 0x04
four8s:
    times 4 dd 8
Bug 566247 - Add libvpx, VP8 decoding library for WebM. r=ted 2010-06-08 16:31:28 -07:00			`;`
			`; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.`
			`;`
			`; Use of this source code is governed by a BSD-style license`
			`; that can be found in the LICENSE file in the root of the source`
			`; tree. An additional intellectual property rights grant can be found`
			`; in the file PATENTS. All contributing project authors may`
			`; be found in the AUTHORS file in the root of the source tree.`
			`;`


			`%include "vpx_ports/x86_abi_support.asm"`

			`;void vp8_post_proc_down_and_across_xmm`
			`;(`
			`; unsigned char *src_ptr,`
			`; unsigned char *dst_ptr,`
			`; int src_pixels_per_line,`
			`; int dst_pixels_per_line,`
			`; int rows,`
			`; int cols,`
			`; int flimit`
			`;)`
			`global sym(vp8_post_proc_down_and_across_xmm)`
			`sym(vp8_post_proc_down_and_across_xmm):`
			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 7`
			`GET_GOT rbx`
			`push rsi`
			`push rdi`
			`; end prolog`

			`%if ABI_IS_32BIT=1 && CONFIG_PIC=1`
			`ALIGN_STACK 16, rax`
			`; move the global rd onto the stack, since we don't have enough registers`
			`; to do PIC addressing`
			`movdqa xmm0, [rd42 GLOBAL]`
			`sub rsp, 16`
			`movdqa [rsp], xmm0`
			`%define RD42 [rsp]`
			`%else`
			`%define RD42 [rd42 GLOBAL]`
			`%endif`


			`movd xmm2, dword ptr arg(6) ;flimit`
			`punpcklwd xmm2, xmm2`
			`punpckldq xmm2, xmm2`
			`punpcklqdq xmm2, xmm2`

			`mov rsi, arg(0) ;src_ptr`
			`mov rdi, arg(1) ;dst_ptr`

			`movsxd rcx, DWORD PTR arg(4) ;rows`
			`movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?`
			`pxor xmm0, xmm0 ; mm0 = 00000000`

			`nextrow:`

			`xor rdx, rdx ; clear out rdx for use as loop counter`
			`nextcol:`
			`movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7`
			`punpcklbw xmm3, xmm0 ; mm3 = p0..p3`
			`movdqa xmm1, xmm3 ; mm1 = p0..p3`
			`psllw xmm3, 2 ;`

			`movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7`
			`punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3`
			`paddusw xmm3, xmm5 ; mm3 += mm6`

			`; thresholding`
			`movdqa xmm7, xmm1 ; mm7 = r0 p0..p3`
			`psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3`
			`psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3`
			`paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)`
			`pcmpgtw xmm7, xmm2`

			`movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7`
			`punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3`
			`paddusw xmm3, xmm5 ; mm3 += mm5`

			`; thresholding`
			`movdqa xmm6, xmm1 ; mm6 = r0 p0..p3`
			`psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3`
			`psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3`
			`paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)`
			`pcmpgtw xmm6, xmm2`
			`por xmm7, xmm6 ; accumulate thresholds`


			`neg rax`
			`movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7`
			`punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3`
			`paddusw xmm3, xmm5 ; mm3 += mm5`

			`; thresholding`
			`movdqa xmm6, xmm1 ; mm6 = r0 p0..p3`
			`psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3`
			`psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3`
			`paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)`
			`pcmpgtw xmm6, xmm2`
			`por xmm7, xmm6 ; accumulate thresholds`

			`movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7`
			`punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3`
			`paddusw xmm3, xmm4 ; mm3 += mm5`

			`; thresholding`
			`movdqa xmm6, xmm1 ; mm6 = r0 p0..p3`
			`psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3`
			`psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3`
			`paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)`
			`pcmpgtw xmm6, xmm2`
			`por xmm7, xmm6 ; accumulate thresholds`


			`paddusw xmm3, RD42 ; mm3 += round value`
			`psraw xmm3, 3 ; mm3 /= 8`

			`pand xmm1, xmm7 ; mm1 select vals > thresh from source`
			`pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result`
			`paddusw xmm1, xmm7 ; combination`

			`packuswb xmm1, xmm0 ; pack to bytes`
			`movq QWORD PTR [rdi], xmm1 ;`

			`neg rax ; pitch is positive`
			`add rsi, 8`
			`add rdi, 8`

			`add rdx, 8`
			`cmp edx, dword arg(5) ;cols`

			`jl nextcol`

			`; done with the all cols, start the across filtering in place`
			`sub rsi, rdx`
			`sub rdi, rdx`

			`xor rdx, rdx`
			`movq mm0, QWORD PTR [rdi-8];`

			`acrossnextcol:`
			`movq xmm7, QWORD PTR [rdi +rdx -2]`
			`movd xmm4, DWORD PTR [rdi +rdx +6]`

			`pslldq xmm4, 8`
			`por xmm4, xmm7`

			`movdqa xmm3, xmm4`
			`psrldq xmm3, 2`
			`punpcklbw xmm3, xmm0 ; mm3 = p0..p3`
			`movdqa xmm1, xmm3 ; mm1 = p0..p3`
			`psllw xmm3, 2`


			`movdqa xmm5, xmm4`
			`psrldq xmm5, 3`
			`punpcklbw xmm5, xmm0 ; mm5 = p1..p4`
			`paddusw xmm3, xmm5 ; mm3 += mm6`

			`; thresholding`
			`movdqa xmm7, xmm1 ; mm7 = p0..p3`
			`psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4`
			`psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3`
			`paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4)`
			`pcmpgtw xmm7, xmm2`

			`movdqa xmm5, xmm4`
			`psrldq xmm5, 4`
			`punpcklbw xmm5, xmm0 ; mm5 = p2..p5`
			`paddusw xmm3, xmm5 ; mm3 += mm5`

			`; thresholding`
			`movdqa xmm6, xmm1 ; mm6 = p0..p3`
			`psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4`
			`psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3`
			`paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)`
			`pcmpgtw xmm6, xmm2`
			`por xmm7, xmm6 ; accumulate thresholds`


			`movdqa xmm5, xmm4 ; mm5 = p-2..p5`
			`punpcklbw xmm5, xmm0 ; mm5 = p-2..p1`
			`paddusw xmm3, xmm5 ; mm3 += mm5`

			`; thresholding`
			`movdqa xmm6, xmm1 ; mm6 = p0..p3`
			`psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4`
			`psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3`
			`paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)`
			`pcmpgtw xmm6, xmm2`
			`por xmm7, xmm6 ; accumulate thresholds`

			`psrldq xmm4, 1 ; mm4 = p-1..p5`
			`punpcklbw xmm4, xmm0 ; mm4 = p-1..p2`
			`paddusw xmm3, xmm4 ; mm3 += mm5`

			`; thresholding`
			`movdqa xmm6, xmm1 ; mm6 = p0..p3`
			`psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4`
			`psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3`
			`paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4)`
			`pcmpgtw xmm6, xmm2`
			`por xmm7, xmm6 ; accumulate thresholds`

			`paddusw xmm3, RD42 ; mm3 += round value`
			`psraw xmm3, 3 ; mm3 /= 8`

			`pand xmm1, xmm7 ; mm1 select vals > thresh from source`
			`pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result`
			`paddusw xmm1, xmm7 ; combination`

			`packuswb xmm1, xmm0 ; pack to bytes`
			`movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes`
			`movdq2q mm0, xmm1`

			`add rdx, 8`
			`cmp edx, dword arg(5) ;cols`
			`jl acrossnextcol;`

			`; last 8 pixels`
			`movq QWORD PTR [rdi+rdx-8], mm0`

			`; done with this rwo`
			`add rsi,rax ; next line`
			`mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?`
			`add rdi,rax ; next destination`
			`mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch?`

			`dec rcx ; decrement count`
			`jnz nextrow ; next row`

			`%if ABI_IS_32BIT=1 && CONFIG_PIC=1`
			`add rsp,16`
			`pop rsp`
			`%endif`
			`; begin epilog`
			`pop rdi`
			`pop rsi`
			`RESTORE_GOT`
			`UNSHADOW_ARGS`
			`pop rbp`
			`ret`
			`%undef RD42`


			`;void vp8_mbpost_proc_down_xmm(unsigned char *dst,`
			`; int pitch, int rows, int cols,int flimit)`
			`extern sym(vp8_rv)`
			`global sym(vp8_mbpost_proc_down_xmm)`
			`sym(vp8_mbpost_proc_down_xmm):`
			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 5`
			`GET_GOT rbx`
			`push rsi`
			`push rdi`
			`; end prolog`

			`ALIGN_STACK 16, rax`
			`sub rsp, 128+16`

			`; unsigned char d[16][8] at [rsp]`
			`; create flimit2 at [rsp+128]`
			`mov eax, dword ptr arg(4) ;flimit`
			`mov [rsp+128], eax`
			`mov [rsp+128+4], eax`
			`mov [rsp+128+8], eax`
			`mov [rsp+128+12], eax`
			`%define flimit4 [rsp+128]`

			`%if ABI_IS_32BIT=0`
			`lea r8, [sym(vp8_rv) GLOBAL]`
			`%endif`

			`;rows +=8;`
			`add dword arg(2), 8`

			`;for(c=0; c<cols; c+=8)`
			`loop_col:`
			`mov rsi, arg(0) ; s`
			`pxor xmm0, xmm0 ;`

			`movsxd rax, dword ptr arg(1) ;pitch ;`
			`neg rax ; rax = -pitch`

			`lea rsi, [rsi + rax8]; ; rdi = s[-pitch8]`
			`neg rax`


			`pxor xmm5, xmm5`
			`pxor xmm6, xmm6 ;`

			`pxor xmm7, xmm7 ;`
			`mov rdi, rsi`

			`mov rcx, 15 ;`

			`loop_initvar:`
			`movq xmm1, QWORD PTR [rdi];`
			`punpcklbw xmm1, xmm0 ;`

			`paddw xmm5, xmm1 ;`
			`pmullw xmm1, xmm1 ;`

			`movdqa xmm2, xmm1 ;`
			`punpcklwd xmm1, xmm0 ;`

			`punpckhwd xmm2, xmm0 ;`
			`paddd xmm6, xmm1 ;`

			`paddd xmm7, xmm2 ;`
			`lea rdi, [rdi+rax] ;`

			`dec rcx`
			`jne loop_initvar`
			`;save the var and sum`
			`xor rdx, rdx`
			`loop_row:`
			`movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]`
			`movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]`

			`punpcklbw xmm1, xmm0`
			`punpcklbw xmm2, xmm0`

			`paddw xmm5, xmm2`
			`psubw xmm5, xmm1`

			`pmullw xmm2, xmm2`
			`movdqa xmm4, xmm2`

			`punpcklwd xmm2, xmm0`
			`punpckhwd xmm4, xmm0`

			`paddd xmm6, xmm2`
			`paddd xmm7, xmm4`

			`pmullw xmm1, xmm1`
			`movdqa xmm2, xmm1`

			`punpcklwd xmm1, xmm0`
			`psubd xmm6, xmm1`

			`punpckhwd xmm2, xmm0`
			`psubd xmm7, xmm2`


			`movdqa xmm3, xmm6`
			`pslld xmm3, 4`

			`psubd xmm3, xmm6`
			`movdqa xmm1, xmm5`

			`movdqa xmm4, xmm5`
			`pmullw xmm1, xmm1`

			`pmulhw xmm4, xmm4`
			`movdqa xmm2, xmm1`

			`punpcklwd xmm1, xmm4`
			`punpckhwd xmm2, xmm4`

			`movdqa xmm4, xmm7`
			`pslld xmm4, 4`

			`psubd xmm4, xmm7`

			`psubd xmm3, xmm1`
			`psubd xmm4, xmm2`

			`psubd xmm3, flimit4`
			`psubd xmm4, flimit4`

			`psrad xmm3, 31`
			`psrad xmm4, 31`

			`packssdw xmm3, xmm4`
			`packsswb xmm3, xmm0`

			`movq xmm1, QWORD PTR [rsi+rax*8]`

			`movq xmm2, xmm1`
			`punpcklbw xmm1, xmm0`

			`paddw xmm1, xmm5`
			`mov rcx, rdx`

			`and rcx, 127`
			`%if ABI_IS_32BIT=1 && CONFIG_PIC=1`
			`push rax`
			`lea rax, [sym(vp8_rv) GLOBAL]`
			`movdqu xmm4, [rax + rcx2] ;vp8_rv[rcx2]`
			`pop rax`
			`%elif ABI_IS_32BIT=0`
			`movdqu xmm4, [r8 + rcx2] ;vp8_rv[rcx2]`
			`%else`
			`movdqu xmm4, [sym(vp8_rv) + rcx*2]`
			`%endif`

			`paddw xmm1, xmm4`
			`;paddw xmm1, eight8s`
			`psraw xmm1, 4`

			`packuswb xmm1, xmm0`
			`pand xmm1, xmm3`

			`pandn xmm3, xmm2`
			`por xmm1, xmm3`

			`and rcx, 15`
			`movq QWORD PTR [rsp + rcx8], xmm1 ;d[rcx8]`

			`mov rcx, rdx`
			`sub rcx, 8`

			`and rcx, 15`
			`movq mm0, [rsp + rcx8] ;d[rcx8]`

			`movq [rsi], mm0`
			`lea rsi, [rsi+rax]`

			`lea rdi, [rdi+rax]`
			`add rdx, 1`

			`cmp edx, dword arg(2) ;rows`
			`jl loop_row`

			`add dword arg(0), 8 ; s += 8`
			`sub dword arg(3), 8 ; cols -= 8`
			`cmp dword arg(3), 0`
			`jg loop_col`

			`add rsp, 128+16`
			`pop rsp`

			`; begin epilog`
			`pop rdi`
			`pop rsi`
			`RESTORE_GOT`
			`UNSHADOW_ARGS`
			`pop rbp`
			`ret`
			`%undef flimit4`


			`;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,`
			`; int pitch, int rows, int cols,int flimit)`
			`global sym(vp8_mbpost_proc_across_ip_xmm)`
			`sym(vp8_mbpost_proc_across_ip_xmm):`
			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 5`
			`GET_GOT rbx`
			`push rsi`
			`push rdi`
			`; end prolog`

			`ALIGN_STACK 16, rax`
			`sub rsp, 16`

			`; create flimit4 at [rsp]`
			`mov eax, dword ptr arg(4) ;flimit`
			`mov [rsp], eax`
			`mov [rsp+4], eax`
			`mov [rsp+8], eax`
			`mov [rsp+12], eax`
			`%define flimit4 [rsp]`


			`;for(r=0;r<rows;r++)`
			`ip_row_loop:`

			`xor rdx, rdx ;sumsq=0;`
			`xor rcx, rcx ;sum=0;`
			`mov rsi, arg(0); s`
			`mov rdi, -8`
			`ip_var_loop:`
			`;for(i=-8;i<=6;i++)`
			`;{`
			`; sumsq += s[i]*s[i];`
			`; sum += s[i];`
			`;}`
			`movzx eax, byte [rsi+rdi]`
			`add ecx, eax`
			`mul al`
			`add edx, eax`
			`add rdi, 1`
			`cmp rdi, 6`
			`jle ip_var_loop`


			`;mov rax, sumsq`
			`;movd xmm7, rax`
			`movd xmm7, edx`

			`;mov rax, sum`
			`;movd xmm6, rax`
			`movd xmm6, ecx`

			`mov rsi, arg(0) ;s`
			`xor rcx, rcx`

			`movsxd rdx, dword arg(3) ;cols`
			`add rdx, 8`
			`pxor mm0, mm0`
			`pxor mm1, mm1`

			`pxor xmm0, xmm0`
			`nextcol4:`

			`movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5`
			`movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10`

			`punpcklbw xmm1, xmm0 ; expanding`
			`punpcklbw xmm2, xmm0 ; expanding`

			`punpcklwd xmm1, xmm0 ; expanding to dwords`
			`punpcklwd xmm2, xmm0 ; expanding to dwords`

			`psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5`
			`paddd xmm1, xmm1 ; -82 -72 -62 -52`

			`paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5`
			`pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5`

			`paddd xmm6, xmm2`
			`paddd xmm7, xmm1`

			`pshufd xmm6, xmm6, 0 ; duplicate the last ones`
			`pshufd xmm7, xmm7, 0 ; duplicate the last ones`

			`psrldq xmm1, 4 ; 8--7 9--6 10--5 0000`
			`psrldq xmm2, 4 ; 8--7 9--6 10--5 0000`

			`pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared`
			`pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared`

			`paddd xmm6, xmm4`
			`paddd xmm7, xmm3`

			`pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared`
			`pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared`

			`paddd xmm7, xmm3`
			`paddd xmm6, xmm4`

			`pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared`
			`pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared`

			`paddd xmm7, xmm3`
			`paddd xmm6, xmm4`

			`movdqa xmm3, xmm6`
			`pmaddwd xmm3, xmm3`

			`movdqa xmm5, xmm7`
			`pslld xmm5, 4`

			`psubd xmm5, xmm7`
			`psubd xmm5, xmm3`

			`psubd xmm5, flimit4`
			`psrad xmm5, 31`

			`packssdw xmm5, xmm0`
			`packsswb xmm5, xmm0`

			`movd xmm1, DWORD PTR [rsi+rcx]`
			`movq xmm2, xmm1`

			`punpcklbw xmm1, xmm0`
			`punpcklwd xmm1, xmm0`

			`paddd xmm1, xmm6`
			`paddd xmm1, [four8s GLOBAL]`

			`psrad xmm1, 4`
			`packssdw xmm1, xmm0`

			`packuswb xmm1, xmm0`
			`pand xmm1, xmm5`

			`pandn xmm5, xmm2`
			`por xmm5, xmm1`

			`movd [rsi+rcx-8], mm0`
			`movq mm0, mm1`

			`movdq2q mm1, xmm5`
			`psrldq xmm7, 12`

			`psrldq xmm6, 12`
			`add rcx, 4`

			`cmp rcx, rdx`
			`jl nextcol4`

			`;s+=pitch;`
			`movsxd rax, dword arg(1)`
			`add arg(0), rax`

			`sub dword arg(2), 1 ;rows-=1`
			`cmp dword arg(2), 0`
			`jg ip_row_loop`

			`add rsp, 16`
			`pop rsp`

			`; begin epilog`
			`pop rdi`
			`pop rsi`
			`RESTORE_GOT`
			`UNSHADOW_ARGS`
			`pop rbp`
			`ret`
			`%undef flimit4`


			`;void vp8_plane_add_noise_wmt (unsigned char Start, unsigned char noise,`
			`; unsigned char blackclamp[16],`
			`; unsigned char whiteclamp[16],`
			`; unsigned char bothclamp[16],`
			`; unsigned int Width, unsigned int Height, int Pitch)`
			`extern sym(rand)`
			`global sym(vp8_plane_add_noise_wmt)`
			`sym(vp8_plane_add_noise_wmt):`
			`push rbp`
			`mov rbp, rsp`
			`SHADOW_ARGS_TO_STACK 8`
			`GET_GOT rbx`
			`push rsi`
			`push rdi`
			`; end prolog`

			`addnoise_loop:`
			`call sym(rand) WRT_PLT`
			`mov rcx, arg(1) ;noise`
			`and rax, 0xff`
			`add rcx, rax`

			`; we rely on the fact that the clamping vectors are stored contiguously`
			`; in black/white/both order. Note that we have to reload this here because`
			`; rdx could be trashed by rand()`
			`mov rdx, arg(2) ; blackclamp`


			`mov rdi, rcx`
			`movsxd rcx, dword arg(5) ;[Width]`
			`mov rsi, arg(0) ;Pos`
			`xor rax,rax`

			`addnoise_nextset:`
			`movdqu xmm1,[rsi+rax] ; get the source`

			`psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise`
			`paddusb xmm1, [rdx+32] ;bothclamp`
			`psubusb xmm1, [rdx+16] ;whiteclamp`

			`movdqu xmm2,[rdi+rax] ; get the noise for this line`
			`paddb xmm1,xmm2 ; add it in`
			`movdqu [rsi+rax],xmm1 ; store the result`

			`add rax,16 ; move to the next line`

			`cmp rax, rcx`
			`jl addnoise_nextset`

			`movsxd rax, dword arg(7) ; Pitch`
			`add arg(0), rax ; Start += Pitch`
			`sub dword arg(6), 1 ; Height -= 1`
			`jg addnoise_loop`

			`; begin epilog`
			`pop rdi`
			`pop rsi`
			`RESTORE_GOT`
			`UNSHADOW_ARGS`
			`pop rbp`
			`ret`


			`SECTION_RODATA`
			`align 16`
			`rd42:`
			`times 8 dw 0x04`
			`four8s:`
			`times 4 dd 8`