mirror of
https://gitlab.winehq.org/wine/wine-gecko.git
synced 2024-09-13 09:24:08 -07:00
Backed out changeset 041ed2e08168 (bug 926838)
This commit is contained in:
parent
8061d7003f
commit
09d2b1fa6a
@ -170,9 +170,6 @@
|
||||
.global \name
|
||||
.func \name
|
||||
.section .text.\name,"ax",%progbits
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
.object_arch armv4
|
||||
.align 2
|
||||
\name :
|
||||
.fnstart
|
||||
|
@ -93,13 +93,13 @@
|
||||
radix2lsGrpLoop\name :
|
||||
@ dWr = [pTwiddle[0].Re, pTwiddle[1].Re]
|
||||
@ dWi = [pTwiddle[0].Im, pTwiddle[1].Im]
|
||||
VLD2 {dWr,dWi},[pTwiddle, :64]!
|
||||
VLD2 {dWr,dWi},[pTwiddle :64]!
|
||||
|
||||
@ dXr0 = [pSrc[0].Re, pSrc[2].Re]
|
||||
@ dXi0 = [pSrc[0].Im, pSrc[2].Im]
|
||||
@ dXr1 = [pSrc[1].Re, pSrc[3].Re]
|
||||
@ dXi1 = [pSrc[1].Im, pSrc[3].Im]
|
||||
VLD4 {dXr0,dXi0,dXr1,dXi1},[pSrc, :128]!
|
||||
VLD4 {dXr0,dXi0,dXr1,dXi1},[pSrc :128]!
|
||||
SUBS grpCount,grpCount,#4 @// grpCount is multiplied by 2
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
@ -118,27 +118,27 @@
|
||||
|
||||
|
||||
@// Update pSubFFTSize and pSubFFTNum regs
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
|
||||
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
|
||||
@// subFFTSize = 1 for the first stage
|
||||
MOV subFFTSize,#4
|
||||
|
||||
@// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
|
||||
LSR grpSize,subFFTNum,#2
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
|
||||
MOV subFFTNum,grpSize
|
||||
|
||||
|
||||
@// Calculate the step of input data for the next set
|
||||
@//MOV setStep,pointStep,LSL #1
|
||||
MOV setStep,grpSize,LSL #4
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
|
||||
@// setStep = 3*pointStep
|
||||
ADD setStep,setStep,pointStep
|
||||
@// setStep = - 3*pointStep+16
|
||||
RSB setStep,setStep,#16
|
||||
|
||||
@// data[3] & update pSrc for the next set
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],setStep
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],setStep
|
||||
@// step1 = 2*pointStep
|
||||
MOV step1,pointStep,LSL #1
|
||||
|
||||
@ -163,9 +163,9 @@ radix4fsGrpZeroSetLoop\name :
|
||||
|
||||
VSUB qY2,qX0,qX2
|
||||
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],step1 @// data[0]
|
||||
VLD2 {dXr0,dXi0},[pSrc :128],step1 @// data[0]
|
||||
VADD qY1,qX1,qX3
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],step3 @// data[2]
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],step3 @// data[2]
|
||||
VSUB qY3,qX1,qX3
|
||||
|
||||
|
||||
@ -173,56 +173,56 @@ radix4fsGrpZeroSetLoop\name :
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
|
||||
VADD qZ0,qY0,qY1
|
||||
|
||||
@// data[3] & update pSrc for the next set, but not if it's the
|
||||
@// last iteration so that we don't read past the end of the
|
||||
@// input array.
|
||||
BEQ radix4SkipLastUpdateInv\name
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],setStep
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],setStep
|
||||
radix4SkipLastUpdateInv\name:
|
||||
VSUB dZr3,dYr2,dYi3
|
||||
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VADD dZi3,dYi2,dYr3
|
||||
|
||||
VSUB qZ1,qY0,qY1
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],outPointStep
|
||||
|
||||
VADD dZr2,dYr2,dYi3
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
VSUB dZi2,dYi2,dYr3
|
||||
|
||||
VADD qY0,qX0,qX2 @// u0 for next iteration
|
||||
VST2 {dZr2,dZi2},[pDst, :128],setStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],setStep
|
||||
|
||||
|
||||
.else
|
||||
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
|
||||
VADD qZ0,qY0,qY1
|
||||
|
||||
@// data[3] & update pSrc for the next set, but not if it's the
|
||||
@// last iteration so that we don't read past the end of the
|
||||
@// input array.
|
||||
BEQ radix4SkipLastUpdateFwd\name
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],setStep
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],setStep
|
||||
radix4SkipLastUpdateFwd\name:
|
||||
VADD dZr2,dYr2,dYi3
|
||||
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VSUB dZi2,dYi2,dYr3
|
||||
|
||||
VSUB qZ1,qY0,qY1
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
|
||||
VSUB dZr3,dYr2,dYi3
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
VADD dZi3,dYi2,dYr3
|
||||
|
||||
VADD qY0,qX0,qX2 @// u0 for next iteration
|
||||
VST2 {dZr3,dZi3},[pDst, :128],setStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],setStep
|
||||
|
||||
.endif
|
||||
|
||||
|
@ -139,32 +139,32 @@
|
||||
|
||||
@// Update grpCount and grpSize rightaway
|
||||
|
||||
VLD2 {dW1r,dW1i},[pTwiddle, :128] @// [wi|wr]
|
||||
VLD2 {dW1r,dW1i},[pTwiddle :128] @// [wi|wr]
|
||||
MOV step16,#16
|
||||
LSL grpCount,subFFTSize,#2
|
||||
|
||||
VLD1 dW2r,[pTwiddle, :64] @// [wi|wr]
|
||||
VLD1 dW2r,[pTwiddle :64] @// [wi|wr]
|
||||
MOV subFFTNum,#1 @//after the last stage
|
||||
|
||||
VLD1 dW3r,[pTwiddle, :64],step16 @// [wi|wr]
|
||||
VLD1 dW3r,[pTwiddle :64],step16 @// [wi|wr]
|
||||
MOV stepTwiddle,#0
|
||||
|
||||
VLD1 dW2i,[pTwiddle, :64]! @// [wi|wr]
|
||||
VLD1 dW2i,[pTwiddle :64]! @// [wi|wr]
|
||||
SUB grpTwStep,stepTwiddle,#8 @// grpTwStep = -8 to start with
|
||||
|
||||
@// update subFFTSize for the next stage
|
||||
MOV subFFTSize,grpCount
|
||||
VLD1 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr]
|
||||
VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|wr]
|
||||
MOV dstStep,outPointStep,LSL #1
|
||||
|
||||
@// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]!
|
||||
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]!
|
||||
ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep
|
||||
RSB dstStep,dstStep,#16 @// dstStep = - 3*outPointStep+16
|
||||
MOV step24,#24
|
||||
|
||||
@// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]!
|
||||
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]!
|
||||
|
||||
|
||||
@// Process two groups at a time
|
||||
@ -204,13 +204,13 @@ radix4lsGrpLoop\name :
|
||||
|
||||
.endif
|
||||
|
||||
VLD2 {dW1r,dW1i},[pTwiddle, :128],stepTwiddle @// [wi|wr]
|
||||
VLD2 {dW1r,dW1i},[pTwiddle :128],stepTwiddle @// [wi|wr]
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
VMUL dZr2,dW2r,dXr2
|
||||
VMLA dZr2,dW2i,dXi2 @// real part
|
||||
VMUL dZi2,dW2r,dXi2
|
||||
VLD1 dW2r,[pTwiddle, :64],step16 @// [wi|wr]
|
||||
VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr]
|
||||
VMLS dZi2,dW2i,dXr2 @// imag part
|
||||
|
||||
.else
|
||||
@ -218,13 +218,13 @@ radix4lsGrpLoop\name :
|
||||
VMUL dZr2,dW2r,dXr2
|
||||
VMLS dZr2,dW2i,dXi2 @// real part
|
||||
VMUL dZi2,dW2r,dXi2
|
||||
VLD1 dW2r,[pTwiddle, :64],step16 @// [wi|wr]
|
||||
VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr]
|
||||
VMLA dZi2,dW2i,dXr2 @// imag part
|
||||
|
||||
.endif
|
||||
|
||||
|
||||
VLD1 dW2i,[pTwiddle, :64],twStep @// [wi|wr]
|
||||
VLD1 dW2i,[pTwiddle :64],twStep @// [wi|wr]
|
||||
|
||||
@// move qX0 so as to load for the next iteration
|
||||
VMOV qZ0,qX0
|
||||
@ -233,7 +233,7 @@ radix4lsGrpLoop\name :
|
||||
VMUL dZr3,dW3r,dXr3
|
||||
VMLA dZr3,dW3i,dXi3 @// real part
|
||||
VMUL dZi3,dW3r,dXi3
|
||||
VLD1 dW3r,[pTwiddle, :64],step24
|
||||
VLD1 dW3r,[pTwiddle :64],step24
|
||||
VMLS dZi3,dW3i,dXr3 @// imag part
|
||||
|
||||
.else
|
||||
@ -241,22 +241,22 @@ radix4lsGrpLoop\name :
|
||||
VMUL dZr3,dW3r,dXr3
|
||||
VMLS dZr3,dW3i,dXi3 @// real part
|
||||
VMUL dZi3,dW3r,dXi3
|
||||
VLD1 dW3r,[pTwiddle, :64],step24
|
||||
VLD1 dW3r,[pTwiddle :64],step24
|
||||
VMLA dZi3,dW3i,dXr3 @// imag part
|
||||
|
||||
.endif
|
||||
|
||||
VLD1 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr]
|
||||
VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|wr]
|
||||
|
||||
@// Don't do the load on the last iteration so we don't read past the end
|
||||
@// of pSrc.
|
||||
addeq pSrc, pSrc, #64
|
||||
beq radix4lsSkipRead\name
|
||||
@// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]!
|
||||
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]!
|
||||
|
||||
@// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]!
|
||||
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]!
|
||||
radix4lsSkipRead\name:
|
||||
|
||||
@// finish first stage of 4 point FFT
|
||||
@ -274,18 +274,18 @@ radix4lsSkipRead\name:
|
||||
VSUB qZ0,qY2,qY1
|
||||
|
||||
VADD dZr3,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VSUB dZi3,dYi0,dYr3
|
||||
|
||||
VADD qZ2,qY2,qY1
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],outPointStep
|
||||
|
||||
VSUB dZr1,dYr0,dYi3
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VADD dZi1,dYi0,dYr3
|
||||
|
||||
@// dstStep = -outPointStep + 16
|
||||
VST2 {dZr1,dZi1},[pDst, :128],dstStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],dstStep
|
||||
|
||||
|
||||
.else
|
||||
@ -293,18 +293,18 @@ radix4lsSkipRead\name:
|
||||
VSUB qZ0,qY2,qY1
|
||||
|
||||
VSUB dZr1,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VADD dZi1,dYi0,dYr3
|
||||
|
||||
VADD qZ2,qY2,qY1
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
|
||||
VADD dZr3,dYr0,dYi3
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VSUB dZi3,dYi0,dYr3
|
||||
|
||||
@// dstStep = -outPointStep + 16
|
||||
VST2 {dZr3,dZi3},[pDst, :128],dstStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],dstStep
|
||||
|
||||
|
||||
.endif
|
||||
|
@ -250,7 +250,7 @@ radix4SkipRead\name:
|
||||
VSUB qY2,qX0,qZ2
|
||||
|
||||
@// data[0] for next iteration
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128]!
|
||||
VLD2 {dXr0,dXi0},[pSrc :128]!
|
||||
VADD qY1,qZ1,qZ3
|
||||
VSUB qY3,qZ1,qZ3
|
||||
|
||||
@ -262,33 +262,33 @@ radix4SkipRead\name:
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
VADD dZr3,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VSUB dZi3,dYi0,dYr3
|
||||
|
||||
VADD qZ2,qY2,qY1
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],outPointStep
|
||||
|
||||
VSUB dZr1,dYr0,dYi3
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VADD dZi1,dYi0,dYr3
|
||||
|
||||
VST2 {dZr1,dZi1},[pDst, :128],dstStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],dstStep
|
||||
|
||||
|
||||
.else
|
||||
|
||||
VSUB dZr1,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VADD dZi1,dYi0,dYr3
|
||||
|
||||
VADD qZ2,qY2,qY1
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
|
||||
VADD dZr3,dYr0,dYi3
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VSUB dZi3,dYi0,dYr3
|
||||
|
||||
VST2 {dZr3,dZi3},[pDst, :128],dstStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],dstStep
|
||||
|
||||
|
||||
.endif
|
||||
@ -298,13 +298,13 @@ radix4SkipRead\name:
|
||||
BGT radix4SetLoop\name
|
||||
|
||||
|
||||
VLD1 dW1,[pTwiddle, :64],stepTwiddle @//[wi | wr]
|
||||
VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi | wr]
|
||||
@// subtract 4 since grpCount multiplied by 4
|
||||
SUBS grpCount,grpCount,#4
|
||||
VLD1 dW2,[pTwiddle, :64],stepTwiddle @//[wi | wr]
|
||||
VLD1 dW2,[pTwiddle :64],stepTwiddle @//[wi | wr]
|
||||
@// increment pSrc for the next grp
|
||||
ADD pSrc,pSrc,srcStep
|
||||
VLD1 dW3,[pTwiddle, :64],twStep @//[wi | wr]
|
||||
VLD1 dW3,[pTwiddle :64],twStep @//[wi | wr]
|
||||
BGT radix4GrpLoop\name
|
||||
|
||||
|
||||
|
@ -173,6 +173,10 @@
|
||||
#define dT0 D14.F32
|
||||
#define dT1 D15.F32
|
||||
|
||||
@// Define constants
|
||||
@ sqrt(1/2)
|
||||
ONEBYSQRT2: .float 0.7071067811865476e0
|
||||
|
||||
|
||||
.MACRO FFTSTAGE scaled, inverse, name
|
||||
|
||||
@ -181,7 +185,7 @@
|
||||
@// Update pSubFFTSize and pSubFFTNum regs
|
||||
@// subFFTSize = 1 for the first stage
|
||||
MOV subFFTSize,#8
|
||||
ADR t0,ONEBYSQRT2\name
|
||||
LDR t0,=ONEBYSQRT2
|
||||
|
||||
@// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
|
||||
LSR grpSize,subFFTNum,#3
|
||||
@ -197,23 +201,23 @@
|
||||
|
||||
@// Calculate the step of input data for the next set
|
||||
@//MOV step1,pointStep,LSL #1 @// step1 = 2*pointStep
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
|
||||
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
|
||||
MOV step1,grpSize,LSL #4
|
||||
|
||||
MOV step2,pointStep,LSL #3
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
|
||||
SUB step2,step2,pointStep @// step2 = 7*pointStep
|
||||
@// setStep = - 7*pointStep+16
|
||||
RSB setStep,step2,#16
|
||||
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
|
||||
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
|
||||
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
|
||||
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
|
||||
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
|
||||
@// data[7] & update pSrc for the next set
|
||||
@// setStep = -7*pointStep + 16
|
||||
VLD2 {dXr7,dXi7},[pSrc, :128],setStep
|
||||
VLD2 {dXr7,dXi7},[pSrc :128],setStep
|
||||
@// grp = 0 a special case since all the twiddle factors are 1
|
||||
@// Loop on the sets
|
||||
|
||||
@ -241,7 +245,7 @@ radix8fsGrpZeroSetLoop\name :
|
||||
|
||||
VADD qY0,qV0,qV4
|
||||
VSUB qY4,qV0,qV4
|
||||
VST2 {dYr0,dYi0},[pDst, :128],step1 @// store y0
|
||||
VST2 {dYr0,dYi0},[pDst :128],step1 @// store y0
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
@ -249,15 +253,15 @@ radix8fsGrpZeroSetLoop\name :
|
||||
VADD dYi2,dVi2,dVr6
|
||||
|
||||
VADD dYr6,dVr2,dVi6
|
||||
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y2
|
||||
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y2
|
||||
VSUB dYi6,dVi2,dVr6
|
||||
|
||||
VSUB qU1,qX0,qX4
|
||||
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
|
||||
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
|
||||
|
||||
VSUB qU3,qX1,qX5
|
||||
VSUB qU5,qX2,qX6
|
||||
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y6
|
||||
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y6
|
||||
|
||||
.ELSE
|
||||
|
||||
@ -265,15 +269,15 @@ radix8fsGrpZeroSetLoop\name :
|
||||
VSUB dYi6,dVi2,dVr6
|
||||
|
||||
VSUB dYr2,dVr2,dVi6
|
||||
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y2
|
||||
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y2
|
||||
VADD dYi2,dVi2,dVr6
|
||||
|
||||
|
||||
VSUB qU1,qX0,qX4
|
||||
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
|
||||
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
|
||||
VSUB qU3,qX1,qX5
|
||||
VSUB qU5,qX2,qX6
|
||||
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y6
|
||||
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y6
|
||||
|
||||
|
||||
.ENDIF
|
||||
@ -287,17 +291,17 @@ radix8fsGrpZeroSetLoop\name :
|
||||
|
||||
VSUB dVr1,dUr1,dUi5
|
||||
@// data[0] for next iteration
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep
|
||||
VLD2 {dXr0,dXi0},[pSrc :128],pointStep
|
||||
VADD dVi1,dUi1,dUr5
|
||||
VADD dVr3,dUr1,dUi5
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
|
||||
VSUB dVi3,dUi1,dUr5
|
||||
|
||||
VSUB dVr5,dUr3,dUi7
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
|
||||
VADD dVi5,dUi3,dUr7
|
||||
VADD dVr7,dUr3,dUi7
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
|
||||
VSUB dVi7,dUi3,dUr7
|
||||
|
||||
@// finish third stage of 8 point FFT
|
||||
@ -307,14 +311,14 @@ radix8fsGrpZeroSetLoop\name :
|
||||
@// calculate a*v5
|
||||
VMUL dT1,dVr5,dT0[0] @// use dVi0 for dT1
|
||||
|
||||
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
|
||||
VMUL dVi5,dVi5,dT0[0]
|
||||
|
||||
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
|
||||
VSUB dVr5,dT1,dVi5 @// a * V5
|
||||
VADD dVi5,dT1,dVi5
|
||||
|
||||
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
|
||||
|
||||
@// calculate b*v7
|
||||
VMUL dT1,dVr7,dT0[0]
|
||||
@ -331,33 +335,33 @@ radix8fsGrpZeroSetLoop\name :
|
||||
@// On the last iteration, this will read past the end of pSrc,
|
||||
@// so skip this read.
|
||||
BEQ radix8SkipLastUpdateInv\name
|
||||
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
|
||||
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
|
||||
radix8SkipLastUpdateInv\name:
|
||||
|
||||
VSUB dYr3,dVr3,dVr7
|
||||
VSUB dYi3,dVi3,dVi7
|
||||
VST2 {dYr1,dYi1},[pDst, :128],step1 @// store y1
|
||||
VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1
|
||||
VADD dYr7,dVr3,dVr7
|
||||
VADD dYi7,dVi3,dVi7
|
||||
|
||||
|
||||
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y3
|
||||
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y5
|
||||
VST2 {dYr7,dYi7},[pDst, :128] @// store y7
|
||||
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y3
|
||||
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y5
|
||||
VST2 {dYr7,dYi7},[pDst :128] @// store y7
|
||||
ADD pDst, pDst, #16
|
||||
|
||||
.ELSE
|
||||
|
||||
@// calculate b*v7
|
||||
VMUL dT1,dVr7,dT0[0]
|
||||
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
|
||||
VMUL dVi7,dVi7,dT0[0]
|
||||
|
||||
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
|
||||
VADD dVr7,dT1,dVi7 @// b * V7
|
||||
VSUB dVi7,dVi7,dT1
|
||||
|
||||
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
|
||||
|
||||
@// calculate a*v5
|
||||
VMUL dT1,dVr5,dT0[0] @// use dVi0 for dT1
|
||||
@ -373,20 +377,20 @@ radix8SkipLastUpdateInv\name:
|
||||
@// On the last iteration, this will read past the end of pSrc,
|
||||
@// so skip this read.
|
||||
BEQ radix8SkipLastUpdateFwd\name
|
||||
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
|
||||
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
|
||||
radix8SkipLastUpdateFwd\name:
|
||||
|
||||
VSUB qY5,qV1,qV5
|
||||
|
||||
VSUB dYr3,dVr3,dVr7
|
||||
VST2 {dYr7,dYi7},[pDst, :128],step1 @// store y1
|
||||
VST2 {dYr7,dYi7},[pDst :128],step1 @// store y1
|
||||
VSUB dYi3,dVi3,dVi7
|
||||
VADD qY1,qV1,qV5
|
||||
|
||||
|
||||
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y3
|
||||
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y5
|
||||
VST2 {dYr1,dYi1},[pDst, :128]! @// store y7
|
||||
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y3
|
||||
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y5
|
||||
VST2 {dYr1,dYi1},[pDst :128]! @// store y7
|
||||
|
||||
.ENDIF
|
||||
|
||||
@ -411,12 +415,12 @@ radix8SkipLastUpdateFwd\name:
|
||||
M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe,r4
|
||||
FFTSTAGE "FALSE","FALSE",FWD
|
||||
M_END
|
||||
ONEBYSQRT2FWD: .float 0.7071067811865476e0
|
||||
|
||||
|
||||
M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe,r4
|
||||
FFTSTAGE "FALSE","TRUE",INV
|
||||
M_END
|
||||
ONEBYSQRT2INV: .float 0.7071067811865476e0
|
||||
|
||||
|
||||
|
||||
.end
|
||||
|
@ -121,7 +121,7 @@
|
||||
@// Update pSubFFTSize and pSubFFTNum regs
|
||||
|
||||
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
|
||||
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
|
||||
@// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
|
||||
LSR grpSize,subFFTNum,#2
|
||||
MOV subFFTNum,grpSize
|
||||
@ -130,19 +130,19 @@
|
||||
@// pT0+1 increments pT0 by 4 bytes
|
||||
@// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
|
||||
@// Note: outPointStep = pointStep for firststage
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
|
||||
|
||||
|
||||
@// Calculate the step of input data for the next set
|
||||
@//MOV setStep,pointStep,LSL #1
|
||||
MOV setStep,grpSize,LSL #3
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
|
||||
MOV step1,setStep
|
||||
ADD setStep,setStep,pointStep @// setStep = 3*pointStep
|
||||
RSB setStep,setStep,#16 @// setStep = - 3*pointStep+16
|
||||
|
||||
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],setStep @// data[3]
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3]
|
||||
MOV subFFTSize,#4 @// subFFTSize = 1 for the first stage
|
||||
|
||||
|
||||
@ -166,36 +166,36 @@ grpZeroSetLoop\name:
|
||||
VHSUB qY2,qX0,qX2 @// u1
|
||||
SUBS setCount,setCount,#4 @// decrement the set loop counter
|
||||
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],step1 @// data[0]
|
||||
VLD2 {dXr0,dXi0},[pSrc :128],step1 @// data[0]
|
||||
VHADD qY1,qX1,qX3 @// u2
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],step3
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],step3
|
||||
VHSUB qY3,qX1,qX3 @// u3
|
||||
|
||||
|
||||
|
||||
@// finish second stage of 4 point FFT
|
||||
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
|
||||
VHADD qZ0,qY0,qY1 @// y0
|
||||
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],setStep
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],setStep
|
||||
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
VHSUB dZr3,dYr2,dYi3 @// y3
|
||||
VHADD dZi3,dYi2,dYr3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
|
||||
VHSUB qZ1,qY0,qY1 @// y2
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],outPointStep
|
||||
|
||||
VHADD dZr2,dYr2,dYi3 @// y1
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
VHSUB dZi2,dYi2,dYr3
|
||||
|
||||
VHADD qY0,qX0,qX2 @// u0 (next loop)
|
||||
VST2 {dZr2,dZi2},[pDst, :128],setStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],setStep
|
||||
|
||||
|
||||
.ELSE
|
||||
@ -203,15 +203,15 @@ grpZeroSetLoop\name:
|
||||
VHADD dZr2,dYr2,dYi3 @// y1
|
||||
VHSUB dZi2,dYi2,dYr3
|
||||
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VHSUB qZ1,qY0,qY1 @// y2
|
||||
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VHSUB dZr3,dYr2,dYi3 @// y3
|
||||
VHADD dZi3,dYi2,dYr3
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
VHADD qY0,qX0,qX2 @// u0 (next loop)
|
||||
VST2 {dZr3,dZi3},[pDst, :128],setStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],setStep
|
||||
|
||||
.ENDIF
|
||||
|
||||
@ -223,36 +223,36 @@ grpZeroSetLoop\name:
|
||||
VSUB qY2,qX0,qX2 @// u1
|
||||
SUBS setCount,setCount,#4 @// decrement the set loop counter
|
||||
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],step1 @// data[0]
|
||||
VLD2 {dXr0,dXi0},[pSrc :128],step1 @// data[0]
|
||||
VADD qY1,qX1,qX3 @// u2
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],step3
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],step3
|
||||
VSUB qY3,qX1,qX3 @// u3
|
||||
|
||||
|
||||
|
||||
@// finish second stage of 4 point FFT
|
||||
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
|
||||
VADD qZ0,qY0,qY1 @// y0
|
||||
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],setStep
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],setStep
|
||||
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
VSUB dZr3,dYr2,dYi3 @// y3
|
||||
VADD dZi3,dYi2,dYr3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
|
||||
VSUB qZ1,qY0,qY1 @// y2
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],outPointStep
|
||||
|
||||
VADD dZr2,dYr2,dYi3 @// y1
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
VSUB dZi2,dYi2,dYr3
|
||||
|
||||
VADD qY0,qX0,qX2 @// u0 (next loop)
|
||||
VST2 {dZr2,dZi2},[pDst, :128],setStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],setStep
|
||||
|
||||
|
||||
.ELSE
|
||||
@ -260,15 +260,15 @@ grpZeroSetLoop\name:
|
||||
VADD dZr2,dYr2,dYi3 @// y1
|
||||
VSUB dZi2,dYi2,dYr3
|
||||
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VSUB qZ1,qY0,qY1 @// y2
|
||||
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VSUB dZr3,dYr2,dYi3 @// y3
|
||||
VADD dZi3,dYi2,dYr3
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
VADD qY0,qX0,qX2 @// u0 (next loop)
|
||||
VST2 {dZr3,dZi3},[pDst, :128],setStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],setStep
|
||||
|
||||
.ENDIF
|
||||
|
||||
|
@ -163,7 +163,7 @@
|
||||
@// Define stack arguments
|
||||
|
||||
MOV pw2,pTwiddle
|
||||
VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2, :256]!
|
||||
VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]!
|
||||
|
||||
MOV pw3,pTwiddle
|
||||
MOV pw1,pTwiddle
|
||||
@ -171,26 +171,26 @@
|
||||
@// pOut0+outPointStep == increment of 4*outPointStep bytes
|
||||
MOV outPointStep,subFFTSize,LSL #2
|
||||
|
||||
VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3, :64]!
|
||||
VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]!
|
||||
MOV subFFTNum,#1 @//after the last stage
|
||||
LSL grpCount,subFFTSize,#2
|
||||
|
||||
|
||||
@// Update grpCount and grpSize rightaway
|
||||
VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3, :64]!
|
||||
VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]!
|
||||
|
||||
@// update subFFTSize for the next stage
|
||||
MOV subFFTSize,grpCount
|
||||
MOV dstStep,outPointStep,LSL #1
|
||||
|
||||
VLD2 {dW1r,dW1i}, [pw1, :128]!
|
||||
VLD2 {dW1r,dW1i}, [pw1 :128]!
|
||||
|
||||
|
||||
ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep
|
||||
RSB dstStep,dstStep,#16 @// dstStep = - 3*outPointStep+16
|
||||
|
||||
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
|
||||
|
||||
@// Process 4 groups at a time
|
||||
|
||||
@ -225,7 +225,7 @@ grpLoop\name:
|
||||
@// Load the first twiddle for 4 groups : w^1
|
||||
@// w^1 twiddle (i+0,i+1,i+2,i+3) for group 0,1,2,3
|
||||
|
||||
VLD2 {dW1r,dW1i}, [pw1, :128]!
|
||||
VLD2 {dW1r,dW1i}, [pw1 :128]!
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
VMULL qT2,dXr2,dW2r
|
||||
@ -262,7 +262,7 @@ grpLoop\name:
|
||||
|
||||
@// Load the second twiddle for 4 groups : w^2
|
||||
@// w^2 twiddle (2i+0,2i+2,2i+4,2i+6) for group 0,1,2,3
|
||||
VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2, :256]!
|
||||
VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]!
|
||||
|
||||
|
||||
VRSHRN dZr2,qT2,#15
|
||||
@ -271,12 +271,12 @@ grpLoop\name:
|
||||
@// Load the third twiddle for 4 groups : w^3
|
||||
@// w^3 twiddle (3i+0,3i+3,3i+6,3i+9) for group 0,1,2,3
|
||||
|
||||
VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3, :64]!
|
||||
VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]!
|
||||
|
||||
VRSHRN dZr3,qT0,#15
|
||||
VRSHRN dZi3,qT1,#15
|
||||
|
||||
VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3, :64]!
|
||||
VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]!
|
||||
|
||||
.ifeqs "\scaled", "TRUE"
|
||||
|
||||
@ -285,7 +285,7 @@ grpLoop\name:
|
||||
VHADD qY0,qX0,qZ2
|
||||
VHSUB qY2,qX0,qZ2
|
||||
VHADD qY1,qZ1,qZ3
|
||||
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
|
||||
|
||||
VHSUB qY3,qZ1,qZ3
|
||||
|
||||
@ -293,20 +293,20 @@ grpLoop\name:
|
||||
|
||||
VHSUB qZ0,qY2,qY1
|
||||
VHADD qZ2,qY2,qY1
|
||||
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
|
||||
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
VHADD dZr3,dYr0,dYi3 @// y3 = u0-ju3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VHSUB dZi3,dYi0,dYr3
|
||||
|
||||
VHSUB dZr1,dYr0,dYi3 @// y1 = u0+ju3
|
||||
VHADD dZi1,dYi0,dYr3
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst, :128],dstStep @// dstStep = -3*outPointStep + 16
|
||||
VST2 {dZr3,dZi3},[pDst :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],dstStep @// dstStep = -3*outPointStep + 16
|
||||
|
||||
.ELSE
|
||||
|
||||
@ -314,11 +314,11 @@ grpLoop\name:
|
||||
VHADD dZi1,dYi0,dYr3
|
||||
|
||||
VHADD dZr3,dYr0,dYi3 @// y3 = u0-ju3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VHSUB dZi3,dYi0,dYr3
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr3,dZi3},[pDst, :128],dstStep @// dstStep = -3*outPointStep + 16
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],dstStep @// dstStep = -3*outPointStep + 16
|
||||
|
||||
.ENDIF
|
||||
|
||||
@ -329,7 +329,7 @@ grpLoop\name:
|
||||
VADD qY0,qX0,qZ2
|
||||
VSUB qY2,qX0,qZ2
|
||||
VADD qY1,qZ1,qZ3
|
||||
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
|
||||
|
||||
VSUB qY3,qZ1,qZ3
|
||||
|
||||
@ -337,20 +337,20 @@ grpLoop\name:
|
||||
|
||||
VSUB qZ0,qY2,qY1
|
||||
VADD qZ2,qY2,qY1
|
||||
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
|
||||
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
VADD dZr3,dYr0,dYi3 @// y3 = u0-ju3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VSUB dZi3,dYi0,dYr3
|
||||
|
||||
VSUB dZr1,dYr0,dYi3 @// y1 = u0+ju3
|
||||
VADD dZi1,dYi0,dYr3
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst, :128],dstStep @// dstStep = -3*outPointStep + 16
|
||||
VST2 {dZr3,dZi3},[pDst :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],dstStep @// dstStep = -3*outPointStep + 16
|
||||
|
||||
.ELSE
|
||||
|
||||
@ -358,11 +358,11 @@ grpLoop\name:
|
||||
VADD dZi1,dYi0,dYr3
|
||||
|
||||
VADD dZr3,dYr0,dYi3 @// y3 = u0-ju3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VSUB dZi3,dYi0,dYr3
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr3,dZi3},[pDst, :128],dstStep @// dstStep = -3*outPointStep + 16
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],dstStep @// dstStep = -3*outPointStep + 16
|
||||
|
||||
.ENDIF
|
||||
|
||||
|
@ -150,12 +150,12 @@
|
||||
|
||||
LSL pointStep,subFFTNum,#2 @// 2*grpSize
|
||||
|
||||
VLD1 dW1,[pTwiddle, :64] @//[wi | wr]
|
||||
VLD1 dW1,[pTwiddle :64] @//[wi | wr]
|
||||
MOV srcStep,pointStep,LSL #1 @// srcStep = 2*pointStep
|
||||
VLD1 dW2,[pTwiddle, :64] @//[wi | wr]
|
||||
VLD1 dW2,[pTwiddle :64] @//[wi | wr]
|
||||
ADD setStep,srcStep,pointStep @// setStep = 3*pointStep
|
||||
SUB srcStep,srcStep,#16 @// srcStep = 2*pointStep-16
|
||||
VLD1 dW3,[pTwiddle, :64]
|
||||
VLD1 dW3,[pTwiddle :64]
|
||||
@//RSB setStep,setStep,#16 @// setStep = - 3*pointStep+16
|
||||
RSB setStep,setStep,#0 @// setStep = - 3*pointStep
|
||||
|
||||
@ -167,13 +167,13 @@
|
||||
|
||||
grpLoop\name:
|
||||
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
|
||||
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
|
||||
ADD stepTwiddle,stepTwiddle,pointStep
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
|
||||
ADD pTwiddle,pTwiddle,stepTwiddle @// set pTwiddle to the first point
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
|
||||
MOV twStep,stepTwiddle,LSL #2
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],setStep @// data[3] & reset pSrc
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & reset pSrc
|
||||
|
||||
SUB twStep,stepTwiddle,twStep @// twStep = -3*stepTwiddle
|
||||
|
||||
@ -202,7 +202,7 @@ setLoop\name:
|
||||
|
||||
.ENDIF
|
||||
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
VMULL qT2,dXr2,dW2[0]
|
||||
@ -222,7 +222,7 @@ setLoop\name:
|
||||
VRSHRN dZi1,qT1,#15
|
||||
|
||||
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
VMULL qT0,dXr3,dW3[0]
|
||||
@ -244,7 +244,7 @@ setLoop\name:
|
||||
|
||||
VRSHRN dZr3,qT0,#15
|
||||
VRSHRN dZi3,qT1,#15
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],setStep @// data[3] & update pSrc for the next set
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set
|
||||
|
||||
|
||||
.ifeqs "\scaled", "TRUE"
|
||||
@ -253,7 +253,7 @@ setLoop\name:
|
||||
VHADD qY0,qX0,qZ2
|
||||
VHSUB qY2,qX0,qZ2
|
||||
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128]! @// data[0]
|
||||
VLD2 {dXr0,dXi0},[pSrc :128]! @// data[0]
|
||||
VHADD qY1,qZ1,qZ3
|
||||
VHSUB qY3,qZ1,qZ3
|
||||
|
||||
@ -265,16 +265,16 @@ setLoop\name:
|
||||
VHSUB qZ0,qY2,qY1
|
||||
|
||||
VHADD dZr2,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VHSUB dZi2,dYi0,dYr3
|
||||
|
||||
VHADD qZ1,qY2,qY1
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
|
||||
VHSUB dZr3,dYr0,dYi3
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
VHADD dZi3,dYi0,dYr3
|
||||
VST2 {dZr3,dZi3},[pDst, :128],dstStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],dstStep
|
||||
|
||||
|
||||
.ELSE
|
||||
@ -282,16 +282,16 @@ setLoop\name:
|
||||
VHSUB qZ0,qY2,qY1
|
||||
|
||||
VHSUB dZr3,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VHADD dZi3,dYi0,dYr3
|
||||
|
||||
VHADD qZ1,qY2,qY1
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],outPointStep
|
||||
|
||||
VHADD dZr2,dYr0,dYi3
|
||||
VHSUB dZi2,dYi0,dYr3
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst, :128],dstStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],dstStep
|
||||
|
||||
|
||||
.ENDIF
|
||||
@ -316,16 +316,16 @@ setLoop\name:
|
||||
VSUB qZ0,qY2,qY1
|
||||
|
||||
VADD dZr2,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VSUB dZi2,dYi0,dYr3
|
||||
|
||||
VADD qZ1,qY2,qY1
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
|
||||
VSUB dZr3,dYr0,dYi3
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
VADD dZi3,dYi0,dYr3
|
||||
VST2 {dZr3,dZi3},[pDst, :128],dstStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],dstStep
|
||||
|
||||
|
||||
.ELSE
|
||||
@ -333,16 +333,16 @@ setLoop\name:
|
||||
VSUB qZ0,qY2,qY1
|
||||
|
||||
VSUB dZr3,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VADD dZi3,dYi0,dYr3
|
||||
|
||||
VADD qZ1,qY2,qY1
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],outPointStep
|
||||
|
||||
VADD dZr2,dYr0,dYi3
|
||||
VSUB dZi2,dYi0,dYr3
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst, :128],dstStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],dstStep
|
||||
|
||||
|
||||
.ENDIF
|
||||
@ -354,11 +354,11 @@ setLoop\name:
|
||||
ADD pSrc,pSrc,pointStep @// increment to data[1] of the next set
|
||||
BGT setLoop\name
|
||||
|
||||
VLD1 dW1,[pTwiddle, :64],stepTwiddle @//[wi | wr]
|
||||
VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi | wr]
|
||||
SUBS grpCount,grpCount,#4 @// subtract 4 since grpCount multiplied by 4
|
||||
VLD1 dW2,[pTwiddle, :64],stepTwiddle @//[wi | wr]
|
||||
VLD1 dW2,[pTwiddle :64],stepTwiddle @//[wi | wr]
|
||||
ADD pSrc,pSrc,srcStep @// increment pSrc for the next grp
|
||||
VLD1 dW3,[pTwiddle, :64],twStep @//[wi | wr]
|
||||
VLD1 dW3,[pTwiddle :64],twStep @//[wi | wr]
|
||||
|
||||
|
||||
|
||||
|
@ -218,22 +218,22 @@
|
||||
|
||||
@// Calculate the step of input data for the next set
|
||||
@//MOV step1,pointStep,LSL #1 @// step1 = 2*pointStep
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
|
||||
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
|
||||
MOV step1,grpSize,LSL #3
|
||||
|
||||
MOV step2,pointStep,LSL #3
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
|
||||
SUB step2,step2,pointStep @// step2 = 7*pointStep
|
||||
RSB setStep,step2,#16 @// setStep = - 7*pointStep+16
|
||||
|
||||
|
||||
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
|
||||
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7] & update pSrc for the next set
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
|
||||
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
|
||||
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
|
||||
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
|
||||
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7] & update pSrc for the next set
|
||||
@// setStep = -7*pointStep + 16
|
||||
@// grp = 0 a special case since all the twiddle factors are 1
|
||||
@// Loop on the sets : 4 sets at a time
|
||||
@ -263,7 +263,7 @@ grpZeroSetLoop\name:
|
||||
|
||||
VHADD qY0,qV0,qV4
|
||||
VHSUB qY4,qV0,qV4
|
||||
VST2 {dYr0,dYi0},[pDst, :128],step1 @// store y0
|
||||
VST2 {dYr0,dYi0},[pDst :128],step1 @// store y0
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
@ -271,15 +271,15 @@ grpZeroSetLoop\name:
|
||||
VHADD dYi2,dVi2,dVr6
|
||||
|
||||
VHADD dYr6,dVr2,dVi6
|
||||
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y2
|
||||
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y2
|
||||
VHSUB dYi6,dVi2,dVr6
|
||||
|
||||
VHSUB qU1,qX0,qX4
|
||||
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
|
||||
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
|
||||
|
||||
VHSUB qU3,qX1,qX5
|
||||
VHSUB qU5,qX2,qX6
|
||||
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y6
|
||||
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y6
|
||||
|
||||
.ELSE
|
||||
|
||||
@ -287,15 +287,15 @@ grpZeroSetLoop\name:
|
||||
VHSUB dYi6,dVi2,dVr6
|
||||
|
||||
VHSUB dYr2,dVr2,dVi6
|
||||
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y2
|
||||
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y2
|
||||
VHADD dYi2,dVi2,dVr6
|
||||
|
||||
|
||||
VHSUB qU1,qX0,qX4
|
||||
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
|
||||
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
|
||||
VHSUB qU3,qX1,qX5
|
||||
VHSUB qU5,qX2,qX6
|
||||
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y6
|
||||
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y6
|
||||
|
||||
|
||||
.ENDIF
|
||||
@ -308,17 +308,17 @@ grpZeroSetLoop\name:
|
||||
@// finish second stage of 8 point FFT
|
||||
|
||||
VHSUB dVr1,dUr1,dUi5
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0] for next iteration
|
||||
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] for next iteration
|
||||
VHADD dVi1,dUi1,dUr5
|
||||
VHADD dVr3,dUr1,dUi5
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
|
||||
VHSUB dVi3,dUi1,dUr5
|
||||
|
||||
VHSUB dVr5,dUr3,dUi7
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
|
||||
VHADD dVi5,dUi3,dUr7
|
||||
VHADD dVr7,dUr3,dUi7
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
|
||||
VHSUB dVi7,dUi3,dUr7
|
||||
|
||||
@// finish third stage of 8 point FFT
|
||||
@ -327,14 +327,14 @@ grpZeroSetLoop\name:
|
||||
|
||||
@// calculate a*v5
|
||||
VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
|
||||
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
|
||||
VQRDMULH dVi5,dVi5,dT0[0]
|
||||
|
||||
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
|
||||
VSUB dVr5,dT1,dVi5 @// a * V5
|
||||
VADD dVi5,dT1,dVi5
|
||||
|
||||
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
|
||||
|
||||
@// calculate b*v7
|
||||
VQRDMULH dT1,dVr7,dT0[0]
|
||||
@ -348,35 +348,35 @@ grpZeroSetLoop\name:
|
||||
VSUB dVi7,dVi7,dT1
|
||||
SUB pDst, pDst, step2 @// set pDst to y1
|
||||
|
||||
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
|
||||
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
|
||||
|
||||
|
||||
VHSUB dYr3,dVr3,dVr7
|
||||
VHSUB dYi3,dVi3,dVi7
|
||||
VST2 {dYr1,dYi1},[pDst, :128],step1 @// store y1
|
||||
VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1
|
||||
VHADD dYr7,dVr3,dVr7
|
||||
VHADD dYi7,dVi3,dVi7
|
||||
|
||||
|
||||
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y3
|
||||
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y5
|
||||
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y3
|
||||
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y5
|
||||
#if 0
|
||||
VST2 {dYr7,dYi7},[pDst, :128],#16 @// store y7
|
||||
VST2 {dYr7,dYi7},[pDst :128],#16 @// store y7
|
||||
#else
|
||||
VST2 {dYr7,dYi7},[pDst, :128]! @// store y7
|
||||
VST2 {dYr7,dYi7},[pDst :128]! @// store y7
|
||||
#endif
|
||||
.ELSE
|
||||
|
||||
@// calculate b*v7
|
||||
VQRDMULH dT1,dVr7,dT0[0]
|
||||
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
|
||||
VQRDMULH dVi7,dVi7,dT0[0]
|
||||
|
||||
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
|
||||
VADD dVr7,dT1,dVi7 @// b * V7
|
||||
VSUB dVi7,dVi7,dT1
|
||||
|
||||
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
|
||||
|
||||
@// calculate a*v5
|
||||
VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
|
||||
@ -388,22 +388,22 @@ grpZeroSetLoop\name:
|
||||
|
||||
VSUB dVr5,dT1,dVi5 @// a * V5
|
||||
VADD dVi5,dT1,dVi5
|
||||
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
|
||||
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
|
||||
|
||||
VHSUB qY5,qV1,qV5
|
||||
|
||||
VHSUB dYr3,dVr3,dVr7
|
||||
VST2 {dYr7,dYi7},[pDst, :128],step1 @// store y1
|
||||
VST2 {dYr7,dYi7},[pDst :128],step1 @// store y1
|
||||
VHSUB dYi3,dVi3,dVi7
|
||||
VHADD qY1,qV1,qV5
|
||||
|
||||
|
||||
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y3
|
||||
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y5
|
||||
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y3
|
||||
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y5
|
||||
#if 0
|
||||
VST2 {dYr1,dYi1},[pDst, :128],#16 @// store y7
|
||||
VST2 {dYr1,dYi1},[pDst :128],#16 @// store y7
|
||||
#else
|
||||
VST2 {dYr1,dYi1},[pDst, :128]! @// store y7
|
||||
VST2 {dYr1,dYi1},[pDst :128]! @// store y7
|
||||
#endif
|
||||
|
||||
.ENDIF
|
||||
@ -429,7 +429,7 @@ grpZeroSetLoop\name:
|
||||
|
||||
VADD qY0,qV0,qV4
|
||||
VSUB qY4,qV0,qV4
|
||||
VST2 {dYr0,dYi0},[pDst, :128],step1 @// store y0
|
||||
VST2 {dYr0,dYi0},[pDst :128],step1 @// store y0
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
@ -437,15 +437,15 @@ grpZeroSetLoop\name:
|
||||
VADD dYi2,dVi2,dVr6
|
||||
|
||||
VADD dYr6,dVr2,dVi6
|
||||
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y2
|
||||
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y2
|
||||
VSUB dYi6,dVi2,dVr6
|
||||
|
||||
VSUB qU1,qX0,qX4
|
||||
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
|
||||
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
|
||||
|
||||
VSUB qU3,qX1,qX5
|
||||
VSUB qU5,qX2,qX6
|
||||
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y6
|
||||
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y6
|
||||
|
||||
.ELSE
|
||||
|
||||
@ -453,15 +453,15 @@ grpZeroSetLoop\name:
|
||||
VSUB dYi6,dVi2,dVr6
|
||||
|
||||
VSUB dYr2,dVr2,dVi6
|
||||
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y2
|
||||
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y2
|
||||
VADD dYi2,dVi2,dVr6
|
||||
|
||||
|
||||
VSUB qU1,qX0,qX4
|
||||
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
|
||||
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
|
||||
VSUB qU3,qX1,qX5
|
||||
VSUB qU5,qX2,qX6
|
||||
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y6
|
||||
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y6
|
||||
|
||||
|
||||
.ENDIF
|
||||
@ -474,17 +474,17 @@ grpZeroSetLoop\name:
|
||||
@// finish second stage of 8 point FFT
|
||||
|
||||
VSUB dVr1,dUr1,dUi5
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0] for next iteration
|
||||
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] for next iteration
|
||||
VADD dVi1,dUi1,dUr5
|
||||
VADD dVr3,dUr1,dUi5
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
|
||||
VSUB dVi3,dUi1,dUr5
|
||||
|
||||
VSUB dVr5,dUr3,dUi7
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
|
||||
VADD dVi5,dUi3,dUr7
|
||||
VADD dVr7,dUr3,dUi7
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
|
||||
VSUB dVi7,dUi3,dUr7
|
||||
|
||||
@// finish third stage of 8 point FFT
|
||||
@ -493,14 +493,14 @@ grpZeroSetLoop\name:
|
||||
|
||||
@// calculate a*v5
|
||||
VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
|
||||
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
|
||||
VQRDMULH dVi5,dVi5,dT0[0]
|
||||
|
||||
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
|
||||
VSUB dVr5,dT1,dVi5 @// a * V5
|
||||
VADD dVi5,dT1,dVi5
|
||||
|
||||
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
|
||||
|
||||
@// calculate b*v7
|
||||
VQRDMULH dT1,dVr7,dT0[0]
|
||||
@ -514,35 +514,35 @@ grpZeroSetLoop\name:
|
||||
VSUB dVi7,dVi7,dT1
|
||||
SUB pDst, pDst, step2 @// set pDst to y1
|
||||
|
||||
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
|
||||
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
|
||||
|
||||
|
||||
VSUB dYr3,dVr3,dVr7
|
||||
VSUB dYi3,dVi3,dVi7
|
||||
VST2 {dYr1,dYi1},[pDst, :128],step1 @// store y1
|
||||
VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1
|
||||
VADD dYr7,dVr3,dVr7
|
||||
VADD dYi7,dVi3,dVi7
|
||||
|
||||
|
||||
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y3
|
||||
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y5
|
||||
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y3
|
||||
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y5
|
||||
#if 0
|
||||
VST2 {dYr7,dYi7},[pDst, :128],#16 @// store y7
|
||||
VST2 {dYr7,dYi7},[pDst :128],#16 @// store y7
|
||||
#else
|
||||
VST2 {dYr7,dYi7},[pDst, :128]! @// store y7
|
||||
VST2 {dYr7,dYi7},[pDst :128]! @// store y7
|
||||
#endif
|
||||
.ELSE
|
||||
|
||||
@// calculate b*v7
|
||||
VQRDMULH dT1,dVr7,dT0[0]
|
||||
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
|
||||
VQRDMULH dVi7,dVi7,dT0[0]
|
||||
|
||||
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
|
||||
VADD dVr7,dT1,dVi7 @// b * V7
|
||||
VSUB dVi7,dVi7,dT1
|
||||
|
||||
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
|
||||
|
||||
@// calculate a*v5
|
||||
VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
|
||||
@ -554,22 +554,22 @@ grpZeroSetLoop\name:
|
||||
|
||||
VSUB dVr5,dT1,dVi5 @// a * V5
|
||||
VADD dVi5,dT1,dVi5
|
||||
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
|
||||
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
|
||||
|
||||
VSUB qY5,qV1,qV5
|
||||
|
||||
VSUB dYr3,dVr3,dVr7
|
||||
VST2 {dYr7,dYi7},[pDst, :128],step1 @// store y1
|
||||
VST2 {dYr7,dYi7},[pDst :128],step1 @// store y1
|
||||
VSUB dYi3,dVi3,dVi7
|
||||
VADD qY1,qV1,qV5
|
||||
|
||||
|
||||
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y3
|
||||
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y5
|
||||
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y3
|
||||
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y5
|
||||
#if 0
|
||||
VST2 {dYr1,dYi1},[pDst, :128],#16 @// store y7
|
||||
VST2 {dYr1,dYi1},[pDst :128],#16 @// store y7
|
||||
#else
|
||||
VST2 {dYr1,dYi1},[pDst, :128]! @// store y7
|
||||
VST2 {dYr1,dYi1},[pDst :128]! @// store y7
|
||||
#endif
|
||||
|
||||
.ENDIF
|
||||
|
@ -100,9 +100,9 @@
|
||||
@// Loop on 2 grps at a time for the last stage
|
||||
|
||||
grpLoop\name :
|
||||
VLD2 {dWr,dWi},[pTwiddle, :64]!
|
||||
VLD2 {dWr,dWi},[pTwiddle :64]!
|
||||
|
||||
VLD4 {dXr0,dXi0,dXr1,dXi1},[pSrc, :128]!
|
||||
VLD4 {dXr0,dXi0,dXr1,dXi1},[pSrc :128]!
|
||||
SUBS grpCount,grpCount,#4 @// grpCount is multiplied by 2
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
@ -126,23 +126,23 @@
|
||||
|
||||
|
||||
@// Update pSubFFTSize and pSubFFTNum regs
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
|
||||
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
|
||||
MOV subFFTSize,#4 @// subFFTSize = 1 for the first stage
|
||||
|
||||
@// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
|
||||
LSR grpSize,subFFTNum,#2
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
|
||||
MOV subFFTNum,grpSize
|
||||
|
||||
|
||||
@// Calculate the step of input data for the next set
|
||||
@//MOV setStep,pointStep,LSL #1
|
||||
MOV setStep,grpSize,LSL #4
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
|
||||
ADD setStep,setStep,pointStep @// setStep = 3*pointStep
|
||||
RSB setStep,setStep,#16 @// setStep = - 3*pointStep+16
|
||||
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],setStep @// data[3] & update pSrc for the next set
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set
|
||||
MOV step1,pointStep,LSL #1 @// step1 = 2*pointStep
|
||||
|
||||
.ifeqs "\scaled", "TRUE"
|
||||
@ -169,9 +169,9 @@ grpZeroSetLoop\name :
|
||||
|
||||
VHSUB qY2,qX0,qX2
|
||||
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],step1 @// data[0]
|
||||
VLD2 {dXr0,dXi0},[pSrc :128],step1 @// data[0]
|
||||
VHADD qY1,qX1,qX3
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],step3 @// data[2]
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],step3 @// data[2]
|
||||
VHSUB qY3,qX1,qX3
|
||||
|
||||
|
||||
@ -179,46 +179,46 @@ grpZeroSetLoop\name :
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
|
||||
VHADD qZ0,qY0,qY1
|
||||
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],setStep @// data[3] & update pSrc for the next set
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set
|
||||
VHSUB dZr3,dYr2,dYi3
|
||||
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VHADD dZi3,dYi2,dYr3
|
||||
|
||||
VHSUB qZ1,qY0,qY1
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],outPointStep
|
||||
|
||||
VHADD dZr2,dYr2,dYi3
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
VHSUB dZi2,dYi2,dYr3
|
||||
|
||||
VHADD qY0,qX0,qX2 @// u0 for next iteration
|
||||
VST2 {dZr2,dZi2},[pDst, :128],setStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],setStep
|
||||
|
||||
|
||||
.else
|
||||
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
|
||||
VHADD qZ0,qY0,qY1
|
||||
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],setStep @// data[3] & update pSrc for the next set
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set
|
||||
VHADD dZr2,dYr2,dYi3
|
||||
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VHSUB dZi2,dYi2,dYr3
|
||||
|
||||
VHSUB qZ1,qY0,qY1
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
|
||||
VHSUB dZr3,dYr2,dYi3
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
VHADD dZi3,dYi2,dYr3
|
||||
|
||||
VHADD qY0,qX0,qX2 @// u0 for next iteration
|
||||
VST2 {dZr3,dZi3},[pDst, :128],setStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],setStep
|
||||
|
||||
.endif
|
||||
|
||||
@ -231,9 +231,9 @@ grpZeroSetLoop\name :
|
||||
|
||||
VSUB qY2,qX0,qX2
|
||||
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],step1 @// data[0]
|
||||
VLD2 {dXr0,dXi0},[pSrc :128],step1 @// data[0]
|
||||
VADD qY1,qX1,qX3
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],step3 @// data[2]
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],step3 @// data[2]
|
||||
VSUB qY3,qX1,qX3
|
||||
|
||||
|
||||
@ -241,46 +241,46 @@ grpZeroSetLoop\name :
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
|
||||
VADD qZ0,qY0,qY1
|
||||
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],setStep @// data[3] & update pSrc for the next set
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set
|
||||
VSUB dZr3,dYr2,dYi3
|
||||
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VADD dZi3,dYi2,dYr3
|
||||
|
||||
VSUB qZ1,qY0,qY1
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],outPointStep
|
||||
|
||||
VADD dZr2,dYr2,dYi3
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
VSUB dZi2,dYi2,dYr3
|
||||
|
||||
VADD qY0,qX0,qX2 @// u0 for next iteration
|
||||
VST2 {dZr2,dZi2},[pDst, :128],setStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],setStep
|
||||
|
||||
|
||||
.else
|
||||
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
|
||||
VADD qZ0,qY0,qY1
|
||||
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],setStep @// data[3] & update pSrc for the next set
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set
|
||||
VADD dZr2,dYr2,dYi3
|
||||
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VSUB dZi2,dYi2,dYr3
|
||||
|
||||
VSUB qZ1,qY0,qY1
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
|
||||
VSUB dZr3,dYr2,dYi3
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
VADD dZi3,dYi2,dYr3
|
||||
|
||||
VADD qY0,qX0,qX2 @// u0 for next iteration
|
||||
VST2 {dZr3,dZi3},[pDst, :128],setStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],setStep
|
||||
|
||||
.endif
|
||||
|
||||
|
@ -147,30 +147,30 @@
|
||||
|
||||
@// Update grpCount and grpSize rightaway
|
||||
|
||||
VLD2 {dW1r,dW1i},[pTwiddle, :128] @// [wi|wr]
|
||||
VLD2 {dW1r,dW1i},[pTwiddle :128] @// [wi|wr]
|
||||
MOV step16,#16
|
||||
LSL grpCount,subFFTSize,#2
|
||||
|
||||
VLD1 dW2r,[pTwiddle, :64] @// [wi|wr]
|
||||
VLD1 dW2r,[pTwiddle :64] @// [wi|wr]
|
||||
MOV subFFTNum,#1 @//after the last stage
|
||||
|
||||
VLD1 dW3r,[pTwiddle, :64],step16 @// [wi|wr]
|
||||
VLD1 dW3r,[pTwiddle :64],step16 @// [wi|wr]
|
||||
MOV stepTwiddle,#0
|
||||
|
||||
VLD1 dW2i,[pTwiddle, :64]! @// [wi|wr]
|
||||
VLD1 dW2i,[pTwiddle :64]! @// [wi|wr]
|
||||
SUB grpTwStep,stepTwiddle,#8 @// grpTwStep = -8 to start with
|
||||
|
||||
@// update subFFTSize for the next stage
|
||||
MOV subFFTSize,grpCount
|
||||
VLD1 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr]
|
||||
VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|wr]
|
||||
MOV dstStep,outPointStep,LSL #1
|
||||
|
||||
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
|
||||
ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep
|
||||
RSB dstStep,dstStep,#16 @// dstStep = - 3*outPointStep+16
|
||||
MOV step24,#24
|
||||
|
||||
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
|
||||
|
||||
|
||||
@// Process two groups at a time
|
||||
@ -209,13 +209,13 @@ grpLoop\name :
|
||||
|
||||
.endif
|
||||
|
||||
VLD2 {dW1r,dW1i},[pTwiddle, :128],stepTwiddle @// [wi|wr]
|
||||
VLD2 {dW1r,dW1i},[pTwiddle :128],stepTwiddle @// [wi|wr]
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
VMULL qT2,dW2r,dXr2
|
||||
VMLAL qT2,dW2i,dXi2 @// real part
|
||||
VMULL qT3,dW2r,dXi2
|
||||
VLD1 dW2r,[pTwiddle, :64],step16 @// [wi|wr]
|
||||
VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr]
|
||||
VMLSL qT3,dW2i,dXr2 @// imag part
|
||||
|
||||
.else
|
||||
@ -223,25 +223,25 @@ grpLoop\name :
|
||||
VMULL qT2,dW2r,dXr2
|
||||
VMLSL qT2,dW2i,dXi2 @// real part
|
||||
VMULL qT3,dW2r,dXi2
|
||||
VLD1 dW2r,[pTwiddle, :64],step16 @// [wi|wr]
|
||||
VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr]
|
||||
VMLAL qT3,dW2i,dXr2 @// imag part
|
||||
|
||||
.endif
|
||||
|
||||
|
||||
VRSHRN dZr1,qT0,#31
|
||||
VLD1 dW2i,[pTwiddle, :64],twStep @// [wi|wr]
|
||||
VLD1 dW2i,[pTwiddle :64],twStep @// [wi|wr]
|
||||
VRSHRN dZi1,qT1,#31
|
||||
|
||||
VMOV qZ0,qX0 @// move qX0 so as to load for the next iteration
|
||||
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
|
||||
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
VMULL qT4,dW3r,dXr3
|
||||
VMLAL qT4,dW3i,dXi3 @// real part
|
||||
VMULL qT5,dW3r,dXi3
|
||||
VLD1 dW3r,[pTwiddle, :64],step24
|
||||
VLD1 dW3r,[pTwiddle :64],step24
|
||||
VMLSL qT5,dW3i,dXr3 @// imag part
|
||||
|
||||
.else
|
||||
@ -249,18 +249,18 @@ grpLoop\name :
|
||||
VMULL qT4,dW3r,dXr3
|
||||
VMLSL qT4,dW3i,dXi3 @// real part
|
||||
VMULL qT5,dW3r,dXi3
|
||||
VLD1 dW3r,[pTwiddle, :64],step24
|
||||
VLD1 dW3r,[pTwiddle :64],step24
|
||||
VMLAL qT5,dW3i,dXr3 @// imag part
|
||||
|
||||
.endif
|
||||
|
||||
VRSHRN dZr2,qT2,#31
|
||||
VLD1 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr]
|
||||
VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|wr]
|
||||
VRSHRN dZi2,qT3,#31
|
||||
|
||||
VRSHRN dZr3,qT4,#31
|
||||
VRSHRN dZi3,qT5,#31
|
||||
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
|
||||
|
||||
|
||||
.ifeqs "\scaled", "TRUE"
|
||||
@ -280,17 +280,17 @@ grpLoop\name :
|
||||
VHSUB qZ0,qY2,qY1
|
||||
|
||||
VHADD dZr3,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VHSUB dZi3,dYi0,dYr3
|
||||
|
||||
VHADD qZ2,qY2,qY1
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],outPointStep
|
||||
|
||||
VHSUB dZr1,dYr0,dYi3
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VHADD dZi1,dYi0,dYr3
|
||||
|
||||
VST2 {dZr1,dZi1},[pDst, :128],dstStep @// dstStep = -outPointStep + 16
|
||||
VST2 {dZr1,dZi1},[pDst :128],dstStep @// dstStep = -outPointStep + 16
|
||||
|
||||
|
||||
.else
|
||||
@ -298,17 +298,17 @@ grpLoop\name :
|
||||
VHSUB qZ0,qY2,qY1
|
||||
|
||||
VHSUB dZr1,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VHADD dZi1,dYi0,dYr3
|
||||
|
||||
VHADD qZ2,qY2,qY1
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
|
||||
VHADD dZr3,dYr0,dYi3
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VHSUB dZi3,dYi0,dYr3
|
||||
|
||||
VST2 {dZr3,dZi3},[pDst, :128],dstStep @// dstStep = -outPointStep + 16
|
||||
VST2 {dZr3,dZi3},[pDst :128],dstStep @// dstStep = -outPointStep + 16
|
||||
|
||||
|
||||
.endif
|
||||
@ -332,17 +332,17 @@ grpLoop\name :
|
||||
VSUB qZ0,qY2,qY1
|
||||
|
||||
VADD dZr3,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VSUB dZi3,dYi0,dYr3
|
||||
|
||||
VADD qZ2,qY2,qY1
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],outPointStep
|
||||
|
||||
VSUB dZr1,dYr0,dYi3
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VADD dZi1,dYi0,dYr3
|
||||
|
||||
VST2 {dZr1,dZi1},[pDst, :128],dstStep @// dstStep = -outPointStep + 16
|
||||
VST2 {dZr1,dZi1},[pDst :128],dstStep @// dstStep = -outPointStep + 16
|
||||
|
||||
|
||||
.else
|
||||
@ -350,17 +350,17 @@ grpLoop\name :
|
||||
VSUB qZ0,qY2,qY1
|
||||
|
||||
VSUB dZr1,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VADD dZi1,dYi0,dYr3
|
||||
|
||||
VADD qZ2,qY2,qY1
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
|
||||
VADD dZr3,dYr0,dYi3
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VSUB dZi3,dYi0,dYr3
|
||||
|
||||
VST2 {dZr3,dZi3},[pDst, :128],dstStep @// dstStep = -outPointStep + 16
|
||||
VST2 {dZr3,dZi3},[pDst :128],dstStep @// dstStep = -outPointStep + 16
|
||||
|
||||
|
||||
.endif
|
||||
|
@ -268,33 +268,33 @@ setLoop\name :
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
VHADD dZr3,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VHSUB dZi3,dYi0,dYr3
|
||||
|
||||
VHADD qZ2,qY2,qY1
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],outPointStep
|
||||
|
||||
VHSUB dZr1,dYr0,dYi3
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VHADD dZi1,dYi0,dYr3
|
||||
|
||||
VST2 {dZr1,dZi1},[pDst, :128],dstStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],dstStep
|
||||
|
||||
|
||||
.else
|
||||
|
||||
VHSUB dZr1,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VHADD dZi1,dYi0,dYr3
|
||||
|
||||
VHADD qZ2,qY2,qY1
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
|
||||
VHADD dZr3,dYr0,dYi3
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VHSUB dZi3,dYi0,dYr3
|
||||
|
||||
VST2 {dZr3,dZi3},[pDst, :128],dstStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],dstStep
|
||||
|
||||
|
||||
.endif
|
||||
@ -306,7 +306,7 @@ setLoop\name :
|
||||
VADD qY0,qX0,qZ2
|
||||
VSUB qY2,qX0,qZ2
|
||||
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128]! @// data[0] for next iteration
|
||||
VLD2 {dXr0,dXi0},[pSrc :128]! @// data[0] for next iteration
|
||||
VADD qY1,qZ1,qZ3
|
||||
VSUB qY3,qZ1,qZ3
|
||||
|
||||
@ -318,33 +318,33 @@ setLoop\name :
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
VADD dZr3,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VSUB dZi3,dYi0,dYr3
|
||||
|
||||
VADD qZ2,qY2,qY1
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],outPointStep
|
||||
|
||||
VSUB dZr1,dYr0,dYi3
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VADD dZi1,dYi0,dYr3
|
||||
|
||||
VST2 {dZr1,dZi1},[pDst, :128],dstStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],dstStep
|
||||
|
||||
|
||||
.else
|
||||
|
||||
VSUB dZr1,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VST2 {dZr0,dZi0},[pDst :128],outPointStep
|
||||
VADD dZi1,dYi0,dYr3
|
||||
|
||||
VADD qZ2,qY2,qY1
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VST2 {dZr1,dZi1},[pDst :128],outPointStep
|
||||
|
||||
VADD dZr3,dYr0,dYi3
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VST2 {dZr2,dZi2},[pDst :128],outPointStep
|
||||
VSUB dZi3,dYi0,dYr3
|
||||
|
||||
VST2 {dZr3,dZi3},[pDst, :128],dstStep
|
||||
VST2 {dZr3,dZi3},[pDst :128],dstStep
|
||||
|
||||
|
||||
.endif
|
||||
@ -355,11 +355,11 @@ setLoop\name :
|
||||
BGT setLoop\name
|
||||
|
||||
|
||||
VLD1 dW1,[pTwiddle, :64],stepTwiddle @//[wi | wr]
|
||||
VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi | wr]
|
||||
SUBS grpCount,grpCount,#4 @// subtract 4 since grpCount multiplied by 4
|
||||
VLD1 dW2,[pTwiddle, :64],stepTwiddle @//[wi | wr]
|
||||
VLD1 dW2,[pTwiddle :64],stepTwiddle @//[wi | wr]
|
||||
ADD pSrc,pSrc,srcStep @// increment pSrc for the next grp
|
||||
VLD1 dW3,[pTwiddle, :64],twStep @//[wi | wr]
|
||||
VLD1 dW3,[pTwiddle :64],twStep @//[wi | wr]
|
||||
BGT grpLoop\name
|
||||
|
||||
|
||||
|
@ -213,20 +213,20 @@
|
||||
|
||||
@// Calculate the step of input data for the next set
|
||||
@//MOV step1,pointStep,LSL #1 @// step1 = 2*pointStep
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
|
||||
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
|
||||
MOV step1,grpSize,LSL #4
|
||||
|
||||
MOV step2,pointStep,LSL #3
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
|
||||
SUB step2,step2,pointStep @// step2 = 7*pointStep
|
||||
RSB setStep,step2,#16 @// setStep = - 7*pointStep+16
|
||||
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
|
||||
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7] & update pSrc for the next set
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
|
||||
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
|
||||
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
|
||||
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
|
||||
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7] & update pSrc for the next set
|
||||
@// setStep = -7*pointStep + 16
|
||||
@// grp = 0 a special case since all the twiddle factors are 1
|
||||
@// Loop on the sets
|
||||
@ -256,7 +256,7 @@ grpZeroSetLoop\name :
|
||||
|
||||
VHADD qY0,qV0,qV4
|
||||
VHSUB qY4,qV0,qV4
|
||||
VST2 {dYr0,dYi0},[pDst, :128],step1 @// store y0
|
||||
VST2 {dYr0,dYi0},[pDst :128],step1 @// store y0
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
@ -264,15 +264,15 @@ grpZeroSetLoop\name :
|
||||
VHADD dYi2,dVi2,dVr6
|
||||
|
||||
VHADD dYr6,dVr2,dVi6
|
||||
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y2
|
||||
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y2
|
||||
VHSUB dYi6,dVi2,dVr6
|
||||
|
||||
VHSUB qU1,qX0,qX4
|
||||
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
|
||||
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
|
||||
|
||||
VHSUB qU3,qX1,qX5
|
||||
VHSUB qU5,qX2,qX6
|
||||
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y6
|
||||
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y6
|
||||
|
||||
.ELSE
|
||||
|
||||
@ -280,15 +280,15 @@ grpZeroSetLoop\name :
|
||||
VHSUB dYi6,dVi2,dVr6
|
||||
|
||||
VHSUB dYr2,dVr2,dVi6
|
||||
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y2
|
||||
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y2
|
||||
VHADD dYi2,dVi2,dVr6
|
||||
|
||||
|
||||
VHSUB qU1,qX0,qX4
|
||||
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
|
||||
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
|
||||
VHSUB qU3,qX1,qX5
|
||||
VHSUB qU5,qX2,qX6
|
||||
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y6
|
||||
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y6
|
||||
|
||||
|
||||
.ENDIF
|
||||
@ -301,17 +301,17 @@ grpZeroSetLoop\name :
|
||||
@// finish second stage of 8 point FFT
|
||||
|
||||
VHSUB dVr1,dUr1,dUi5
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0] for next iteration
|
||||
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] for next iteration
|
||||
VHADD dVi1,dUi1,dUr5
|
||||
VHADD dVr3,dUr1,dUi5
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
|
||||
VHSUB dVi3,dUi1,dUr5
|
||||
|
||||
VHSUB dVr5,dUr3,dUi7
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
|
||||
VHADD dVi5,dUi3,dUr7
|
||||
VHADD dVr7,dUr3,dUi7
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
|
||||
VHSUB dVi7,dUi3,dUr7
|
||||
|
||||
@// finish third stage of 8 point FFT
|
||||
@ -320,14 +320,14 @@ grpZeroSetLoop\name :
|
||||
|
||||
@// calculate a*v5
|
||||
VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
|
||||
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
|
||||
VQRDMULH dVi5,dVi5,dT0[0]
|
||||
|
||||
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
|
||||
VSUB dVr5,dT1,dVi5 @// a * V5
|
||||
VADD dVi5,dT1,dVi5
|
||||
|
||||
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
|
||||
|
||||
@// calculate b*v7
|
||||
VQRDMULH dT1,dVr7,dT0[0]
|
||||
@ -341,32 +341,32 @@ grpZeroSetLoop\name :
|
||||
VSUB dVi7,dVi7,dT1
|
||||
SUB pDst, pDst, step2 @// set pDst to y1
|
||||
|
||||
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
|
||||
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
|
||||
|
||||
|
||||
VHSUB dYr3,dVr3,dVr7
|
||||
VHSUB dYi3,dVi3,dVi7
|
||||
VST2 {dYr1,dYi1},[pDst, :128],step1 @// store y1
|
||||
VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1
|
||||
VHADD dYr7,dVr3,dVr7
|
||||
VHADD dYi7,dVi3,dVi7
|
||||
|
||||
|
||||
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y3
|
||||
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y5
|
||||
VST2 {dYr7,dYi7},[pDst, :128]! @// store y7
|
||||
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y3
|
||||
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y5
|
||||
VST2 {dYr7,dYi7},[pDst :128]! @// store y7
|
||||
|
||||
.ELSE
|
||||
|
||||
@// calculate b*v7
|
||||
VQRDMULH dT1,dVr7,dT0[0]
|
||||
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
|
||||
VQRDMULH dVi7,dVi7,dT0[0]
|
||||
|
||||
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
|
||||
VADD dVr7,dT1,dVi7 @// b * V7
|
||||
VSUB dVi7,dVi7,dT1
|
||||
|
||||
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
|
||||
|
||||
@// calculate a*v5
|
||||
VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
|
||||
@ -378,19 +378,19 @@ grpZeroSetLoop\name :
|
||||
|
||||
VSUB dVr5,dT1,dVi5 @// a * V5
|
||||
VADD dVi5,dT1,dVi5
|
||||
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
|
||||
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
|
||||
|
||||
VHSUB qY5,qV1,qV5
|
||||
|
||||
VHSUB dYr3,dVr3,dVr7
|
||||
VST2 {dYr7,dYi7},[pDst, :128],step1 @// store y1
|
||||
VST2 {dYr7,dYi7},[pDst :128],step1 @// store y1
|
||||
VHSUB dYi3,dVi3,dVi7
|
||||
VHADD qY1,qV1,qV5
|
||||
|
||||
|
||||
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y3
|
||||
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y5
|
||||
VST2 {dYr1,dYi1},[pDst, :128]! @// store y7
|
||||
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y3
|
||||
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y5
|
||||
VST2 {dYr1,dYi1},[pDst :128]! @// store y7
|
||||
|
||||
.ENDIF
|
||||
|
||||
@ -415,7 +415,7 @@ grpZeroSetLoop\name :
|
||||
|
||||
VADD qY0,qV0,qV4
|
||||
VSUB qY4,qV0,qV4
|
||||
VST2 {dYr0,dYi0},[pDst, :128],step1 @// store y0
|
||||
VST2 {dYr0,dYi0},[pDst :128],step1 @// store y0
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
@ -423,15 +423,15 @@ grpZeroSetLoop\name :
|
||||
VADD dYi2,dVi2,dVr6
|
||||
|
||||
VADD dYr6,dVr2,dVi6
|
||||
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y2
|
||||
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y2
|
||||
VSUB dYi6,dVi2,dVr6
|
||||
|
||||
VSUB qU1,qX0,qX4
|
||||
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
|
||||
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
|
||||
|
||||
VSUB qU3,qX1,qX5
|
||||
VSUB qU5,qX2,qX6
|
||||
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y6
|
||||
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y6
|
||||
|
||||
.ELSE
|
||||
|
||||
@ -439,15 +439,15 @@ grpZeroSetLoop\name :
|
||||
VSUB dYi6,dVi2,dVr6
|
||||
|
||||
VSUB dYr2,dVr2,dVi6
|
||||
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y2
|
||||
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y2
|
||||
VADD dYi2,dVi2,dVr6
|
||||
|
||||
|
||||
VSUB qU1,qX0,qX4
|
||||
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
|
||||
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
|
||||
VSUB qU3,qX1,qX5
|
||||
VSUB qU5,qX2,qX6
|
||||
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y6
|
||||
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y6
|
||||
|
||||
|
||||
.ENDIF
|
||||
@ -460,17 +460,17 @@ grpZeroSetLoop\name :
|
||||
@// finish second stage of 8 point FFT
|
||||
|
||||
VSUB dVr1,dUr1,dUi5
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0] for next iteration
|
||||
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] for next iteration
|
||||
VADD dVi1,dUi1,dUr5
|
||||
VADD dVr3,dUr1,dUi5
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
|
||||
VSUB dVi3,dUi1,dUr5
|
||||
|
||||
VSUB dVr5,dUr3,dUi7
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
|
||||
VADD dVi5,dUi3,dUr7
|
||||
VADD dVr7,dUr3,dUi7
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
|
||||
VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
|
||||
VSUB dVi7,dUi3,dUr7
|
||||
|
||||
@// finish third stage of 8 point FFT
|
||||
@ -479,14 +479,14 @@ grpZeroSetLoop\name :
|
||||
|
||||
@// calculate a*v5
|
||||
VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
|
||||
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
|
||||
VQRDMULH dVi5,dVi5,dT0[0]
|
||||
|
||||
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
|
||||
VSUB dVr5,dT1,dVi5 @// a * V5
|
||||
VADD dVi5,dT1,dVi5
|
||||
|
||||
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
|
||||
|
||||
@// calculate b*v7
|
||||
VQRDMULH dT1,dVr7,dT0[0]
|
||||
@ -500,32 +500,32 @@ grpZeroSetLoop\name :
|
||||
VSUB dVi7,dVi7,dT1
|
||||
SUB pDst, pDst, step2 @// set pDst to y1
|
||||
|
||||
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
|
||||
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
|
||||
|
||||
|
||||
VSUB dYr3,dVr3,dVr7
|
||||
VSUB dYi3,dVi3,dVi7
|
||||
VST2 {dYr1,dYi1},[pDst, :128],step1 @// store y1
|
||||
VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1
|
||||
VADD dYr7,dVr3,dVr7
|
||||
VADD dYi7,dVi3,dVi7
|
||||
|
||||
|
||||
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y3
|
||||
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y5
|
||||
VST2 {dYr7,dYi7},[pDst, :128]! @// store y7
|
||||
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y3
|
||||
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y5
|
||||
VST2 {dYr7,dYi7},[pDst :128]! @// store y7
|
||||
|
||||
.ELSE
|
||||
|
||||
@// calculate b*v7
|
||||
VQRDMULH dT1,dVr7,dT0[0]
|
||||
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
|
||||
VQRDMULH dVi7,dVi7,dT0[0]
|
||||
|
||||
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
|
||||
VADD dVr7,dT1,dVi7 @// b * V7
|
||||
VSUB dVi7,dVi7,dT1
|
||||
|
||||
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
|
||||
|
||||
@// calculate a*v5
|
||||
VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
|
||||
@ -537,19 +537,19 @@ grpZeroSetLoop\name :
|
||||
|
||||
VSUB dVr5,dT1,dVi5 @// a * V5
|
||||
VADD dVi5,dT1,dVi5
|
||||
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
|
||||
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
|
||||
|
||||
VSUB qY5,qV1,qV5
|
||||
|
||||
VSUB dYr3,dVr3,dVr7
|
||||
VST2 {dYr7,dYi7},[pDst, :128],step1 @// store y1
|
||||
VST2 {dYr7,dYi7},[pDst :128],step1 @// store y1
|
||||
VSUB dYi3,dVi3,dVi7
|
||||
VADD qY1,qV1,qV5
|
||||
|
||||
|
||||
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y3
|
||||
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y5
|
||||
VST2 {dYr1,dYi1},[pDst, :128]! @// store y7
|
||||
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y3
|
||||
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y5
|
||||
VST2 {dYr1,dYi1},[pDst :128]! @// store y7
|
||||
|
||||
.ENDIF
|
||||
|
||||
|
@ -128,6 +128,8 @@
|
||||
|
||||
#define half d0.f32
|
||||
|
||||
HALF: .float 0.5
|
||||
|
||||
@// Allocate stack memory required by the function
|
||||
|
||||
@// Write function header
|
||||
@ -298,7 +300,7 @@ finalComplexToRealFixup:
|
||||
@// since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
|
||||
|
||||
|
||||
ADR t0, HALF
|
||||
LDR t0, =HALF
|
||||
VLD1 half[0], [t0]
|
||||
|
||||
evenOddButterflyLoop:
|
||||
@ -400,5 +402,5 @@ End:
|
||||
|
||||
@// Write function tail
|
||||
M_END
|
||||
HALF: .float 0.5
|
||||
|
||||
.end
|
||||
|
@ -198,10 +198,10 @@ FFTEnd: @// Does only the scaling
|
||||
|
||||
@// N = subFFTSize ; dataptr = pDst ; scale = diff
|
||||
scaleFFTData:
|
||||
VLD1 {qX0},[pSrc, :128] @// pSrc contains pDst pointer
|
||||
VLD1 {qX0},[pSrc :128] @// pSrc contains pDst pointer
|
||||
SUBS subFFTSize,subFFTSize,#2
|
||||
VMUL qX0, qX0, dScale[0]
|
||||
VST1 {qX0},[pSrc, :128]!
|
||||
VST1 {qX0},[pSrc :128]!
|
||||
|
||||
BGT scaleFFTData
|
||||
End:
|
||||
|
Loading…
Reference in New Issue
Block a user