Backed out changeset 041ed2e08168 (bug 926838)

This commit is contained in:
Carsten "Tomcat" Book 2015-05-05 15:51:33 +02:00
parent 8061d7003f
commit 09d2b1fa6a
17 changed files with 406 additions and 403 deletions

View File

@ -170,9 +170,6 @@
.global \name
.func \name
.section .text.\name,"ax",%progbits
.arch armv7-a
.fpu neon
.object_arch armv4
.align 2
\name :
.fnstart

View File

@ -93,13 +93,13 @@
radix2lsGrpLoop\name :
@ dWr = [pTwiddle[0].Re, pTwiddle[1].Re]
@ dWi = [pTwiddle[0].Im, pTwiddle[1].Im]
VLD2 {dWr,dWi},[pTwiddle, :64]!
VLD2 {dWr,dWi},[pTwiddle :64]!
@ dXr0 = [pSrc[0].Re, pSrc[2].Re]
@ dXi0 = [pSrc[0].Im, pSrc[2].Im]
@ dXr1 = [pSrc[1].Re, pSrc[3].Re]
@ dXi1 = [pSrc[1].Im, pSrc[3].Im]
VLD4 {dXr0,dXi0,dXr1,dXi1},[pSrc, :128]!
VLD4 {dXr0,dXi0,dXr1,dXi1},[pSrc :128]!
SUBS grpCount,grpCount,#4 @// grpCount is multiplied by 2
.ifeqs "\inverse", "TRUE"

View File

@ -118,27 +118,27 @@
@// Update pSubFFTSize and pSubFFTNum regs
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
@// subFFTSize = 1 for the first stage
MOV subFFTSize,#4
@// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
LSR grpSize,subFFTNum,#2
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
MOV subFFTNum,grpSize
@// Calculate the step of input data for the next set
@//MOV setStep,pointStep,LSL #1
MOV setStep,grpSize,LSL #4
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
@// setStep = 3*pointStep
ADD setStep,setStep,pointStep
@// setStep = - 3*pointStep+16
RSB setStep,setStep,#16
@// data[3] & update pSrc for the next set
VLD2 {dXr3,dXi3},[pSrc, :128],setStep
VLD2 {dXr3,dXi3},[pSrc :128],setStep
@// step1 = 2*pointStep
MOV step1,pointStep,LSL #1
@ -163,9 +163,9 @@ radix4fsGrpZeroSetLoop\name :
VSUB qY2,qX0,qX2
VLD2 {dXr0,dXi0},[pSrc, :128],step1 @// data[0]
VLD2 {dXr0,dXi0},[pSrc :128],step1 @// data[0]
VADD qY1,qX1,qX3
VLD2 {dXr2,dXi2},[pSrc, :128],step3 @// data[2]
VLD2 {dXr2,dXi2},[pSrc :128],step3 @// data[2]
VSUB qY3,qX1,qX3
@ -173,56 +173,56 @@ radix4fsGrpZeroSetLoop\name :
.ifeqs "\inverse", "TRUE"
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
VADD qZ0,qY0,qY1
@// data[3] & update pSrc for the next set, but not if it's the
@// last iteration so that we don't read past the end of the
@// input array.
BEQ radix4SkipLastUpdateInv\name
VLD2 {dXr3,dXi3},[pSrc, :128],setStep
VLD2 {dXr3,dXi3},[pSrc :128],setStep
radix4SkipLastUpdateInv\name:
VSUB dZr3,dYr2,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VADD dZi3,dYi2,dYr3
VSUB qZ1,qY0,qY1
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VST2 {dZr3,dZi3},[pDst :128],outPointStep
VADD dZr2,dYr2,dYi3
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VSUB dZi2,dYi2,dYr3
VADD qY0,qX0,qX2 @// u0 for next iteration
VST2 {dZr2,dZi2},[pDst, :128],setStep
VST2 {dZr2,dZi2},[pDst :128],setStep
.else
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
VADD qZ0,qY0,qY1
@// data[3] & update pSrc for the next set, but not if it's the
@// last iteration so that we don't read past the end of the
@// input array.
BEQ radix4SkipLastUpdateFwd\name
VLD2 {dXr3,dXi3},[pSrc, :128],setStep
VLD2 {dXr3,dXi3},[pSrc :128],setStep
radix4SkipLastUpdateFwd\name:
VADD dZr2,dYr2,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VSUB dZi2,dYi2,dYr3
VSUB qZ1,qY0,qY1
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VSUB dZr3,dYr2,dYi3
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VADD dZi3,dYi2,dYr3
VADD qY0,qX0,qX2 @// u0 for next iteration
VST2 {dZr3,dZi3},[pDst, :128],setStep
VST2 {dZr3,dZi3},[pDst :128],setStep
.endif

View File

@ -139,32 +139,32 @@
@// Update grpCount and grpSize rightaway
VLD2 {dW1r,dW1i},[pTwiddle, :128] @// [wi|wr]
VLD2 {dW1r,dW1i},[pTwiddle :128] @// [wi|wr]
MOV step16,#16
LSL grpCount,subFFTSize,#2
VLD1 dW2r,[pTwiddle, :64] @// [wi|wr]
VLD1 dW2r,[pTwiddle :64] @// [wi|wr]
MOV subFFTNum,#1 @//after the last stage
VLD1 dW3r,[pTwiddle, :64],step16 @// [wi|wr]
VLD1 dW3r,[pTwiddle :64],step16 @// [wi|wr]
MOV stepTwiddle,#0
VLD1 dW2i,[pTwiddle, :64]! @// [wi|wr]
VLD1 dW2i,[pTwiddle :64]! @// [wi|wr]
SUB grpTwStep,stepTwiddle,#8 @// grpTwStep = -8 to start with
@// update subFFTSize for the next stage
MOV subFFTSize,grpCount
VLD1 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr]
VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|wr]
MOV dstStep,outPointStep,LSL #1
@// AC.r AC.i BD.r BD.i
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]!
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]!
ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep
RSB dstStep,dstStep,#16 @// dstStep = - 3*outPointStep+16
MOV step24,#24
@// AC.r AC.i BD.r BD.i
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]!
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]!
@// Process two groups at a time
@ -204,13 +204,13 @@ radix4lsGrpLoop\name :
.endif
VLD2 {dW1r,dW1i},[pTwiddle, :128],stepTwiddle @// [wi|wr]
VLD2 {dW1r,dW1i},[pTwiddle :128],stepTwiddle @// [wi|wr]
.ifeqs "\inverse", "TRUE"
VMUL dZr2,dW2r,dXr2
VMLA dZr2,dW2i,dXi2 @// real part
VMUL dZi2,dW2r,dXi2
VLD1 dW2r,[pTwiddle, :64],step16 @// [wi|wr]
VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr]
VMLS dZi2,dW2i,dXr2 @// imag part
.else
@ -218,13 +218,13 @@ radix4lsGrpLoop\name :
VMUL dZr2,dW2r,dXr2
VMLS dZr2,dW2i,dXi2 @// real part
VMUL dZi2,dW2r,dXi2
VLD1 dW2r,[pTwiddle, :64],step16 @// [wi|wr]
VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr]
VMLA dZi2,dW2i,dXr2 @// imag part
.endif
VLD1 dW2i,[pTwiddle, :64],twStep @// [wi|wr]
VLD1 dW2i,[pTwiddle :64],twStep @// [wi|wr]
@// move qX0 so as to load for the next iteration
VMOV qZ0,qX0
@ -233,7 +233,7 @@ radix4lsGrpLoop\name :
VMUL dZr3,dW3r,dXr3
VMLA dZr3,dW3i,dXi3 @// real part
VMUL dZi3,dW3r,dXi3
VLD1 dW3r,[pTwiddle, :64],step24
VLD1 dW3r,[pTwiddle :64],step24
VMLS dZi3,dW3i,dXr3 @// imag part
.else
@ -241,22 +241,22 @@ radix4lsGrpLoop\name :
VMUL dZr3,dW3r,dXr3
VMLS dZr3,dW3i,dXi3 @// real part
VMUL dZi3,dW3r,dXi3
VLD1 dW3r,[pTwiddle, :64],step24
VLD1 dW3r,[pTwiddle :64],step24
VMLA dZi3,dW3i,dXr3 @// imag part
.endif
VLD1 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr]
VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|wr]
@// Don't do the load on the last iteration so we don't read past the end
@// of pSrc.
addeq pSrc, pSrc, #64
beq radix4lsSkipRead\name
@// AC.r AC.i BD.r BD.i
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]!
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]!
@// AC.r AC.i BD.r BD.i
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]!
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]!
radix4lsSkipRead\name:
@// finish first stage of 4 point FFT
@ -274,18 +274,18 @@ radix4lsSkipRead\name:
VSUB qZ0,qY2,qY1
VADD dZr3,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VSUB dZi3,dYi0,dYr3
VADD qZ2,qY2,qY1
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VST2 {dZr3,dZi3},[pDst :128],outPointStep
VSUB dZr1,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VADD dZi1,dYi0,dYr3
@// dstStep = -outPointStep + 16
VST2 {dZr1,dZi1},[pDst, :128],dstStep
VST2 {dZr1,dZi1},[pDst :128],dstStep
.else
@ -293,18 +293,18 @@ radix4lsSkipRead\name:
VSUB qZ0,qY2,qY1
VSUB dZr1,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VADD dZi1,dYi0,dYr3
VADD qZ2,qY2,qY1
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VADD dZr3,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VSUB dZi3,dYi0,dYr3
@// dstStep = -outPointStep + 16
VST2 {dZr3,dZi3},[pDst, :128],dstStep
VST2 {dZr3,dZi3},[pDst :128],dstStep
.endif

View File

@ -250,7 +250,7 @@ radix4SkipRead\name:
VSUB qY2,qX0,qZ2
@// data[0] for next iteration
VLD2 {dXr0,dXi0},[pSrc, :128]!
VLD2 {dXr0,dXi0},[pSrc :128]!
VADD qY1,qZ1,qZ3
VSUB qY3,qZ1,qZ3
@ -262,33 +262,33 @@ radix4SkipRead\name:
.ifeqs "\inverse", "TRUE"
VADD dZr3,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VSUB dZi3,dYi0,dYr3
VADD qZ2,qY2,qY1
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VST2 {dZr3,dZi3},[pDst :128],outPointStep
VSUB dZr1,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VADD dZi1,dYi0,dYr3
VST2 {dZr1,dZi1},[pDst, :128],dstStep
VST2 {dZr1,dZi1},[pDst :128],dstStep
.else
VSUB dZr1,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VADD dZi1,dYi0,dYr3
VADD qZ2,qY2,qY1
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VADD dZr3,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VSUB dZi3,dYi0,dYr3
VST2 {dZr3,dZi3},[pDst, :128],dstStep
VST2 {dZr3,dZi3},[pDst :128],dstStep
.endif
@ -298,13 +298,13 @@ radix4SkipRead\name:
BGT radix4SetLoop\name
VLD1 dW1,[pTwiddle, :64],stepTwiddle @//[wi | wr]
VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi | wr]
@// subtract 4 since grpCount multiplied by 4
SUBS grpCount,grpCount,#4
VLD1 dW2,[pTwiddle, :64],stepTwiddle @//[wi | wr]
VLD1 dW2,[pTwiddle :64],stepTwiddle @//[wi | wr]
@// increment pSrc for the next grp
ADD pSrc,pSrc,srcStep
VLD1 dW3,[pTwiddle, :64],twStep @//[wi | wr]
VLD1 dW3,[pTwiddle :64],twStep @//[wi | wr]
BGT radix4GrpLoop\name

View File

@ -173,6 +173,10 @@
#define dT0 D14.F32
#define dT1 D15.F32
@// Define constants
@ sqrt(1/2)
ONEBYSQRT2: .float 0.7071067811865476e0
.MACRO FFTSTAGE scaled, inverse, name
@ -181,7 +185,7 @@
@// Update pSubFFTSize and pSubFFTNum regs
@// subFFTSize = 1 for the first stage
MOV subFFTSize,#8
ADR t0,ONEBYSQRT2\name
LDR t0,=ONEBYSQRT2
@// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
LSR grpSize,subFFTNum,#3
@ -197,23 +201,23 @@
@// Calculate the step of input data for the next set
@//MOV step1,pointStep,LSL #1 @// step1 = 2*pointStep
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
MOV step1,grpSize,LSL #4
MOV step2,pointStep,LSL #3
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
SUB step2,step2,pointStep @// step2 = 7*pointStep
@// setStep = - 7*pointStep+16
RSB setStep,step2,#16
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
@// data[7] & update pSrc for the next set
@// setStep = -7*pointStep + 16
VLD2 {dXr7,dXi7},[pSrc, :128],setStep
VLD2 {dXr7,dXi7},[pSrc :128],setStep
@// grp = 0 a special case since all the twiddle factors are 1
@// Loop on the sets
@ -241,7 +245,7 @@ radix8fsGrpZeroSetLoop\name :
VADD qY0,qV0,qV4
VSUB qY4,qV0,qV4
VST2 {dYr0,dYi0},[pDst, :128],step1 @// store y0
VST2 {dYr0,dYi0},[pDst :128],step1 @// store y0
.ifeqs "\inverse", "TRUE"
@ -249,15 +253,15 @@ radix8fsGrpZeroSetLoop\name :
VADD dYi2,dVi2,dVr6
VADD dYr6,dVr2,dVi6
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y2
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y2
VSUB dYi6,dVi2,dVr6
VSUB qU1,qX0,qX4
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
VSUB qU3,qX1,qX5
VSUB qU5,qX2,qX6
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y6
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y6
.ELSE
@ -265,15 +269,15 @@ radix8fsGrpZeroSetLoop\name :
VSUB dYi6,dVi2,dVr6
VSUB dYr2,dVr2,dVi6
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y2
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y2
VADD dYi2,dVi2,dVr6
VSUB qU1,qX0,qX4
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
VSUB qU3,qX1,qX5
VSUB qU5,qX2,qX6
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y6
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y6
.ENDIF
@ -287,17 +291,17 @@ radix8fsGrpZeroSetLoop\name :
VSUB dVr1,dUr1,dUi5
@// data[0] for next iteration
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep
VLD2 {dXr0,dXi0},[pSrc :128],pointStep
VADD dVi1,dUi1,dUr5
VADD dVr3,dUr1,dUi5
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
VSUB dVi3,dUi1,dUr5
VSUB dVr5,dUr3,dUi7
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
VADD dVi5,dUi3,dUr7
VADD dVr7,dUr3,dUi7
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
VSUB dVi7,dUi3,dUr7
@// finish third stage of 8 point FFT
@ -307,14 +311,14 @@ radix8fsGrpZeroSetLoop\name :
@// calculate a*v5
VMUL dT1,dVr5,dT0[0] @// use dVi0 for dT1
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VMUL dVi5,dVi5,dT0[0]
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
VSUB dVr5,dT1,dVi5 @// a * V5
VADD dVi5,dT1,dVi5
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
@// calculate b*v7
VMUL dT1,dVr7,dT0[0]
@ -331,33 +335,33 @@ radix8fsGrpZeroSetLoop\name :
@// On the last iteration, this will read past the end of pSrc,
@// so skip this read.
BEQ radix8SkipLastUpdateInv\name
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
radix8SkipLastUpdateInv\name:
VSUB dYr3,dVr3,dVr7
VSUB dYi3,dVi3,dVi7
VST2 {dYr1,dYi1},[pDst, :128],step1 @// store y1
VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1
VADD dYr7,dVr3,dVr7
VADD dYi7,dVi3,dVi7
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y3
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y5
VST2 {dYr7,dYi7},[pDst, :128] @// store y7
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y3
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y5
VST2 {dYr7,dYi7},[pDst :128] @// store y7
ADD pDst, pDst, #16
.ELSE
@// calculate b*v7
VMUL dT1,dVr7,dT0[0]
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VMUL dVi7,dVi7,dT0[0]
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
VADD dVr7,dT1,dVi7 @// b * V7
VSUB dVi7,dVi7,dT1
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
@// calculate a*v5
VMUL dT1,dVr5,dT0[0] @// use dVi0 for dT1
@ -373,20 +377,20 @@ radix8SkipLastUpdateInv\name:
@// On the last iteration, this will read past the end of pSrc,
@// so skip this read.
BEQ radix8SkipLastUpdateFwd\name
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
radix8SkipLastUpdateFwd\name:
VSUB qY5,qV1,qV5
VSUB dYr3,dVr3,dVr7
VST2 {dYr7,dYi7},[pDst, :128],step1 @// store y1
VST2 {dYr7,dYi7},[pDst :128],step1 @// store y1
VSUB dYi3,dVi3,dVi7
VADD qY1,qV1,qV5
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y3
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y5
VST2 {dYr1,dYi1},[pDst, :128]! @// store y7
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y3
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y5
VST2 {dYr1,dYi1},[pDst :128]! @// store y7
.ENDIF
@ -411,12 +415,12 @@ radix8SkipLastUpdateFwd\name:
M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe,r4
FFTSTAGE "FALSE","FALSE",FWD
M_END
ONEBYSQRT2FWD: .float 0.7071067811865476e0
M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe,r4
FFTSTAGE "FALSE","TRUE",INV
M_END
ONEBYSQRT2INV: .float 0.7071067811865476e0
.end

View File

@ -121,7 +121,7 @@
@// Update pSubFFTSize and pSubFFTNum regs
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
@// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
LSR grpSize,subFFTNum,#2
MOV subFFTNum,grpSize
@ -130,19 +130,19 @@
@// pT0+1 increments pT0 by 4 bytes
@// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
@// Note: outPointStep = pointStep for firststage
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
@// Calculate the step of input data for the next set
@//MOV setStep,pointStep,LSL #1
MOV setStep,grpSize,LSL #3
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
MOV step1,setStep
ADD setStep,setStep,pointStep @// setStep = 3*pointStep
RSB setStep,setStep,#16 @// setStep = - 3*pointStep+16
VLD2 {dXr3,dXi3},[pSrc, :128],setStep @// data[3]
VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3]
MOV subFFTSize,#4 @// subFFTSize = 1 for the first stage
@ -166,36 +166,36 @@ grpZeroSetLoop\name:
VHSUB qY2,qX0,qX2 @// u1
SUBS setCount,setCount,#4 @// decrement the set loop counter
VLD2 {dXr0,dXi0},[pSrc, :128],step1 @// data[0]
VLD2 {dXr0,dXi0},[pSrc :128],step1 @// data[0]
VHADD qY1,qX1,qX3 @// u2
VLD2 {dXr2,dXi2},[pSrc, :128],step3
VLD2 {dXr2,dXi2},[pSrc :128],step3
VHSUB qY3,qX1,qX3 @// u3
@// finish second stage of 4 point FFT
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
VHADD qZ0,qY0,qY1 @// y0
VLD2 {dXr3,dXi3},[pSrc, :128],setStep
VLD2 {dXr3,dXi3},[pSrc :128],setStep
.ifeqs "\inverse", "TRUE"
VHSUB dZr3,dYr2,dYi3 @// y3
VHADD dZi3,dYi2,dYr3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VHSUB qZ1,qY0,qY1 @// y2
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VST2 {dZr3,dZi3},[pDst :128],outPointStep
VHADD dZr2,dYr2,dYi3 @// y1
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VHSUB dZi2,dYi2,dYr3
VHADD qY0,qX0,qX2 @// u0 (next loop)
VST2 {dZr2,dZi2},[pDst, :128],setStep
VST2 {dZr2,dZi2},[pDst :128],setStep
.ELSE
@ -203,15 +203,15 @@ grpZeroSetLoop\name:
VHADD dZr2,dYr2,dYi3 @// y1
VHSUB dZi2,dYi2,dYr3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VHSUB qZ1,qY0,qY1 @// y2
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VHSUB dZr3,dYr2,dYi3 @// y3
VHADD dZi3,dYi2,dYr3
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VHADD qY0,qX0,qX2 @// u0 (next loop)
VST2 {dZr3,dZi3},[pDst, :128],setStep
VST2 {dZr3,dZi3},[pDst :128],setStep
.ENDIF
@ -223,36 +223,36 @@ grpZeroSetLoop\name:
VSUB qY2,qX0,qX2 @// u1
SUBS setCount,setCount,#4 @// decrement the set loop counter
VLD2 {dXr0,dXi0},[pSrc, :128],step1 @// data[0]
VLD2 {dXr0,dXi0},[pSrc :128],step1 @// data[0]
VADD qY1,qX1,qX3 @// u2
VLD2 {dXr2,dXi2},[pSrc, :128],step3
VLD2 {dXr2,dXi2},[pSrc :128],step3
VSUB qY3,qX1,qX3 @// u3
@// finish second stage of 4 point FFT
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
VADD qZ0,qY0,qY1 @// y0
VLD2 {dXr3,dXi3},[pSrc, :128],setStep
VLD2 {dXr3,dXi3},[pSrc :128],setStep
.ifeqs "\inverse", "TRUE"
VSUB dZr3,dYr2,dYi3 @// y3
VADD dZi3,dYi2,dYr3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VSUB qZ1,qY0,qY1 @// y2
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VST2 {dZr3,dZi3},[pDst :128],outPointStep
VADD dZr2,dYr2,dYi3 @// y1
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VSUB dZi2,dYi2,dYr3
VADD qY0,qX0,qX2 @// u0 (next loop)
VST2 {dZr2,dZi2},[pDst, :128],setStep
VST2 {dZr2,dZi2},[pDst :128],setStep
.ELSE
@ -260,15 +260,15 @@ grpZeroSetLoop\name:
VADD dZr2,dYr2,dYi3 @// y1
VSUB dZi2,dYi2,dYr3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VSUB qZ1,qY0,qY1 @// y2
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VSUB dZr3,dYr2,dYi3 @// y3
VADD dZi3,dYi2,dYr3
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VADD qY0,qX0,qX2 @// u0 (next loop)
VST2 {dZr3,dZi3},[pDst, :128],setStep
VST2 {dZr3,dZi3},[pDst :128],setStep
.ENDIF

View File

@ -163,7 +163,7 @@
@// Define stack arguments
MOV pw2,pTwiddle
VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2, :256]!
VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]!
MOV pw3,pTwiddle
MOV pw1,pTwiddle
@ -171,26 +171,26 @@
@// pOut0+outPointStep == increment of 4*outPointStep bytes
MOV outPointStep,subFFTSize,LSL #2
VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3, :64]!
VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]!
MOV subFFTNum,#1 @//after the last stage
LSL grpCount,subFFTSize,#2
@// Update grpCount and grpSize rightaway
VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3, :64]!
VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]!
@// update subFFTSize for the next stage
MOV subFFTSize,grpCount
MOV dstStep,outPointStep,LSL #1
VLD2 {dW1r,dW1i}, [pw1, :128]!
VLD2 {dW1r,dW1i}, [pw1 :128]!
ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep
RSB dstStep,dstStep,#16 @// dstStep = - 3*outPointStep+16
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
@// Process 4 groups at a time
@ -225,7 +225,7 @@ grpLoop\name:
@// Load the first twiddle for 4 groups : w^1
@// w^1 twiddle (i+0,i+1,i+2,i+3) for group 0,1,2,3
VLD2 {dW1r,dW1i}, [pw1, :128]!
VLD2 {dW1r,dW1i}, [pw1 :128]!
.ifeqs "\inverse", "TRUE"
VMULL qT2,dXr2,dW2r
@ -262,7 +262,7 @@ grpLoop\name:
@// Load the second twiddle for 4 groups : w^2
@// w^2 twiddle (2i+0,2i+2,2i+4,2i+6) for group 0,1,2,3
VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2, :256]!
VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]!
VRSHRN dZr2,qT2,#15
@ -271,12 +271,12 @@ grpLoop\name:
@// Load the third twiddle for 4 groups : w^3
@// w^3 twiddle (3i+0,3i+3,3i+6,3i+9) for group 0,1,2,3
VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3, :64]!
VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]!
VRSHRN dZr3,qT0,#15
VRSHRN dZi3,qT1,#15
VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3, :64]!
VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]!
.ifeqs "\scaled", "TRUE"
@ -285,7 +285,7 @@ grpLoop\name:
VHADD qY0,qX0,qZ2
VHSUB qY2,qX0,qZ2
VHADD qY1,qZ1,qZ3
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
VHSUB qY3,qZ1,qZ3
@ -293,20 +293,20 @@ grpLoop\name:
VHSUB qZ0,qY2,qY1
VHADD qZ2,qY2,qY1
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
.ifeqs "\inverse", "TRUE"
VHADD dZr3,dYr0,dYi3 @// y3 = u0-ju3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VHSUB dZi3,dYi0,dYr3
VHSUB dZr1,dYr0,dYi3 @// y1 = u0+ju3
VHADD dZi1,dYi0,dYr3
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst, :128],dstStep @// dstStep = -3*outPointStep + 16
VST2 {dZr3,dZi3},[pDst :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],dstStep @// dstStep = -3*outPointStep + 16
.ELSE
@ -314,11 +314,11 @@ grpLoop\name:
VHADD dZi1,dYi0,dYr3
VHADD dZr3,dYr0,dYi3 @// y3 = u0-ju3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VHSUB dZi3,dYi0,dYr3
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr3,dZi3},[pDst, :128],dstStep @// dstStep = -3*outPointStep + 16
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VST2 {dZr3,dZi3},[pDst :128],dstStep @// dstStep = -3*outPointStep + 16
.ENDIF
@ -329,7 +329,7 @@ grpLoop\name:
VADD qY0,qX0,qZ2
VSUB qY2,qX0,qZ2
VADD qY1,qZ1,qZ3
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
VSUB qY3,qZ1,qZ3
@ -337,20 +337,20 @@ grpLoop\name:
VSUB qZ0,qY2,qY1
VADD qZ2,qY2,qY1
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
.ifeqs "\inverse", "TRUE"
VADD dZr3,dYr0,dYi3 @// y3 = u0-ju3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VSUB dZi3,dYi0,dYr3
VSUB dZr1,dYr0,dYi3 @// y1 = u0+ju3
VADD dZi1,dYi0,dYr3
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst, :128],dstStep @// dstStep = -3*outPointStep + 16
VST2 {dZr3,dZi3},[pDst :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],dstStep @// dstStep = -3*outPointStep + 16
.ELSE
@ -358,11 +358,11 @@ grpLoop\name:
VADD dZi1,dYi0,dYr3
VADD dZr3,dYr0,dYi3 @// y3 = u0-ju3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VSUB dZi3,dYi0,dYr3
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr3,dZi3},[pDst, :128],dstStep @// dstStep = -3*outPointStep + 16
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VST2 {dZr3,dZi3},[pDst :128],dstStep @// dstStep = -3*outPointStep + 16
.ENDIF

View File

@ -150,12 +150,12 @@
LSL pointStep,subFFTNum,#2 @// 2*grpSize
VLD1 dW1,[pTwiddle, :64] @//[wi | wr]
VLD1 dW1,[pTwiddle :64] @//[wi | wr]
MOV srcStep,pointStep,LSL #1 @// srcStep = 2*pointStep
VLD1 dW2,[pTwiddle, :64] @//[wi | wr]
VLD1 dW2,[pTwiddle :64] @//[wi | wr]
ADD setStep,srcStep,pointStep @// setStep = 3*pointStep
SUB srcStep,srcStep,#16 @// srcStep = 2*pointStep-16
VLD1 dW3,[pTwiddle, :64]
VLD1 dW3,[pTwiddle :64]
@//RSB setStep,setStep,#16 @// setStep = - 3*pointStep+16
RSB setStep,setStep,#0 @// setStep = - 3*pointStep
@ -167,13 +167,13 @@
grpLoop\name:
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
ADD stepTwiddle,stepTwiddle,pointStep
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
ADD pTwiddle,pTwiddle,stepTwiddle @// set pTwiddle to the first point
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
MOV twStep,stepTwiddle,LSL #2
VLD2 {dXr3,dXi3},[pSrc, :128],setStep @// data[3] & reset pSrc
VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & reset pSrc
SUB twStep,stepTwiddle,twStep @// twStep = -3*stepTwiddle
@ -202,7 +202,7 @@ setLoop\name:
.ENDIF
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
.ifeqs "\inverse", "TRUE"
VMULL qT2,dXr2,dW2[0]
@ -222,7 +222,7 @@ setLoop\name:
VRSHRN dZi1,qT1,#15
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
.ifeqs "\inverse", "TRUE"
VMULL qT0,dXr3,dW3[0]
@ -244,7 +244,7 @@ setLoop\name:
VRSHRN dZr3,qT0,#15
VRSHRN dZi3,qT1,#15
VLD2 {dXr3,dXi3},[pSrc, :128],setStep @// data[3] & update pSrc for the next set
VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set
.ifeqs "\scaled", "TRUE"
@ -253,7 +253,7 @@ setLoop\name:
VHADD qY0,qX0,qZ2
VHSUB qY2,qX0,qZ2
VLD2 {dXr0,dXi0},[pSrc, :128]! @// data[0]
VLD2 {dXr0,dXi0},[pSrc :128]! @// data[0]
VHADD qY1,qZ1,qZ3
VHSUB qY3,qZ1,qZ3
@ -265,16 +265,16 @@ setLoop\name:
VHSUB qZ0,qY2,qY1
VHADD dZr2,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VHSUB dZi2,dYi0,dYr3
VHADD qZ1,qY2,qY1
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VHSUB dZr3,dYr0,dYi3
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VHADD dZi3,dYi0,dYr3
VST2 {dZr3,dZi3},[pDst, :128],dstStep
VST2 {dZr3,dZi3},[pDst :128],dstStep
.ELSE
@ -282,16 +282,16 @@ setLoop\name:
VHSUB qZ0,qY2,qY1
VHSUB dZr3,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VHADD dZi3,dYi0,dYr3
VHADD qZ1,qY2,qY1
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VST2 {dZr3,dZi3},[pDst :128],outPointStep
VHADD dZr2,dYr0,dYi3
VHSUB dZi2,dYi0,dYr3
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst, :128],dstStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],dstStep
.ENDIF
@ -316,16 +316,16 @@ setLoop\name:
VSUB qZ0,qY2,qY1
VADD dZr2,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VSUB dZi2,dYi0,dYr3
VADD qZ1,qY2,qY1
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VSUB dZr3,dYr0,dYi3
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VADD dZi3,dYi0,dYr3
VST2 {dZr3,dZi3},[pDst, :128],dstStep
VST2 {dZr3,dZi3},[pDst :128],dstStep
.ELSE
@ -333,16 +333,16 @@ setLoop\name:
VSUB qZ0,qY2,qY1
VSUB dZr3,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VADD dZi3,dYi0,dYr3
VADD qZ1,qY2,qY1
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VST2 {dZr3,dZi3},[pDst :128],outPointStep
VADD dZr2,dYr0,dYi3
VSUB dZi2,dYi0,dYr3
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst, :128],dstStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],dstStep
.ENDIF
@ -354,11 +354,11 @@ setLoop\name:
ADD pSrc,pSrc,pointStep @// increment to data[1] of the next set
BGT setLoop\name
VLD1 dW1,[pTwiddle, :64],stepTwiddle @//[wi | wr]
VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi | wr]
SUBS grpCount,grpCount,#4 @// subtract 4 since grpCount multiplied by 4
VLD1 dW2,[pTwiddle, :64],stepTwiddle @//[wi | wr]
VLD1 dW2,[pTwiddle :64],stepTwiddle @//[wi | wr]
ADD pSrc,pSrc,srcStep @// increment pSrc for the next grp
VLD1 dW3,[pTwiddle, :64],twStep @//[wi | wr]
VLD1 dW3,[pTwiddle :64],twStep @//[wi | wr]

View File

@ -218,22 +218,22 @@
@// Calculate the step of input data for the next set
@//MOV step1,pointStep,LSL #1 @// step1 = 2*pointStep
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
MOV step1,grpSize,LSL #3
MOV step2,pointStep,LSL #3
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
SUB step2,step2,pointStep @// step2 = 7*pointStep
RSB setStep,step2,#16 @// setStep = - 7*pointStep+16
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7] & update pSrc for the next set
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7] & update pSrc for the next set
@// setStep = -7*pointStep + 16
@// grp = 0 a special case since all the twiddle factors are 1
@// Loop on the sets : 4 sets at a time
@ -263,7 +263,7 @@ grpZeroSetLoop\name:
VHADD qY0,qV0,qV4
VHSUB qY4,qV0,qV4
VST2 {dYr0,dYi0},[pDst, :128],step1 @// store y0
VST2 {dYr0,dYi0},[pDst :128],step1 @// store y0
.ifeqs "\inverse", "TRUE"
@ -271,15 +271,15 @@ grpZeroSetLoop\name:
VHADD dYi2,dVi2,dVr6
VHADD dYr6,dVr2,dVi6
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y2
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y2
VHSUB dYi6,dVi2,dVr6
VHSUB qU1,qX0,qX4
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
VHSUB qU3,qX1,qX5
VHSUB qU5,qX2,qX6
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y6
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y6
.ELSE
@ -287,15 +287,15 @@ grpZeroSetLoop\name:
VHSUB dYi6,dVi2,dVr6
VHSUB dYr2,dVr2,dVi6
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y2
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y2
VHADD dYi2,dVi2,dVr6
VHSUB qU1,qX0,qX4
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
VHSUB qU3,qX1,qX5
VHSUB qU5,qX2,qX6
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y6
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y6
.ENDIF
@ -308,17 +308,17 @@ grpZeroSetLoop\name:
@// finish second stage of 8 point FFT
VHSUB dVr1,dUr1,dUi5
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0] for next iteration
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] for next iteration
VHADD dVi1,dUi1,dUr5
VHADD dVr3,dUr1,dUi5
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
VHSUB dVi3,dUi1,dUr5
VHSUB dVr5,dUr3,dUi7
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
VHADD dVi5,dUi3,dUr7
VHADD dVr7,dUr3,dUi7
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
VHSUB dVi7,dUi3,dUr7
@// finish third stage of 8 point FFT
@ -327,14 +327,14 @@ grpZeroSetLoop\name:
@// calculate a*v5
VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VQRDMULH dVi5,dVi5,dT0[0]
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
VSUB dVr5,dT1,dVi5 @// a * V5
VADD dVi5,dT1,dVi5
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
@// calculate b*v7
VQRDMULH dT1,dVr7,dT0[0]
@ -348,35 +348,35 @@ grpZeroSetLoop\name:
VSUB dVi7,dVi7,dT1
SUB pDst, pDst, step2 @// set pDst to y1
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
VHSUB dYr3,dVr3,dVr7
VHSUB dYi3,dVi3,dVi7
VST2 {dYr1,dYi1},[pDst, :128],step1 @// store y1
VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1
VHADD dYr7,dVr3,dVr7
VHADD dYi7,dVi3,dVi7
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y3
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y5
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y3
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y5
#if 0
VST2 {dYr7,dYi7},[pDst, :128],#16 @// store y7
VST2 {dYr7,dYi7},[pDst :128],#16 @// store y7
#else
VST2 {dYr7,dYi7},[pDst, :128]! @// store y7
VST2 {dYr7,dYi7},[pDst :128]! @// store y7
#endif
.ELSE
@// calculate b*v7
VQRDMULH dT1,dVr7,dT0[0]
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VQRDMULH dVi7,dVi7,dT0[0]
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
VADD dVr7,dT1,dVi7 @// b * V7
VSUB dVi7,dVi7,dT1
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
@// calculate a*v5
VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
@ -388,22 +388,22 @@ grpZeroSetLoop\name:
VSUB dVr5,dT1,dVi5 @// a * V5
VADD dVi5,dT1,dVi5
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
VHSUB qY5,qV1,qV5
VHSUB dYr3,dVr3,dVr7
VST2 {dYr7,dYi7},[pDst, :128],step1 @// store y1
VST2 {dYr7,dYi7},[pDst :128],step1 @// store y1
VHSUB dYi3,dVi3,dVi7
VHADD qY1,qV1,qV5
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y3
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y5
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y3
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y5
#if 0
VST2 {dYr1,dYi1},[pDst, :128],#16 @// store y7
VST2 {dYr1,dYi1},[pDst :128],#16 @// store y7
#else
VST2 {dYr1,dYi1},[pDst, :128]! @// store y7
VST2 {dYr1,dYi1},[pDst :128]! @// store y7
#endif
.ENDIF
@ -429,7 +429,7 @@ grpZeroSetLoop\name:
VADD qY0,qV0,qV4
VSUB qY4,qV0,qV4
VST2 {dYr0,dYi0},[pDst, :128],step1 @// store y0
VST2 {dYr0,dYi0},[pDst :128],step1 @// store y0
.ifeqs "\inverse", "TRUE"
@ -437,15 +437,15 @@ grpZeroSetLoop\name:
VADD dYi2,dVi2,dVr6
VADD dYr6,dVr2,dVi6
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y2
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y2
VSUB dYi6,dVi2,dVr6
VSUB qU1,qX0,qX4
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
VSUB qU3,qX1,qX5
VSUB qU5,qX2,qX6
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y6
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y6
.ELSE
@ -453,15 +453,15 @@ grpZeroSetLoop\name:
VSUB dYi6,dVi2,dVr6
VSUB dYr2,dVr2,dVi6
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y2
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y2
VADD dYi2,dVi2,dVr6
VSUB qU1,qX0,qX4
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
VSUB qU3,qX1,qX5
VSUB qU5,qX2,qX6
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y6
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y6
.ENDIF
@ -474,17 +474,17 @@ grpZeroSetLoop\name:
@// finish second stage of 8 point FFT
VSUB dVr1,dUr1,dUi5
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0] for next iteration
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] for next iteration
VADD dVi1,dUi1,dUr5
VADD dVr3,dUr1,dUi5
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
VSUB dVi3,dUi1,dUr5
VSUB dVr5,dUr3,dUi7
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
VADD dVi5,dUi3,dUr7
VADD dVr7,dUr3,dUi7
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
VSUB dVi7,dUi3,dUr7
@// finish third stage of 8 point FFT
@ -493,14 +493,14 @@ grpZeroSetLoop\name:
@// calculate a*v5
VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VQRDMULH dVi5,dVi5,dT0[0]
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
VSUB dVr5,dT1,dVi5 @// a * V5
VADD dVi5,dT1,dVi5
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
@// calculate b*v7
VQRDMULH dT1,dVr7,dT0[0]
@ -514,35 +514,35 @@ grpZeroSetLoop\name:
VSUB dVi7,dVi7,dT1
SUB pDst, pDst, step2 @// set pDst to y1
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
VSUB dYr3,dVr3,dVr7
VSUB dYi3,dVi3,dVi7
VST2 {dYr1,dYi1},[pDst, :128],step1 @// store y1
VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1
VADD dYr7,dVr3,dVr7
VADD dYi7,dVi3,dVi7
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y3
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y5
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y3
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y5
#if 0
VST2 {dYr7,dYi7},[pDst, :128],#16 @// store y7
VST2 {dYr7,dYi7},[pDst :128],#16 @// store y7
#else
VST2 {dYr7,dYi7},[pDst, :128]! @// store y7
VST2 {dYr7,dYi7},[pDst :128]! @// store y7
#endif
.ELSE
@// calculate b*v7
VQRDMULH dT1,dVr7,dT0[0]
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VQRDMULH dVi7,dVi7,dT0[0]
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
VADD dVr7,dT1,dVi7 @// b * V7
VSUB dVi7,dVi7,dT1
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
@// calculate a*v5
VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
@ -554,22 +554,22 @@ grpZeroSetLoop\name:
VSUB dVr5,dT1,dVi5 @// a * V5
VADD dVi5,dT1,dVi5
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
VSUB qY5,qV1,qV5
VSUB dYr3,dVr3,dVr7
VST2 {dYr7,dYi7},[pDst, :128],step1 @// store y1
VST2 {dYr7,dYi7},[pDst :128],step1 @// store y1
VSUB dYi3,dVi3,dVi7
VADD qY1,qV1,qV5
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y3
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y5
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y3
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y5
#if 0
VST2 {dYr1,dYi1},[pDst, :128],#16 @// store y7
VST2 {dYr1,dYi1},[pDst :128],#16 @// store y7
#else
VST2 {dYr1,dYi1},[pDst, :128]! @// store y7
VST2 {dYr1,dYi1},[pDst :128]! @// store y7
#endif
.ENDIF

View File

@ -100,9 +100,9 @@
@// Loop on 2 grps at a time for the last stage
grpLoop\name :
VLD2 {dWr,dWi},[pTwiddle, :64]!
VLD2 {dWr,dWi},[pTwiddle :64]!
VLD4 {dXr0,dXi0,dXr1,dXi1},[pSrc, :128]!
VLD4 {dXr0,dXi0,dXr1,dXi1},[pSrc :128]!
SUBS grpCount,grpCount,#4 @// grpCount is multiplied by 2
.ifeqs "\inverse", "TRUE"

View File

@ -126,23 +126,23 @@
@// Update pSubFFTSize and pSubFFTNum regs
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
MOV subFFTSize,#4 @// subFFTSize = 1 for the first stage
@// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
LSR grpSize,subFFTNum,#2
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
MOV subFFTNum,grpSize
@// Calculate the step of input data for the next set
@//MOV setStep,pointStep,LSL #1
MOV setStep,grpSize,LSL #4
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
ADD setStep,setStep,pointStep @// setStep = 3*pointStep
RSB setStep,setStep,#16 @// setStep = - 3*pointStep+16
VLD2 {dXr3,dXi3},[pSrc, :128],setStep @// data[3] & update pSrc for the next set
VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set
MOV step1,pointStep,LSL #1 @// step1 = 2*pointStep
.ifeqs "\scaled", "TRUE"
@ -169,9 +169,9 @@ grpZeroSetLoop\name :
VHSUB qY2,qX0,qX2
VLD2 {dXr0,dXi0},[pSrc, :128],step1 @// data[0]
VLD2 {dXr0,dXi0},[pSrc :128],step1 @// data[0]
VHADD qY1,qX1,qX3
VLD2 {dXr2,dXi2},[pSrc, :128],step3 @// data[2]
VLD2 {dXr2,dXi2},[pSrc :128],step3 @// data[2]
VHSUB qY3,qX1,qX3
@ -179,46 +179,46 @@ grpZeroSetLoop\name :
.ifeqs "\inverse", "TRUE"
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
VHADD qZ0,qY0,qY1
VLD2 {dXr3,dXi3},[pSrc, :128],setStep @// data[3] & update pSrc for the next set
VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set
VHSUB dZr3,dYr2,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VHADD dZi3,dYi2,dYr3
VHSUB qZ1,qY0,qY1
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VST2 {dZr3,dZi3},[pDst :128],outPointStep
VHADD dZr2,dYr2,dYi3
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VHSUB dZi2,dYi2,dYr3
VHADD qY0,qX0,qX2 @// u0 for next iteration
VST2 {dZr2,dZi2},[pDst, :128],setStep
VST2 {dZr2,dZi2},[pDst :128],setStep
.else
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
VHADD qZ0,qY0,qY1
VLD2 {dXr3,dXi3},[pSrc, :128],setStep @// data[3] & update pSrc for the next set
VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set
VHADD dZr2,dYr2,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VHSUB dZi2,dYi2,dYr3
VHSUB qZ1,qY0,qY1
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VHSUB dZr3,dYr2,dYi3
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VHADD dZi3,dYi2,dYr3
VHADD qY0,qX0,qX2 @// u0 for next iteration
VST2 {dZr3,dZi3},[pDst, :128],setStep
VST2 {dZr3,dZi3},[pDst :128],setStep
.endif
@ -231,9 +231,9 @@ grpZeroSetLoop\name :
VSUB qY2,qX0,qX2
VLD2 {dXr0,dXi0},[pSrc, :128],step1 @// data[0]
VLD2 {dXr0,dXi0},[pSrc :128],step1 @// data[0]
VADD qY1,qX1,qX3
VLD2 {dXr2,dXi2},[pSrc, :128],step3 @// data[2]
VLD2 {dXr2,dXi2},[pSrc :128],step3 @// data[2]
VSUB qY3,qX1,qX3
@ -241,46 +241,46 @@ grpZeroSetLoop\name :
.ifeqs "\inverse", "TRUE"
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
VADD qZ0,qY0,qY1
VLD2 {dXr3,dXi3},[pSrc, :128],setStep @// data[3] & update pSrc for the next set
VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set
VSUB dZr3,dYr2,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VADD dZi3,dYi2,dYr3
VSUB qZ1,qY0,qY1
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VST2 {dZr3,dZi3},[pDst :128],outPointStep
VADD dZr2,dYr2,dYi3
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VSUB dZi2,dYi2,dYr3
VADD qY0,qX0,qX2 @// u0 for next iteration
VST2 {dZr2,dZi2},[pDst, :128],setStep
VST2 {dZr2,dZi2},[pDst :128],setStep
.else
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
VADD qZ0,qY0,qY1
VLD2 {dXr3,dXi3},[pSrc, :128],setStep @// data[3] & update pSrc for the next set
VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set
VADD dZr2,dYr2,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VSUB dZi2,dYi2,dYr3
VSUB qZ1,qY0,qY1
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VSUB dZr3,dYr2,dYi3
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VADD dZi3,dYi2,dYr3
VADD qY0,qX0,qX2 @// u0 for next iteration
VST2 {dZr3,dZi3},[pDst, :128],setStep
VST2 {dZr3,dZi3},[pDst :128],setStep
.endif

View File

@ -147,30 +147,30 @@
@// Update grpCount and grpSize rightaway
VLD2 {dW1r,dW1i},[pTwiddle, :128] @// [wi|wr]
VLD2 {dW1r,dW1i},[pTwiddle :128] @// [wi|wr]
MOV step16,#16
LSL grpCount,subFFTSize,#2
VLD1 dW2r,[pTwiddle, :64] @// [wi|wr]
VLD1 dW2r,[pTwiddle :64] @// [wi|wr]
MOV subFFTNum,#1 @//after the last stage
VLD1 dW3r,[pTwiddle, :64],step16 @// [wi|wr]
VLD1 dW3r,[pTwiddle :64],step16 @// [wi|wr]
MOV stepTwiddle,#0
VLD1 dW2i,[pTwiddle, :64]! @// [wi|wr]
VLD1 dW2i,[pTwiddle :64]! @// [wi|wr]
SUB grpTwStep,stepTwiddle,#8 @// grpTwStep = -8 to start with
@// update subFFTSize for the next stage
MOV subFFTSize,grpCount
VLD1 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr]
VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|wr]
MOV dstStep,outPointStep,LSL #1
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep
RSB dstStep,dstStep,#16 @// dstStep = - 3*outPointStep+16
MOV step24,#24
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
@// Process two groups at a time
@ -209,13 +209,13 @@ grpLoop\name :
.endif
VLD2 {dW1r,dW1i},[pTwiddle, :128],stepTwiddle @// [wi|wr]
VLD2 {dW1r,dW1i},[pTwiddle :128],stepTwiddle @// [wi|wr]
.ifeqs "\inverse", "TRUE"
VMULL qT2,dW2r,dXr2
VMLAL qT2,dW2i,dXi2 @// real part
VMULL qT3,dW2r,dXi2
VLD1 dW2r,[pTwiddle, :64],step16 @// [wi|wr]
VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr]
VMLSL qT3,dW2i,dXr2 @// imag part
.else
@ -223,25 +223,25 @@ grpLoop\name :
VMULL qT2,dW2r,dXr2
VMLSL qT2,dW2i,dXi2 @// real part
VMULL qT3,dW2r,dXi2
VLD1 dW2r,[pTwiddle, :64],step16 @// [wi|wr]
VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr]
VMLAL qT3,dW2i,dXr2 @// imag part
.endif
VRSHRN dZr1,qT0,#31
VLD1 dW2i,[pTwiddle, :64],twStep @// [wi|wr]
VLD1 dW2i,[pTwiddle :64],twStep @// [wi|wr]
VRSHRN dZi1,qT1,#31
VMOV qZ0,qX0 @// move qX0 so as to load for the next iteration
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
.ifeqs "\inverse", "TRUE"
VMULL qT4,dW3r,dXr3
VMLAL qT4,dW3i,dXi3 @// real part
VMULL qT5,dW3r,dXi3
VLD1 dW3r,[pTwiddle, :64],step24
VLD1 dW3r,[pTwiddle :64],step24
VMLSL qT5,dW3i,dXr3 @// imag part
.else
@ -249,18 +249,18 @@ grpLoop\name :
VMULL qT4,dW3r,dXr3
VMLSL qT4,dW3i,dXi3 @// real part
VMULL qT5,dW3r,dXi3
VLD1 dW3r,[pTwiddle, :64],step24
VLD1 dW3r,[pTwiddle :64],step24
VMLAL qT5,dW3i,dXr3 @// imag part
.endif
VRSHRN dZr2,qT2,#31
VLD1 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr]
VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|wr]
VRSHRN dZi2,qT3,#31
VRSHRN dZr3,qT4,#31
VRSHRN dZi3,qT5,#31
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]! @// AC.r AC.i BD.r BD.i
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
.ifeqs "\scaled", "TRUE"
@ -280,17 +280,17 @@ grpLoop\name :
VHSUB qZ0,qY2,qY1
VHADD dZr3,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VHSUB dZi3,dYi0,dYr3
VHADD qZ2,qY2,qY1
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VST2 {dZr3,dZi3},[pDst :128],outPointStep
VHSUB dZr1,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VHADD dZi1,dYi0,dYr3
VST2 {dZr1,dZi1},[pDst, :128],dstStep @// dstStep = -outPointStep + 16
VST2 {dZr1,dZi1},[pDst :128],dstStep @// dstStep = -outPointStep + 16
.else
@ -298,17 +298,17 @@ grpLoop\name :
VHSUB qZ0,qY2,qY1
VHSUB dZr1,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VHADD dZi1,dYi0,dYr3
VHADD qZ2,qY2,qY1
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VHADD dZr3,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VHSUB dZi3,dYi0,dYr3
VST2 {dZr3,dZi3},[pDst, :128],dstStep @// dstStep = -outPointStep + 16
VST2 {dZr3,dZi3},[pDst :128],dstStep @// dstStep = -outPointStep + 16
.endif
@ -332,17 +332,17 @@ grpLoop\name :
VSUB qZ0,qY2,qY1
VADD dZr3,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VSUB dZi3,dYi0,dYr3
VADD qZ2,qY2,qY1
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VST2 {dZr3,dZi3},[pDst :128],outPointStep
VSUB dZr1,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VADD dZi1,dYi0,dYr3
VST2 {dZr1,dZi1},[pDst, :128],dstStep @// dstStep = -outPointStep + 16
VST2 {dZr1,dZi1},[pDst :128],dstStep @// dstStep = -outPointStep + 16
.else
@ -350,17 +350,17 @@ grpLoop\name :
VSUB qZ0,qY2,qY1
VSUB dZr1,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VADD dZi1,dYi0,dYr3
VADD qZ2,qY2,qY1
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VADD dZr3,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VSUB dZi3,dYi0,dYr3
VST2 {dZr3,dZi3},[pDst, :128],dstStep @// dstStep = -outPointStep + 16
VST2 {dZr3,dZi3},[pDst :128],dstStep @// dstStep = -outPointStep + 16
.endif

View File

@ -268,33 +268,33 @@ setLoop\name :
.ifeqs "\inverse", "TRUE"
VHADD dZr3,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VHSUB dZi3,dYi0,dYr3
VHADD qZ2,qY2,qY1
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VST2 {dZr3,dZi3},[pDst :128],outPointStep
VHSUB dZr1,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VHADD dZi1,dYi0,dYr3
VST2 {dZr1,dZi1},[pDst, :128],dstStep
VST2 {dZr1,dZi1},[pDst :128],dstStep
.else
VHSUB dZr1,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VHADD dZi1,dYi0,dYr3
VHADD qZ2,qY2,qY1
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VHADD dZr3,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VHSUB dZi3,dYi0,dYr3
VST2 {dZr3,dZi3},[pDst, :128],dstStep
VST2 {dZr3,dZi3},[pDst :128],dstStep
.endif
@ -306,7 +306,7 @@ setLoop\name :
VADD qY0,qX0,qZ2
VSUB qY2,qX0,qZ2
VLD2 {dXr0,dXi0},[pSrc, :128]! @// data[0] for next iteration
VLD2 {dXr0,dXi0},[pSrc :128]! @// data[0] for next iteration
VADD qY1,qZ1,qZ3
VSUB qY3,qZ1,qZ3
@ -318,33 +318,33 @@ setLoop\name :
.ifeqs "\inverse", "TRUE"
VADD dZr3,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VSUB dZi3,dYi0,dYr3
VADD qZ2,qY2,qY1
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VST2 {dZr3,dZi3},[pDst :128],outPointStep
VSUB dZr1,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VADD dZi1,dYi0,dYr3
VST2 {dZr1,dZi1},[pDst, :128],dstStep
VST2 {dZr1,dZi1},[pDst :128],dstStep
.else
VSUB dZr1,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VADD dZi1,dYi0,dYr3
VADD qZ2,qY2,qY1
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VADD dZr3,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VSUB dZi3,dYi0,dYr3
VST2 {dZr3,dZi3},[pDst, :128],dstStep
VST2 {dZr3,dZi3},[pDst :128],dstStep
.endif
@ -355,11 +355,11 @@ setLoop\name :
BGT setLoop\name
VLD1 dW1,[pTwiddle, :64],stepTwiddle @//[wi | wr]
VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi | wr]
SUBS grpCount,grpCount,#4 @// subtract 4 since grpCount multiplied by 4
VLD1 dW2,[pTwiddle, :64],stepTwiddle @//[wi | wr]
VLD1 dW2,[pTwiddle :64],stepTwiddle @//[wi | wr]
ADD pSrc,pSrc,srcStep @// increment pSrc for the next grp
VLD1 dW3,[pTwiddle, :64],twStep @//[wi | wr]
VLD1 dW3,[pTwiddle :64],twStep @//[wi | wr]
BGT grpLoop\name

View File

@ -213,20 +213,20 @@
@// Calculate the step of input data for the next set
@//MOV step1,pointStep,LSL #1 @// step1 = 2*pointStep
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
MOV step1,grpSize,LSL #4
MOV step2,pointStep,LSL #3
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
SUB step2,step2,pointStep @// step2 = 7*pointStep
RSB setStep,step2,#16 @// setStep = - 7*pointStep+16
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7] & update pSrc for the next set
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7] & update pSrc for the next set
@// setStep = -7*pointStep + 16
@// grp = 0 a special case since all the twiddle factors are 1
@// Loop on the sets
@ -256,7 +256,7 @@ grpZeroSetLoop\name :
VHADD qY0,qV0,qV4
VHSUB qY4,qV0,qV4
VST2 {dYr0,dYi0},[pDst, :128],step1 @// store y0
VST2 {dYr0,dYi0},[pDst :128],step1 @// store y0
.ifeqs "\inverse", "TRUE"
@ -264,15 +264,15 @@ grpZeroSetLoop\name :
VHADD dYi2,dVi2,dVr6
VHADD dYr6,dVr2,dVi6
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y2
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y2
VHSUB dYi6,dVi2,dVr6
VHSUB qU1,qX0,qX4
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
VHSUB qU3,qX1,qX5
VHSUB qU5,qX2,qX6
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y6
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y6
.ELSE
@ -280,15 +280,15 @@ grpZeroSetLoop\name :
VHSUB dYi6,dVi2,dVr6
VHSUB dYr2,dVr2,dVi6
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y2
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y2
VHADD dYi2,dVi2,dVr6
VHSUB qU1,qX0,qX4
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
VHSUB qU3,qX1,qX5
VHSUB qU5,qX2,qX6
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y6
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y6
.ENDIF
@ -301,17 +301,17 @@ grpZeroSetLoop\name :
@// finish second stage of 8 point FFT
VHSUB dVr1,dUr1,dUi5
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0] for next iteration
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] for next iteration
VHADD dVi1,dUi1,dUr5
VHADD dVr3,dUr1,dUi5
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
VHSUB dVi3,dUi1,dUr5
VHSUB dVr5,dUr3,dUi7
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
VHADD dVi5,dUi3,dUr7
VHADD dVr7,dUr3,dUi7
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
VHSUB dVi7,dUi3,dUr7
@// finish third stage of 8 point FFT
@ -320,14 +320,14 @@ grpZeroSetLoop\name :
@// calculate a*v5
VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VQRDMULH dVi5,dVi5,dT0[0]
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
VSUB dVr5,dT1,dVi5 @// a * V5
VADD dVi5,dT1,dVi5
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
@// calculate b*v7
VQRDMULH dT1,dVr7,dT0[0]
@ -341,32 +341,32 @@ grpZeroSetLoop\name :
VSUB dVi7,dVi7,dT1
SUB pDst, pDst, step2 @// set pDst to y1
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
VHSUB dYr3,dVr3,dVr7
VHSUB dYi3,dVi3,dVi7
VST2 {dYr1,dYi1},[pDst, :128],step1 @// store y1
VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1
VHADD dYr7,dVr3,dVr7
VHADD dYi7,dVi3,dVi7
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y3
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y5
VST2 {dYr7,dYi7},[pDst, :128]! @// store y7
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y3
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y5
VST2 {dYr7,dYi7},[pDst :128]! @// store y7
.ELSE
@// calculate b*v7
VQRDMULH dT1,dVr7,dT0[0]
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VQRDMULH dVi7,dVi7,dT0[0]
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
VADD dVr7,dT1,dVi7 @// b * V7
VSUB dVi7,dVi7,dT1
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
@// calculate a*v5
VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
@ -378,19 +378,19 @@ grpZeroSetLoop\name :
VSUB dVr5,dT1,dVi5 @// a * V5
VADD dVi5,dT1,dVi5
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
VHSUB qY5,qV1,qV5
VHSUB dYr3,dVr3,dVr7
VST2 {dYr7,dYi7},[pDst, :128],step1 @// store y1
VST2 {dYr7,dYi7},[pDst :128],step1 @// store y1
VHSUB dYi3,dVi3,dVi7
VHADD qY1,qV1,qV5
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y3
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y5
VST2 {dYr1,dYi1},[pDst, :128]! @// store y7
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y3
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y5
VST2 {dYr1,dYi1},[pDst :128]! @// store y7
.ENDIF
@ -415,7 +415,7 @@ grpZeroSetLoop\name :
VADD qY0,qV0,qV4
VSUB qY4,qV0,qV4
VST2 {dYr0,dYi0},[pDst, :128],step1 @// store y0
VST2 {dYr0,dYi0},[pDst :128],step1 @// store y0
.ifeqs "\inverse", "TRUE"
@ -423,15 +423,15 @@ grpZeroSetLoop\name :
VADD dYi2,dVi2,dVr6
VADD dYr6,dVr2,dVi6
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y2
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y2
VSUB dYi6,dVi2,dVr6
VSUB qU1,qX0,qX4
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
VSUB qU3,qX1,qX5
VSUB qU5,qX2,qX6
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y6
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y6
.ELSE
@ -439,15 +439,15 @@ grpZeroSetLoop\name :
VSUB dYi6,dVi2,dVr6
VSUB dYr2,dVr2,dVi6
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y2
VST2 {dYr6,dYi6},[pDst :128],step1 @// store y2
VADD dYi2,dVi2,dVr6
VSUB qU1,qX0,qX4
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
VSUB qU3,qX1,qX5
VSUB qU5,qX2,qX6
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y6
VST2 {dYr2,dYi2},[pDst :128],step1 @// store y6
.ENDIF
@ -460,17 +460,17 @@ grpZeroSetLoop\name :
@// finish second stage of 8 point FFT
VSUB dVr1,dUr1,dUi5
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0] for next iteration
VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] for next iteration
VADD dVi1,dUi1,dUr5
VADD dVr3,dUr1,dUi5
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
VSUB dVi3,dUi1,dUr5
VSUB dVr5,dUr3,dUi7
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
VADD dVi5,dUi3,dUr7
VADD dVr7,dUr3,dUi7
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
VSUB dVi7,dUi3,dUr7
@// finish third stage of 8 point FFT
@ -479,14 +479,14 @@ grpZeroSetLoop\name :
@// calculate a*v5
VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VQRDMULH dVi5,dVi5,dT0[0]
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
VSUB dVr5,dT1,dVi5 @// a * V5
VADD dVi5,dT1,dVi5
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
@// calculate b*v7
VQRDMULH dT1,dVr7,dT0[0]
@ -500,32 +500,32 @@ grpZeroSetLoop\name :
VSUB dVi7,dVi7,dT1
SUB pDst, pDst, step2 @// set pDst to y1
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
VSUB dYr3,dVr3,dVr7
VSUB dYi3,dVi3,dVi7
VST2 {dYr1,dYi1},[pDst, :128],step1 @// store y1
VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1
VADD dYr7,dVr3,dVr7
VADD dYi7,dVi3,dVi7
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y3
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y5
VST2 {dYr7,dYi7},[pDst, :128]! @// store y7
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y3
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y5
VST2 {dYr7,dYi7},[pDst :128]! @// store y7
.ELSE
@// calculate b*v7
VQRDMULH dT1,dVr7,dT0[0]
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VQRDMULH dVi7,dVi7,dT0[0]
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
VADD dVr7,dT1,dVi7 @// b * V7
VSUB dVi7,dVi7,dT1
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
@// calculate a*v5
VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
@ -537,19 +537,19 @@ grpZeroSetLoop\name :
VSUB dVr5,dT1,dVi5 @// a * V5
VADD dVi5,dT1,dVi5
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
VSUB qY5,qV1,qV5
VSUB dYr3,dVr3,dVr7
VST2 {dYr7,dYi7},[pDst, :128],step1 @// store y1
VST2 {dYr7,dYi7},[pDst :128],step1 @// store y1
VSUB dYi3,dVi3,dVi7
VADD qY1,qV1,qV5
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y3
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y5
VST2 {dYr1,dYi1},[pDst, :128]! @// store y7
VST2 {dYr5,dYi5},[pDst :128],step1 @// store y3
VST2 {dYr3,dYi3},[pDst :128],step1 @// store y5
VST2 {dYr1,dYi1},[pDst :128]! @// store y7
.ENDIF

View File

@ -128,6 +128,8 @@
#define half d0.f32
HALF: .float 0.5
@// Allocate stack memory required by the function
@// Write function header
@ -298,7 +300,7 @@ finalComplexToRealFixup:
@// since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
ADR t0, HALF
LDR t0, =HALF
VLD1 half[0], [t0]
evenOddButterflyLoop:
@ -400,5 +402,5 @@ End:
@// Write function tail
M_END
HALF: .float 0.5
.end

View File

@ -198,10 +198,10 @@ FFTEnd: @// Does only the scaling
@// N = subFFTSize ; dataptr = pDst ; scale = diff
scaleFFTData:
VLD1 {qX0},[pSrc, :128] @// pSrc contains pDst pointer
VLD1 {qX0},[pSrc :128] @// pSrc contains pDst pointer
SUBS subFFTSize,subFFTSize,#2
VMUL qX0, qX0, dScale[0]
VST1 {qX0},[pSrc, :128]!
VST1 {qX0},[pSrc :128]!
BGT scaleFFTData
End: