| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
|
|
| function ff_fft_calc_vfp, export=1 |
| ldr ip, [a1, #0] |
| mov a1, a2 |
| movrel a2, (fft_tab_vfp - 8) |
| ldr pc, [a2, ip, lsl #2] |
| endfunc |
| const fft_tab_vfp, relocate=1 |
| .word fft4_vfp |
| .word fft8_vfp |
| .word X(ff_fft16_vfp) |
| .word fft32_vfp |
| .word fft64_vfp |
| .word fft128_vfp |
| .word fft256_vfp |
| .word fft512_vfp |
| .word fft1024_vfp |
| .word fft2048_vfp |
| .word fft4096_vfp |
| .word fft8192_vfp |
| .word fft16384_vfp |
| .word fft32768_vfp |
| .word fft65536_vfp |
| endconst |
|
|
| function fft4_vfp |
| vldr d0, [a1, #0*2*4] |
| vldr d4, [a1, #1*2*4] |
| vldr d1, [a1, #2*2*4] |
| vldr d5, [a1, #3*2*4] |
| |
| vadd.f s12, s0, s8 |
| vadd.f s13, s1, s9 |
| vadd.f s14, s2, s10 |
| vadd.f s15, s3, s11 |
| vsub.f s8, s0, s8 |
| vsub.f s9, s1, s9 |
| vsub.f s10, s2, s10 |
| vsub.f s11, s3, s11 |
| |
| |
| vadd.f s0, s12, s14 |
| vsub.f s4, s12, s14 |
| vadd.f s1, s13, s15 |
| vsub.f s5, s13, s15 |
| vadd.f s7, s9, s10 |
| vsub.f s3, s9, s10 |
| vadd.f s2, s8, s11 |
| vsub.f s6, s8, s11 |
| |
| |
| vstr d0, [a1, #0*2*4] |
| vstr d2, [a1, #2*2*4] |
| |
| |
| vstr d1, [a1, #1*2*4] |
| vstr d3, [a1, #3*2*4] |
|
|
| bx lr |
| endfunc |
|
|
| .macro macro_fft8_head |
| |
| vldr d4, [a1, #0 * 2*4] |
| vldr d6, [a1, #1 * 2*4] |
| vldr d5, [a1, #2 * 2*4] |
| vldr d7, [a1, #3 * 2*4] |
| |
| vldr d12, [a1, #4 * 2*4] |
| vadd.f s16, s8, s12 |
| vldr d14, [a1, #5 * 2*4] |
| vldr d13, [a1, #6 * 2*4] |
| vldr d15, [a1, #7 * 2*4] |
| vsub.f s20, s8, s12 |
| vadd.f s0, s16, s18 |
| vsub.f s2, s16, s18 |
| vadd.f s1, s17, s19 |
| vsub.f s3, s17, s19 |
| vadd.f s7, s21, s22 |
| vsub.f s5, s21, s22 |
| vadd.f s4, s20, s23 |
| vsub.f s6, s20, s23 |
| vsub.f s20, s24, s28 |
| vstr d0, [a1, #0 * 2*4] |
| vstr d1, [a1, #1 * 2*4] |
| vldr s0, cos1pi4 |
| vadd.f s16, s24, s28 |
| vstr d2, [a1, #2 * 2*4] |
| vstr d3, [a1, #3 * 2*4] |
| vldr d12, [a1, #0 * 2*4] |
| |
| vmul.f s20, s20, s0 |
| vldr d13, [a1, #1 * 2*4] |
| vldr d14, [a1, #2 * 2*4] |
| vldr d15, [a1, #3 * 2*4] |
| |
| vadd.f s0, s18, s16 |
| vadd.f s1, s17, s19 |
| vsub.f s2, s17, s19 |
| vsub.f s3, s18, s16 |
| vadd.f s4, s21, s20 |
| vsub.f s5, s21, s20 |
| vadd.f s6, s22, s23 |
| vsub.f s7, s22, s23 |
| vadd.f s8, s0, s24 |
| vstr d0, [a1, #0 * 2*4] |
| vstr d1, [a1, #1 * 2*4] |
| vldr d6, [a1, #0 * 2*4] |
| vldr d7, [a1, #1 * 2*4] |
| vadd.f s1, s5, s6 |
| vadd.f s0, s7, s4 |
| vsub.f s2, s5, s6 |
| vsub.f s3, s7, s4 |
| vsub.f s12, s24, s12 |
| vsub.f s5, s29, s1 |
| vsub.f s4, s28, s0 |
| vsub.f s6, s30, s2 |
| vsub.f s7, s31, s3 |
| vadd.f s16, s0, s28 |
| vstr d6, [a1, #4 * 2*4] |
| vstr d7, [a1, #6 * 2*4] |
| vstr d4, [a1, #0 * 2*4] |
| vstr d5, [a1, #2 * 2*4] |
| vstr d2, [a1, #5 * 2*4] |
| vstr d3, [a1, #7 * 2*4] |
| .endm |
|
|
| .macro macro_fft8_tail |
| vstr d8, [a1, #1 * 2*4] |
| vstr d9, [a1, #3 * 2*4] |
| .endm |
|
|
| function .Lfft8_internal_vfp |
| macro_fft8_head |
| macro_fft8_tail |
| bx lr |
| endfunc |
|
|
| function fft8_vfp |
| ldr a3, =0x03030000 |
| fmrx a2, FPSCR |
| fmxr FPSCR, a3 |
| vpush {s16-s31} |
| mov ip, lr |
| bl .Lfft8_internal_vfp |
| vpop {s16-s31} |
| fmxr FPSCR, a2 |
| bx ip |
| endfunc |
|
|
| .align 3 |
| cos1pi4: |
| .float 0.707106769084930419921875 |
| cos1pi8: |
| .float 0.92387950420379638671875 |
| cos3pi8: |
| .float 0.3826834261417388916015625 |
|
|
| function .Lfft16_internal_vfp |
| macro_fft8_head |
| |
| vldr d10, [a1, #8 * 2*4] |
| vldr d12, [a1, #9 * 2*4] |
| vldr d11, [a1, #10 * 2*4] |
| vldr d13, [a1, #11 * 2*4] |
| macro_fft8_tail |
| vadd.f s16, s20, s24 |
| |
| vldr d4, [a1, #12 * 2*4] |
| vldr d6, [a1, #13 * 2*4] |
| vldr d5, [a1, #14 * 2*4] |
| vsub.f s20, s20, s24 |
| vldr d7, [a1, #15 * 2*4] |
| vadd.f s0, s16, s18 |
| vsub.f s4, s16, s18 |
| vadd.f s1, s17, s19 |
| vsub.f s5, s17, s19 |
| vadd.f s7, s21, s22 |
| vsub.f s3, s21, s22 |
| vadd.f s2, s20, s23 |
| vsub.f s6, s20, s23 |
| vadd.f s16, s8, s12 |
| vstr d0, [a1, #8 * 2*4] |
| vstr d2, [a1, #10 * 2*4] |
| vstr d1, [a1, #9 * 2*4] |
| vsub.f s20, s8, s12 |
| vstr d3, [a1, #11 * 2*4] |
| |
| vldr d12, [a1, #10 * 2*4] |
| vadd.f s0, s16, s18 |
| vadd.f s1, s17, s19 |
| vsub.f s6, s16, s18 |
| vsub.f s7, s17, s19 |
| vsub.f s3, s21, s22 |
| vadd.f s2, s20, s23 |
| vadd.f s5, s21, s22 |
| vsub.f s4, s20, s23 |
| vstr d0, [a1, #12 * 2*4] |
| vmov s0, s6 |
| |
| vldr d6, [a1, #9 * 2*4] |
| vstr d1, [a1, #13 * 2*4] |
| vldr d1, cos1pi4 |
| vstr d2, [a1, #15 * 2*4] |
| vldr d7, [a1, #13 * 2*4] |
| vadd.f s4, s25, s24 |
| vsub.f s5, s25, s24 |
| vsub.f s6, s0, s7 |
| vadd.f s7, s0, s7 |
| vmul.f s20, s12, s3 |
| |
| vldr d4, [a1, #11 * 2*4] |
| vldr d5, [a1, #15 * 2*4] |
| vldr s1, cos3pi8 |
| vmul.f s24, s4, s2 |
| vmul.f s28, s12, s1 |
| vmul.f s12, s8, s1 |
| vadd.f s4, s20, s29 |
| vsub.f s5, s21, s28 |
| vsub.f s6, s22, s31 |
| vadd.f s7, s23, s30 |
| vmul.f s8, s8, s3 |
| vldr d8, [a1, #1 * 2*4] |
| vldr d9, [a1, #5 * 2*4] |
| vldr d10, [a1, #3 * 2*4] |
| vldr d11, [a1, #7 * 2*4] |
| vldr d14, [a1, #2 * 2*4] |
| vadd.f s0, s6, s4 |
| vadd.f s1, s5, s7 |
| vsub.f s2, s5, s7 |
| vsub.f s3, s6, s4 |
| vadd.f s4, s12, s9 |
| vsub.f s5, s13, s8 |
| vsub.f s6, s14, s11 |
| vadd.f s7, s15, s10 |
| vadd.f s12, s0, s16 |
| vstr d0, [a1, #1 * 2*4] |
| vstr d1, [a1, #5 * 2*4] |
| vldr d4, [a1, #1 * 2*4] |
| vldr d5, [a1, #5 * 2*4] |
| vadd.f s0, s6, s4 |
| vadd.f s1, s5, s7 |
| vsub.f s2, s5, s7 |
| vsub.f s3, s6, s4 |
| vsub.f s8, s16, s8 |
| vstr d6, [a1, #1 * 2*4] |
| vstr d7, [a1, #5 * 2*4] |
| vldr d15, [a1, #6 * 2*4] |
| vsub.f s4, s20, s0 |
| vsub.f s5, s21, s1 |
| vsub.f s6, s22, s2 |
| vsub.f s7, s23, s3 |
| vadd.f s20, s0, s20 |
| vstr d4, [a1, #9 * 2*4] |
| |
| vldr d6, [a1, #8 * 2*4] |
| vstr d5, [a1, #13 * 2*4] |
| vldr d7, [a1, #12 * 2*4] |
| vstr d2, [a1, #11 * 2*4] |
| vldr d8, [a1, #0 * 2*4] |
| vstr d3, [a1, #15 * 2*4] |
| vldr d9, [a1, #4 * 2*4] |
| vadd.f s0, s26, s24 |
| vadd.f s1, s25, s27 |
| vsub.f s2, s25, s27 |
| vsub.f s3, s26, s24 |
| vadd.f s4, s14, s12 |
| vadd.f s5, s13, s15 |
| vsub.f s6, s13, s15 |
| vsub.f s7, s14, s12 |
| vadd.f s8, s0, s28 |
| vstr d0, [a1, #3 * 2*4] |
| vstr d1, [a1, #7 * 2*4] |
| vldr d6, [a1, #3 * 2*4] |
| vldr d7, [a1, #7 * 2*4] |
| vsub.f s0, s16, s4 |
| vsub.f s1, s17, s5 |
| vsub.f s2, s18, s6 |
| vsub.f s3, s19, s7 |
| vsub.f s12, s28, s12 |
| vadd.f s16, s4, s16 |
| vstr d10, [a1, #3 * 2*4] |
| vstr d11, [a1, #7 * 2*4] |
| vstr d4, [a1, #2 * 2*4] |
| vstr d5, [a1, #6 * 2*4] |
| vstr d0, [a1, #8 * 2*4] |
| vstr d1, [a1, #12 * 2*4] |
| vstr d6, [a1, #10 * 2*4] |
| vstr d7, [a1, #14 * 2*4] |
| vstr d8, [a1, #0 * 2*4] |
| vstr d9, [a1, #4 * 2*4] |
|
|
| bx lr |
| endfunc |
|
|
| function ff_fft16_vfp, export=1 |
| ldr a3, =0x03030000 |
| fmrx a2, FPSCR |
| fmxr FPSCR, a3 |
| vpush {s16-s31} |
| mov ip, lr |
| bl .Lfft16_internal_vfp |
| vpop {s16-s31} |
| fmxr FPSCR, a2 |
| bx ip |
| endfunc |
|
|
| .macro pass n, z0, z1, z2, z3 |
| add v6, v5, #4*2*\n |
| |
| |
| |
| |
| vldr d8, [\z2, #8*(o2+1)] |
| vldmdb v6!, {s2} |
| vldr d9, [\z3, #8*(o3+1)] |
| vldmia v5!, {s0,s1} |
| vldr s7, [\z2, #8*o2] |
| vmul.f s20, s16, s2 |
| vldr s0, [\z3, #8*o3] |
| vldr s6, [\z2, #8*o2+4] |
| vldr s3, [\z3, #8*o3+4] |
| vmul.f s16, s16, s1 |
| ldr a4, =\n-1 |
| 1: add \z0, \z0, #8*2 |
| .if \n*4*2 >= 512 |
| add \z1, \z1, #8*2 |
| .endif |
| .if \n*4*2 >= 256 |
| add \z2, \z2, #8*2 |
| .endif |
| .if \n*4*2 >= 512 |
| add \z3, \z3, #8*2 |
| .endif |
| |
| |
| |
| vadd.f s4, s0, s7 |
| vadd.f s5, s6, s3 |
| vsub.f s6, s6, s3 |
| vsub.f s7, s0, s7 |
| vldr d6, [\z0, #8*0-8*2] |
| vadd.f s0, s16, s21 |
| vldr d7, [\z1, #8*o1-8*2] |
| vsub.f s1, s18, s23 |
| vadd.f s8, s4, s12 |
| |
| |
| |
| vsub.f s4, s12, s4 |
| vsub.f s5, s13, s5 |
| vsub.f s6, s14, s6 |
| vsub.f s7, s15, s7 |
| vsub.f s2, s17, s20 |
| vadd.f s3, s19, s22 |
| vstr d4, [\z0, #8*0-8*2] |
| vstr d5, [\z1, #8*o1-8*2] |
| |
| vstr d2, [\z2, #8*o2-8*2] |
| vadd.f s4, s1, s0 |
| vstr d3, [\z3, #8*o3-8*2] |
| vsub.f s7, s1, s0 |
| vadd.f s5, s2, s3 |
| vsub.f s6, s2, s3 |
| vldr d6, [\z0, #8*1-8*2] |
| vldr d7, [\z1, #8*(o1+1)-8*2] |
| vldr d4, [\z2, #8*o2] |
| vldmdb v6!, {s2,s3} |
| vldr d5, [\z3, #8*o3] |
| vadd.f s20, s4, s12 |
| vldmia v5!, {s0,s1} |
| vldr d8, [\z2, #8*(o2+1)] |
| |
| vsub.f s4, s12, s4 |
| vsub.f s5, s13, s5 |
| vsub.f s6, s14, s6 |
| vsub.f s7, s15, s7 |
| vmul.f s12, s8, s3 |
| vstr d10, [\z0, #8*1-8*2] |
| vldr d9, [\z3, #8*(o3+1)] |
| vstr d11, [\z1, #8*(o1+1)-8*2] |
| vmul.f s8, s8, s0 |
| vstr d2, [\z2, #8*(o2+1)-8*2] |
| |
| vstr d3, [\z3, #8*(o3+1)-8*2] |
| vmul.f s20, s16, s2 |
| |
| |
| |
| vadd.f s7, s8, s13 |
| vsub.f s6, s9, s12 |
| vsub.f s0, s10, s15 |
| vadd.f s3, s11, s14 |
| vmul.f s16, s16, s1 |
| subs a4, a4, #1 |
| bne 1b |
| |
| |
| vadd.f s4, s0, s7 |
| vadd.f s5, s6, s3 |
| vsub.f s6, s6, s3 |
| vsub.f s7, s0, s7 |
| vldr d6, [\z0, #8*0] |
| vadd.f s0, s16, s21 |
| vldr d7, [\z1, #8*o1] |
| vsub.f s1, s18, s23 |
| vadd.f s8, s4, s12 |
| vsub.f s4, s12, s4 |
| vsub.f s5, s13, s5 |
| vsub.f s6, s14, s6 |
| vsub.f s7, s15, s7 |
| vsub.f s2, s17, s20 |
| vadd.f s3, s19, s22 |
| vstr d4, [\z0, #8*0] |
| vstr d5, [\z1, #8*o1] |
| vstr d2, [\z2, #8*o2] |
| vadd.f s4, s1, s0 |
| vstr d3, [\z3, #8*o3] |
| vsub.f s7, s1, s0 |
| vadd.f s5, s2, s3 |
| vsub.f s6, s2, s3 |
| vldr d6, [\z0, #8*1] |
| vldr d7, [\z1, #8*(o1+1)] |
| vadd.f s20, s4, s12 |
| vsub.f s4, s12, s4 |
| vsub.f s5, s13, s5 |
| vsub.f s6, s14, s6 |
| vsub.f s7, s15, s7 |
| vstr d10, [\z0, #8*1] |
| vstr d11, [\z1, #8*(o1+1)] |
| vstr d2, [\z2, #8*(o2+1)] |
| vstr d3, [\z3, #8*(o3+1)] |
| .endm |
|
|
| .macro def_fft n, n2, n4 |
| function .Lfft\n\()_internal_vfp |
| .if \n >= 512 |
| push {v1-v6,lr} |
| .elseif \n >= 256 |
| push {v1-v2,v5-v6,lr} |
| .else |
| push {v1,v5-v6,lr} |
| .endif |
| mov v1, a1 |
| bl .Lfft\n2\()_internal_vfp |
| add a1, v1, #8*(\n/4)*2 |
| bl .Lfft\n4\()_internal_vfp |
| movrelx v5, X(ff_cos_\n), a1 |
| add a1, v1, #8*(\n/4)*3 |
| bl .Lfft\n4\()_internal_vfp |
| .if \n >= 512 |
| .set o1, 0*(\n/4/2) |
| .set o2, 0*(\n/4/2) |
| .set o3, 0*(\n/4/2) |
| add v2, v1, #8*2*(\n/4/2) |
| add v3, v1, #8*4*(\n/4/2) |
| add v4, v1, #8*6*(\n/4/2) |
| pass (\n/4/2), v1, v2, v3, v4 |
| pop {v1-v6,pc} |
| .elseif \n >= 256 |
| .set o1, 2*(\n/4/2) |
| .set o2, 0*(\n/4/2) |
| .set o3, 2*(\n/4/2) |
| add v2, v1, #8*4*(\n/4/2) |
| pass (\n/4/2), v1, v1, v2, v2 |
| pop {v1-v2,v5-v6,pc} |
| .else |
| .set o1, 2*(\n/4/2) |
| .set o2, 4*(\n/4/2) |
| .set o3, 6*(\n/4/2) |
| pass (\n/4/2), v1, v1, v1, v1 |
| pop {v1,v5-v6,pc} |
| .endif |
| endfunc |
|
|
| function fft\n\()_vfp |
| ldr a3, =0x03030000 |
| fmrx a2, FPSCR |
| fmxr FPSCR, a3 |
| vpush {s16-s31} |
| mov ip, lr |
| bl .Lfft\n\()_internal_vfp |
| vpop {s16-s31} |
| fmxr FPSCR, a2 |
| bx ip |
| endfunc |
|
|
| .ltorg |
| .endm |
|
|
| def_fft 32, 16, 8 |
| def_fft 64, 32, 16 |
| def_fft 128, 64, 32 |
| def_fft 256, 128, 64 |
| def_fft 512, 256, 128 |
| def_fft 1024, 512, 256 |
| def_fft 2048, 1024, 512 |
| def_fft 4096, 2048, 1024 |
| def_fft 8192, 4096, 2048 |
| def_fft 16384, 8192, 4096 |
| def_fft 32768, 16384, 8192 |
| def_fft 65536, 32768, 16384 |
|
|