Browse code

ARM: 10l: fix large FFTs

Originally committed as revision 19846 to svn://svn.ffmpeg.org/ffmpeg/trunk

Måns Rullgård authored on 2009/09/15 06:37:41
Showing 3 changed files
... ...
@@ -327,8 +327,10 @@ function ff_fft_permute_neon, export=1
327 327
 1:
328 328
         vld1.32         {d0-d1}, [r1,:128]!
329 329
         ldr             r4,  [r0], #4
330
-        uxtah           lr,  r3,  r4
331
-        uxtah           r4,  r3,  r4,  ror #16
330
+        uxth            lr,  r4
331
+        uxth            r4,  r4,  ror #16
332
+        add             lr,  r3,  lr,  lsl #3
333
+        add             r4,  r3,  r4,  lsl #3
332 334
         vst1.32         {d0}, [lr,:64]
333 335
         vst1.32         {d1}, [r4,:64]
334 336
         subs            r12, r12, #2
... ...
@@ -52,8 +52,10 @@ function ff_imdct_half_neon, export=1
52 52
         vmul.f32        d5,  d17, d3
53 53
         vsub.f32        d4,  d6,  d4
54 54
         vadd.f32        d5,  d5,  d7
55
-        uxtah           r8,  r1,  r6,  ror #16
56
-        uxtah           r6,  r1,  r6
55
+        uxth            r8,  r6,  ror #16
56
+        uxth            r6,  r6
57
+        add             r8,  r1,  r8,  lsl #3
58
+        add             r6,  r1,  r6,  lsl #3
57 59
         beq             1f
58 60
         vld2.32         {d16-d17},[r7,:128],r12
59 61
         vld2.32         {d0-d1},  [r2,:128]!
... ...
@@ -198,8 +200,10 @@ function ff_mdct_calc_neon, export=1
198 198
         subs            lr,  lr,  #16
199 199
         vsub.f32        d6,  d6,  d7            @ -R*c-I*s
200 200
         vadd.f32        d7,  d4,  d5            @ -R*s+I*c
201
-        uxtah           r10, r1,  r6,  ror #16
202
-        uxtah           r6,  r1,  r6
201
+        uxth            r10, r6,  ror #16
202
+        uxth            r6,  r6
203
+        add             r10, r1,  r10, lsl #3
204
+        add             r6,  r1,  r6,  lsl #3
203 205
         beq             1f
204 206
         vld2.32         {d16,d18},[r9,:128],r12 @ x,x in4d1,in4d0
205 207
         vld2.32         {d17,d19},[r8,:128],r12 @ x,x in3d1,in3d0
... ...
@@ -245,8 +249,10 @@ function ff_mdct_calc_neon, export=1
245 245
         subs            lr,  lr,  #16
246 246
         vsub.f32        d6,  d7,  d6            @ I*s-R*c
247 247
         vadd.f32        d7,  d4,  d5            @ R*s-I*c
248
-        uxtah           r10, r1,  r6,  ror #16
249
-        uxtah           r6,  r1,  r6
248
+        uxth            r10, r6,  ror #16
249
+        uxth            r6,  r6
250
+        add             r10, r1,  r10, lsl #3
251
+        add             r6,  r1,  r6,  lsl #3
250 252
         beq             1f
251 253
         vld2.32         {d16,d18},[r9,:128],r12 @ x,x in2d1,in2d0
252 254
         vld2.32         {d17,d19},[r8,:128],r12 @ x,x in1d1,in1d0
... ...
@@ -64,7 +64,6 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
64 64
     float alpha, c1, s1, s2;
65 65
     int split_radix = 1;
66 66
     int av_unused has_vectors;
67
-    int revtab_shift = 0;
68 67
 
69 68
     if (nbits < 2 || nbits > 16)
70 69
         goto fail;
... ...
@@ -120,7 +119,6 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
120 120
     s->imdct_calc  = ff_imdct_calc_neon;
121 121
     s->imdct_half  = ff_imdct_half_neon;
122 122
     s->mdct_calc   = ff_mdct_calc_neon;
123
-    revtab_shift = 3;
124 123
 #endif
125 124
 
126 125
     if (split_radix) {
... ...
@@ -134,8 +132,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
134 134
                 tab[m/2-i] = tab[i];
135 135
         }
136 136
         for(i=0; i<n; i++)
137
-            s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] =
138
-                i << revtab_shift;
137
+            s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i;
139 138
         s->tmp_buf = av_malloc(n * sizeof(FFTComplex));
140 139
     } else {
141 140
         int np, nblocks, np2, l;