Originally committed as revision 19819 to svn://svn.ffmpeg.org/ffmpeg/trunk
| ... | ... |
@@ -164,3 +164,163 @@ function ff_imdct_calc_neon, export=1 |
| 164 | 164 |
|
| 165 | 165 |
pop {r4-r6,pc}
|
| 166 | 166 |
.endfunc |
| 167 |
+ |
|
| 168 |
+function ff_mdct_calc_neon, export=1 |
|
| 169 |
+ push {r4-r10,lr}
|
|
| 170 |
+ |
|
| 171 |
+ mov r12, #1 |
|
| 172 |
+ ldr lr, [r0, #4] @ nbits |
|
| 173 |
+ ldr r4, [r0, #8] @ tcos |
|
| 174 |
+ ldr r5, [r0, #12] @ tsin |
|
| 175 |
+ ldr r3, [r0, #24] @ revtab |
|
| 176 |
+ lsl lr, r12, lr @ n = 1 << nbits |
|
| 177 |
+ add r7, r2, lr @ in4u |
|
| 178 |
+ sub r9, r7, #16 @ in4d |
|
| 179 |
+ add r2, r7, lr, lsl #1 @ in3u |
|
| 180 |
+ add r8, r9, lr, lsl #1 @ in3d |
|
| 181 |
+ mov r12, #-16 |
|
| 182 |
+ |
|
| 183 |
+ vld2.32 {d16,d18},[r9,:128],r12 @ x,x in4d1,in4d0
|
|
| 184 |
+ vld2.32 {d17,d19},[r8,:128],r12 @ x,x in3d1,in3d0
|
|
| 185 |
+ vld2.32 {d20,d21},[r7,:128]! @ in4u0,in4u1 x,x
|
|
| 186 |
+ vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 |
|
| 187 |
+ vld2.32 {d0, d1}, [r2,:128]! @ in3u0,in3u1 x,x
|
|
| 188 |
+ vsub.f32 d20, d18, d20 @ in4d-in4u I |
|
| 189 |
+ vld1.32 {d2}, [r4,:64]! @ c0,c1
|
|
| 190 |
+ vadd.f32 d0, d0, d19 @ in3u+in3d -R |
|
| 191 |
+ vld1.32 {d3}, [r5,:64]! @ s0,s1
|
|
| 192 |
+1: |
|
| 193 |
+ vmul.f32 d7, d20, d3 @ I*s |
|
| 194 |
+ vmul.f32 d6, d0, d2 @ -R*c |
|
| 195 |
+ ldr r6, [r3], #4 |
|
| 196 |
+ vmul.f32 d4, d0, d3 @ -R*s |
|
| 197 |
+ vmul.f32 d5, d20, d2 @ I*c |
|
| 198 |
+ subs lr, lr, #16 |
|
| 199 |
+ vsub.f32 d6, d6, d7 @ -R*c-I*s |
|
| 200 |
+ vadd.f32 d7, d4, d5 @ -R*s+I*c |
|
| 201 |
+ uxtah r10, r1, r6, ror #16 |
|
| 202 |
+ uxtah r6, r1, r6 |
|
| 203 |
+ beq 1f |
|
| 204 |
+ vld2.32 {d16,d18},[r9,:128],r12 @ x,x in4d1,in4d0
|
|
| 205 |
+ vld2.32 {d17,d19},[r8,:128],r12 @ x,x in3d1,in3d0
|
|
| 206 |
+ vneg.f32 d7, d7 @ R*s-I*c |
|
| 207 |
+ vld2.32 {d20,d21},[r7,:128]! @ in4u0,in4u1 x,x
|
|
| 208 |
+ vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 |
|
| 209 |
+ vld2.32 {d0, d1}, [r2,:128]! @ in3u0,in3u1 x,x
|
|
| 210 |
+ vsub.f32 d20, d18, d20 @ in4d-in4u I |
|
| 211 |
+ vld1.32 {d2}, [r4,:64]! @ c0,c1
|
|
| 212 |
+ vadd.f32 d0, d0, d19 @ in3u+in3d -R |
|
| 213 |
+ vld1.32 {d3}, [r5,:64]! @ s0,s1
|
|
| 214 |
+ vst2.32 {d6[0],d7[0]}, [r6,:64]
|
|
| 215 |
+ vst2.32 {d6[1],d7[1]}, [r10,:64]
|
|
| 216 |
+ b 1b |
|
| 217 |
+1: |
|
| 218 |
+ vneg.f32 d7, d7 @ R*s-I*c |
|
| 219 |
+ vst2.32 {d6[0],d7[0]}, [r6,:64]
|
|
| 220 |
+ vst2.32 {d6[1],d7[1]}, [r10,:64]
|
|
| 221 |
+ |
|
| 222 |
+ mov r12, #1 |
|
| 223 |
+ ldr lr, [r0, #4] @ nbits |
|
| 224 |
+ lsl lr, r12, lr @ n = 1 << nbits |
|
| 225 |
+ sub r8, r2, #16 @ in1d |
|
| 226 |
+ add r2, r9, #16 @ in0u |
|
| 227 |
+ sub r9, r7, #16 @ in2d |
|
| 228 |
+ mov r12, #-16 |
|
| 229 |
+ |
|
| 230 |
+ vld2.32 {d16,d18},[r9,:128],r12 @ x,x in2d1,in2d0
|
|
| 231 |
+ vld2.32 {d17,d19},[r8,:128],r12 @ x,x in1d1,in1d0
|
|
| 232 |
+ vld2.32 {d20,d21},[r7,:128]! @ in2u0,in2u1 x,x
|
|
| 233 |
+ vrev64.32 q9, q9 @ in2d0,in2d1 in1d0,in1d1 |
|
| 234 |
+ vld2.32 {d0, d1}, [r2,:128]! @ in0u0,in0u1 x,x
|
|
| 235 |
+ vsub.f32 d0, d0, d18 @ in0u-in2d R |
|
| 236 |
+ vld1.32 {d2}, [r4,:64]! @ c0,c1
|
|
| 237 |
+ vadd.f32 d20, d20, d19 @ in2u+in1d -I |
|
| 238 |
+ vld1.32 {d3}, [r5,:64]! @ s0,s1
|
|
| 239 |
+1: |
|
| 240 |
+ vmul.f32 d6, d0, d2 @ R*c |
|
| 241 |
+ vmul.f32 d7, d20, d3 @ -I*s |
|
| 242 |
+ ldr r6, [r3], #4 |
|
| 243 |
+ vmul.f32 d4, d0, d3 @ R*s |
|
| 244 |
+ vmul.f32 d5, d20, d2 @ I*c |
|
| 245 |
+ subs lr, lr, #16 |
|
| 246 |
+ vsub.f32 d6, d7, d6 @ I*s-R*c |
|
| 247 |
+ vadd.f32 d7, d4, d5 @ R*s-I*c |
|
| 248 |
+ uxtah r10, r1, r6, ror #16 |
|
| 249 |
+ uxtah r6, r1, r6 |
|
| 250 |
+ beq 1f |
|
| 251 |
+ vld2.32 {d16,d18},[r9,:128],r12 @ x,x in2d1,in2d0
|
|
| 252 |
+ vld2.32 {d17,d19},[r8,:128],r12 @ x,x in1d1,in1d0
|
|
| 253 |
+ vld2.32 {d20,d21},[r7,:128]! @ in2u0,in2u1 x,x
|
|
| 254 |
+ vrev64.32 q9, q9 @ in2d0,in2d1 in1d0,in1d1 |
|
| 255 |
+ vld2.32 {d0, d1}, [r2,:128]! @ in0u0,in0u1 x,x
|
|
| 256 |
+ vsub.f32 d0, d0, d18 @ in0u-in2d R |
|
| 257 |
+ vld1.32 {d2}, [r4,:64]! @ c0,c1
|
|
| 258 |
+ vadd.f32 d20, d20, d19 @ in2u+in1d -I |
|
| 259 |
+ vld1.32 {d3}, [r5,:64]! @ s0,s1
|
|
| 260 |
+ vst2.32 {d6[0],d7[0]}, [r6,:64]
|
|
| 261 |
+ vst2.32 {d6[1],d7[1]}, [r10,:64]
|
|
| 262 |
+ b 1b |
|
| 263 |
+1: |
|
| 264 |
+ vst2.32 {d6[0],d7[0]}, [r6,:64]
|
|
| 265 |
+ vst2.32 {d6[1],d7[1]}, [r10,:64]
|
|
| 266 |
+ |
|
| 267 |
+ mov r4, r0 |
|
| 268 |
+ mov r6, r1 |
|
| 269 |
+ add r0, r0, #16 |
|
| 270 |
+ bl ff_fft_calc_neon |
|
| 271 |
+ |
|
| 272 |
+ mov r12, #1 |
|
| 273 |
+ ldr lr, [r4, #4] @ nbits |
|
| 274 |
+ ldr r5, [r4, #12] @ tsin |
|
| 275 |
+ ldr r4, [r4, #8] @ tcos |
|
| 276 |
+ lsl r12, r12, lr @ n = 1 << nbits |
|
| 277 |
+ lsr lr, r12, #3 @ n8 = n >> 3 |
|
| 278 |
+ |
|
| 279 |
+ add r4, r4, lr, lsl #2 |
|
| 280 |
+ add r5, r5, lr, lsl #2 |
|
| 281 |
+ add r6, r6, lr, lsl #3 |
|
| 282 |
+ sub r1, r4, #8 |
|
| 283 |
+ sub r2, r5, #8 |
|
| 284 |
+ sub r3, r6, #16 |
|
| 285 |
+ |
|
| 286 |
+ mov r7, #-16 |
|
| 287 |
+ mov r12, #-8 |
|
| 288 |
+ mov r8, r6 |
|
| 289 |
+ mov r0, r3 |
|
| 290 |
+ |
|
| 291 |
+ vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
|
|
| 292 |
+ vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
|
|
| 293 |
+ vld1.32 {d18}, [r2,:64], r12 @ d18=s1,s0
|
|
| 294 |
+1: |
|
| 295 |
+ subs lr, lr, #2 |
|
| 296 |
+ vmul.f32 d7, d0, d18 @ r1*s1,r0*s0 |
|
| 297 |
+ vld1.32 {d19}, [r5,:64]! @ s2,s3
|
|
| 298 |
+ vmul.f32 d4, d1, d18 @ i1*s1,i0*s0 |
|
| 299 |
+ vld1.32 {d16}, [r1,:64], r12 @ c1,c0
|
|
| 300 |
+ vmul.f32 d5, d21, d19 @ i2*s2,i3*s3 |
|
| 301 |
+ vld1.32 {d17}, [r4,:64]! @ c2,c3
|
|
| 302 |
+ vmul.f32 d6, d20, d19 @ r2*s2,r3*s3 |
|
| 303 |
+ vmul.f32 d24, d0, d16 @ r1*c1,r0*c0 |
|
| 304 |
+ vmul.f32 d25, d20, d17 @ r2*c2,r3*c3 |
|
| 305 |
+ vmul.f32 d22, d21, d17 @ i2*c2,i3*c3 |
|
| 306 |
+ vmul.f32 d23, d1, d16 @ i1*c1,i0*c0 |
|
| 307 |
+ vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0 |
|
| 308 |
+ vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3 |
|
| 309 |
+ vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3 |
|
| 310 |
+ vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0 |
|
| 311 |
+ vneg.f32 q2, q2 |
|
| 312 |
+ beq 1f |
|
| 313 |
+ vld2.32 {d0-d1}, [r3,:128], r7
|
|
| 314 |
+ vld2.32 {d20-d21},[r6,:128]!
|
|
| 315 |
+ vld1.32 {d18}, [r2,:64], r12
|
|
| 316 |
+ vrev64.32 q3, q3 |
|
| 317 |
+ vst2.32 {d4,d6}, [r0,:128], r7
|
|
| 318 |
+ vst2.32 {d5,d7}, [r8,:128]!
|
|
| 319 |
+ b 1b |
|
| 320 |
+1: |
|
| 321 |
+ vrev64.32 q3, q3 |
|
| 322 |
+ vst2.32 {d4,d6}, [r0,:128]
|
|
| 323 |
+ vst2.32 {d5,d7}, [r8,:128]
|
|
| 324 |
+ |
|
| 325 |
+ pop {r4-r10,pc}
|
|
| 326 |
+.endfunc |
| ... | ... |
@@ -780,6 +780,7 @@ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input |
| 780 | 780 |
void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input); |
| 781 | 781 |
void ff_imdct_calc_neon(MDCTContext *s, FFTSample *output, const FFTSample *input); |
| 782 | 782 |
void ff_imdct_half_neon(MDCTContext *s, FFTSample *output, const FFTSample *input); |
| 783 |
+void ff_mdct_calc_neon(MDCTContext *s, FFTSample *output, const FFTSample *input); |
|
| 783 | 784 |
void ff_mdct_end(MDCTContext *s); |
| 784 | 785 |
|
| 785 | 786 |
/* Real Discrete Fourier Transform */ |
| ... | ... |
@@ -119,6 +119,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) |
| 119 | 119 |
s->fft_calc = ff_fft_calc_neon; |
| 120 | 120 |
s->imdct_calc = ff_imdct_calc_neon; |
| 121 | 121 |
s->imdct_half = ff_imdct_half_neon; |
| 122 |
+ s->mdct_calc = ff_mdct_calc_neon; |
|
| 122 | 123 |
revtab_shift = 3; |
| 123 | 124 |
#endif |
| 124 | 125 |
|