Originally committed as revision 24077 to svn://svn.ffmpeg.org/ffmpeg/trunk
| ... | ... |
@@ -30,6 +30,7 @@ |
| 30 | 30 |
#include <math.h> |
| 31 | 31 |
#include "libavutil/mathematics.h" |
| 32 | 32 |
#include "fft.h" |
| 33 |
+#include "x86/fft.h" |
|
| 33 | 34 |
|
| 34 | 35 |
#define DCT32_FLOAT |
| 35 | 36 |
#include "dct32.c" |
| ... | ... |
@@ -213,6 +214,7 @@ av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse) |
| 213 | 213 |
s->dct_calc = dct32_func; |
| 214 | 214 |
|
| 215 | 215 |
s->dct32 = dct32; |
| 216 |
+ if (HAVE_MMX) ff_dct_init_mmx(s); |
|
| 216 | 217 |
|
| 217 | 218 |
return 0; |
| 218 | 219 |
} |
| ... | ... |
@@ -112,6 +112,7 @@ void ff_fft_calc_c(FFTContext *s, FFTComplex *z); |
| 112 | 112 |
void ff_fft_init_altivec(FFTContext *s); |
| 113 | 113 |
void ff_fft_init_mmx(FFTContext *s); |
| 114 | 114 |
void ff_fft_init_arm(FFTContext *s); |
| 115 |
+void ff_dct_init_mmx(DCTContext *s); |
|
| 115 | 116 |
|
| 116 | 117 |
/** |
| 117 | 118 |
* Do the permutation needed BEFORE calling ff_fft_calc(). |
| ... | ... |
@@ -32,5 +32,6 @@ void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input |
| 32 | 32 |
void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input); |
| 33 | 33 |
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input); |
| 34 | 34 |
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); |
| 35 |
+void ff_dct32_float_sse(FFTSample *out, const FFTSample *in); |
|
| 35 | 36 |
|
| 36 | 37 |
#endif |
| ... | ... |
@@ -20,6 +20,7 @@ |
| 20 | 20 |
*/ |
| 21 | 21 |
|
| 22 | 22 |
#include "libavutil/x86_cpu.h" |
| 23 |
+#include "libavutil/common.h" |
|
| 23 | 24 |
#include "libavcodec/dsputil.h" |
| 24 | 25 |
#include "fft.h" |
| 25 | 26 |
|
| ... | ... |
@@ -201,3 +202,268 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input) |
| 201 | 201 |
); |
| 202 | 202 |
} |
| 203 | 203 |
|
| 204 |
+DECLARE_ALIGNED(16, static const float, b1)[] = {
|
|
| 205 |
+ 0.500603, 0.505471, 0.515447, 0.531043, |
|
| 206 |
+ 0.553104, 0.582935, 0.622504, 0.674808, |
|
| 207 |
+ -1.169440, -0.972568, -0.839350, -0.744536, |
|
| 208 |
+ -10.190008, -3.407609, -2.057781, -1.484165, |
|
| 209 |
+ 0.502419, 0.522499, 0.566944, 0.646822, |
|
| 210 |
+ 0.788155, 1.060678, 1.722447, 5.101149, |
|
| 211 |
+ 0.509796, 0.601345, 0.899976, 2.562916, |
|
| 212 |
+ 1.000000, 1.000000, 1.306563, 0.541196, |
|
| 213 |
+ 1.000000, 0.707107, 1.000000, -0.707107 |
|
| 214 |
+}; |
|
| 215 |
+ |
|
| 216 |
+DECLARE_ALIGNED(16, static const int32_t, smask)[4] = {
|
|
| 217 |
+ 0, 0, 0x80000000, 0x80000000 |
|
| 218 |
+}; |
|
| 219 |
+ |
|
| 220 |
+/* butterfly operator */ |
|
| 221 |
+#define BUTTERFLY(a,b,c,tmp) \ |
|
| 222 |
+ "movaps %%" #a ", %%" #tmp " \n\t" \ |
|
| 223 |
+ "subps %%" #b ", %%" #a " \n\t" \ |
|
| 224 |
+ "addps %%" #tmp ", %%" #b " \n\t" \ |
|
| 225 |
+ "mulps " #c ", %%" #a " \n\t" |
|
| 226 |
+ |
|
| 227 |
+///* Same as BUTTERFLY when vectors a and b overlap */ |
|
| 228 |
+#define BUTTERFLY0(val, mask, cos, tmp, shuf) \ |
|
| 229 |
+ "movaps %%" #val ", %%" #tmp " \n\t" \ |
|
| 230 |
+ "shufps " #shuf ", %%" #val ",%%" #val " \n\t" \ |
|
| 231 |
+ "xorps %%" #mask ", %%" #tmp " \n\t" /* flip signs */ \ |
|
| 232 |
+ "addps %%" #tmp ", %%" #val " \n\t" \ |
|
| 233 |
+ "mulps %%" #cos ", %%" #val " \n\t" |
|
| 234 |
+ |
|
| 235 |
+#define BUTTERFLY2(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0x1b) |
|
| 236 |
+#define BUTTERFLY3(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0xb1) |
|
| 237 |
+ |
|
| 238 |
+void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) |
|
| 239 |
+{
|
|
| 240 |
+ int32_t tmp1 = 0; |
|
| 241 |
+ __asm__ volatile( |
|
| 242 |
+ /* pass 1 */ |
|
| 243 |
+ |
|
| 244 |
+ "movaps (%4), %%xmm0 \n\t" |
|
| 245 |
+ "movaps 112(%4), %%xmm1 \n\t" |
|
| 246 |
+ "shufps $0x1b, %%xmm1, %%xmm1 \n\t" |
|
| 247 |
+ BUTTERFLY(xmm0, xmm1, (%2), xmm3) |
|
| 248 |
+ |
|
| 249 |
+ "movaps 64(%4), %%xmm7 \n\t" |
|
| 250 |
+ "movaps 48(%4), %%xmm4 \n\t" |
|
| 251 |
+ "shufps $0x1b, %%xmm4, %%xmm4 \n\t" |
|
| 252 |
+ BUTTERFLY(xmm7, xmm4, 48(%2), xmm3) |
|
| 253 |
+ |
|
| 254 |
+ |
|
| 255 |
+ /* pass 2 */ |
|
| 256 |
+ "movaps 64(%2), %%xmm2 \n\t" |
|
| 257 |
+ BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3) |
|
| 258 |
+ "movaps %%xmm1, 48(%1) \n\t" |
|
| 259 |
+ "movaps %%xmm4, (%1) \n\t" |
|
| 260 |
+ |
|
| 261 |
+ /* pass 1 */ |
|
| 262 |
+ "movaps 16(%4), %%xmm1 \n\t" |
|
| 263 |
+ "movaps 96(%4), %%xmm6 \n\t" |
|
| 264 |
+ "shufps $0x1b, %%xmm6, %%xmm6 \n\t" |
|
| 265 |
+ BUTTERFLY(xmm1, xmm6, 16(%2), xmm3) |
|
| 266 |
+ |
|
| 267 |
+ "movaps 80(%4), %%xmm4 \n\t" |
|
| 268 |
+ "movaps 32(%4), %%xmm5 \n\t" |
|
| 269 |
+ "shufps $0x1b, %%xmm5, %%xmm5 \n\t" |
|
| 270 |
+ BUTTERFLY(xmm4, xmm5, 32(%2), xmm3) |
|
| 271 |
+ |
|
| 272 |
+ /* pass 2 */ |
|
| 273 |
+ BUTTERFLY(xmm0, xmm7, %%xmm2, xmm3) |
|
| 274 |
+ |
|
| 275 |
+ "movaps 80(%2), %%xmm2 \n\t" |
|
| 276 |
+ BUTTERFLY(xmm6, xmm5, %%xmm2, xmm3) |
|
| 277 |
+ |
|
| 278 |
+ BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3) |
|
| 279 |
+ |
|
| 280 |
+ /* pass 3 */ |
|
| 281 |
+ "movaps 96(%2), %%xmm2 \n\t" |
|
| 282 |
+ "shufps $0x1b, %%xmm1, %%xmm1 \n\t" |
|
| 283 |
+ BUTTERFLY(xmm0, xmm1, %%xmm2, xmm3) |
|
| 284 |
+ "movaps %%xmm0, 112(%1) \n\t" |
|
| 285 |
+ "movaps %%xmm1, 96(%1) \n\t" |
|
| 286 |
+ |
|
| 287 |
+ "movaps 0(%1), %%xmm0 \n\t" |
|
| 288 |
+ "shufps $0x1b, %%xmm5, %%xmm5 \n\t" |
|
| 289 |
+ BUTTERFLY(xmm0, xmm5, %%xmm2, xmm3) |
|
| 290 |
+ |
|
| 291 |
+ "movaps 48(%1), %%xmm1 \n\t" |
|
| 292 |
+ "shufps $0x1b, %%xmm6, %%xmm6 \n\t" |
|
| 293 |
+ BUTTERFLY(xmm1, xmm6, %%xmm2, xmm3) |
|
| 294 |
+ "movaps %%xmm1, 48(%1) \n\t" |
|
| 295 |
+ |
|
| 296 |
+ "shufps $0x1b, %%xmm4, %%xmm4 \n\t" |
|
| 297 |
+ BUTTERFLY(xmm7, xmm4, %%xmm2, xmm3) |
|
| 298 |
+ |
|
| 299 |
+ /* pass 4 */ |
|
| 300 |
+ "movaps (%3), %%xmm3 \n\t" |
|
| 301 |
+ "movaps 112(%2), %%xmm2 \n\t" |
|
| 302 |
+ |
|
| 303 |
+ BUTTERFLY2(xmm5, xmm3, xmm2, xmm1) |
|
| 304 |
+ |
|
| 305 |
+ BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) |
|
| 306 |
+ "movaps %%xmm0, 16(%1) \n\t" |
|
| 307 |
+ |
|
| 308 |
+ BUTTERFLY2(xmm6, xmm3, xmm2, xmm1) |
|
| 309 |
+ "movaps %%xmm6, 32(%1) \n\t" |
|
| 310 |
+ |
|
| 311 |
+ "movaps 48(%1), %%xmm0 \n\t" |
|
| 312 |
+ BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) |
|
| 313 |
+ "movaps %%xmm0, 48(%1) \n\t" |
|
| 314 |
+ |
|
| 315 |
+ BUTTERFLY2(xmm4, xmm3, xmm2, xmm1) |
|
| 316 |
+ |
|
| 317 |
+ BUTTERFLY2(xmm7, xmm3, xmm2, xmm1) |
|
| 318 |
+ |
|
| 319 |
+ "movaps 96(%1), %%xmm6 \n\t" |
|
| 320 |
+ BUTTERFLY2(xmm6, xmm3, xmm2, xmm1) |
|
| 321 |
+ |
|
| 322 |
+ "movaps 112(%1), %%xmm0 \n\t" |
|
| 323 |
+ BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) |
|
| 324 |
+ |
|
| 325 |
+ /* pass 5 */ |
|
| 326 |
+ "movaps 128(%2), %%xmm2 \n\t" |
|
| 327 |
+ "shufps $0xCC, %%xmm3,%%xmm3 \n\t" |
|
| 328 |
+ |
|
| 329 |
+ BUTTERFLY3(xmm5, xmm3, xmm2, xmm1) |
|
| 330 |
+ "movaps %%xmm5, (%1) \n\t" |
|
| 331 |
+ |
|
| 332 |
+ "movaps 16(%1), %%xmm1 \n\t" |
|
| 333 |
+ BUTTERFLY3(xmm1, xmm3, xmm2, xmm5) |
|
| 334 |
+ "movaps %%xmm1, 16(%1) \n\t" |
|
| 335 |
+ |
|
| 336 |
+ BUTTERFLY3(xmm4, xmm3, xmm2, xmm5) |
|
| 337 |
+ "movaps %%xmm4, 64(%1) \n\t" |
|
| 338 |
+ |
|
| 339 |
+ BUTTERFLY3(xmm7, xmm3, xmm2, xmm5) |
|
| 340 |
+ "movaps %%xmm7, 80(%1) \n\t" |
|
| 341 |
+ |
|
| 342 |
+ "movaps 32(%1), %%xmm5 \n\t" |
|
| 343 |
+ BUTTERFLY3(xmm5, xmm3, xmm2, xmm7) |
|
| 344 |
+ "movaps %%xmm5, 32(%1) \n\t" |
|
| 345 |
+ |
|
| 346 |
+ "movaps 48(%1), %%xmm4 \n\t" |
|
| 347 |
+ BUTTERFLY3(xmm4, xmm3, xmm2, xmm7) |
|
| 348 |
+ "movaps %%xmm4, 48(%1) \n\t" |
|
| 349 |
+ |
|
| 350 |
+ BUTTERFLY3(xmm6, xmm3, xmm2, xmm7) |
|
| 351 |
+ "movaps %%xmm6, 96(%1) \n\t" |
|
| 352 |
+ |
|
| 353 |
+ BUTTERFLY3(xmm0, xmm3, xmm2, xmm7) |
|
| 354 |
+ "movaps %%xmm0, 112(%1) \n\t" |
|
| 355 |
+ |
|
| 356 |
+ |
|
| 357 |
+ /* pass 6, no SIMD... */ |
|
| 358 |
+ "movss 56(%1), %%xmm3 \n\t" |
|
| 359 |
+ "movl 4(%1), %0 \n\t" |
|
| 360 |
+ "addss 60(%1), %%xmm3 \n\t" |
|
| 361 |
+ "movss 72(%1), %%xmm7 \n\t" |
|
| 362 |
+ "addss %%xmm3, %%xmm4 \n\t" |
|
| 363 |
+ "movss 52(%1), %%xmm2 \n\t" |
|
| 364 |
+ "addss %%xmm3, %%xmm2 \n\t" |
|
| 365 |
+ "movss 24(%1), %%xmm3 \n\t" |
|
| 366 |
+ "addss 28(%1), %%xmm3 \n\t" |
|
| 367 |
+ "addss 76(%1), %%xmm7 \n\t" |
|
| 368 |
+ "addss %%xmm3, %%xmm1 \n\t" |
|
| 369 |
+ "addss %%xmm4, %%xmm5 \n\t" |
|
| 370 |
+ "movss %%xmm1, 16(%1) \n\t" |
|
| 371 |
+ "movss 20(%1), %%xmm1 \n\t" |
|
| 372 |
+ "addss %%xmm3, %%xmm1 \n\t" |
|
| 373 |
+ "movss 40(%1), %%xmm3 \n\t" |
|
| 374 |
+ "movss %%xmm1, 48(%1) \n\t" |
|
| 375 |
+ "addss 44(%1), %%xmm3 \n\t" |
|
| 376 |
+ "movss 20(%1), %%xmm1 \n\t" |
|
| 377 |
+ "addss %%xmm3, %%xmm4 \n\t" |
|
| 378 |
+ "addss %%xmm2, %%xmm3 \n\t" |
|
| 379 |
+ "addss 28(%1), %%xmm1 \n\t" |
|
| 380 |
+ "movss %%xmm3, 40(%1) \n\t" |
|
| 381 |
+ "addss 36(%1), %%xmm2 \n\t" |
|
| 382 |
+ "movss 8(%1), %%xmm3 \n\t" |
|
| 383 |
+ "movss %%xmm2, 56(%1) \n\t" |
|
| 384 |
+ "addss 12(%1), %%xmm3 \n\t" |
|
| 385 |
+ "movss %%xmm5, 8(%1) \n\t" |
|
| 386 |
+ "movss %%xmm3, 32(%1) \n\t" |
|
| 387 |
+ "movss 52(%1), %%xmm2 \n\t" |
|
| 388 |
+ "movss 80(%1), %%xmm3 \n\t" |
|
| 389 |
+ "movss 120(%1), %%xmm5 \n\t" |
|
| 390 |
+ "movss %%xmm1, 80(%1) \n\t" |
|
| 391 |
+ "movss %%xmm4, 24(%1) \n\t" |
|
| 392 |
+ "addss 124(%1), %%xmm5 \n\t" |
|
| 393 |
+ "movss 64(%1), %%xmm1 \n\t" |
|
| 394 |
+ "addss 60(%1), %%xmm2 \n\t" |
|
| 395 |
+ "addss %%xmm5, %%xmm0 \n\t" |
|
| 396 |
+ "addss 116(%1), %%xmm5 \n\t" |
|
| 397 |
+ "movl %0, 64(%1) \n\t" |
|
| 398 |
+ "addss %%xmm0, %%xmm6 \n\t" |
|
| 399 |
+ "addss %%xmm6, %%xmm1 \n\t" |
|
| 400 |
+ "movl 12(%1), %0 \n\t" |
|
| 401 |
+ "movss %%xmm1, 4(%1) \n\t" |
|
| 402 |
+ "movss 88(%1), %%xmm1 \n\t" |
|
| 403 |
+ "movl %0, 96(%1) \n\t" |
|
| 404 |
+ "addss 92(%1), %%xmm1 \n\t" |
|
| 405 |
+ "movss 104(%1), %%xmm4 \n\t" |
|
| 406 |
+ "movl 28(%1), %0 \n\t" |
|
| 407 |
+ "addss 108(%1), %%xmm4 \n\t" |
|
| 408 |
+ "addss %%xmm4, %%xmm0 \n\t" |
|
| 409 |
+ "addss %%xmm1, %%xmm3 \n\t" |
|
| 410 |
+ "addss 84(%1), %%xmm1 \n\t" |
|
| 411 |
+ "addss %%xmm5, %%xmm4 \n\t" |
|
| 412 |
+ "addss %%xmm3, %%xmm6 \n\t" |
|
| 413 |
+ "addss %%xmm0, %%xmm3 \n\t" |
|
| 414 |
+ "addss %%xmm7, %%xmm0 \n\t" |
|
| 415 |
+ "addss 100(%1), %%xmm5 \n\t" |
|
| 416 |
+ "addss %%xmm4, %%xmm7 \n\t" |
|
| 417 |
+ "movl %0, 112(%1) \n\t" |
|
| 418 |
+ "movss %%xmm0, 28(%1) \n\t" |
|
| 419 |
+ "movss 36(%1), %%xmm0 \n\t" |
|
| 420 |
+ "movss %%xmm7, 36(%1) \n\t" |
|
| 421 |
+ "addss %%xmm1, %%xmm4 \n\t" |
|
| 422 |
+ "movss 116(%1), %%xmm7 \n\t" |
|
| 423 |
+ "addss %%xmm2, %%xmm0 \n\t" |
|
| 424 |
+ "addss 124(%1), %%xmm7 \n\t" |
|
| 425 |
+ "movss %%xmm0, 72(%1) \n\t" |
|
| 426 |
+ "movss 44(%1), %%xmm0 \n\t" |
|
| 427 |
+ "movss %%xmm6, 12(%1) \n\t" |
|
| 428 |
+ "movss %%xmm3, 20(%1) \n\t" |
|
| 429 |
+ "addss %%xmm0, %%xmm2 \n\t" |
|
| 430 |
+ "movss %%xmm4, 44(%1) \n\t" |
|
| 431 |
+ "movss %%xmm2, 88(%1) \n\t" |
|
| 432 |
+ "addss 60(%1), %%xmm0 \n\t" |
|
| 433 |
+ "movl 60(%1), %0 \n\t" |
|
| 434 |
+ "movl %0, 120(%1) \n\t" |
|
| 435 |
+ "movss %%xmm0, 104(%1) \n\t" |
|
| 436 |
+ "addss %%xmm5, %%xmm1 \n\t" |
|
| 437 |
+ "addss 68(%1), %%xmm5 \n\t" |
|
| 438 |
+ "movss %%xmm1, 52(%1) \n\t" |
|
| 439 |
+ "movss %%xmm5, 60(%1) \n\t" |
|
| 440 |
+ "movss 68(%1), %%xmm1 \n\t" |
|
| 441 |
+ "movss 100(%1), %%xmm5 \n\t" |
|
| 442 |
+ "addss %%xmm7, %%xmm5 \n\t" |
|
| 443 |
+ "addss 108(%1), %%xmm7 \n\t" |
|
| 444 |
+ "addss %%xmm5, %%xmm1 \n\t" |
|
| 445 |
+ "movss 84(%1), %%xmm2 \n\t" |
|
| 446 |
+ "addss 92(%1), %%xmm2 \n\t" |
|
| 447 |
+ "addss %%xmm2, %%xmm5 \n\t" |
|
| 448 |
+ "movss %%xmm1, 68(%1) \n\t" |
|
| 449 |
+ "addss %%xmm7, %%xmm2 \n\t" |
|
| 450 |
+ "movss 76(%1), %%xmm1 \n\t" |
|
| 451 |
+ "movss %%xmm2, 84(%1) \n\t" |
|
| 452 |
+ "movss %%xmm5, 76(%1) \n\t" |
|
| 453 |
+ "movss 108(%1), %%xmm2 \n\t" |
|
| 454 |
+ "addss %%xmm1, %%xmm7 \n\t" |
|
| 455 |
+ "addss 124(%1), %%xmm2 \n\t" |
|
| 456 |
+ "addss %%xmm2, %%xmm1 \n\t" |
|
| 457 |
+ "addss 92(%1), %%xmm2 \n\t" |
|
| 458 |
+ "movss %%xmm1, 100(%1) \n\t" |
|
| 459 |
+ "movss %%xmm2, 108(%1) \n\t" |
|
| 460 |
+ "movss 92(%1), %%xmm2 \n\t" |
|
| 461 |
+ "movss %%xmm7, 92(%1) \n\t" |
|
| 462 |
+ "addss 124(%1), %%xmm2 \n\t" |
|
| 463 |
+ "movss %%xmm2, 116(%1) \n\t" |
|
| 464 |
+ :"+&r"(tmp1) |
|
| 465 |
+ :"r"(out), "r"(b1), "r"(smask), "r"(in) |
|
| 466 |
+ :"memory" |
|
| 467 |
+ ); |
|
| 468 |
+} |