This will be beneficial for use with the audio conversion API without
requiring it to depend on all of dsputil.
Signed-off-by: Mans Rullgard <mans@mansr.com>
(cherry picked from commit c73d99e672329c8f2df290736ffc474c360ac4ae)
... | ... |
@@ -35,6 +35,7 @@ |
35 | 35 |
#include "fft.h" |
36 | 36 |
#include "mpeg4audio.h" |
37 | 37 |
#include "sbr.h" |
38 |
+#include "fmtconvert.h" |
|
38 | 39 |
|
39 | 40 |
#include <stdint.h> |
40 | 41 |
|
... | ... |
@@ -268,6 +269,7 @@ typedef struct { |
268 | 268 |
FFTContext mdct; |
269 | 269 |
FFTContext mdct_small; |
270 | 270 |
DSPContext dsp; |
271 |
+ FmtConvertContext fmt_conv; |
|
271 | 272 |
int random_state; |
272 | 273 |
/** @} */ |
273 | 274 |
|
... | ... |
@@ -85,6 +85,7 @@ |
85 | 85 |
#include "get_bits.h" |
86 | 86 |
#include "dsputil.h" |
87 | 87 |
#include "fft.h" |
88 |
+#include "fmtconvert.h" |
|
88 | 89 |
#include "lpc.h" |
89 | 90 |
|
90 | 91 |
#include "aac.h" |
... | ... |
@@ -562,6 +563,7 @@ static av_cold int aac_decode_init(AVCodecContext *avctx) |
562 | 562 |
ff_aac_sbr_init(); |
563 | 563 |
|
564 | 564 |
dsputil_init(&ac->dsp, avctx); |
565 |
+ ff_fmt_convert_init(&ac->fmt_conv, avctx); |
|
565 | 566 |
|
566 | 567 |
ac->random_state = 0x1f2e3d4c; |
567 | 568 |
|
... | ... |
@@ -2032,7 +2034,7 @@ static int aac_decode_frame_int(AVCodecContext *avctx, void *data, |
2032 | 2032 |
*data_size = data_size_tmp; |
2033 | 2033 |
|
2034 | 2034 |
if (samples) |
2035 |
- ac->dsp.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels); |
|
2035 |
+ ac->fmt_conv.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels); |
|
2036 | 2036 |
|
2037 | 2037 |
if (ac->output_configured) |
2038 | 2038 |
ac->output_configured = OC_LOCKED; |
... | ... |
@@ -193,6 +193,7 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx) |
193 | 193 |
ff_mdct_init(&s->imdct_512, 9, 1, 1.0); |
194 | 194 |
ff_kbd_window_init(s->window, 5.0, 256); |
195 | 195 |
dsputil_init(&s->dsp, avctx); |
196 |
+ ff_fmt_convert_init(&s->fmt_conv, avctx); |
|
196 | 197 |
av_lfg_init(&s->dith_state, 0); |
197 | 198 |
|
198 | 199 |
/* set scale value for float to int16 conversion */ |
... | ... |
@@ -1255,7 +1256,7 @@ static int decode_audio_block(AC3DecodeContext *s, int blk) |
1255 | 1255 |
} else { |
1256 | 1256 |
gain *= s->dynamic_range[0]; |
1257 | 1257 |
} |
1258 |
- s->dsp.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256); |
|
1258 |
+ s->fmt_conv.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256); |
|
1259 | 1259 |
} |
1260 | 1260 |
|
1261 | 1261 |
/* apply spectral extension to high frequency bins */ |
... | ... |
@@ -1407,7 +1408,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size, |
1407 | 1407 |
av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n"); |
1408 | 1408 |
err = 1; |
1409 | 1409 |
} |
1410 |
- s->dsp.float_to_int16_interleave(out_samples, output, 256, s->out_channels); |
|
1410 |
+ s->fmt_conv.float_to_int16_interleave(out_samples, output, 256, s->out_channels); |
|
1411 | 1411 |
out_samples += 256 * s->out_channels; |
1412 | 1412 |
} |
1413 | 1413 |
*data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t); |
... | ... |
@@ -55,6 +55,7 @@ |
55 | 55 |
#include "get_bits.h" |
56 | 56 |
#include "dsputil.h" |
57 | 57 |
#include "fft.h" |
58 |
+#include "fmtconvert.h" |
|
58 | 59 |
|
59 | 60 |
/* override ac3.h to include coupling channel */ |
60 | 61 |
#undef AC3_MAX_CHANNELS |
... | ... |
@@ -190,6 +191,7 @@ typedef struct { |
190 | 190 |
|
191 | 191 |
///@defgroup opt optimization |
192 | 192 |
DSPContext dsp; ///< for optimization |
193 |
+ FmtConvertContext fmt_conv; ///< optimized conversion functions |
|
193 | 194 |
float mul_bias; ///< scaling for float_to_int16 conversion |
194 | 195 |
///@} |
195 | 196 |
|
... | ... |
@@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o |
9 | 9 |
OBJS += arm/dsputil_init_arm.o \ |
10 | 10 |
arm/dsputil_arm.o \ |
11 | 11 |
arm/fft_init_arm.o \ |
12 |
+ arm/fmtconvert_init_arm.o \ |
|
12 | 13 |
arm/jrevdct_arm.o \ |
13 | 14 |
arm/mpegvideo_arm.o \ |
14 | 15 |
arm/simple_idct_arm.o \ |
... | ... |
@@ -22,8 +23,11 @@ OBJS-$(HAVE_ARMV6) += arm/dsputil_init_armv6.o \ |
22 | 22 |
arm/dsputil_armv6.o \ |
23 | 23 |
arm/simple_idct_armv6.o \ |
24 | 24 |
|
25 |
+VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o \ |
|
26 |
+ |
|
25 | 27 |
OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \ |
26 | 28 |
arm/dsputil_init_vfp.o \ |
29 |
+ $(VFP-OBJS-yes) |
|
27 | 30 |
|
28 | 31 |
OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \ |
29 | 32 |
arm/mpegvideo_iwmmxt.o \ |
... | ... |
@@ -52,6 +56,7 @@ NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_neon.o \ |
52 | 52 |
|
53 | 53 |
OBJS-$(HAVE_NEON) += arm/dsputil_init_neon.o \ |
54 | 54 |
arm/dsputil_neon.o \ |
55 |
+ arm/fmtconvert_neon.o \ |
|
55 | 56 |
arm/int_neon.o \ |
56 | 57 |
arm/mpegvideo_neon.o \ |
57 | 58 |
arm/simple_idct_neon.o \ |
... | ... |
@@ -153,8 +153,6 @@ void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul, |
153 | 153 |
int len); |
154 | 154 |
void ff_butterflies_float_neon(float *v1, float *v2, int len); |
155 | 155 |
float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len); |
156 |
-void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, |
|
157 |
- float mul, int len); |
|
158 | 156 |
void ff_vector_fmul_reverse_neon(float *dst, const float *src0, |
159 | 157 |
const float *src1, int len); |
160 | 158 |
void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, |
... | ... |
@@ -162,8 +160,6 @@ void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, |
162 | 162 |
|
163 | 163 |
void ff_vector_clipf_neon(float *dst, const float *src, float min, float max, |
164 | 164 |
int len); |
165 |
-void ff_float_to_int16_neon(int16_t *, const float *, long); |
|
166 |
-void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); |
|
167 | 165 |
|
168 | 166 |
void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize); |
169 | 167 |
|
... | ... |
@@ -308,7 +304,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) |
308 | 308 |
c->vector_fmul_scalar = ff_vector_fmul_scalar_neon; |
309 | 309 |
c->butterflies_float = ff_butterflies_float_neon; |
310 | 310 |
c->scalarproduct_float = ff_scalarproduct_float_neon; |
311 |
- c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; |
|
312 | 311 |
c->vector_fmul_reverse = ff_vector_fmul_reverse_neon; |
313 | 312 |
c->vector_fmul_add = ff_vector_fmul_add_neon; |
314 | 313 |
c->vector_clipf = ff_vector_clipf_neon; |
... | ... |
@@ -319,11 +314,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) |
319 | 319 |
c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon; |
320 | 320 |
c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon; |
321 | 321 |
|
322 |
- if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
|
323 |
- c->float_to_int16 = ff_float_to_int16_neon; |
|
324 |
- c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; |
|
325 |
- } |
|
326 |
- |
|
327 | 322 |
if (CONFIG_VORBIS_DECODER) |
328 | 323 |
c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon; |
329 | 324 |
|
... | ... |
@@ -25,13 +25,9 @@ void ff_vector_fmul_vfp(float *dst, const float *src0, |
25 | 25 |
const float *src1, int len); |
26 | 26 |
void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, |
27 | 27 |
const float *src1, int len); |
28 |
-void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); |
|
29 | 28 |
|
30 | 29 |
void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx) |
31 | 30 |
{ |
32 | 31 |
c->vector_fmul = ff_vector_fmul_vfp; |
33 | 32 |
c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; |
34 |
-#if HAVE_ARMV6 |
|
35 |
- c->float_to_int16 = ff_float_to_int16_vfp; |
|
36 |
-#endif |
|
37 | 33 |
} |
... | ... |
@@ -400,343 +400,6 @@ function ff_add_pixels_clamped_neon, export=1 |
400 | 400 |
bx lr |
401 | 401 |
endfunc |
402 | 402 |
|
403 |
-function ff_float_to_int16_neon, export=1 |
|
404 |
- subs r2, r2, #8 |
|
405 |
- vld1.64 {d0-d1}, [r1,:128]! |
|
406 |
- vcvt.s32.f32 q8, q0, #16 |
|
407 |
- vld1.64 {d2-d3}, [r1,:128]! |
|
408 |
- vcvt.s32.f32 q9, q1, #16 |
|
409 |
- beq 3f |
|
410 |
- bics ip, r2, #15 |
|
411 |
- beq 2f |
|
412 |
-1: subs ip, ip, #16 |
|
413 |
- vshrn.s32 d4, q8, #16 |
|
414 |
- vld1.64 {d0-d1}, [r1,:128]! |
|
415 |
- vcvt.s32.f32 q0, q0, #16 |
|
416 |
- vshrn.s32 d5, q9, #16 |
|
417 |
- vld1.64 {d2-d3}, [r1,:128]! |
|
418 |
- vcvt.s32.f32 q1, q1, #16 |
|
419 |
- vshrn.s32 d6, q0, #16 |
|
420 |
- vst1.64 {d4-d5}, [r0,:128]! |
|
421 |
- vshrn.s32 d7, q1, #16 |
|
422 |
- vld1.64 {d16-d17},[r1,:128]! |
|
423 |
- vcvt.s32.f32 q8, q8, #16 |
|
424 |
- vld1.64 {d18-d19},[r1,:128]! |
|
425 |
- vcvt.s32.f32 q9, q9, #16 |
|
426 |
- vst1.64 {d6-d7}, [r0,:128]! |
|
427 |
- bne 1b |
|
428 |
- ands r2, r2, #15 |
|
429 |
- beq 3f |
|
430 |
-2: vld1.64 {d0-d1}, [r1,:128]! |
|
431 |
- vshrn.s32 d4, q8, #16 |
|
432 |
- vcvt.s32.f32 q0, q0, #16 |
|
433 |
- vld1.64 {d2-d3}, [r1,:128]! |
|
434 |
- vshrn.s32 d5, q9, #16 |
|
435 |
- vcvt.s32.f32 q1, q1, #16 |
|
436 |
- vshrn.s32 d6, q0, #16 |
|
437 |
- vst1.64 {d4-d5}, [r0,:128]! |
|
438 |
- vshrn.s32 d7, q1, #16 |
|
439 |
- vst1.64 {d6-d7}, [r0,:128]! |
|
440 |
- bx lr |
|
441 |
-3: vshrn.s32 d4, q8, #16 |
|
442 |
- vshrn.s32 d5, q9, #16 |
|
443 |
- vst1.64 {d4-d5}, [r0,:128]! |
|
444 |
- bx lr |
|
445 |
-endfunc |
|
446 |
- |
|
447 |
-function ff_float_to_int16_interleave_neon, export=1 |
|
448 |
- cmp r3, #2 |
|
449 |
- ldrlt r1, [r1] |
|
450 |
- blt ff_float_to_int16_neon |
|
451 |
- bne 4f |
|
452 |
- |
|
453 |
- ldr r3, [r1] |
|
454 |
- ldr r1, [r1, #4] |
|
455 |
- |
|
456 |
- subs r2, r2, #8 |
|
457 |
- vld1.64 {d0-d1}, [r3,:128]! |
|
458 |
- vcvt.s32.f32 q8, q0, #16 |
|
459 |
- vld1.64 {d2-d3}, [r3,:128]! |
|
460 |
- vcvt.s32.f32 q9, q1, #16 |
|
461 |
- vld1.64 {d20-d21},[r1,:128]! |
|
462 |
- vcvt.s32.f32 q10, q10, #16 |
|
463 |
- vld1.64 {d22-d23},[r1,:128]! |
|
464 |
- vcvt.s32.f32 q11, q11, #16 |
|
465 |
- beq 3f |
|
466 |
- bics ip, r2, #15 |
|
467 |
- beq 2f |
|
468 |
-1: subs ip, ip, #16 |
|
469 |
- vld1.64 {d0-d1}, [r3,:128]! |
|
470 |
- vcvt.s32.f32 q0, q0, #16 |
|
471 |
- vsri.32 q10, q8, #16 |
|
472 |
- vld1.64 {d2-d3}, [r3,:128]! |
|
473 |
- vcvt.s32.f32 q1, q1, #16 |
|
474 |
- vld1.64 {d24-d25},[r1,:128]! |
|
475 |
- vcvt.s32.f32 q12, q12, #16 |
|
476 |
- vld1.64 {d26-d27},[r1,:128]! |
|
477 |
- vsri.32 q11, q9, #16 |
|
478 |
- vst1.64 {d20-d21},[r0,:128]! |
|
479 |
- vcvt.s32.f32 q13, q13, #16 |
|
480 |
- vst1.64 {d22-d23},[r0,:128]! |
|
481 |
- vsri.32 q12, q0, #16 |
|
482 |
- vld1.64 {d16-d17},[r3,:128]! |
|
483 |
- vsri.32 q13, q1, #16 |
|
484 |
- vst1.64 {d24-d25},[r0,:128]! |
|
485 |
- vcvt.s32.f32 q8, q8, #16 |
|
486 |
- vld1.64 {d18-d19},[r3,:128]! |
|
487 |
- vcvt.s32.f32 q9, q9, #16 |
|
488 |
- vld1.64 {d20-d21},[r1,:128]! |
|
489 |
- vcvt.s32.f32 q10, q10, #16 |
|
490 |
- vld1.64 {d22-d23},[r1,:128]! |
|
491 |
- vcvt.s32.f32 q11, q11, #16 |
|
492 |
- vst1.64 {d26-d27},[r0,:128]! |
|
493 |
- bne 1b |
|
494 |
- ands r2, r2, #15 |
|
495 |
- beq 3f |
|
496 |
-2: vsri.32 q10, q8, #16 |
|
497 |
- vld1.64 {d0-d1}, [r3,:128]! |
|
498 |
- vcvt.s32.f32 q0, q0, #16 |
|
499 |
- vld1.64 {d2-d3}, [r3,:128]! |
|
500 |
- vcvt.s32.f32 q1, q1, #16 |
|
501 |
- vld1.64 {d24-d25},[r1,:128]! |
|
502 |
- vcvt.s32.f32 q12, q12, #16 |
|
503 |
- vsri.32 q11, q9, #16 |
|
504 |
- vld1.64 {d26-d27},[r1,:128]! |
|
505 |
- vcvt.s32.f32 q13, q13, #16 |
|
506 |
- vst1.64 {d20-d21},[r0,:128]! |
|
507 |
- vsri.32 q12, q0, #16 |
|
508 |
- vst1.64 {d22-d23},[r0,:128]! |
|
509 |
- vsri.32 q13, q1, #16 |
|
510 |
- vst1.64 {d24-d27},[r0,:128]! |
|
511 |
- bx lr |
|
512 |
-3: vsri.32 q10, q8, #16 |
|
513 |
- vsri.32 q11, q9, #16 |
|
514 |
- vst1.64 {d20-d23},[r0,:128]! |
|
515 |
- bx lr |
|
516 |
- |
|
517 |
-4: push {r4-r8,lr} |
|
518 |
- cmp r3, #4 |
|
519 |
- lsl ip, r3, #1 |
|
520 |
- blt 4f |
|
521 |
- |
|
522 |
- @ 4 channels |
|
523 |
-5: ldmia r1!, {r4-r7} |
|
524 |
- mov lr, r2 |
|
525 |
- mov r8, r0 |
|
526 |
- vld1.64 {d16-d17},[r4,:128]! |
|
527 |
- vcvt.s32.f32 q8, q8, #16 |
|
528 |
- vld1.64 {d18-d19},[r5,:128]! |
|
529 |
- vcvt.s32.f32 q9, q9, #16 |
|
530 |
- vld1.64 {d20-d21},[r6,:128]! |
|
531 |
- vcvt.s32.f32 q10, q10, #16 |
|
532 |
- vld1.64 {d22-d23},[r7,:128]! |
|
533 |
- vcvt.s32.f32 q11, q11, #16 |
|
534 |
-6: subs lr, lr, #8 |
|
535 |
- vld1.64 {d0-d1}, [r4,:128]! |
|
536 |
- vcvt.s32.f32 q0, q0, #16 |
|
537 |
- vsri.32 q9, q8, #16 |
|
538 |
- vld1.64 {d2-d3}, [r5,:128]! |
|
539 |
- vcvt.s32.f32 q1, q1, #16 |
|
540 |
- vsri.32 q11, q10, #16 |
|
541 |
- vld1.64 {d4-d5}, [r6,:128]! |
|
542 |
- vcvt.s32.f32 q2, q2, #16 |
|
543 |
- vzip.32 d18, d22 |
|
544 |
- vld1.64 {d6-d7}, [r7,:128]! |
|
545 |
- vcvt.s32.f32 q3, q3, #16 |
|
546 |
- vzip.32 d19, d23 |
|
547 |
- vst1.64 {d18}, [r8], ip |
|
548 |
- vsri.32 q1, q0, #16 |
|
549 |
- vst1.64 {d22}, [r8], ip |
|
550 |
- vsri.32 q3, q2, #16 |
|
551 |
- vst1.64 {d19}, [r8], ip |
|
552 |
- vzip.32 d2, d6 |
|
553 |
- vst1.64 {d23}, [r8], ip |
|
554 |
- vzip.32 d3, d7 |
|
555 |
- beq 7f |
|
556 |
- vld1.64 {d16-d17},[r4,:128]! |
|
557 |
- vcvt.s32.f32 q8, q8, #16 |
|
558 |
- vst1.64 {d2}, [r8], ip |
|
559 |
- vld1.64 {d18-d19},[r5,:128]! |
|
560 |
- vcvt.s32.f32 q9, q9, #16 |
|
561 |
- vst1.64 {d6}, [r8], ip |
|
562 |
- vld1.64 {d20-d21},[r6,:128]! |
|
563 |
- vcvt.s32.f32 q10, q10, #16 |
|
564 |
- vst1.64 {d3}, [r8], ip |
|
565 |
- vld1.64 {d22-d23},[r7,:128]! |
|
566 |
- vcvt.s32.f32 q11, q11, #16 |
|
567 |
- vst1.64 {d7}, [r8], ip |
|
568 |
- b 6b |
|
569 |
-7: vst1.64 {d2}, [r8], ip |
|
570 |
- vst1.64 {d6}, [r8], ip |
|
571 |
- vst1.64 {d3}, [r8], ip |
|
572 |
- vst1.64 {d7}, [r8], ip |
|
573 |
- subs r3, r3, #4 |
|
574 |
- popeq {r4-r8,pc} |
|
575 |
- cmp r3, #4 |
|
576 |
- add r0, r0, #8 |
|
577 |
- bge 5b |
|
578 |
- |
|
579 |
- @ 2 channels |
|
580 |
-4: cmp r3, #2 |
|
581 |
- blt 4f |
|
582 |
- ldmia r1!, {r4-r5} |
|
583 |
- mov lr, r2 |
|
584 |
- mov r8, r0 |
|
585 |
- tst lr, #8 |
|
586 |
- vld1.64 {d16-d17},[r4,:128]! |
|
587 |
- vcvt.s32.f32 q8, q8, #16 |
|
588 |
- vld1.64 {d18-d19},[r5,:128]! |
|
589 |
- vcvt.s32.f32 q9, q9, #16 |
|
590 |
- vld1.64 {d20-d21},[r4,:128]! |
|
591 |
- vcvt.s32.f32 q10, q10, #16 |
|
592 |
- vld1.64 {d22-d23},[r5,:128]! |
|
593 |
- vcvt.s32.f32 q11, q11, #16 |
|
594 |
- beq 6f |
|
595 |
- subs lr, lr, #8 |
|
596 |
- beq 7f |
|
597 |
- vsri.32 d18, d16, #16 |
|
598 |
- vsri.32 d19, d17, #16 |
|
599 |
- vld1.64 {d16-d17},[r4,:128]! |
|
600 |
- vcvt.s32.f32 q8, q8, #16 |
|
601 |
- vst1.32 {d18[0]}, [r8], ip |
|
602 |
- vsri.32 d22, d20, #16 |
|
603 |
- vst1.32 {d18[1]}, [r8], ip |
|
604 |
- vsri.32 d23, d21, #16 |
|
605 |
- vst1.32 {d19[0]}, [r8], ip |
|
606 |
- vst1.32 {d19[1]}, [r8], ip |
|
607 |
- vld1.64 {d18-d19},[r5,:128]! |
|
608 |
- vcvt.s32.f32 q9, q9, #16 |
|
609 |
- vst1.32 {d22[0]}, [r8], ip |
|
610 |
- vst1.32 {d22[1]}, [r8], ip |
|
611 |
- vld1.64 {d20-d21},[r4,:128]! |
|
612 |
- vcvt.s32.f32 q10, q10, #16 |
|
613 |
- vst1.32 {d23[0]}, [r8], ip |
|
614 |
- vst1.32 {d23[1]}, [r8], ip |
|
615 |
- vld1.64 {d22-d23},[r5,:128]! |
|
616 |
- vcvt.s32.f32 q11, q11, #16 |
|
617 |
-6: subs lr, lr, #16 |
|
618 |
- vld1.64 {d0-d1}, [r4,:128]! |
|
619 |
- vcvt.s32.f32 q0, q0, #16 |
|
620 |
- vsri.32 d18, d16, #16 |
|
621 |
- vld1.64 {d2-d3}, [r5,:128]! |
|
622 |
- vcvt.s32.f32 q1, q1, #16 |
|
623 |
- vsri.32 d19, d17, #16 |
|
624 |
- vld1.64 {d4-d5}, [r4,:128]! |
|
625 |
- vcvt.s32.f32 q2, q2, #16 |
|
626 |
- vld1.64 {d6-d7}, [r5,:128]! |
|
627 |
- vcvt.s32.f32 q3, q3, #16 |
|
628 |
- vst1.32 {d18[0]}, [r8], ip |
|
629 |
- vsri.32 d22, d20, #16 |
|
630 |
- vst1.32 {d18[1]}, [r8], ip |
|
631 |
- vsri.32 d23, d21, #16 |
|
632 |
- vst1.32 {d19[0]}, [r8], ip |
|
633 |
- vsri.32 d2, d0, #16 |
|
634 |
- vst1.32 {d19[1]}, [r8], ip |
|
635 |
- vsri.32 d3, d1, #16 |
|
636 |
- vst1.32 {d22[0]}, [r8], ip |
|
637 |
- vsri.32 d6, d4, #16 |
|
638 |
- vst1.32 {d22[1]}, [r8], ip |
|
639 |
- vsri.32 d7, d5, #16 |
|
640 |
- vst1.32 {d23[0]}, [r8], ip |
|
641 |
- vst1.32 {d23[1]}, [r8], ip |
|
642 |
- beq 6f |
|
643 |
- vld1.64 {d16-d17},[r4,:128]! |
|
644 |
- vcvt.s32.f32 q8, q8, #16 |
|
645 |
- vst1.32 {d2[0]}, [r8], ip |
|
646 |
- vst1.32 {d2[1]}, [r8], ip |
|
647 |
- vld1.64 {d18-d19},[r5,:128]! |
|
648 |
- vcvt.s32.f32 q9, q9, #16 |
|
649 |
- vst1.32 {d3[0]}, [r8], ip |
|
650 |
- vst1.32 {d3[1]}, [r8], ip |
|
651 |
- vld1.64 {d20-d21},[r4,:128]! |
|
652 |
- vcvt.s32.f32 q10, q10, #16 |
|
653 |
- vst1.32 {d6[0]}, [r8], ip |
|
654 |
- vst1.32 {d6[1]}, [r8], ip |
|
655 |
- vld1.64 {d22-d23},[r5,:128]! |
|
656 |
- vcvt.s32.f32 q11, q11, #16 |
|
657 |
- vst1.32 {d7[0]}, [r8], ip |
|
658 |
- vst1.32 {d7[1]}, [r8], ip |
|
659 |
- bgt 6b |
|
660 |
-6: vst1.32 {d2[0]}, [r8], ip |
|
661 |
- vst1.32 {d2[1]}, [r8], ip |
|
662 |
- vst1.32 {d3[0]}, [r8], ip |
|
663 |
- vst1.32 {d3[1]}, [r8], ip |
|
664 |
- vst1.32 {d6[0]}, [r8], ip |
|
665 |
- vst1.32 {d6[1]}, [r8], ip |
|
666 |
- vst1.32 {d7[0]}, [r8], ip |
|
667 |
- vst1.32 {d7[1]}, [r8], ip |
|
668 |
- b 8f |
|
669 |
-7: vsri.32 d18, d16, #16 |
|
670 |
- vsri.32 d19, d17, #16 |
|
671 |
- vst1.32 {d18[0]}, [r8], ip |
|
672 |
- vsri.32 d22, d20, #16 |
|
673 |
- vst1.32 {d18[1]}, [r8], ip |
|
674 |
- vsri.32 d23, d21, #16 |
|
675 |
- vst1.32 {d19[0]}, [r8], ip |
|
676 |
- vst1.32 {d19[1]}, [r8], ip |
|
677 |
- vst1.32 {d22[0]}, [r8], ip |
|
678 |
- vst1.32 {d22[1]}, [r8], ip |
|
679 |
- vst1.32 {d23[0]}, [r8], ip |
|
680 |
- vst1.32 {d23[1]}, [r8], ip |
|
681 |
-8: subs r3, r3, #2 |
|
682 |
- add r0, r0, #4 |
|
683 |
- popeq {r4-r8,pc} |
|
684 |
- |
|
685 |
- @ 1 channel |
|
686 |
-4: ldr r4, [r1],#4 |
|
687 |
- tst r2, #8 |
|
688 |
- mov lr, r2 |
|
689 |
- mov r5, r0 |
|
690 |
- vld1.64 {d0-d1}, [r4,:128]! |
|
691 |
- vcvt.s32.f32 q0, q0, #16 |
|
692 |
- vld1.64 {d2-d3}, [r4,:128]! |
|
693 |
- vcvt.s32.f32 q1, q1, #16 |
|
694 |
- bne 8f |
|
695 |
-6: subs lr, lr, #16 |
|
696 |
- vld1.64 {d4-d5}, [r4,:128]! |
|
697 |
- vcvt.s32.f32 q2, q2, #16 |
|
698 |
- vld1.64 {d6-d7}, [r4,:128]! |
|
699 |
- vcvt.s32.f32 q3, q3, #16 |
|
700 |
- vst1.16 {d0[1]}, [r5,:16], ip |
|
701 |
- vst1.16 {d0[3]}, [r5,:16], ip |
|
702 |
- vst1.16 {d1[1]}, [r5,:16], ip |
|
703 |
- vst1.16 {d1[3]}, [r5,:16], ip |
|
704 |
- vst1.16 {d2[1]}, [r5,:16], ip |
|
705 |
- vst1.16 {d2[3]}, [r5,:16], ip |
|
706 |
- vst1.16 {d3[1]}, [r5,:16], ip |
|
707 |
- vst1.16 {d3[3]}, [r5,:16], ip |
|
708 |
- beq 7f |
|
709 |
- vld1.64 {d0-d1}, [r4,:128]! |
|
710 |
- vcvt.s32.f32 q0, q0, #16 |
|
711 |
- vld1.64 {d2-d3}, [r4,:128]! |
|
712 |
- vcvt.s32.f32 q1, q1, #16 |
|
713 |
-7: vst1.16 {d4[1]}, [r5,:16], ip |
|
714 |
- vst1.16 {d4[3]}, [r5,:16], ip |
|
715 |
- vst1.16 {d5[1]}, [r5,:16], ip |
|
716 |
- vst1.16 {d5[3]}, [r5,:16], ip |
|
717 |
- vst1.16 {d6[1]}, [r5,:16], ip |
|
718 |
- vst1.16 {d6[3]}, [r5,:16], ip |
|
719 |
- vst1.16 {d7[1]}, [r5,:16], ip |
|
720 |
- vst1.16 {d7[3]}, [r5,:16], ip |
|
721 |
- bgt 6b |
|
722 |
- pop {r4-r8,pc} |
|
723 |
-8: subs lr, lr, #8 |
|
724 |
- vst1.16 {d0[1]}, [r5,:16], ip |
|
725 |
- vst1.16 {d0[3]}, [r5,:16], ip |
|
726 |
- vst1.16 {d1[1]}, [r5,:16], ip |
|
727 |
- vst1.16 {d1[3]}, [r5,:16], ip |
|
728 |
- vst1.16 {d2[1]}, [r5,:16], ip |
|
729 |
- vst1.16 {d2[3]}, [r5,:16], ip |
|
730 |
- vst1.16 {d3[1]}, [r5,:16], ip |
|
731 |
- vst1.16 {d3[3]}, [r5,:16], ip |
|
732 |
- popeq {r4-r8,pc} |
|
733 |
- vld1.64 {d0-d1}, [r4,:128]! |
|
734 |
- vcvt.s32.f32 q0, q0, #16 |
|
735 |
- vld1.64 {d2-d3}, [r4,:128]! |
|
736 |
- vcvt.s32.f32 q1, q1, #16 |
|
737 |
- b 6b |
|
738 |
-endfunc |
|
739 |
- |
|
740 | 403 |
function ff_vector_fmul_neon, export=1 |
741 | 404 |
subs r3, r3, #8 |
742 | 405 |
vld1.64 {d0-d3}, [r1,:128]! |
... | ... |
@@ -1050,34 +713,6 @@ NOVFP vmov.32 r0, d0[0] |
1050 | 1050 |
bx lr |
1051 | 1051 |
endfunc |
1052 | 1052 |
|
1053 |
-function ff_int32_to_float_fmul_scalar_neon, export=1 |
|
1054 |
-VFP vdup.32 q0, d0[0] |
|
1055 |
-VFP len .req r2 |
|
1056 |
-NOVFP vdup.32 q0, r2 |
|
1057 |
-NOVFP len .req r3 |
|
1058 |
- |
|
1059 |
- vld1.32 {q1},[r1,:128]! |
|
1060 |
- vcvt.f32.s32 q3, q1 |
|
1061 |
- vld1.32 {q2},[r1,:128]! |
|
1062 |
- vcvt.f32.s32 q8, q2 |
|
1063 |
-1: subs len, len, #8 |
|
1064 |
- pld [r1, #16] |
|
1065 |
- vmul.f32 q9, q3, q0 |
|
1066 |
- vmul.f32 q10, q8, q0 |
|
1067 |
- beq 2f |
|
1068 |
- vld1.32 {q1},[r1,:128]! |
|
1069 |
- vcvt.f32.s32 q3, q1 |
|
1070 |
- vld1.32 {q2},[r1,:128]! |
|
1071 |
- vcvt.f32.s32 q8, q2 |
|
1072 |
- vst1.32 {q9}, [r0,:128]! |
|
1073 |
- vst1.32 {q10},[r0,:128]! |
|
1074 |
- b 1b |
|
1075 |
-2: vst1.32 {q9}, [r0,:128]! |
|
1076 |
- vst1.32 {q10},[r0,:128]! |
|
1077 |
- bx lr |
|
1078 |
- .unreq len |
|
1079 |
-endfunc |
|
1080 |
- |
|
1081 | 1053 |
function ff_vector_fmul_reverse_neon, export=1 |
1082 | 1054 |
add r2, r2, r3, lsl #2 |
1083 | 1055 |
sub r2, r2, #32 |
... | ... |
@@ -131,58 +131,3 @@ function ff_vector_fmul_reverse_vfp, export=1 |
131 | 131 |
vpop {d8-d15} |
132 | 132 |
bx lr |
133 | 133 |
endfunc |
134 |
- |
|
135 |
-#if HAVE_ARMV6 |
|
136 |
-/** |
|
137 |
- * ARM VFP optimized float to int16 conversion. |
|
138 |
- * Assume that len is a positive number and is multiple of 8, destination |
|
139 |
- * buffer is at least 4 bytes aligned (8 bytes alignment is better for |
|
140 |
- * performance), little endian byte sex |
|
141 |
- */ |
|
142 |
-@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) |
|
143 |
-function ff_float_to_int16_vfp, export=1 |
|
144 |
- push {r4-r8,lr} |
|
145 |
- vpush {d8-d11} |
|
146 |
- vldmia r1!, {s16-s23} |
|
147 |
- vcvt.s32.f32 s0, s16 |
|
148 |
- vcvt.s32.f32 s1, s17 |
|
149 |
- vcvt.s32.f32 s2, s18 |
|
150 |
- vcvt.s32.f32 s3, s19 |
|
151 |
- vcvt.s32.f32 s4, s20 |
|
152 |
- vcvt.s32.f32 s5, s21 |
|
153 |
- vcvt.s32.f32 s6, s22 |
|
154 |
- vcvt.s32.f32 s7, s23 |
|
155 |
-1: |
|
156 |
- subs r2, r2, #8 |
|
157 |
- vmov r3, r4, s0, s1 |
|
158 |
- vmov r5, r6, s2, s3 |
|
159 |
- vmov r7, r8, s4, s5 |
|
160 |
- vmov ip, lr, s6, s7 |
|
161 |
- vldmiagt r1!, {s16-s23} |
|
162 |
- ssat r4, #16, r4 |
|
163 |
- ssat r3, #16, r3 |
|
164 |
- ssat r6, #16, r6 |
|
165 |
- ssat r5, #16, r5 |
|
166 |
- pkhbt r3, r3, r4, lsl #16 |
|
167 |
- pkhbt r4, r5, r6, lsl #16 |
|
168 |
- vcvtgt.s32.f32 s0, s16 |
|
169 |
- vcvtgt.s32.f32 s1, s17 |
|
170 |
- vcvtgt.s32.f32 s2, s18 |
|
171 |
- vcvtgt.s32.f32 s3, s19 |
|
172 |
- vcvtgt.s32.f32 s4, s20 |
|
173 |
- vcvtgt.s32.f32 s5, s21 |
|
174 |
- vcvtgt.s32.f32 s6, s22 |
|
175 |
- vcvtgt.s32.f32 s7, s23 |
|
176 |
- ssat r8, #16, r8 |
|
177 |
- ssat r7, #16, r7 |
|
178 |
- ssat lr, #16, lr |
|
179 |
- ssat ip, #16, ip |
|
180 |
- pkhbt r5, r7, r8, lsl #16 |
|
181 |
- pkhbt r6, ip, lr, lsl #16 |
|
182 |
- stmia r0!, {r3-r6} |
|
183 |
- bgt 1b |
|
184 |
- |
|
185 |
- vpop {d8-d11} |
|
186 |
- pop {r4-r8,pc} |
|
187 |
-endfunc |
|
188 |
-#endif |
189 | 134 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,48 @@ |
0 |
+/* |
|
1 |
+ * ARM optimized Format Conversion Utils |
|
2 |
+ * |
|
3 |
+ * This file is part of FFmpeg. |
|
4 |
+ * |
|
5 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
6 |
+ * modify it under the terms of the GNU Lesser General Public |
|
7 |
+ * License as published by the Free Software Foundation; either |
|
8 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
9 |
+ * |
|
10 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
13 |
+ * Lesser General Public License for more details. |
|
14 |
+ * |
|
15 |
+ * You should have received a copy of the GNU Lesser General Public |
|
16 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
18 |
+ */ |
|
19 |
+ |
|
20 |
+#include <stdint.h> |
|
21 |
+ |
|
22 |
+#include "libavcodec/avcodec.h" |
|
23 |
+#include "libavcodec/fmtconvert.h" |
|
24 |
+ |
|
25 |
+void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, |
|
26 |
+ float mul, int len); |
|
27 |
+ |
|
28 |
+void ff_float_to_int16_neon(int16_t *dst, const float *src, long len); |
|
29 |
+void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); |
|
30 |
+ |
|
31 |
+void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); |
|
32 |
+ |
|
33 |
+void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx) |
|
34 |
+{ |
|
35 |
+ if (HAVE_ARMVFP && HAVE_ARMV6) { |
|
36 |
+ c->float_to_int16 = ff_float_to_int16_vfp; |
|
37 |
+ } |
|
38 |
+ |
|
39 |
+ if (HAVE_NEON) { |
|
40 |
+ c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; |
|
41 |
+ |
|
42 |
+ if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
|
43 |
+ c->float_to_int16 = ff_float_to_int16_neon; |
|
44 |
+ c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; |
|
45 |
+ } |
|
46 |
+ } |
|
47 |
+} |
0 | 48 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,391 @@ |
0 |
+/* |
|
1 |
+ * ARM NEON optimised Format Conversion Utils |
|
2 |
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> |
|
3 |
+ * |
|
4 |
+ * This file is part of FFmpeg. |
|
5 |
+ * |
|
6 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
7 |
+ * modify it under the terms of the GNU Lesser General Public |
|
8 |
+ * License as published by the Free Software Foundation; either |
|
9 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
10 |
+ * |
|
11 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
12 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
14 |
+ * Lesser General Public License for more details. |
|
15 |
+ * |
|
16 |
+ * You should have received a copy of the GNU Lesser General Public |
|
17 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
18 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
19 |
+ */ |
|
20 |
+ |
|
21 |
+#include "config.h" |
|
22 |
+#include "asm.S" |
|
23 |
+ |
|
24 |
+ preserve8 |
|
25 |
+ .text |
|
26 |
+ |
|
27 |
+function ff_float_to_int16_neon, export=1 |
|
28 |
+ subs r2, r2, #8 |
|
29 |
+ vld1.64 {d0-d1}, [r1,:128]! |
|
30 |
+ vcvt.s32.f32 q8, q0, #16 |
|
31 |
+ vld1.64 {d2-d3}, [r1,:128]! |
|
32 |
+ vcvt.s32.f32 q9, q1, #16 |
|
33 |
+ beq 3f |
|
34 |
+ bics ip, r2, #15 |
|
35 |
+ beq 2f |
|
36 |
+1: subs ip, ip, #16 |
|
37 |
+ vshrn.s32 d4, q8, #16 |
|
38 |
+ vld1.64 {d0-d1}, [r1,:128]! |
|
39 |
+ vcvt.s32.f32 q0, q0, #16 |
|
40 |
+ vshrn.s32 d5, q9, #16 |
|
41 |
+ vld1.64 {d2-d3}, [r1,:128]! |
|
42 |
+ vcvt.s32.f32 q1, q1, #16 |
|
43 |
+ vshrn.s32 d6, q0, #16 |
|
44 |
+ vst1.64 {d4-d5}, [r0,:128]! |
|
45 |
+ vshrn.s32 d7, q1, #16 |
|
46 |
+ vld1.64 {d16-d17},[r1,:128]! |
|
47 |
+ vcvt.s32.f32 q8, q8, #16 |
|
48 |
+ vld1.64 {d18-d19},[r1,:128]! |
|
49 |
+ vcvt.s32.f32 q9, q9, #16 |
|
50 |
+ vst1.64 {d6-d7}, [r0,:128]! |
|
51 |
+ bne 1b |
|
52 |
+ ands r2, r2, #15 |
|
53 |
+ beq 3f |
|
54 |
+2: vld1.64 {d0-d1}, [r1,:128]! |
|
55 |
+ vshrn.s32 d4, q8, #16 |
|
56 |
+ vcvt.s32.f32 q0, q0, #16 |
|
57 |
+ vld1.64 {d2-d3}, [r1,:128]! |
|
58 |
+ vshrn.s32 d5, q9, #16 |
|
59 |
+ vcvt.s32.f32 q1, q1, #16 |
|
60 |
+ vshrn.s32 d6, q0, #16 |
|
61 |
+ vst1.64 {d4-d5}, [r0,:128]! |
|
62 |
+ vshrn.s32 d7, q1, #16 |
|
63 |
+ vst1.64 {d6-d7}, [r0,:128]! |
|
64 |
+ bx lr |
|
65 |
+3: vshrn.s32 d4, q8, #16 |
|
66 |
+ vshrn.s32 d5, q9, #16 |
|
67 |
+ vst1.64 {d4-d5}, [r0,:128]! |
|
68 |
+ bx lr |
|
69 |
+endfunc |
|
70 |
+ |
|
71 |
+function ff_float_to_int16_interleave_neon, export=1 |
|
72 |
+ cmp r3, #2 |
|
73 |
+ ldrlt r1, [r1] |
|
74 |
+ blt ff_float_to_int16_neon |
|
75 |
+ bne 4f |
|
76 |
+ |
|
77 |
+ ldr r3, [r1] |
|
78 |
+ ldr r1, [r1, #4] |
|
79 |
+ |
|
80 |
+ subs r2, r2, #8 |
|
81 |
+ vld1.64 {d0-d1}, [r3,:128]! |
|
82 |
+ vcvt.s32.f32 q8, q0, #16 |
|
83 |
+ vld1.64 {d2-d3}, [r3,:128]! |
|
84 |
+ vcvt.s32.f32 q9, q1, #16 |
|
85 |
+ vld1.64 {d20-d21},[r1,:128]! |
|
86 |
+ vcvt.s32.f32 q10, q10, #16 |
|
87 |
+ vld1.64 {d22-d23},[r1,:128]! |
|
88 |
+ vcvt.s32.f32 q11, q11, #16 |
|
89 |
+ beq 3f |
|
90 |
+ bics ip, r2, #15 |
|
91 |
+ beq 2f |
|
92 |
+1: subs ip, ip, #16 |
|
93 |
+ vld1.64 {d0-d1}, [r3,:128]! |
|
94 |
+ vcvt.s32.f32 q0, q0, #16 |
|
95 |
+ vsri.32 q10, q8, #16 |
|
96 |
+ vld1.64 {d2-d3}, [r3,:128]! |
|
97 |
+ vcvt.s32.f32 q1, q1, #16 |
|
98 |
+ vld1.64 {d24-d25},[r1,:128]! |
|
99 |
+ vcvt.s32.f32 q12, q12, #16 |
|
100 |
+ vld1.64 {d26-d27},[r1,:128]! |
|
101 |
+ vsri.32 q11, q9, #16 |
|
102 |
+ vst1.64 {d20-d21},[r0,:128]! |
|
103 |
+ vcvt.s32.f32 q13, q13, #16 |
|
104 |
+ vst1.64 {d22-d23},[r0,:128]! |
|
105 |
+ vsri.32 q12, q0, #16 |
|
106 |
+ vld1.64 {d16-d17},[r3,:128]! |
|
107 |
+ vsri.32 q13, q1, #16 |
|
108 |
+ vst1.64 {d24-d25},[r0,:128]! |
|
109 |
+ vcvt.s32.f32 q8, q8, #16 |
|
110 |
+ vld1.64 {d18-d19},[r3,:128]! |
|
111 |
+ vcvt.s32.f32 q9, q9, #16 |
|
112 |
+ vld1.64 {d20-d21},[r1,:128]! |
|
113 |
+ vcvt.s32.f32 q10, q10, #16 |
|
114 |
+ vld1.64 {d22-d23},[r1,:128]! |
|
115 |
+ vcvt.s32.f32 q11, q11, #16 |
|
116 |
+ vst1.64 {d26-d27},[r0,:128]! |
|
117 |
+ bne 1b |
|
118 |
+ ands r2, r2, #15 |
|
119 |
+ beq 3f |
|
120 |
+2: vsri.32 q10, q8, #16 |
|
121 |
+ vld1.64 {d0-d1}, [r3,:128]! |
|
122 |
+ vcvt.s32.f32 q0, q0, #16 |
|
123 |
+ vld1.64 {d2-d3}, [r3,:128]! |
|
124 |
+ vcvt.s32.f32 q1, q1, #16 |
|
125 |
+ vld1.64 {d24-d25},[r1,:128]! |
|
126 |
+ vcvt.s32.f32 q12, q12, #16 |
|
127 |
+ vsri.32 q11, q9, #16 |
|
128 |
+ vld1.64 {d26-d27},[r1,:128]! |
|
129 |
+ vcvt.s32.f32 q13, q13, #16 |
|
130 |
+ vst1.64 {d20-d21},[r0,:128]! |
|
131 |
+ vsri.32 q12, q0, #16 |
|
132 |
+ vst1.64 {d22-d23},[r0,:128]! |
|
133 |
+ vsri.32 q13, q1, #16 |
|
134 |
+ vst1.64 {d24-d27},[r0,:128]! |
|
135 |
+ bx lr |
|
136 |
+3: vsri.32 q10, q8, #16 |
|
137 |
+ vsri.32 q11, q9, #16 |
|
138 |
+ vst1.64 {d20-d23},[r0,:128]! |
|
139 |
+ bx lr |
|
140 |
+ |
|
141 |
+4: push {r4-r8,lr} |
|
142 |
+ cmp r3, #4 |
|
143 |
+ lsl ip, r3, #1 |
|
144 |
+ blt 4f |
|
145 |
+ |
|
146 |
+ @ 4 channels |
|
147 |
+5: ldmia r1!, {r4-r7} |
|
148 |
+ mov lr, r2 |
|
149 |
+ mov r8, r0 |
|
150 |
+ vld1.64 {d16-d17},[r4,:128]! |
|
151 |
+ vcvt.s32.f32 q8, q8, #16 |
|
152 |
+ vld1.64 {d18-d19},[r5,:128]! |
|
153 |
+ vcvt.s32.f32 q9, q9, #16 |
|
154 |
+ vld1.64 {d20-d21},[r6,:128]! |
|
155 |
+ vcvt.s32.f32 q10, q10, #16 |
|
156 |
+ vld1.64 {d22-d23},[r7,:128]! |
|
157 |
+ vcvt.s32.f32 q11, q11, #16 |
|
158 |
+6: subs lr, lr, #8 |
|
159 |
+ vld1.64 {d0-d1}, [r4,:128]! |
|
160 |
+ vcvt.s32.f32 q0, q0, #16 |
|
161 |
+ vsri.32 q9, q8, #16 |
|
162 |
+ vld1.64 {d2-d3}, [r5,:128]! |
|
163 |
+ vcvt.s32.f32 q1, q1, #16 |
|
164 |
+ vsri.32 q11, q10, #16 |
|
165 |
+ vld1.64 {d4-d5}, [r6,:128]! |
|
166 |
+ vcvt.s32.f32 q2, q2, #16 |
|
167 |
+ vzip.32 d18, d22 |
|
168 |
+ vld1.64 {d6-d7}, [r7,:128]! |
|
169 |
+ vcvt.s32.f32 q3, q3, #16 |
|
170 |
+ vzip.32 d19, d23 |
|
171 |
+ vst1.64 {d18}, [r8], ip |
|
172 |
+ vsri.32 q1, q0, #16 |
|
173 |
+ vst1.64 {d22}, [r8], ip |
|
174 |
+ vsri.32 q3, q2, #16 |
|
175 |
+ vst1.64 {d19}, [r8], ip |
|
176 |
+ vzip.32 d2, d6 |
|
177 |
+ vst1.64 {d23}, [r8], ip |
|
178 |
+ vzip.32 d3, d7 |
|
179 |
+ beq 7f |
|
180 |
+ vld1.64 {d16-d17},[r4,:128]! |
|
181 |
+ vcvt.s32.f32 q8, q8, #16 |
|
182 |
+ vst1.64 {d2}, [r8], ip |
|
183 |
+ vld1.64 {d18-d19},[r5,:128]! |
|
184 |
+ vcvt.s32.f32 q9, q9, #16 |
|
185 |
+ vst1.64 {d6}, [r8], ip |
|
186 |
+ vld1.64 {d20-d21},[r6,:128]! |
|
187 |
+ vcvt.s32.f32 q10, q10, #16 |
|
188 |
+ vst1.64 {d3}, [r8], ip |
|
189 |
+ vld1.64 {d22-d23},[r7,:128]! |
|
190 |
+ vcvt.s32.f32 q11, q11, #16 |
|
191 |
+ vst1.64 {d7}, [r8], ip |
|
192 |
+ b 6b |
|
193 |
+7: vst1.64 {d2}, [r8], ip |
|
194 |
+ vst1.64 {d6}, [r8], ip |
|
195 |
+ vst1.64 {d3}, [r8], ip |
|
196 |
+ vst1.64 {d7}, [r8], ip |
|
197 |
+ subs r3, r3, #4 |
|
198 |
+ popeq {r4-r8,pc} |
|
199 |
+ cmp r3, #4 |
|
200 |
+ add r0, r0, #8 |
|
201 |
+ bge 5b |
|
202 |
+ |
|
203 |
+ @ 2 channels |
|
204 |
+4: cmp r3, #2 |
|
205 |
+ blt 4f |
|
206 |
+ ldmia r1!, {r4-r5} |
|
207 |
+ mov lr, r2 |
|
208 |
+ mov r8, r0 |
|
209 |
+ tst lr, #8 |
|
210 |
+ vld1.64 {d16-d17},[r4,:128]! |
|
211 |
+ vcvt.s32.f32 q8, q8, #16 |
|
212 |
+ vld1.64 {d18-d19},[r5,:128]! |
|
213 |
+ vcvt.s32.f32 q9, q9, #16 |
|
214 |
+ vld1.64 {d20-d21},[r4,:128]! |
|
215 |
+ vcvt.s32.f32 q10, q10, #16 |
|
216 |
+ vld1.64 {d22-d23},[r5,:128]! |
|
217 |
+ vcvt.s32.f32 q11, q11, #16 |
|
218 |
+ beq 6f |
|
219 |
+ subs lr, lr, #8 |
|
220 |
+ beq 7f |
|
221 |
+ vsri.32 d18, d16, #16 |
|
222 |
+ vsri.32 d19, d17, #16 |
|
223 |
+ vld1.64 {d16-d17},[r4,:128]! |
|
224 |
+ vcvt.s32.f32 q8, q8, #16 |
|
225 |
+ vst1.32 {d18[0]}, [r8], ip |
|
226 |
+ vsri.32 d22, d20, #16 |
|
227 |
+ vst1.32 {d18[1]}, [r8], ip |
|
228 |
+ vsri.32 d23, d21, #16 |
|
229 |
+ vst1.32 {d19[0]}, [r8], ip |
|
230 |
+ vst1.32 {d19[1]}, [r8], ip |
|
231 |
+ vld1.64 {d18-d19},[r5,:128]! |
|
232 |
+ vcvt.s32.f32 q9, q9, #16 |
|
233 |
+ vst1.32 {d22[0]}, [r8], ip |
|
234 |
+ vst1.32 {d22[1]}, [r8], ip |
|
235 |
+ vld1.64 {d20-d21},[r4,:128]! |
|
236 |
+ vcvt.s32.f32 q10, q10, #16 |
|
237 |
+ vst1.32 {d23[0]}, [r8], ip |
|
238 |
+ vst1.32 {d23[1]}, [r8], ip |
|
239 |
+ vld1.64 {d22-d23},[r5,:128]! |
|
240 |
+ vcvt.s32.f32 q11, q11, #16 |
|
241 |
+6: subs lr, lr, #16 |
|
242 |
+ vld1.64 {d0-d1}, [r4,:128]! |
|
243 |
+ vcvt.s32.f32 q0, q0, #16 |
|
244 |
+ vsri.32 d18, d16, #16 |
|
245 |
+ vld1.64 {d2-d3}, [r5,:128]! |
|
246 |
+ vcvt.s32.f32 q1, q1, #16 |
|
247 |
+ vsri.32 d19, d17, #16 |
|
248 |
+ vld1.64 {d4-d5}, [r4,:128]! |
|
249 |
+ vcvt.s32.f32 q2, q2, #16 |
|
250 |
+ vld1.64 {d6-d7}, [r5,:128]! |
|
251 |
+ vcvt.s32.f32 q3, q3, #16 |
|
252 |
+ vst1.32 {d18[0]}, [r8], ip |
|
253 |
+ vsri.32 d22, d20, #16 |
|
254 |
+ vst1.32 {d18[1]}, [r8], ip |
|
255 |
+ vsri.32 d23, d21, #16 |
|
256 |
+ vst1.32 {d19[0]}, [r8], ip |
|
257 |
+ vsri.32 d2, d0, #16 |
|
258 |
+ vst1.32 {d19[1]}, [r8], ip |
|
259 |
+ vsri.32 d3, d1, #16 |
|
260 |
+ vst1.32 {d22[0]}, [r8], ip |
|
261 |
+ vsri.32 d6, d4, #16 |
|
262 |
+ vst1.32 {d22[1]}, [r8], ip |
|
263 |
+ vsri.32 d7, d5, #16 |
|
264 |
+ vst1.32 {d23[0]}, [r8], ip |
|
265 |
+ vst1.32 {d23[1]}, [r8], ip |
|
266 |
+ beq 6f |
|
267 |
+ vld1.64 {d16-d17},[r4,:128]! |
|
268 |
+ vcvt.s32.f32 q8, q8, #16 |
|
269 |
+ vst1.32 {d2[0]}, [r8], ip |
|
270 |
+ vst1.32 {d2[1]}, [r8], ip |
|
271 |
+ vld1.64 {d18-d19},[r5,:128]! |
|
272 |
+ vcvt.s32.f32 q9, q9, #16 |
|
273 |
+ vst1.32 {d3[0]}, [r8], ip |
|
274 |
+ vst1.32 {d3[1]}, [r8], ip |
|
275 |
+ vld1.64 {d20-d21},[r4,:128]! |
|
276 |
+ vcvt.s32.f32 q10, q10, #16 |
|
277 |
+ vst1.32 {d6[0]}, [r8], ip |
|
278 |
+ vst1.32 {d6[1]}, [r8], ip |
|
279 |
+ vld1.64 {d22-d23},[r5,:128]! |
|
280 |
+ vcvt.s32.f32 q11, q11, #16 |
|
281 |
+ vst1.32 {d7[0]}, [r8], ip |
|
282 |
+ vst1.32 {d7[1]}, [r8], ip |
|
283 |
+ bgt 6b |
|
284 |
+6: vst1.32 {d2[0]}, [r8], ip |
|
285 |
+ vst1.32 {d2[1]}, [r8], ip |
|
286 |
+ vst1.32 {d3[0]}, [r8], ip |
|
287 |
+ vst1.32 {d3[1]}, [r8], ip |
|
288 |
+ vst1.32 {d6[0]}, [r8], ip |
|
289 |
+ vst1.32 {d6[1]}, [r8], ip |
|
290 |
+ vst1.32 {d7[0]}, [r8], ip |
|
291 |
+ vst1.32 {d7[1]}, [r8], ip |
|
292 |
+ b 8f |
|
293 |
+7: vsri.32 d18, d16, #16 |
|
294 |
+ vsri.32 d19, d17, #16 |
|
295 |
+ vst1.32 {d18[0]}, [r8], ip |
|
296 |
+ vsri.32 d22, d20, #16 |
|
297 |
+ vst1.32 {d18[1]}, [r8], ip |
|
298 |
+ vsri.32 d23, d21, #16 |
|
299 |
+ vst1.32 {d19[0]}, [r8], ip |
|
300 |
+ vst1.32 {d19[1]}, [r8], ip |
|
301 |
+ vst1.32 {d22[0]}, [r8], ip |
|
302 |
+ vst1.32 {d22[1]}, [r8], ip |
|
303 |
+ vst1.32 {d23[0]}, [r8], ip |
|
304 |
+ vst1.32 {d23[1]}, [r8], ip |
|
305 |
+8: subs r3, r3, #2 |
|
306 |
+ add r0, r0, #4 |
|
307 |
+ popeq {r4-r8,pc} |
|
308 |
+ |
|
309 |
+ @ 1 channel |
|
310 |
+4: ldr r4, [r1],#4 |
|
311 |
+ tst r2, #8 |
|
312 |
+ mov lr, r2 |
|
313 |
+ mov r5, r0 |
|
314 |
+ vld1.64 {d0-d1}, [r4,:128]! |
|
315 |
+ vcvt.s32.f32 q0, q0, #16 |
|
316 |
+ vld1.64 {d2-d3}, [r4,:128]! |
|
317 |
+ vcvt.s32.f32 q1, q1, #16 |
|
318 |
+ bne 8f |
|
319 |
+6: subs lr, lr, #16 |
|
320 |
+ vld1.64 {d4-d5}, [r4,:128]! |
|
321 |
+ vcvt.s32.f32 q2, q2, #16 |
|
322 |
+ vld1.64 {d6-d7}, [r4,:128]! |
|
323 |
+ vcvt.s32.f32 q3, q3, #16 |
|
324 |
+ vst1.16 {d0[1]}, [r5,:16], ip |
|
325 |
+ vst1.16 {d0[3]}, [r5,:16], ip |
|
326 |
+ vst1.16 {d1[1]}, [r5,:16], ip |
|
327 |
+ vst1.16 {d1[3]}, [r5,:16], ip |
|
328 |
+ vst1.16 {d2[1]}, [r5,:16], ip |
|
329 |
+ vst1.16 {d2[3]}, [r5,:16], ip |
|
330 |
+ vst1.16 {d3[1]}, [r5,:16], ip |
|
331 |
+ vst1.16 {d3[3]}, [r5,:16], ip |
|
332 |
+ beq 7f |
|
333 |
+ vld1.64 {d0-d1}, [r4,:128]! |
|
334 |
+ vcvt.s32.f32 q0, q0, #16 |
|
335 |
+ vld1.64 {d2-d3}, [r4,:128]! |
|
336 |
+ vcvt.s32.f32 q1, q1, #16 |
|
337 |
+7: vst1.16 {d4[1]}, [r5,:16], ip |
|
338 |
+ vst1.16 {d4[3]}, [r5,:16], ip |
|
339 |
+ vst1.16 {d5[1]}, [r5,:16], ip |
|
340 |
+ vst1.16 {d5[3]}, [r5,:16], ip |
|
341 |
+ vst1.16 {d6[1]}, [r5,:16], ip |
|
342 |
+ vst1.16 {d6[3]}, [r5,:16], ip |
|
343 |
+ vst1.16 {d7[1]}, [r5,:16], ip |
|
344 |
+ vst1.16 {d7[3]}, [r5,:16], ip |
|
345 |
+ bgt 6b |
|
346 |
+ pop {r4-r8,pc} |
|
347 |
+8: subs lr, lr, #8 |
|
348 |
+ vst1.16 {d0[1]}, [r5,:16], ip |
|
349 |
+ vst1.16 {d0[3]}, [r5,:16], ip |
|
350 |
+ vst1.16 {d1[1]}, [r5,:16], ip |
|
351 |
+ vst1.16 {d1[3]}, [r5,:16], ip |
|
352 |
+ vst1.16 {d2[1]}, [r5,:16], ip |
|
353 |
+ vst1.16 {d2[3]}, [r5,:16], ip |
|
354 |
+ vst1.16 {d3[1]}, [r5,:16], ip |
|
355 |
+ vst1.16 {d3[3]}, [r5,:16], ip |
|
356 |
+ popeq {r4-r8,pc} |
|
357 |
+ vld1.64 {d0-d1}, [r4,:128]! |
|
358 |
+ vcvt.s32.f32 q0, q0, #16 |
|
359 |
+ vld1.64 {d2-d3}, [r4,:128]! |
|
360 |
+ vcvt.s32.f32 q1, q1, #16 |
|
361 |
+ b 6b |
|
362 |
+endfunc |
|
363 |
+ |
|
364 |
+function ff_int32_to_float_fmul_scalar_neon, export=1 |
|
365 |
+VFP vdup.32 q0, d0[0] |
|
366 |
+VFP len .req r2 |
|
367 |
+NOVFP vdup.32 q0, r2 |
|
368 |
+NOVFP len .req r3 |
|
369 |
+ |
|
370 |
+ vld1.32 {q1},[r1,:128]! |
|
371 |
+ vcvt.f32.s32 q3, q1 |
|
372 |
+ vld1.32 {q2},[r1,:128]! |
|
373 |
+ vcvt.f32.s32 q8, q2 |
|
374 |
+1: subs len, len, #8 |
|
375 |
+ pld [r1, #16] |
|
376 |
+ vmul.f32 q9, q3, q0 |
|
377 |
+ vmul.f32 q10, q8, q0 |
|
378 |
+ beq 2f |
|
379 |
+ vld1.32 {q1},[r1,:128]! |
|
380 |
+ vcvt.f32.s32 q3, q1 |
|
381 |
+ vld1.32 {q2},[r1,:128]! |
|
382 |
+ vcvt.f32.s32 q8, q2 |
|
383 |
+ vst1.32 {q9}, [r0,:128]! |
|
384 |
+ vst1.32 {q10},[r0,:128]! |
|
385 |
+ b 1b |
|
386 |
+2: vst1.32 {q9}, [r0,:128]! |
|
387 |
+ vst1.32 {q10},[r0,:128]! |
|
388 |
+ bx lr |
|
389 |
+ .unreq len |
|
390 |
+endfunc |
0 | 391 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,77 @@ |
0 |
+/* |
|
1 |
+ * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> |
|
2 |
+ * |
|
3 |
+ * This file is part of FFmpeg. |
|
4 |
+ * |
|
5 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
6 |
+ * modify it under the terms of the GNU Lesser General Public |
|
7 |
+ * License as published by the Free Software Foundation; either |
|
8 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
9 |
+ * |
|
10 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
13 |
+ * Lesser General Public License for more details. |
|
14 |
+ * |
|
15 |
+ * You should have received a copy of the GNU Lesser General Public |
|
16 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
18 |
+ */ |
|
19 |
+ |
|
20 |
+#include "config.h" |
|
21 |
+#include "asm.S" |
|
22 |
+ |
|
23 |
+ .syntax unified |
|
24 |
+ |
|
25 |
+/** |
|
26 |
+ * ARM VFP optimized float to int16 conversion. |
|
27 |
+ * Assume that len is a positive number and is multiple of 8, destination |
|
28 |
+ * buffer is at least 4 bytes aligned (8 bytes alignment is better for |
|
29 |
+ * performance), little endian byte sex |
|
30 |
+ */ |
|
31 |
+@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) |
|
32 |
+function ff_float_to_int16_vfp, export=1 |
|
33 |
+ push {r4-r8,lr} |
|
34 |
+ vpush {d8-d11} |
|
35 |
+ vldmia r1!, {s16-s23} |
|
36 |
+ vcvt.s32.f32 s0, s16 |
|
37 |
+ vcvt.s32.f32 s1, s17 |
|
38 |
+ vcvt.s32.f32 s2, s18 |
|
39 |
+ vcvt.s32.f32 s3, s19 |
|
40 |
+ vcvt.s32.f32 s4, s20 |
|
41 |
+ vcvt.s32.f32 s5, s21 |
|
42 |
+ vcvt.s32.f32 s6, s22 |
|
43 |
+ vcvt.s32.f32 s7, s23 |
|
44 |
+1: |
|
45 |
+ subs r2, r2, #8 |
|
46 |
+ vmov r3, r4, s0, s1 |
|
47 |
+ vmov r5, r6, s2, s3 |
|
48 |
+ vmov r7, r8, s4, s5 |
|
49 |
+ vmov ip, lr, s6, s7 |
|
50 |
+ vldmiagt r1!, {s16-s23} |
|
51 |
+ ssat r4, #16, r4 |
|
52 |
+ ssat r3, #16, r3 |
|
53 |
+ ssat r6, #16, r6 |
|
54 |
+ ssat r5, #16, r5 |
|
55 |
+ pkhbt r3, r3, r4, lsl #16 |
|
56 |
+ pkhbt r4, r5, r6, lsl #16 |
|
57 |
+ vcvtgt.s32.f32 s0, s16 |
|
58 |
+ vcvtgt.s32.f32 s1, s17 |
|
59 |
+ vcvtgt.s32.f32 s2, s18 |
|
60 |
+ vcvtgt.s32.f32 s3, s19 |
|
61 |
+ vcvtgt.s32.f32 s4, s20 |
|
62 |
+ vcvtgt.s32.f32 s5, s21 |
|
63 |
+ vcvtgt.s32.f32 s6, s22 |
|
64 |
+ vcvtgt.s32.f32 s7, s23 |
|
65 |
+ ssat r8, #16, r8 |
|
66 |
+ ssat r7, #16, r7 |
|
67 |
+ ssat lr, #16, lr |
|
68 |
+ ssat ip, #16, ip |
|
69 |
+ pkhbt r5, r7, r8, lsl #16 |
|
70 |
+ pkhbt r6, ip, lr, lsl #16 |
|
71 |
+ stmia r0!, {r3-r6} |
|
72 |
+ bgt 1b |
|
73 |
+ |
|
74 |
+ vpop {d8-d11} |
|
75 |
+ pop {r4-r8,pc} |
|
76 |
+endfunc |
... | ... |
@@ -33,6 +33,7 @@ |
33 | 33 |
#include "get_bits.h" |
34 | 34 |
#include "dsputil.h" |
35 | 35 |
#include "fft.h" |
36 |
+#include "fmtconvert.h" |
|
36 | 37 |
|
37 | 38 |
extern const uint16_t ff_wma_critical_freqs[25]; |
38 | 39 |
|
... | ... |
@@ -43,6 +44,7 @@ typedef struct { |
43 | 43 |
AVCodecContext *avctx; |
44 | 44 |
GetBitContext gb; |
45 | 45 |
DSPContext dsp; |
46 |
+ FmtConvertContext fmt_conv; |
|
46 | 47 |
int first; |
47 | 48 |
int channels; |
48 | 49 |
int frame_len; ///< transform size (samples) |
... | ... |
@@ -71,6 +73,7 @@ static av_cold int decode_init(AVCodecContext *avctx) |
71 | 71 |
|
72 | 72 |
s->avctx = avctx; |
73 | 73 |
dsputil_init(&s->dsp, avctx); |
74 |
+ ff_fmt_convert_init(&s->fmt_conv, avctx); |
|
74 | 75 |
|
75 | 76 |
/* determine frame length */ |
76 | 77 |
if (avctx->sample_rate < 22050) { |
... | ... |
@@ -222,7 +225,8 @@ static void decode_block(BinkAudioContext *s, short *out, int use_dct) |
222 | 222 |
ff_rdft_calc(&s->trans.rdft, coeffs); |
223 | 223 |
} |
224 | 224 |
|
225 |
- s->dsp.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, s->frame_len, s->channels); |
|
225 |
+ s->fmt_conv.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, |
|
226 |
+ s->frame_len, s->channels); |
|
226 | 227 |
|
227 | 228 |
if (!s->first) { |
228 | 229 |
int count = s->overlap_len * s->channels; |
... | ... |
@@ -40,6 +40,7 @@ |
40 | 40 |
#include "dca.h" |
41 | 41 |
#include "synth_filter.h" |
42 | 42 |
#include "dcadsp.h" |
43 |
+#include "fmtconvert.h" |
|
43 | 44 |
|
44 | 45 |
//#define TRACE |
45 | 46 |
|
... | ... |
@@ -347,6 +348,7 @@ typedef struct { |
347 | 347 |
FFTContext imdct; |
348 | 348 |
SynthFilterContext synth; |
349 | 349 |
DCADSPContext dcadsp; |
350 |
+ FmtConvertContext fmt_conv; |
|
350 | 351 |
} DCAContext; |
351 | 352 |
|
352 | 353 |
static const uint16_t dca_vlc_offs[] = { |
... | ... |
@@ -1115,7 +1117,7 @@ static int dca_subsubframe(DCAContext * s, int base_channel, int block_index) |
1115 | 1115 |
block[m] = get_bitalloc(&s->gb, &dca_smpl_bitalloc[abits], sel); |
1116 | 1116 |
} |
1117 | 1117 |
|
1118 |
- s->dsp.int32_to_float_fmul_scalar(subband_samples[k][l], |
|
1118 |
+ s->fmt_conv.int32_to_float_fmul_scalar(subband_samples[k][l], |
|
1119 | 1119 |
block, rscale, 8); |
1120 | 1120 |
} |
1121 | 1121 |
|
... | ... |
@@ -1802,7 +1804,7 @@ static int dca_decode_frame(AVCodecContext * avctx, |
1802 | 1802 |
} |
1803 | 1803 |
} |
1804 | 1804 |
|
1805 |
- s->dsp.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels); |
|
1805 |
+ s->fmt_conv.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels); |
|
1806 | 1806 |
samples += 256 * channels; |
1807 | 1807 |
} |
1808 | 1808 |
|
... | ... |
@@ -1835,6 +1837,7 @@ static av_cold int dca_decode_init(AVCodecContext * avctx) |
1835 | 1835 |
ff_mdct_init(&s->imdct, 6, 1, 1.0); |
1836 | 1836 |
ff_synth_filter_init(&s->synth); |
1837 | 1837 |
ff_dcadsp_init(&s->dcadsp); |
1838 |
+ ff_fmt_convert_init(&s->fmt_conv, avctx); |
|
1838 | 1839 |
|
1839 | 1840 |
for (i = 0; i < DCA_PRIM_CHANNELS_MAX+1; i++) |
1840 | 1841 |
s->samples_chanptr[i] = s->samples + i * 256; |
... | ... |
@@ -3867,12 +3867,6 @@ static float scalarproduct_float_c(const float *v1, const float *v2, int len) |
3867 | 3867 |
return p; |
3868 | 3868 |
} |
3869 | 3869 |
|
3870 |
-static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){ |
|
3871 |
- int i; |
|
3872 |
- for(i=0; i<len; i++) |
|
3873 |
- dst[i] = src[i] * mul; |
|
3874 |
-} |
|
3875 |
- |
|
3876 | 3870 |
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini, |
3877 | 3871 |
uint32_t maxi, uint32_t maxisign) |
3878 | 3872 |
{ |
... | ... |
@@ -3918,30 +3912,6 @@ static void vector_clipf_c(float *dst, const float *src, float min, float max, i |
3918 | 3918 |
} |
3919 | 3919 |
} |
3920 | 3920 |
|
3921 |
-static av_always_inline int float_to_int16_one(const float *src){ |
|
3922 |
- return av_clip_int16(lrintf(*src)); |
|
3923 |
-} |
|
3924 |
- |
|
3925 |
-static void ff_float_to_int16_c(int16_t *dst, const float *src, long len){ |
|
3926 |
- int i; |
|
3927 |
- for(i=0; i<len; i++) |
|
3928 |
- dst[i] = float_to_int16_one(src+i); |
|
3929 |
-} |
|
3930 |
- |
|
3931 |
-static void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){ |
|
3932 |
- int i,j,c; |
|
3933 |
- if(channels==2){ |
|
3934 |
- for(i=0; i<len; i++){ |
|
3935 |
- dst[2*i] = float_to_int16_one(src[0]+i); |
|
3936 |
- dst[2*i+1] = float_to_int16_one(src[1]+i); |
|
3937 |
- } |
|
3938 |
- }else{ |
|
3939 |
- for(c=0; c<channels; c++) |
|
3940 |
- for(i=0, j=c; i<len; i++, j+=channels) |
|
3941 |
- dst[j] = float_to_int16_one(src[c]+i); |
|
3942 |
- } |
|
3943 |
-} |
|
3944 |
- |
|
3945 | 3921 |
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift) |
3946 | 3922 |
{ |
3947 | 3923 |
int res = 0; |
... | ... |
@@ -4437,10 +4407,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) |
4437 | 4437 |
c->vector_fmul_reverse = vector_fmul_reverse_c; |
4438 | 4438 |
c->vector_fmul_add = vector_fmul_add_c; |
4439 | 4439 |
c->vector_fmul_window = vector_fmul_window_c; |
4440 |
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; |
|
4441 | 4440 |
c->vector_clipf = vector_clipf_c; |
4442 |
- c->float_to_int16 = ff_float_to_int16_c; |
|
4443 |
- c->float_to_int16_interleave = ff_float_to_int16_interleave_c; |
|
4444 | 4441 |
c->scalarproduct_int16 = scalarproduct_int16_c; |
4445 | 4442 |
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; |
4446 | 4443 |
c->scalarproduct_float = scalarproduct_float_c; |
... | ... |
@@ -392,7 +392,6 @@ typedef struct DSPContext { |
392 | 392 |
/* assume len is a multiple of 4, and arrays are 16-byte aligned */ |
393 | 393 |
void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len); |
394 | 394 |
/* assume len is a multiple of 8, and arrays are 16-byte aligned */ |
395 |
- void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len); |
|
396 | 395 |
void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */); |
397 | 396 |
/** |
398 | 397 |
* Multiply a vector of floats by a scalar float. Source and |
... | ... |
@@ -445,10 +444,6 @@ typedef struct DSPContext { |
445 | 445 |
*/ |
446 | 446 |
void (*butterflies_float)(float *restrict v1, float *restrict v2, int len); |
447 | 447 |
|
448 |
- /* convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */ |
|
449 |
- void (*float_to_int16)(int16_t *dst, const float *src, long len); |
|
450 |
- void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels); |
|
451 |
- |
|
452 | 448 |
/* (I)DCT */ |
453 | 449 |
void (*fdct)(DCTELEM *block/* align 16*/); |
454 | 450 |
void (*fdct248)(DCTELEM *block/* align 16*/); |
455 | 451 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,68 @@ |
0 |
+/* |
|
1 |
+ * Format Conversion Utils |
|
2 |
+ * Copyright (c) 2000, 2001 Fabrice Bellard |
|
3 |
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
4 |
+ * |
|
5 |
+ * This file is part of FFmpeg. |
|
6 |
+ * |
|
7 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
8 |
+ * modify it under the terms of the GNU Lesser General Public |
|
9 |
+ * License as published by the Free Software Foundation; either |
|
10 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
11 |
+ * |
|
12 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
13 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
15 |
+ * Lesser General Public License for more details. |
|
16 |
+ * |
|
17 |
+ * You should have received a copy of the GNU Lesser General Public |
|
18 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
19 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
20 |
+ */ |
|
21 |
+ |
|
22 |
+#include "avcodec.h" |
|
23 |
+#include "fmtconvert.h" |
|
24 |
+ |
|
25 |
+static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){ |
|
26 |
+ int i; |
|
27 |
+ for(i=0; i<len; i++) |
|
28 |
+ dst[i] = src[i] * mul; |
|
29 |
+} |
|
30 |
+ |
|
31 |
+static av_always_inline int float_to_int16_one(const float *src){ |
|
32 |
+ return av_clip_int16(lrintf(*src)); |
|
33 |
+} |
|
34 |
+ |
|
35 |
+static void float_to_int16_c(int16_t *dst, const float *src, long len) |
|
36 |
+{ |
|
37 |
+ int i; |
|
38 |
+ for(i=0; i<len; i++) |
|
39 |
+ dst[i] = float_to_int16_one(src+i); |
|
40 |
+} |
|
41 |
+ |
|
42 |
+static void float_to_int16_interleave_c(int16_t *dst, const float **src, |
|
43 |
+ long len, int channels) |
|
44 |
+{ |
|
45 |
+ int i,j,c; |
|
46 |
+ if(channels==2){ |
|
47 |
+ for(i=0; i<len; i++){ |
|
48 |
+ dst[2*i] = float_to_int16_one(src[0]+i); |
|
49 |
+ dst[2*i+1] = float_to_int16_one(src[1]+i); |
|
50 |
+ } |
|
51 |
+ }else{ |
|
52 |
+ for(c=0; c<channels; c++) |
|
53 |
+ for(i=0, j=c; i<len; i++, j+=channels) |
|
54 |
+ dst[j] = float_to_int16_one(src[c]+i); |
|
55 |
+ } |
|
56 |
+} |
|
57 |
+ |
|
58 |
+av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx) |
|
59 |
+{ |
|
60 |
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; |
|
61 |
+ c->float_to_int16 = float_to_int16_c; |
|
62 |
+ c->float_to_int16_interleave = float_to_int16_interleave_c; |
|
63 |
+ |
|
64 |
+ if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx); |
|
65 |
+ if (ARCH_PPC) ff_fmt_convert_init_ppc(c, avctx); |
|
66 |
+ if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx); |
|
67 |
+} |
0 | 68 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,79 @@ |
0 |
+/* |
|
1 |
+ * Format Conversion Utils |
|
2 |
+ * Copyright (c) 2000, 2001 Fabrice Bellard |
|
3 |
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
4 |
+ * |
|
5 |
+ * This file is part of FFmpeg. |
|
6 |
+ * |
|
7 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
8 |
+ * modify it under the terms of the GNU Lesser General Public |
|
9 |
+ * License as published by the Free Software Foundation; either |
|
10 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
11 |
+ * |
|
12 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
13 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
15 |
+ * Lesser General Public License for more details. |
|
16 |
+ * |
|
17 |
+ * You should have received a copy of the GNU Lesser General Public |
|
18 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
19 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
20 |
+ */ |
|
21 |
+ |
|
22 |
+#ifndef AVCODEC_FMTCONVERT_H |
|
23 |
+#define AVCODEC_FMTCONVERT_H |
|
24 |
+ |
|
25 |
+#include "avcodec.h" |
|
26 |
+ |
|
27 |
+typedef struct FmtConvertContext { |
|
28 |
+ /** |
|
29 |
+ * Convert an array of int32_t to float and multiply by a float value. |
|
30 |
+ * @param dst destination array of float. |
|
31 |
+ * constraints: 16-byte aligned |
|
32 |
+ * @param src source array of int32_t. |
|
33 |
+ * constraints: 16-byte aligned |
|
34 |
+ * @param len number of elements to convert. |
|
35 |
+ * constraints: multiple of 8 |
|
36 |
+ */ |
|
37 |
+ void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len); |
|
38 |
+ |
|
39 |
+ /** |
|
40 |
+ * Convert an array of float to an array of int16_t. |
|
41 |
+ * |
|
42 |
+ * Convert floats from in the range [-32768.0,32767.0] to ints |
|
43 |
+ * without rescaling |
|
44 |
+ * |
|
45 |
+ * @param dst destination array of int16_t. |
|
46 |
+ * constraints: 16-byte aligned |
|
47 |
+ * @param src source array of float. |
|
48 |
+ * constraints: 16-byte aligned |
|
49 |
+ * @param len number of elements to convert. |
|
50 |
+ * constraints: multiple of 8 |
|
51 |
+ */ |
|
52 |
+ void (*float_to_int16)(int16_t *dst, const float *src, long len); |
|
53 |
+ |
|
54 |
+ /** |
|
55 |
+ * Convert multiple arrays of float to an interleaved array of int16_t. |
|
56 |
+ * |
|
57 |
+ * Convert floats from in the range [-32768.0,32767.0] to ints |
|
58 |
+ * without rescaling |
|
59 |
+ * |
|
60 |
+ * @param dst destination array of interleaved int16_t. |
|
61 |
+ * constraints: 16-byte aligned |
|
62 |
+ * @param src source array of float arrays, one for each channel. |
|
63 |
+ * constraints: 16-byte aligned |
|
64 |
+ * @param len number of elements to convert. |
|
65 |
+ * constraints: multiple of 8 |
|
66 |
+ * @param channels number of channels |
|
67 |
+ */ |
|
68 |
+ void (*float_to_int16_interleave)(int16_t *dst, const float **src, |
|
69 |
+ long len, int channels); |
|
70 |
+} FmtConvertContext; |
|
71 |
+ |
|
72 |
+void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx); |
|
73 |
+ |
|
74 |
+void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx); |
|
75 |
+void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx); |
|
76 |
+void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx); |
|
77 |
+ |
|
78 |
+#endif /* AVCODEC_FMTCONVERT_H */ |
... | ... |
@@ -38,6 +38,7 @@ |
38 | 38 |
#include "avcodec.h" |
39 | 39 |
#include "dsputil.h" |
40 | 40 |
#include "fft.h" |
41 |
+#include "fmtconvert.h" |
|
41 | 42 |
|
42 | 43 |
#define ALT_BITSTREAM_READER_LE |
43 | 44 |
#include "get_bits.h" |
... | ... |
@@ -52,6 +53,7 @@ typedef struct NellyMoserDecodeContext { |
52 | 52 |
float scale_bias; |
53 | 53 |
DSPContext dsp; |
54 | 54 |
FFTContext imdct_ctx; |
55 |
+ FmtConvertContext fmt_conv; |
|
55 | 56 |
DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2]; |
56 | 57 |
} NellyMoserDecodeContext; |
57 | 58 |
|
... | ... |
@@ -134,6 +136,7 @@ static av_cold int decode_init(AVCodecContext * avctx) { |
134 | 134 |
ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0); |
135 | 135 |
|
136 | 136 |
dsputil_init(&s->dsp, avctx); |
137 |
+ ff_fmt_convert_init(&s->fmt_conv, avctx); |
|
137 | 138 |
|
138 | 139 |
s->scale_bias = 1.0/(1*8); |
139 | 140 |
|
... | ... |
@@ -175,7 +178,7 @@ static int decode_tag(AVCodecContext * avctx, |
175 | 175 |
|
176 | 176 |
for (i=0 ; i<blocks ; i++) { |
177 | 177 |
nelly_decode_block(s, &buf[i*NELLY_BLOCK_LEN], s->float_buf); |
178 |
- s->dsp.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES); |
|
178 |
+ s->fmt_conv.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES); |
|
179 | 179 |
*data_size += NELLY_SAMPLES*sizeof(int16_t); |
180 | 180 |
} |
181 | 181 |
|
... | ... |
@@ -122,124 +122,12 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa |
122 | 122 |
} |
123 | 123 |
} |
124 | 124 |
|
125 |
-static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len) |
|
126 |
-{ |
|
127 |
- union { |
|
128 |
- vector float v; |
|
129 |
- float s[4]; |
|
130 |
- } mul_u; |
|
131 |
- int i; |
|
132 |
- vector float src1, src2, dst1, dst2, mul_v, zero; |
|
133 |
- |
|
134 |
- zero = (vector float)vec_splat_u32(0); |
|
135 |
- mul_u.s[0] = mul; |
|
136 |
- mul_v = vec_splat(mul_u.v, 0); |
|
137 |
- |
|
138 |
- for(i=0; i<len; i+=8) { |
|
139 |
- src1 = vec_ctf(vec_ld(0, src+i), 0); |
|
140 |
- src2 = vec_ctf(vec_ld(16, src+i), 0); |
|
141 |
- dst1 = vec_madd(src1, mul_v, zero); |
|
142 |
- dst2 = vec_madd(src2, mul_v, zero); |
|
143 |
- vec_st(dst1, 0, dst+i); |
|
144 |
- vec_st(dst2, 16, dst+i); |
|
145 |
- } |
|
146 |
-} |
|
147 |
- |
|
148 |
- |
|
149 |
-static vector signed short |
|
150 |
-float_to_int16_one_altivec(const float *src) |
|
151 |
-{ |
|
152 |
- vector float s0 = vec_ld(0, src); |
|
153 |
- vector float s1 = vec_ld(16, src); |
|
154 |
- vector signed int t0 = vec_cts(s0, 0); |
|
155 |
- vector signed int t1 = vec_cts(s1, 0); |
|
156 |
- return vec_packs(t0,t1); |
|
157 |
-} |
|
158 |
- |
|
159 |
-static void float_to_int16_altivec(int16_t *dst, const float *src, long len) |
|
160 |
-{ |
|
161 |
- int i; |
|
162 |
- vector signed short d0, d1, d; |
|
163 |
- vector unsigned char align; |
|
164 |
- if(((long)dst)&15) //FIXME |
|
165 |
- for(i=0; i<len-7; i+=8) { |
|
166 |
- d0 = vec_ld(0, dst+i); |
|
167 |
- d = float_to_int16_one_altivec(src+i); |
|
168 |
- d1 = vec_ld(15, dst+i); |
|
169 |
- d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); |
|
170 |
- align = vec_lvsr(0, dst+i); |
|
171 |
- d0 = vec_perm(d1, d, align); |
|
172 |
- d1 = vec_perm(d, d1, align); |
|
173 |
- vec_st(d0, 0, dst+i); |
|
174 |
- vec_st(d1,15, dst+i); |
|
175 |
- } |
|
176 |
- else |
|
177 |
- for(i=0; i<len-7; i+=8) { |
|
178 |
- d = float_to_int16_one_altivec(src+i); |
|
179 |
- vec_st(d, 0, dst+i); |
|
180 |
- } |
|
181 |
-} |
|
182 |
- |
|
183 |
-static void |
|
184 |
-float_to_int16_interleave_altivec(int16_t *dst, const float **src, |
|
185 |
- long len, int channels) |
|
186 |
-{ |
|
187 |
- int i; |
|
188 |
- vector signed short d0, d1, d2, c0, c1, t0, t1; |
|
189 |
- vector unsigned char align; |
|
190 |
- if(channels == 1) |
|
191 |
- float_to_int16_altivec(dst, src[0], len); |
|
192 |
- else |
|
193 |
- if (channels == 2) { |
|
194 |
- if(((long)dst)&15) |
|
195 |
- for(i=0; i<len-7; i+=8) { |
|
196 |
- d0 = vec_ld(0, dst + i); |
|
197 |
- t0 = float_to_int16_one_altivec(src[0] + i); |
|
198 |
- d1 = vec_ld(31, dst + i); |
|
199 |
- t1 = float_to_int16_one_altivec(src[1] + i); |
|
200 |
- c0 = vec_mergeh(t0, t1); |
|
201 |
- c1 = vec_mergel(t0, t1); |
|
202 |
- d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); |
|
203 |
- align = vec_lvsr(0, dst + i); |
|
204 |
- d0 = vec_perm(d2, c0, align); |
|
205 |
- d1 = vec_perm(c0, c1, align); |
|
206 |
- vec_st(d0, 0, dst + i); |
|
207 |
- d0 = vec_perm(c1, d2, align); |
|
208 |
- vec_st(d1, 15, dst + i); |
|
209 |
- vec_st(d0, 31, dst + i); |
|
210 |
- dst+=8; |
|
211 |
- } |
|
212 |
- else |
|
213 |
- for(i=0; i<len-7; i+=8) { |
|
214 |
- t0 = float_to_int16_one_altivec(src[0] + i); |
|
215 |
- t1 = float_to_int16_one_altivec(src[1] + i); |
|
216 |
- d0 = vec_mergeh(t0, t1); |
|
217 |
- d1 = vec_mergel(t0, t1); |
|
218 |
- vec_st(d0, 0, dst + i); |
|
219 |
- vec_st(d1, 16, dst + i); |
|
220 |
- dst+=8; |
|
221 |
- } |
|
222 |
- } else { |
|
223 |
- DECLARE_ALIGNED(16, int16_t, tmp)[len]; |
|
224 |
- int c, j; |
|
225 |
- for (c = 0; c < channels; c++) { |
|
226 |
- float_to_int16_altivec(tmp, src[c], len); |
|
227 |
- for (i = 0, j = c; i < len; i++, j+=channels) { |
|
228 |
- dst[j] = tmp[i]; |
|
229 |
- } |
|
230 |
- } |
|
231 |
- } |
|
232 |
-} |
|
233 |
- |
|
234 | 125 |
void float_init_altivec(DSPContext* c, AVCodecContext *avctx) |
235 | 126 |
{ |
236 | 127 |
c->vector_fmul = vector_fmul_altivec; |
237 | 128 |
c->vector_fmul_reverse = vector_fmul_reverse_altivec; |
238 | 129 |
c->vector_fmul_add = vector_fmul_add_altivec; |
239 |
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec; |
|
240 | 130 |
if(!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
241 | 131 |
c->vector_fmul_window = vector_fmul_window_altivec; |
242 |
- c->float_to_int16 = float_to_int16_altivec; |
|
243 |
- c->float_to_int16_interleave = float_to_int16_interleave_altivec; |
|
244 | 132 |
} |
245 | 133 |
} |
246 | 134 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,142 @@ |
0 |
+/* |
|
1 |
+ * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> |
|
2 |
+ * |
|
3 |
+ * This file is part of FFmpeg. |
|
4 |
+ * |
|
5 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
6 |
+ * modify it under the terms of the GNU Lesser General Public |
|
7 |
+ * License as published by the Free Software Foundation; either |
|
8 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
9 |
+ * |
|
10 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
13 |
+ * Lesser General Public License for more details. |
|
14 |
+ * |
|
15 |
+ * You should have received a copy of the GNU Lesser General Public |
|
16 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
18 |
+ */ |
|
19 |
+ |
|
20 |
+#include "libavcodec/fmtconvert.h" |
|
21 |
+ |
|
22 |
+#include "dsputil_altivec.h" |
|
23 |
+#include "util_altivec.h" |
|
24 |
+ |
|
25 |
+static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len) |
|
26 |
+{ |
|
27 |
+ union { |
|
28 |
+ vector float v; |
|
29 |
+ float s[4]; |
|
30 |
+ } mul_u; |
|
31 |
+ int i; |
|
32 |
+ vector float src1, src2, dst1, dst2, mul_v, zero; |
|
33 |
+ |
|
34 |
+ zero = (vector float)vec_splat_u32(0); |
|
35 |
+ mul_u.s[0] = mul; |
|
36 |
+ mul_v = vec_splat(mul_u.v, 0); |
|
37 |
+ |
|
38 |
+ for(i=0; i<len; i+=8) { |
|
39 |
+ src1 = vec_ctf(vec_ld(0, src+i), 0); |
|
40 |
+ src2 = vec_ctf(vec_ld(16, src+i), 0); |
|
41 |
+ dst1 = vec_madd(src1, mul_v, zero); |
|
42 |
+ dst2 = vec_madd(src2, mul_v, zero); |
|
43 |
+ vec_st(dst1, 0, dst+i); |
|
44 |
+ vec_st(dst2, 16, dst+i); |
|
45 |
+ } |
|
46 |
+} |
|
47 |
+ |
|
48 |
+ |
|
49 |
+static vector signed short |
|
50 |
+float_to_int16_one_altivec(const float *src) |
|
51 |
+{ |
|
52 |
+ vector float s0 = vec_ld(0, src); |
|
53 |
+ vector float s1 = vec_ld(16, src); |
|
54 |
+ vector signed int t0 = vec_cts(s0, 0); |
|
55 |
+ vector signed int t1 = vec_cts(s1, 0); |
|
56 |
+ return vec_packs(t0,t1); |
|
57 |
+} |
|
58 |
+ |
|
59 |
+static void float_to_int16_altivec(int16_t *dst, const float *src, long len) |
|
60 |
+{ |
|
61 |
+ int i; |
|
62 |
+ vector signed short d0, d1, d; |
|
63 |
+ vector unsigned char align; |
|
64 |
+ if(((long)dst)&15) //FIXME |
|
65 |
+ for(i=0; i<len-7; i+=8) { |
|
66 |
+ d0 = vec_ld(0, dst+i); |
|
67 |
+ d = float_to_int16_one_altivec(src+i); |
|
68 |
+ d1 = vec_ld(15, dst+i); |
|
69 |
+ d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); |
|
70 |
+ align = vec_lvsr(0, dst+i); |
|
71 |
+ d0 = vec_perm(d1, d, align); |
|
72 |
+ d1 = vec_perm(d, d1, align); |
|
73 |
+ vec_st(d0, 0, dst+i); |
|
74 |
+ vec_st(d1,15, dst+i); |
|
75 |
+ } |
|
76 |
+ else |
|
77 |
+ for(i=0; i<len-7; i+=8) { |
|
78 |
+ d = float_to_int16_one_altivec(src+i); |
|
79 |
+ vec_st(d, 0, dst+i); |
|
80 |
+ } |
|
81 |
+} |
|
82 |
+ |
|
83 |
+static void |
|
84 |
+float_to_int16_interleave_altivec(int16_t *dst, const float **src, |
|
85 |
+ long len, int channels) |
|
86 |
+{ |
|
87 |
+ int i; |
|
88 |
+ vector signed short d0, d1, d2, c0, c1, t0, t1; |
|
89 |
+ vector unsigned char align; |
|
90 |
+ if(channels == 1) |
|
91 |
+ float_to_int16_altivec(dst, src[0], len); |
|
92 |
+ else |
|
93 |
+ if (channels == 2) { |
|
94 |
+ if(((long)dst)&15) |
|
95 |
+ for(i=0; i<len-7; i+=8) { |
|
96 |
+ d0 = vec_ld(0, dst + i); |
|
97 |
+ t0 = float_to_int16_one_altivec(src[0] + i); |
|
98 |
+ d1 = vec_ld(31, dst + i); |
|
99 |
+ t1 = float_to_int16_one_altivec(src[1] + i); |
|
100 |
+ c0 = vec_mergeh(t0, t1); |
|
101 |
+ c1 = vec_mergel(t0, t1); |
|
102 |
+ d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); |
|
103 |
+ align = vec_lvsr(0, dst + i); |
|
104 |
+ d0 = vec_perm(d2, c0, align); |
|
105 |
+ d1 = vec_perm(c0, c1, align); |
|
106 |
+ vec_st(d0, 0, dst + i); |
|
107 |
+ d0 = vec_perm(c1, d2, align); |
|
108 |
+ vec_st(d1, 15, dst + i); |
|
109 |
+ vec_st(d0, 31, dst + i); |
|
110 |
+ dst+=8; |
|
111 |
+ } |
|
112 |
+ else |
|
113 |
+ for(i=0; i<len-7; i+=8) { |
|
114 |
+ t0 = float_to_int16_one_altivec(src[0] + i); |
|
115 |
+ t1 = float_to_int16_one_altivec(src[1] + i); |
|
116 |
+ d0 = vec_mergeh(t0, t1); |
|
117 |
+ d1 = vec_mergel(t0, t1); |
|
118 |
+ vec_st(d0, 0, dst + i); |
|
119 |
+ vec_st(d1, 16, dst + i); |
|
120 |
+ dst+=8; |
|
121 |
+ } |
|
122 |
+ } else { |
|
123 |
+ DECLARE_ALIGNED(16, int16_t, tmp)[len]; |
|
124 |
+ int c, j; |
|
125 |
+ for (c = 0; c < channels; c++) { |
|
126 |
+ float_to_int16_altivec(tmp, src[c], len); |
|
127 |
+ for (i = 0, j = c; i < len; i++, j+=channels) { |
|
128 |
+ dst[j] = tmp[i]; |
|
129 |
+ } |
|
130 |
+ } |
|
131 |
+ } |
|
132 |
+} |
|
133 |
+ |
|
134 |
+void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx) |
|
135 |
+{ |
|
136 |
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec; |
|
137 |
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
|
138 |
+ c->float_to_int16 = float_to_int16_altivec; |
|
139 |
+ c->float_to_int16_interleave = float_to_int16_interleave_altivec; |
|
140 |
+ } |
|
141 |
+} |
... | ... |
@@ -31,6 +31,7 @@ |
31 | 31 |
#include "get_bits.h" |
32 | 32 |
#include "dsputil.h" |
33 | 33 |
#include "fft.h" |
34 |
+#include "fmtconvert.h" |
|
34 | 35 |
|
35 | 36 |
#include "vorbis.h" |
36 | 37 |
#include "xiph.h" |
... | ... |
@@ -127,6 +128,7 @@ typedef struct vorbis_context_s { |
127 | 127 |
AVCodecContext *avccontext; |
128 | 128 |
GetBitContext gb; |
129 | 129 |
DSPContext dsp; |
130 |
+ FmtConvertContext fmt_conv; |
|
130 | 131 |
|
131 | 132 |
FFTContext mdct[2]; |
132 | 133 |
uint_fast8_t first_frame; |
... | ... |
@@ -961,6 +963,7 @@ static av_cold int vorbis_decode_init(AVCodecContext *avccontext) |
961 | 961 |
|
962 | 962 |
vc->avccontext = avccontext; |
963 | 963 |
dsputil_init(&vc->dsp, avccontext); |
964 |
+ ff_fmt_convert_init(&vc->fmt_conv, avccontext); |
|
964 | 965 |
|
965 | 966 |
vc->scale_bias = 32768.0f; |
966 | 967 |
|
... | ... |
@@ -1636,7 +1639,8 @@ static int vorbis_decode_frame(AVCodecContext *avccontext, |
1636 | 1636 |
len * ff_vorbis_channel_layout_offsets[vc->audio_channels - 1][i]; |
1637 | 1637 |
} |
1638 | 1638 |
|
1639 |
- vc->dsp.float_to_int16_interleave(data, channel_ptrs, len, vc->audio_channels); |
|
1639 |
+ vc->fmt_conv.float_to_int16_interleave(data, channel_ptrs, len, |
|
1640 |
+ vc->audio_channels); |
|
1640 | 1641 |
*data_size = len * 2 * vc->audio_channels; |
1641 | 1642 |
|
1642 | 1643 |
return buf_size ; |
... | ... |
@@ -26,6 +26,7 @@ |
26 | 26 |
#include "put_bits.h" |
27 | 27 |
#include "dsputil.h" |
28 | 28 |
#include "fft.h" |
29 |
+#include "fmtconvert.h" |
|
29 | 30 |
|
30 | 31 |
/* size of blocks */ |
31 | 32 |
#define BLOCK_MIN_BITS 7 |
... | ... |
@@ -134,6 +135,7 @@ typedef struct WMACodecContext { |
134 | 134 |
float lsp_pow_m_table1[(1 << LSP_POW_BITS)]; |
135 | 135 |
float lsp_pow_m_table2[(1 << LSP_POW_BITS)]; |
136 | 136 |
DSPContext dsp; |
137 |
+ FmtConvertContext fmt_conv; |
|
137 | 138 |
|
138 | 139 |
#ifdef TRACE |
139 | 140 |
int frame_count; |
... | ... |
@@ -791,7 +791,7 @@ static int wma_decode_frame(WMACodecContext *s, int16_t *samples) |
791 | 791 |
incr = s->nb_channels; |
792 | 792 |
for (ch = 0; ch < MAX_CHANNELS; ch++) |
793 | 793 |
output[ch] = s->frame_out[ch]; |
794 |
- s->dsp.float_to_int16_interleave(samples, output, n, incr); |
|
794 |
+ s->fmt_conv.float_to_int16_interleave(samples, output, n, incr); |
|
795 | 795 |
for (ch = 0; ch < incr; ch++) { |
796 | 796 |
/* prepare for next block */ |
797 | 797 |
memmove(&s->frame_out[ch][0], &s->frame_out[ch][n], n * sizeof(float)); |
... | ... |
@@ -39,6 +39,7 @@ YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o |
39 | 39 |
MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o |
40 | 40 |
MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \ |
41 | 41 |
x86/deinterlace.o \ |
42 |
+ x86/fmtconvert.o \ |
|
42 | 43 |
x86/h264_chromamc.o \ |
43 | 44 |
$(YASM-OBJS-yes) |
44 | 45 |
|
... | ... |
@@ -47,6 +48,7 @@ MMX-OBJS-$(CONFIG_FFT) += x86/fft.o |
47 | 47 |
OBJS-$(HAVE_MMX) += x86/dnxhd_mmx.o \ |
48 | 48 |
x86/dsputil_mmx.o \ |
49 | 49 |
x86/fdct_mmx.o \ |
50 |
+ x86/fmtconvert_mmx.o \ |
|
50 | 51 |
x86/idct_mmx_xvid.o \ |
51 | 52 |
x86/idct_sse2_xvid.o \ |
52 | 53 |
x86/motion_est_mmx.o \ |
... | ... |
@@ -2349,50 +2349,6 @@ static void vector_fmul_window_sse(float *dst, const float *src0, const float *s |
2349 | 2349 |
} |
2350 | 2350 |
#endif /* HAVE_6REGS */ |
2351 | 2351 |
|
2352 |
-static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) |
|
2353 |
-{ |
|
2354 |
- x86_reg i = -4*len; |
|
2355 |
- __asm__ volatile( |
|
2356 |
- "movss %3, %%xmm4 \n" |
|
2357 |
- "shufps $0, %%xmm4, %%xmm4 \n" |
|
2358 |
- "1: \n" |
|
2359 |
- "cvtpi2ps (%2,%0), %%xmm0 \n" |
|
2360 |
- "cvtpi2ps 8(%2,%0), %%xmm1 \n" |
|
2361 |
- "cvtpi2ps 16(%2,%0), %%xmm2 \n" |
|
2362 |
- "cvtpi2ps 24(%2,%0), %%xmm3 \n" |
|
2363 |
- "movlhps %%xmm1, %%xmm0 \n" |
|
2364 |
- "movlhps %%xmm3, %%xmm2 \n" |
|
2365 |
- "mulps %%xmm4, %%xmm0 \n" |
|
2366 |
- "mulps %%xmm4, %%xmm2 \n" |
|
2367 |
- "movaps %%xmm0, (%1,%0) \n" |
|
2368 |
- "movaps %%xmm2, 16(%1,%0) \n" |
|
2369 |
- "add $32, %0 \n" |
|
2370 |
- "jl 1b \n" |
|
2371 |
- :"+r"(i) |
|
2372 |
- :"r"(dst+len), "r"(src+len), "m"(mul) |
|
2373 |
- ); |
|
2374 |
-} |
|
2375 |
- |
|
2376 |
-static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) |
|
2377 |
-{ |
|
2378 |
- x86_reg i = -4*len; |
|
2379 |
- __asm__ volatile( |
|
2380 |
- "movss %3, %%xmm4 \n" |
|
2381 |
- "shufps $0, %%xmm4, %%xmm4 \n" |
|
2382 |
- "1: \n" |
|
2383 |
- "cvtdq2ps (%2,%0), %%xmm0 \n" |
|
2384 |
- "cvtdq2ps 16(%2,%0), %%xmm1 \n" |
|
2385 |
- "mulps %%xmm4, %%xmm0 \n" |
|
2386 |
- "mulps %%xmm4, %%xmm1 \n" |
|
2387 |
- "movaps %%xmm0, (%1,%0) \n" |
|
2388 |
- "movaps %%xmm1, 16(%1,%0) \n" |
|
2389 |
- "add $32, %0 \n" |
|
2390 |
- "jl 1b \n" |
|
2391 |
- :"+r"(i) |
|
2392 |
- :"r"(dst+len), "r"(src+len), "m"(mul) |
|
2393 |
- ); |
|
2394 |
-} |
|
2395 |
- |
|
2396 | 2352 |
static void vector_clipf_sse(float *dst, const float *src, float min, float max, |
2397 | 2353 |
int len) |
2398 | 2354 |
{ |
... | ... |
@@ -2427,70 +2383,6 @@ static void vector_clipf_sse(float *dst, const float *src, float min, float max, |
2427 | 2427 |
); |
2428 | 2428 |
} |
2429 | 2429 |
|
2430 |
-static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ |
|
2431 |
- x86_reg reglen = len; |
|
2432 |
- // not bit-exact: pf2id uses different rounding than C and SSE |
|
2433 |
- __asm__ volatile( |
|
2434 |
- "add %0 , %0 \n\t" |
|
2435 |
- "lea (%2,%0,2) , %2 \n\t" |
|
2436 |
- "add %0 , %1 \n\t" |
|
2437 |
- "neg %0 \n\t" |
|
2438 |
- "1: \n\t" |
|
2439 |
- "pf2id (%2,%0,2) , %%mm0 \n\t" |
|
2440 |
- "pf2id 8(%2,%0,2) , %%mm1 \n\t" |
|
2441 |
- "pf2id 16(%2,%0,2) , %%mm2 \n\t" |
|
2442 |
- "pf2id 24(%2,%0,2) , %%mm3 \n\t" |
|
2443 |
- "packssdw %%mm1 , %%mm0 \n\t" |
|
2444 |
- "packssdw %%mm3 , %%mm2 \n\t" |
|
2445 |
- "movq %%mm0 , (%1,%0) \n\t" |
|
2446 |
- "movq %%mm2 , 8(%1,%0) \n\t" |
|
2447 |
- "add $16 , %0 \n\t" |
|
2448 |
- " js 1b \n\t" |
|
2449 |
- "femms \n\t" |
|
2450 |
- :"+r"(reglen), "+r"(dst), "+r"(src) |
|
2451 |
- ); |
|
2452 |
-} |
|
2453 |
-static void float_to_int16_sse(int16_t *dst, const float *src, long len){ |
|
2454 |
- x86_reg reglen = len; |
|
2455 |
- __asm__ volatile( |
|
2456 |
- "add %0 , %0 \n\t" |
|
2457 |
- "lea (%2,%0,2) , %2 \n\t" |
|
2458 |
- "add %0 , %1 \n\t" |
|
2459 |
- "neg %0 \n\t" |
|
2460 |
- "1: \n\t" |
|
2461 |
- "cvtps2pi (%2,%0,2) , %%mm0 \n\t" |
|
2462 |
- "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" |
|
2463 |
- "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" |
|
2464 |
- "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" |
|
2465 |
- "packssdw %%mm1 , %%mm0 \n\t" |
|
2466 |
- "packssdw %%mm3 , %%mm2 \n\t" |
|
2467 |
- "movq %%mm0 , (%1,%0) \n\t" |
|
2468 |
- "movq %%mm2 , 8(%1,%0) \n\t" |
|
2469 |
- "add $16 , %0 \n\t" |
|
2470 |
- " js 1b \n\t" |
|
2471 |
- "emms \n\t" |
|
2472 |
- :"+r"(reglen), "+r"(dst), "+r"(src) |
|
2473 |
- ); |
|
2474 |
-} |
|
2475 |
- |
|
2476 |
-static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ |
|
2477 |
- x86_reg reglen = len; |
|
2478 |
- __asm__ volatile( |
|
2479 |
- "add %0 , %0 \n\t" |
|
2480 |
- "lea (%2,%0,2) , %2 \n\t" |
|
2481 |
- "add %0 , %1 \n\t" |
|
2482 |
- "neg %0 \n\t" |
|
2483 |
- "1: \n\t" |
|
2484 |
- "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" |
|
2485 |
- "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" |
|
2486 |
- "packssdw %%xmm1 , %%xmm0 \n\t" |
|
2487 |
- "movdqa %%xmm0 , (%1,%0) \n\t" |
|
2488 |
- "add $16 , %0 \n\t" |
|
2489 |
- " js 1b \n\t" |
|
2490 |
- :"+r"(reglen), "+r"(dst), "+r"(src) |
|
2491 |
- ); |
|
2492 |
-} |
|
2493 |
- |
|
2494 | 2430 |
void ff_vp3_idct_mmx(int16_t *input_data); |
2495 | 2431 |
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); |
2496 | 2432 |
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); |
... | ... |
@@ -2504,9 +2396,6 @@ void ff_vp3_idct_sse2(int16_t *input_data); |
2504 | 2504 |
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); |
2505 | 2505 |
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); |
2506 | 2506 |
|
2507 |
-void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); |
|
2508 |
-void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); |
|
2509 |
-void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); |
|
2510 | 2507 |
int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift); |
2511 | 2508 |
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift); |
2512 | 2509 |
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); |
... | ... |
@@ -2516,102 +2405,6 @@ void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const |
2516 | 2516 |
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); |
2517 | 2517 |
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); |
2518 | 2518 |
|
2519 |
-#if !HAVE_YASM |
|
2520 |
-#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) |
|
2521 |
-#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) |
|
2522 |
-#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) |
|
2523 |
-#endif |
|
2524 |
-#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse |
|
2525 |
- |
|
2526 |
-#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ |
|
2527 |
-/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ |
|
2528 |
-static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ |
|
2529 |
- DECLARE_ALIGNED(16, int16_t, tmp)[len];\ |
|
2530 |
- int i,j,c;\ |
|
2531 |
- for(c=0; c<channels; c++){\ |
|
2532 |
- float_to_int16_##cpu(tmp, src[c], len);\ |
|
2533 |
- for(i=0, j=c; i<len; i++, j+=channels)\ |
|
2534 |
- dst[j] = tmp[i];\ |
|
2535 |
- }\ |
|
2536 |
-}\ |
|
2537 |
-\ |
|
2538 |
-static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ |
|
2539 |
- if(channels==1)\ |
|
2540 |
- float_to_int16_##cpu(dst, src[0], len);\ |
|
2541 |
- else if(channels==2){\ |
|
2542 |
- x86_reg reglen = len; \ |
|
2543 |
- const float *src0 = src[0];\ |
|
2544 |
- const float *src1 = src[1];\ |
|
2545 |
- __asm__ volatile(\ |
|
2546 |
- "shl $2, %0 \n"\ |
|
2547 |
- "add %0, %1 \n"\ |
|
2548 |
- "add %0, %2 \n"\ |
|
2549 |
- "add %0, %3 \n"\ |
|
2550 |
- "neg %0 \n"\ |
|
2551 |
- body\ |
|
2552 |
- :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ |
|
2553 |
- );\ |
|
2554 |
- }else if(channels==6){\ |
|
2555 |
- ff_float_to_int16_interleave6_##cpu(dst, src, len);\ |
|
2556 |
- }else\ |
|
2557 |
- float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ |
|
2558 |
-} |
|
2559 |
- |
|
2560 |
-FLOAT_TO_INT16_INTERLEAVE(3dnow, |
|
2561 |
- "1: \n" |
|
2562 |
- "pf2id (%2,%0), %%mm0 \n" |
|
2563 |
- "pf2id 8(%2,%0), %%mm1 \n" |
|
2564 |
- "pf2id (%3,%0), %%mm2 \n" |
|
2565 |
- "pf2id 8(%3,%0), %%mm3 \n" |
|
2566 |
- "packssdw %%mm1, %%mm0 \n" |
|
2567 |
- "packssdw %%mm3, %%mm2 \n" |
|
2568 |
- "movq %%mm0, %%mm1 \n" |
|
2569 |
- "punpcklwd %%mm2, %%mm0 \n" |
|
2570 |
- "punpckhwd %%mm2, %%mm1 \n" |
|
2571 |
- "movq %%mm0, (%1,%0)\n" |
|
2572 |
- "movq %%mm1, 8(%1,%0)\n" |
|
2573 |
- "add $16, %0 \n" |
|
2574 |
- "js 1b \n" |
|
2575 |
- "femms \n" |
|
2576 |
-) |
|
2577 |
- |
|
2578 |
-FLOAT_TO_INT16_INTERLEAVE(sse, |
|
2579 |
- "1: \n" |
|
2580 |
- "cvtps2pi (%2,%0), %%mm0 \n" |
|
2581 |
- "cvtps2pi 8(%2,%0), %%mm1 \n" |
|
2582 |
- "cvtps2pi (%3,%0), %%mm2 \n" |
|
2583 |
- "cvtps2pi 8(%3,%0), %%mm3 \n" |
|
2584 |
- "packssdw %%mm1, %%mm0 \n" |
|
2585 |
- "packssdw %%mm3, %%mm2 \n" |
|
2586 |
- "movq %%mm0, %%mm1 \n" |
|
2587 |
- "punpcklwd %%mm2, %%mm0 \n" |
|
2588 |
- "punpckhwd %%mm2, %%mm1 \n" |
|
2589 |
- "movq %%mm0, (%1,%0)\n" |
|
2590 |
- "movq %%mm1, 8(%1,%0)\n" |
|
2591 |
- "add $16, %0 \n" |
|
2592 |
- "js 1b \n" |
|
2593 |
- "emms \n" |
|
2594 |
-) |
|
2595 |
- |
|
2596 |
-FLOAT_TO_INT16_INTERLEAVE(sse2, |
|
2597 |
- "1: \n" |
|
2598 |
- "cvtps2dq (%2,%0), %%xmm0 \n" |
|
2599 |
- "cvtps2dq (%3,%0), %%xmm1 \n" |
|
2600 |
- "packssdw %%xmm1, %%xmm0 \n" |
|
2601 |
- "movhlps %%xmm0, %%xmm1 \n" |
|
2602 |
- "punpcklwd %%xmm1, %%xmm0 \n" |
|
2603 |
- "movdqa %%xmm0, (%1,%0) \n" |
|
2604 |
- "add $16, %0 \n" |
|
2605 |
- "js 1b \n" |
|
2606 |
-) |
|
2607 |
- |
|
2608 |
-static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ |
|
2609 |
- if(channels==6) |
|
2610 |
- ff_float_to_int16_interleave6_3dn2(dst, src, len); |
|
2611 |
- else |
|
2612 |
- float_to_int16_interleave_3dnow(dst, src, len, channels); |
|
2613 |
-} |
|
2614 |
- |
|
2615 | 2519 |
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); |
2616 | 2520 |
|
2617 | 2521 |
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
... | ... |
@@ -2968,19 +2761,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
2968 | 2968 |
if(mm_flags & AV_CPU_FLAG_3DNOW){ |
2969 | 2969 |
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; |
2970 | 2970 |
c->vector_fmul = vector_fmul_3dnow; |
2971 |
- if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
|
2972 |
- c->float_to_int16 = float_to_int16_3dnow; |
|
2973 |
- c->float_to_int16_interleave = float_to_int16_interleave_3dnow; |
|
2974 |
- } |
|
2975 | 2971 |
} |
2976 | 2972 |
if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ |
2977 | 2973 |
c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; |
2978 | 2974 |
#if HAVE_6REGS |
2979 | 2975 |
c->vector_fmul_window = vector_fmul_window_3dnow2; |
2980 | 2976 |
#endif |
2981 |
- if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
|
2982 |
- c->float_to_int16_interleave = float_to_int16_interleave_3dn2; |
|
2983 |
- } |
|
2984 | 2977 |
} |
2985 | 2978 |
if(mm_flags & AV_CPU_FLAG_MMX2){ |
2986 | 2979 |
#if HAVE_YASM |
... | ... |
@@ -2997,10 +2783,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
2997 | 2997 |
#if HAVE_6REGS |
2998 | 2998 |
c->vector_fmul_window = vector_fmul_window_sse; |
2999 | 2999 |
#endif |
3000 |
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; |
|
3001 | 3000 |
c->vector_clipf = vector_clipf_sse; |
3002 |
- c->float_to_int16 = float_to_int16_sse; |
|
3003 |
- c->float_to_int16_interleave = float_to_int16_interleave_sse; |
|
3004 | 3001 |
#if HAVE_YASM |
3005 | 3002 |
c->scalarproduct_float = ff_scalarproduct_float_sse; |
3006 | 3003 |
#endif |
... | ... |
@@ -3008,9 +2791,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
3008 | 3008 |
if(mm_flags & AV_CPU_FLAG_3DNOW) |
3009 | 3009 |
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse |
3010 | 3010 |
if(mm_flags & AV_CPU_FLAG_SSE2){ |
3011 |
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; |
|
3012 |
- c->float_to_int16 = float_to_int16_sse2; |
|
3013 |
- c->float_to_int16_interleave = float_to_int16_interleave_sse2; |
|
3014 | 3011 |
#if HAVE_YASM |
3015 | 3012 |
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; |
3016 | 3013 |
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; |
... | ... |
@@ -30,75 +30,6 @@ pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 |
30 | 30 |
|
31 | 31 |
section .text align=16 |
32 | 32 |
|
33 |
-%macro PSWAPD_SSE 2 |
|
34 |
- pshufw %1, %2, 0x4e |
|
35 |
-%endmacro |
|
36 |
-%macro PSWAPD_3DN1 2 |
|
37 |
- movq %1, %2 |
|
38 |
- psrlq %1, 32 |
|
39 |
- punpckldq %1, %2 |
|
40 |
-%endmacro |
|
41 |
- |
|
42 |
-%macro FLOAT_TO_INT16_INTERLEAVE6 1 |
|
43 |
-; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) |
|
44 |
-cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 |
|
45 |
-%ifdef ARCH_X86_64 |
|
46 |
- %define lend r10d |
|
47 |
- mov lend, r2d |
|
48 |
-%else |
|
49 |
- %define lend dword r2m |
|
50 |
-%endif |
|
51 |
- mov src1q, [srcq+1*gprsize] |
|
52 |
- mov src2q, [srcq+2*gprsize] |
|
53 |
- mov src3q, [srcq+3*gprsize] |
|
54 |
- mov src4q, [srcq+4*gprsize] |
|
55 |
- mov src5q, [srcq+5*gprsize] |
|
56 |
- mov srcq, [srcq] |
|
57 |
- sub src1q, srcq |
|
58 |
- sub src2q, srcq |
|
59 |
- sub src3q, srcq |
|
60 |
- sub src4q, srcq |
|
61 |
- sub src5q, srcq |
|
62 |
-.loop: |
|
63 |
- cvtps2pi mm0, [srcq] |
|
64 |
- cvtps2pi mm1, [srcq+src1q] |
|
65 |
- cvtps2pi mm2, [srcq+src2q] |
|
66 |
- cvtps2pi mm3, [srcq+src3q] |
|
67 |
- cvtps2pi mm4, [srcq+src4q] |
|
68 |
- cvtps2pi mm5, [srcq+src5q] |
|
69 |
- packssdw mm0, mm3 |
|
70 |
- packssdw mm1, mm4 |
|
71 |
- packssdw mm2, mm5 |
|
72 |
- pswapd mm3, mm0 |
|
73 |
- punpcklwd mm0, mm1 |
|
74 |
- punpckhwd mm1, mm2 |
|
75 |
- punpcklwd mm2, mm3 |
|
76 |
- pswapd mm3, mm0 |
|
77 |
- punpckldq mm0, mm2 |
|
78 |
- punpckhdq mm2, mm1 |
|
79 |
- punpckldq mm1, mm3 |
|
80 |
- movq [dstq ], mm0 |
|
81 |
- movq [dstq+16], mm2 |
|
82 |
- movq [dstq+ 8], mm1 |
|
83 |
- add srcq, 8 |
|
84 |
- add dstq, 24 |
|
85 |
- sub lend, 2 |
|
86 |
- jg .loop |
|
87 |
- emms |
|
88 |
- RET |
|
89 |
-%endmacro ; FLOAT_TO_INT16_INTERLEAVE6 |
|
90 |
- |
|
91 |
-%define pswapd PSWAPD_SSE |
|
92 |
-FLOAT_TO_INT16_INTERLEAVE6 sse |
|
93 |
-%define cvtps2pi pf2id |
|
94 |
-%define pswapd PSWAPD_3DN1 |
|
95 |
-FLOAT_TO_INT16_INTERLEAVE6 3dnow |
|
96 |
-%undef pswapd |
|
97 |
-FLOAT_TO_INT16_INTERLEAVE6 3dn2 |
|
98 |
-%undef cvtps2pi |
|
99 |
- |
|
100 |
- |
|
101 |
- |
|
102 | 33 |
%macro SCALARPRODUCT 1 |
103 | 34 |
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) |
104 | 35 |
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift |
105 | 36 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,91 @@ |
0 |
+;****************************************************************************** |
|
1 |
+;* x86 optimized Format Conversion Utils |
|
2 |
+;* Copyright (c) 2008 Loren Merritt |
|
3 |
+;* |
|
4 |
+;* This file is part of FFmpeg. |
|
5 |
+;* |
|
6 |
+;* FFmpeg is free software; you can redistribute it and/or |
|
7 |
+;* modify it under the terms of the GNU Lesser General Public |
|
8 |
+;* License as published by the Free Software Foundation; either |
|
9 |
+;* version 2.1 of the License, or (at your option) any later version. |
|
10 |
+;* |
|
11 |
+;* FFmpeg is distributed in the hope that it will be useful, |
|
12 |
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
14 |
+;* Lesser General Public License for more details. |
|
15 |
+;* |
|
16 |
+;* You should have received a copy of the GNU Lesser General Public |
|
17 |
+;* License along with FFmpeg; if not, write to the Free Software |
|
18 |
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
19 |
+;****************************************************************************** |
|
20 |
+ |
|
21 |
+%include "x86inc.asm" |
|
22 |
+ |
|
23 |
+section .text align=16 |
|
24 |
+ |
|
25 |
+%macro PSWAPD_SSE 2 |
|
26 |
+ pshufw %1, %2, 0x4e |
|
27 |
+%endmacro |
|
28 |
+%macro PSWAPD_3DN1 2 |
|
29 |
+ movq %1, %2 |
|
30 |
+ psrlq %1, 32 |
|
31 |
+ punpckldq %1, %2 |
|
32 |
+%endmacro |
|
33 |
+ |
|
34 |
+%macro FLOAT_TO_INT16_INTERLEAVE6 1 |
|
35 |
+; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) |
|
36 |
+cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 |
|
37 |
+%ifdef ARCH_X86_64 |
|
38 |
+ %define lend r10d |
|
39 |
+ mov lend, r2d |
|
40 |
+%else |
|
41 |
+ %define lend dword r2m |
|
42 |
+%endif |
|
43 |
+ mov src1q, [srcq+1*gprsize] |
|
44 |
+ mov src2q, [srcq+2*gprsize] |
|
45 |
+ mov src3q, [srcq+3*gprsize] |
|
46 |
+ mov src4q, [srcq+4*gprsize] |
|
47 |
+ mov src5q, [srcq+5*gprsize] |
|
48 |
+ mov srcq, [srcq] |
|
49 |
+ sub src1q, srcq |
|
50 |
+ sub src2q, srcq |
|
51 |
+ sub src3q, srcq |
|
52 |
+ sub src4q, srcq |
|
53 |
+ sub src5q, srcq |
|
54 |
+.loop: |
|
55 |
+ cvtps2pi mm0, [srcq] |
|
56 |
+ cvtps2pi mm1, [srcq+src1q] |
|
57 |
+ cvtps2pi mm2, [srcq+src2q] |
|
58 |
+ cvtps2pi mm3, [srcq+src3q] |
|
59 |
+ cvtps2pi mm4, [srcq+src4q] |
|
60 |
+ cvtps2pi mm5, [srcq+src5q] |
|
61 |
+ packssdw mm0, mm3 |
|
62 |
+ packssdw mm1, mm4 |
|
63 |
+ packssdw mm2, mm5 |
|
64 |
+ pswapd mm3, mm0 |
|
65 |
+ punpcklwd mm0, mm1 |
|
66 |
+ punpckhwd mm1, mm2 |
|
67 |
+ punpcklwd mm2, mm3 |
|
68 |
+ pswapd mm3, mm0 |
|
69 |
+ punpckldq mm0, mm2 |
|
70 |
+ punpckhdq mm2, mm1 |
|
71 |
+ punpckldq mm1, mm3 |
|
72 |
+ movq [dstq ], mm0 |
|
73 |
+ movq [dstq+16], mm2 |
|
74 |
+ movq [dstq+ 8], mm1 |
|
75 |
+ add srcq, 8 |
|
76 |
+ add dstq, 24 |
|
77 |
+ sub lend, 2 |
|
78 |
+ jg .loop |
|
79 |
+ emms |
|
80 |
+ RET |
|
81 |
+%endmacro ; FLOAT_TO_INT16_INTERLEAVE6 |
|
82 |
+ |
|
83 |
+%define pswapd PSWAPD_SSE |
|
84 |
+FLOAT_TO_INT16_INTERLEAVE6 sse |
|
85 |
+%define cvtps2pi pf2id |
|
86 |
+%define pswapd PSWAPD_3DN1 |
|
87 |
+FLOAT_TO_INT16_INTERLEAVE6 3dnow |
|
88 |
+%undef pswapd |
|
89 |
+FLOAT_TO_INT16_INTERLEAVE6 3dn2 |
|
90 |
+%undef cvtps2pi |
0 | 91 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,266 @@ |
0 |
+/* |
|
1 |
+ * Format Conversion Utils |
|
2 |
+ * Copyright (c) 2000, 2001 Fabrice Bellard |
|
3 |
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
4 |
+ * |
|
5 |
+ * This file is part of FFmpeg. |
|
6 |
+ * |
|
7 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
8 |
+ * modify it under the terms of the GNU Lesser General Public |
|
9 |
+ * License as published by the Free Software Foundation; either |
|
10 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
11 |
+ * |
|
12 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
13 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
15 |
+ * Lesser General Public License for more details. |
|
16 |
+ * |
|
17 |
+ * You should have received a copy of the GNU Lesser General Public |
|
18 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
19 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
20 |
+ * |
|
21 |
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
|
22 |
+ */ |
|
23 |
+ |
|
24 |
+#include "libavutil/cpu.h" |
|
25 |
+#include "libavutil/x86_cpu.h" |
|
26 |
+#include "libavcodec/fmtconvert.h" |
|
27 |
+ |
|
28 |
+static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) |
|
29 |
+{ |
|
30 |
+ x86_reg i = -4*len; |
|
31 |
+ __asm__ volatile( |
|
32 |
+ "movss %3, %%xmm4 \n" |
|
33 |
+ "shufps $0, %%xmm4, %%xmm4 \n" |
|
34 |
+ "1: \n" |
|
35 |
+ "cvtpi2ps (%2,%0), %%xmm0 \n" |
|
36 |
+ "cvtpi2ps 8(%2,%0), %%xmm1 \n" |
|
37 |
+ "cvtpi2ps 16(%2,%0), %%xmm2 \n" |
|
38 |
+ "cvtpi2ps 24(%2,%0), %%xmm3 \n" |
|
39 |
+ "movlhps %%xmm1, %%xmm0 \n" |
|
40 |
+ "movlhps %%xmm3, %%xmm2 \n" |
|
41 |
+ "mulps %%xmm4, %%xmm0 \n" |
|
42 |
+ "mulps %%xmm4, %%xmm2 \n" |
|
43 |
+ "movaps %%xmm0, (%1,%0) \n" |
|
44 |
+ "movaps %%xmm2, 16(%1,%0) \n" |
|
45 |
+ "add $32, %0 \n" |
|
46 |
+ "jl 1b \n" |
|
47 |
+ :"+r"(i) |
|
48 |
+ :"r"(dst+len), "r"(src+len), "m"(mul) |
|
49 |
+ ); |
|
50 |
+} |
|
51 |
+ |
|
52 |
+static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) |
|
53 |
+{ |
|
54 |
+ x86_reg i = -4*len; |
|
55 |
+ __asm__ volatile( |
|
56 |
+ "movss %3, %%xmm4 \n" |
|
57 |
+ "shufps $0, %%xmm4, %%xmm4 \n" |
|
58 |
+ "1: \n" |
|
59 |
+ "cvtdq2ps (%2,%0), %%xmm0 \n" |
|
60 |
+ "cvtdq2ps 16(%2,%0), %%xmm1 \n" |
|
61 |
+ "mulps %%xmm4, %%xmm0 \n" |
|
62 |
+ "mulps %%xmm4, %%xmm1 \n" |
|
63 |
+ "movaps %%xmm0, (%1,%0) \n" |
|
64 |
+ "movaps %%xmm1, 16(%1,%0) \n" |
|
65 |
+ "add $32, %0 \n" |
|
66 |
+ "jl 1b \n" |
|
67 |
+ :"+r"(i) |
|
68 |
+ :"r"(dst+len), "r"(src+len), "m"(mul) |
|
69 |
+ ); |
|
70 |
+} |
|
71 |
+ |
|
72 |
+static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ |
|
73 |
+ x86_reg reglen = len; |
|
74 |
+ // not bit-exact: pf2id uses different rounding than C and SSE |
|
75 |
+ __asm__ volatile( |
|
76 |
+ "add %0 , %0 \n\t" |
|
77 |
+ "lea (%2,%0,2) , %2 \n\t" |
|
78 |
+ "add %0 , %1 \n\t" |
|
79 |
+ "neg %0 \n\t" |
|
80 |
+ "1: \n\t" |
|
81 |
+ "pf2id (%2,%0,2) , %%mm0 \n\t" |
|
82 |
+ "pf2id 8(%2,%0,2) , %%mm1 \n\t" |
|
83 |
+ "pf2id 16(%2,%0,2) , %%mm2 \n\t" |
|
84 |
+ "pf2id 24(%2,%0,2) , %%mm3 \n\t" |
|
85 |
+ "packssdw %%mm1 , %%mm0 \n\t" |
|
86 |
+ "packssdw %%mm3 , %%mm2 \n\t" |
|
87 |
+ "movq %%mm0 , (%1,%0) \n\t" |
|
88 |
+ "movq %%mm2 , 8(%1,%0) \n\t" |
|
89 |
+ "add $16 , %0 \n\t" |
|
90 |
+ " js 1b \n\t" |
|
91 |
+ "femms \n\t" |
|
92 |
+ :"+r"(reglen), "+r"(dst), "+r"(src) |
|
93 |
+ ); |
|
94 |
+} |
|
95 |
+ |
|
96 |
+static void float_to_int16_sse(int16_t *dst, const float *src, long len){ |
|
97 |
+ x86_reg reglen = len; |
|
98 |
+ __asm__ volatile( |
|
99 |
+ "add %0 , %0 \n\t" |
|
100 |
+ "lea (%2,%0,2) , %2 \n\t" |
|
101 |
+ "add %0 , %1 \n\t" |
|
102 |
+ "neg %0 \n\t" |
|
103 |
+ "1: \n\t" |
|
104 |
+ "cvtps2pi (%2,%0,2) , %%mm0 \n\t" |
|
105 |
+ "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" |
|
106 |
+ "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" |
|
107 |
+ "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" |
|
108 |
+ "packssdw %%mm1 , %%mm0 \n\t" |
|
109 |
+ "packssdw %%mm3 , %%mm2 \n\t" |
|
110 |
+ "movq %%mm0 , (%1,%0) \n\t" |
|
111 |
+ "movq %%mm2 , 8(%1,%0) \n\t" |
|
112 |
+ "add $16 , %0 \n\t" |
|
113 |
+ " js 1b \n\t" |
|
114 |
+ "emms \n\t" |
|
115 |
+ :"+r"(reglen), "+r"(dst), "+r"(src) |
|
116 |
+ ); |
|
117 |
+} |
|
118 |
+ |
|
119 |
+static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ |
|
120 |
+ x86_reg reglen = len; |
|
121 |
+ __asm__ volatile( |
|
122 |
+ "add %0 , %0 \n\t" |
|
123 |
+ "lea (%2,%0,2) , %2 \n\t" |
|
124 |
+ "add %0 , %1 \n\t" |
|
125 |
+ "neg %0 \n\t" |
|
126 |
+ "1: \n\t" |
|
127 |
+ "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" |
|
128 |
+ "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" |
|
129 |
+ "packssdw %%xmm1 , %%xmm0 \n\t" |
|
130 |
+ "movdqa %%xmm0 , (%1,%0) \n\t" |
|
131 |
+ "add $16 , %0 \n\t" |
|
132 |
+ " js 1b \n\t" |
|
133 |
+ :"+r"(reglen), "+r"(dst), "+r"(src) |
|
134 |
+ ); |
|
135 |
+} |
|
136 |
+ |
|
137 |
+void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); |
|
138 |
+void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); |
|
139 |
+void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); |
|
140 |
+ |
|
141 |
+#if !HAVE_YASM |
|
142 |
+#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) |
|
143 |
+#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) |
|
144 |
+#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) |
|
145 |
+#endif |
|
146 |
+#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse |
|
147 |
+ |
|
148 |
+#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ |
|
149 |
+/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ |
|
150 |
+static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ |
|
151 |
+ DECLARE_ALIGNED(16, int16_t, tmp)[len];\ |
|
152 |
+ int i,j,c;\ |
|
153 |
+ for(c=0; c<channels; c++){\ |
|
154 |
+ float_to_int16_##cpu(tmp, src[c], len);\ |
|
155 |
+ for(i=0, j=c; i<len; i++, j+=channels)\ |
|
156 |
+ dst[j] = tmp[i];\ |
|
157 |
+ }\ |
|
158 |
+}\ |
|
159 |
+\ |
|
160 |
+static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ |
|
161 |
+ if(channels==1)\ |
|
162 |
+ float_to_int16_##cpu(dst, src[0], len);\ |
|
163 |
+ else if(channels==2){\ |
|
164 |
+ x86_reg reglen = len; \ |
|
165 |
+ const float *src0 = src[0];\ |
|
166 |
+ const float *src1 = src[1];\ |
|
167 |
+ __asm__ volatile(\ |
|
168 |
+ "shl $2, %0 \n"\ |
|
169 |
+ "add %0, %1 \n"\ |
|
170 |
+ "add %0, %2 \n"\ |
|
171 |
+ "add %0, %3 \n"\ |
|
172 |
+ "neg %0 \n"\ |
|
173 |
+ body\ |
|
174 |
+ :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ |
|
175 |
+ );\ |
|
176 |
+ }else if(channels==6){\ |
|
177 |
+ ff_float_to_int16_interleave6_##cpu(dst, src, len);\ |
|
178 |
+ }else\ |
|
179 |
+ float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ |
|
180 |
+} |
|
181 |
+ |
|
182 |
+FLOAT_TO_INT16_INTERLEAVE(3dnow, |
|
183 |
+ "1: \n" |
|
184 |
+ "pf2id (%2,%0), %%mm0 \n" |
|
185 |
+ "pf2id 8(%2,%0), %%mm1 \n" |
|
186 |
+ "pf2id (%3,%0), %%mm2 \n" |
|
187 |
+ "pf2id 8(%3,%0), %%mm3 \n" |
|
188 |
+ "packssdw %%mm1, %%mm0 \n" |
|
189 |
+ "packssdw %%mm3, %%mm2 \n" |
|
190 |
+ "movq %%mm0, %%mm1 \n" |
|
191 |
+ "punpcklwd %%mm2, %%mm0 \n" |
|
192 |
+ "punpckhwd %%mm2, %%mm1 \n" |
|
193 |
+ "movq %%mm0, (%1,%0)\n" |
|
194 |
+ "movq %%mm1, 8(%1,%0)\n" |
|
195 |
+ "add $16, %0 \n" |
|
196 |
+ "js 1b \n" |
|
197 |
+ "femms \n" |
|
198 |
+) |
|
199 |
+ |
|
200 |
+FLOAT_TO_INT16_INTERLEAVE(sse, |
|
201 |
+ "1: \n" |
|
202 |
+ "cvtps2pi (%2,%0), %%mm0 \n" |
|
203 |
+ "cvtps2pi 8(%2,%0), %%mm1 \n" |
|
204 |
+ "cvtps2pi (%3,%0), %%mm2 \n" |
|
205 |
+ "cvtps2pi 8(%3,%0), %%mm3 \n" |
|
206 |
+ "packssdw %%mm1, %%mm0 \n" |
|
207 |
+ "packssdw %%mm3, %%mm2 \n" |
|
208 |
+ "movq %%mm0, %%mm1 \n" |
|
209 |
+ "punpcklwd %%mm2, %%mm0 \n" |
|
210 |
+ "punpckhwd %%mm2, %%mm1 \n" |
|
211 |
+ "movq %%mm0, (%1,%0)\n" |
|
212 |
+ "movq %%mm1, 8(%1,%0)\n" |
|
213 |
+ "add $16, %0 \n" |
|
214 |
+ "js 1b \n" |
|
215 |
+ "emms \n" |
|
216 |
+) |
|
217 |
+ |
|
218 |
+FLOAT_TO_INT16_INTERLEAVE(sse2, |
|
219 |
+ "1: \n" |
|
220 |
+ "cvtps2dq (%2,%0), %%xmm0 \n" |
|
221 |
+ "cvtps2dq (%3,%0), %%xmm1 \n" |
|
222 |
+ "packssdw %%xmm1, %%xmm0 \n" |
|
223 |
+ "movhlps %%xmm0, %%xmm1 \n" |
|
224 |
+ "punpcklwd %%xmm1, %%xmm0 \n" |
|
225 |
+ "movdqa %%xmm0, (%1,%0) \n" |
|
226 |
+ "add $16, %0 \n" |
|
227 |
+ "js 1b \n" |
|
228 |
+) |
|
229 |
+ |
|
230 |
+static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ |
|
231 |
+ if(channels==6) |
|
232 |
+ ff_float_to_int16_interleave6_3dn2(dst, src, len); |
|
233 |
+ else |
|
234 |
+ float_to_int16_interleave_3dnow(dst, src, len, channels); |
|
235 |
+} |
|
236 |
+ |
|
237 |
+void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) |
|
238 |
+{ |
|
239 |
+ int mm_flags = av_get_cpu_flags(); |
|
240 |
+ |
|
241 |
+ if (mm_flags & AV_CPU_FLAG_MMX) { |
|
242 |
+ |
|
243 |
+ if(mm_flags & AV_CPU_FLAG_3DNOW){ |
|
244 |
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
|
245 |
+ c->float_to_int16 = float_to_int16_3dnow; |
|
246 |
+ c->float_to_int16_interleave = float_to_int16_interleave_3dnow; |
|
247 |
+ } |
|
248 |
+ } |
|
249 |
+ if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ |
|
250 |
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
|
251 |
+ c->float_to_int16_interleave = float_to_int16_interleave_3dn2; |
|
252 |
+ } |
|
253 |
+ } |
|
254 |
+ if(mm_flags & AV_CPU_FLAG_SSE){ |
|
255 |
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; |
|
256 |
+ c->float_to_int16 = float_to_int16_sse; |
|
257 |
+ c->float_to_int16_interleave = float_to_int16_interleave_sse; |
|
258 |
+ } |
|
259 |
+ if(mm_flags & AV_CPU_FLAG_SSE2){ |
|
260 |
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; |
|
261 |
+ c->float_to_int16 = float_to_int16_sse2; |
|
262 |
+ c->float_to_int16_interleave = float_to_int16_interleave_sse2; |
|
263 |
+ } |
|
264 |
+ } |
|
265 |
+} |