Browse code

Separate format conversion DSP functions from DSPContext.

This will be beneficial for use with the audio conversion API without
requiring it to depend on all of dsputil.

Signed-off-by: Mans Rullgard <mans@mansr.com>
(cherry picked from commit c73d99e672329c8f2df290736ffc474c360ac4ae)

Justin Ruggles authored on 2011/01/31 00:06:46
Showing 32 changed files
... ...
@@ -12,6 +12,7 @@ OBJS = allcodecs.o                                                      \
12 12
        bitstream_filter.o                                               \
13 13
        dsputil.o                                                        \
14 14
        faanidct.o                                                       \
15
+       fmtconvert.o                                                     \
15 16
        imgconvert.o                                                     \
16 17
        jrevdct.o                                                        \
17 18
        opt.o                                                            \
... ...
@@ -35,6 +35,7 @@
35 35
 #include "fft.h"
36 36
 #include "mpeg4audio.h"
37 37
 #include "sbr.h"
38
+#include "fmtconvert.h"
38 39
 
39 40
 #include <stdint.h>
40 41
 
... ...
@@ -268,6 +269,7 @@ typedef struct {
268 268
     FFTContext mdct;
269 269
     FFTContext mdct_small;
270 270
     DSPContext dsp;
271
+    FmtConvertContext fmt_conv;
271 272
     int random_state;
272 273
     /** @} */
273 274
 
... ...
@@ -85,6 +85,7 @@
85 85
 #include "get_bits.h"
86 86
 #include "dsputil.h"
87 87
 #include "fft.h"
88
+#include "fmtconvert.h"
88 89
 #include "lpc.h"
89 90
 
90 91
 #include "aac.h"
... ...
@@ -562,6 +563,7 @@ static av_cold int aac_decode_init(AVCodecContext *avctx)
562 562
     ff_aac_sbr_init();
563 563
 
564 564
     dsputil_init(&ac->dsp, avctx);
565
+    ff_fmt_convert_init(&ac->fmt_conv, avctx);
565 566
 
566 567
     ac->random_state = 0x1f2e3d4c;
567 568
 
... ...
@@ -2032,7 +2034,7 @@ static int aac_decode_frame_int(AVCodecContext *avctx, void *data,
2032 2032
     *data_size = data_size_tmp;
2033 2033
 
2034 2034
     if (samples)
2035
-        ac->dsp.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);
2035
+        ac->fmt_conv.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);
2036 2036
 
2037 2037
     if (ac->output_configured)
2038 2038
         ac->output_configured = OC_LOCKED;
... ...
@@ -193,6 +193,7 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
193 193
     ff_mdct_init(&s->imdct_512, 9, 1, 1.0);
194 194
     ff_kbd_window_init(s->window, 5.0, 256);
195 195
     dsputil_init(&s->dsp, avctx);
196
+    ff_fmt_convert_init(&s->fmt_conv, avctx);
196 197
     av_lfg_init(&s->dith_state, 0);
197 198
 
198 199
     /* set scale value for float to int16 conversion */
... ...
@@ -1255,7 +1256,7 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
1255 1255
         } else {
1256 1256
             gain *= s->dynamic_range[0];
1257 1257
         }
1258
-        s->dsp.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
1258
+        s->fmt_conv.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
1259 1259
     }
1260 1260
 
1261 1261
     /* apply spectral extension to high frequency bins */
... ...
@@ -1407,7 +1408,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size,
1407 1407
             av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n");
1408 1408
             err = 1;
1409 1409
         }
1410
-        s->dsp.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
1410
+        s->fmt_conv.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
1411 1411
         out_samples += 256 * s->out_channels;
1412 1412
     }
1413 1413
     *data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t);
... ...
@@ -55,6 +55,7 @@
55 55
 #include "get_bits.h"
56 56
 #include "dsputil.h"
57 57
 #include "fft.h"
58
+#include "fmtconvert.h"
58 59
 
59 60
 /* override ac3.h to include coupling channel */
60 61
 #undef AC3_MAX_CHANNELS
... ...
@@ -190,6 +191,7 @@ typedef struct {
190 190
 
191 191
 ///@defgroup opt optimization
192 192
     DSPContext dsp;                         ///< for optimization
193
+    FmtConvertContext fmt_conv;             ///< optimized conversion functions
193 194
     float mul_bias;                         ///< scaling for float_to_int16 conversion
194 195
 ///@}
195 196
 
... ...
@@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264PRED)                += arm/h264pred_init_arm.o
9 9
 OBJS                                   += arm/dsputil_init_arm.o        \
10 10
                                           arm/dsputil_arm.o             \
11 11
                                           arm/fft_init_arm.o            \
12
+                                          arm/fmtconvert_init_arm.o     \
12 13
                                           arm/jrevdct_arm.o             \
13 14
                                           arm/mpegvideo_arm.o           \
14 15
                                           arm/simple_idct_arm.o         \
... ...
@@ -22,8 +23,11 @@ OBJS-$(HAVE_ARMV6)                     += arm/dsputil_init_armv6.o      \
22 22
                                           arm/dsputil_armv6.o           \
23 23
                                           arm/simple_idct_armv6.o       \
24 24
 
25
+VFP-OBJS-$(HAVE_ARMV6)                 += arm/fmtconvert_vfp.o          \
26
+
25 27
 OBJS-$(HAVE_ARMVFP)                    += arm/dsputil_vfp.o             \
26 28
                                           arm/dsputil_init_vfp.o        \
29
+                                          $(VFP-OBJS-yes)
27 30
 
28 31
 OBJS-$(HAVE_IWMMXT)                    += arm/dsputil_iwmmxt.o          \
29 32
                                           arm/mpegvideo_iwmmxt.o        \
... ...
@@ -52,6 +56,7 @@ NEON-OBJS-$(CONFIG_VP6_DECODER)        += arm/vp56dsp_neon.o            \
52 52
 
53 53
 OBJS-$(HAVE_NEON)                      += arm/dsputil_init_neon.o       \
54 54
                                           arm/dsputil_neon.o            \
55
+                                          arm/fmtconvert_neon.o         \
55 56
                                           arm/int_neon.o                \
56 57
                                           arm/mpegvideo_neon.o          \
57 58
                                           arm/simple_idct_neon.o        \
... ...
@@ -153,8 +153,6 @@ void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul,
153 153
                               int len);
154 154
 void ff_butterflies_float_neon(float *v1, float *v2, int len);
155 155
 float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
156
-void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
157
-                                        float mul, int len);
158 156
 void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
159 157
                                  const float *src1, int len);
160 158
 void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
... ...
@@ -162,8 +160,6 @@ void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
162 162
 
163 163
 void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
164 164
                           int len);
165
-void ff_float_to_int16_neon(int16_t *, const float *, long);
166
-void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
167 165
 
168 166
 void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
169 167
 
... ...
@@ -308,7 +304,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
308 308
     c->vector_fmul_scalar         = ff_vector_fmul_scalar_neon;
309 309
     c->butterflies_float          = ff_butterflies_float_neon;
310 310
     c->scalarproduct_float        = ff_scalarproduct_float_neon;
311
-    c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
312 311
     c->vector_fmul_reverse        = ff_vector_fmul_reverse_neon;
313 312
     c->vector_fmul_add            = ff_vector_fmul_add_neon;
314 313
     c->vector_clipf               = ff_vector_clipf_neon;
... ...
@@ -319,11 +314,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
319 319
     c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon;
320 320
     c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon;
321 321
 
322
-    if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
323
-        c->float_to_int16            = ff_float_to_int16_neon;
324
-        c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
325
-    }
326
-
327 322
     if (CONFIG_VORBIS_DECODER)
328 323
         c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
329 324
 
... ...
@@ -25,13 +25,9 @@ void ff_vector_fmul_vfp(float *dst, const float *src0,
25 25
                         const float *src1, int len);
26 26
 void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
27 27
                                 const float *src1, int len);
28
-void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
29 28
 
30 29
 void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx)
31 30
 {
32 31
     c->vector_fmul = ff_vector_fmul_vfp;
33 32
     c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
34
-#if HAVE_ARMV6
35
-    c->float_to_int16 = ff_float_to_int16_vfp;
36
-#endif
37 33
 }
... ...
@@ -400,343 +400,6 @@ function ff_add_pixels_clamped_neon, export=1
400 400
         bx              lr
401 401
 endfunc
402 402
 
403
-function ff_float_to_int16_neon, export=1
404
-        subs            r2,  r2,  #8
405
-        vld1.64         {d0-d1},  [r1,:128]!
406
-        vcvt.s32.f32    q8,  q0,  #16
407
-        vld1.64         {d2-d3},  [r1,:128]!
408
-        vcvt.s32.f32    q9,  q1,  #16
409
-        beq             3f
410
-        bics            ip,  r2,  #15
411
-        beq             2f
412
-1:      subs            ip,  ip,  #16
413
-        vshrn.s32       d4,  q8,  #16
414
-        vld1.64         {d0-d1},  [r1,:128]!
415
-        vcvt.s32.f32    q0,  q0,  #16
416
-        vshrn.s32       d5,  q9,  #16
417
-        vld1.64         {d2-d3},  [r1,:128]!
418
-        vcvt.s32.f32    q1,  q1,  #16
419
-        vshrn.s32       d6,  q0,  #16
420
-        vst1.64         {d4-d5},  [r0,:128]!
421
-        vshrn.s32       d7,  q1,  #16
422
-        vld1.64         {d16-d17},[r1,:128]!
423
-        vcvt.s32.f32    q8,  q8,  #16
424
-        vld1.64         {d18-d19},[r1,:128]!
425
-        vcvt.s32.f32    q9,  q9,  #16
426
-        vst1.64         {d6-d7},  [r0,:128]!
427
-        bne             1b
428
-        ands            r2,  r2,  #15
429
-        beq             3f
430
-2:      vld1.64         {d0-d1},  [r1,:128]!
431
-        vshrn.s32       d4,  q8,  #16
432
-        vcvt.s32.f32    q0,  q0,  #16
433
-        vld1.64         {d2-d3},  [r1,:128]!
434
-        vshrn.s32       d5,  q9,  #16
435
-        vcvt.s32.f32    q1,  q1,  #16
436
-        vshrn.s32       d6,  q0,  #16
437
-        vst1.64         {d4-d5},  [r0,:128]!
438
-        vshrn.s32       d7,  q1,  #16
439
-        vst1.64         {d6-d7},  [r0,:128]!
440
-        bx              lr
441
-3:      vshrn.s32       d4,  q8,  #16
442
-        vshrn.s32       d5,  q9,  #16
443
-        vst1.64         {d4-d5},  [r0,:128]!
444
-        bx              lr
445
-endfunc
446
-
447
-function ff_float_to_int16_interleave_neon, export=1
448
-        cmp             r3, #2
449
-        ldrlt           r1, [r1]
450
-        blt             ff_float_to_int16_neon
451
-        bne             4f
452
-
453
-        ldr             r3, [r1]
454
-        ldr             r1, [r1, #4]
455
-
456
-        subs            r2,  r2,  #8
457
-        vld1.64         {d0-d1},  [r3,:128]!
458
-        vcvt.s32.f32    q8,  q0,  #16
459
-        vld1.64         {d2-d3},  [r3,:128]!
460
-        vcvt.s32.f32    q9,  q1,  #16
461
-        vld1.64         {d20-d21},[r1,:128]!
462
-        vcvt.s32.f32    q10, q10, #16
463
-        vld1.64         {d22-d23},[r1,:128]!
464
-        vcvt.s32.f32    q11, q11, #16
465
-        beq             3f
466
-        bics            ip,  r2,  #15
467
-        beq             2f
468
-1:      subs            ip,  ip,  #16
469
-        vld1.64         {d0-d1},  [r3,:128]!
470
-        vcvt.s32.f32    q0,  q0,  #16
471
-        vsri.32         q10, q8,  #16
472
-        vld1.64         {d2-d3},  [r3,:128]!
473
-        vcvt.s32.f32    q1,  q1,  #16
474
-        vld1.64         {d24-d25},[r1,:128]!
475
-        vcvt.s32.f32    q12, q12, #16
476
-        vld1.64         {d26-d27},[r1,:128]!
477
-        vsri.32         q11, q9,  #16
478
-        vst1.64         {d20-d21},[r0,:128]!
479
-        vcvt.s32.f32    q13, q13, #16
480
-        vst1.64         {d22-d23},[r0,:128]!
481
-        vsri.32         q12, q0,  #16
482
-        vld1.64         {d16-d17},[r3,:128]!
483
-        vsri.32         q13, q1,  #16
484
-        vst1.64         {d24-d25},[r0,:128]!
485
-        vcvt.s32.f32    q8,  q8,  #16
486
-        vld1.64         {d18-d19},[r3,:128]!
487
-        vcvt.s32.f32    q9,  q9,  #16
488
-        vld1.64         {d20-d21},[r1,:128]!
489
-        vcvt.s32.f32    q10, q10, #16
490
-        vld1.64         {d22-d23},[r1,:128]!
491
-        vcvt.s32.f32    q11, q11, #16
492
-        vst1.64         {d26-d27},[r0,:128]!
493
-        bne             1b
494
-        ands            r2,  r2,  #15
495
-        beq             3f
496
-2:      vsri.32         q10, q8,  #16
497
-        vld1.64         {d0-d1},  [r3,:128]!
498
-        vcvt.s32.f32    q0,  q0,  #16
499
-        vld1.64         {d2-d3},  [r3,:128]!
500
-        vcvt.s32.f32    q1,  q1,  #16
501
-        vld1.64         {d24-d25},[r1,:128]!
502
-        vcvt.s32.f32    q12, q12, #16
503
-        vsri.32         q11, q9,  #16
504
-        vld1.64         {d26-d27},[r1,:128]!
505
-        vcvt.s32.f32    q13, q13, #16
506
-        vst1.64         {d20-d21},[r0,:128]!
507
-        vsri.32         q12, q0,  #16
508
-        vst1.64         {d22-d23},[r0,:128]!
509
-        vsri.32         q13, q1,  #16
510
-        vst1.64         {d24-d27},[r0,:128]!
511
-        bx              lr
512
-3:      vsri.32         q10, q8,  #16
513
-        vsri.32         q11, q9,  #16
514
-        vst1.64         {d20-d23},[r0,:128]!
515
-        bx              lr
516
-
517
-4:      push            {r4-r8,lr}
518
-        cmp             r3,  #4
519
-        lsl             ip,  r3,  #1
520
-        blt             4f
521
-
522
-        @ 4 channels
523
-5:      ldmia           r1!, {r4-r7}
524
-        mov             lr,  r2
525
-        mov             r8,  r0
526
-        vld1.64         {d16-d17},[r4,:128]!
527
-        vcvt.s32.f32    q8,  q8,  #16
528
-        vld1.64         {d18-d19},[r5,:128]!
529
-        vcvt.s32.f32    q9,  q9,  #16
530
-        vld1.64         {d20-d21},[r6,:128]!
531
-        vcvt.s32.f32    q10, q10, #16
532
-        vld1.64         {d22-d23},[r7,:128]!
533
-        vcvt.s32.f32    q11, q11, #16
534
-6:      subs            lr,  lr,  #8
535
-        vld1.64         {d0-d1},  [r4,:128]!
536
-        vcvt.s32.f32    q0,  q0,  #16
537
-        vsri.32         q9,  q8,  #16
538
-        vld1.64         {d2-d3},  [r5,:128]!
539
-        vcvt.s32.f32    q1,  q1,  #16
540
-        vsri.32         q11, q10, #16
541
-        vld1.64         {d4-d5},  [r6,:128]!
542
-        vcvt.s32.f32    q2,  q2,  #16
543
-        vzip.32         d18, d22
544
-        vld1.64         {d6-d7},  [r7,:128]!
545
-        vcvt.s32.f32    q3,  q3,  #16
546
-        vzip.32         d19, d23
547
-        vst1.64         {d18},    [r8], ip
548
-        vsri.32         q1,  q0,  #16
549
-        vst1.64         {d22},    [r8], ip
550
-        vsri.32         q3,  q2,  #16
551
-        vst1.64         {d19},    [r8], ip
552
-        vzip.32         d2,  d6
553
-        vst1.64         {d23},    [r8], ip
554
-        vzip.32         d3,  d7
555
-        beq             7f
556
-        vld1.64         {d16-d17},[r4,:128]!
557
-        vcvt.s32.f32    q8,  q8,  #16
558
-        vst1.64         {d2},     [r8], ip
559
-        vld1.64         {d18-d19},[r5,:128]!
560
-        vcvt.s32.f32    q9,  q9,  #16
561
-        vst1.64         {d6},     [r8], ip
562
-        vld1.64         {d20-d21},[r6,:128]!
563
-        vcvt.s32.f32    q10, q10, #16
564
-        vst1.64         {d3},     [r8], ip
565
-        vld1.64         {d22-d23},[r7,:128]!
566
-        vcvt.s32.f32    q11, q11, #16
567
-        vst1.64         {d7},     [r8], ip
568
-        b               6b
569
-7:      vst1.64         {d2},     [r8], ip
570
-        vst1.64         {d6},     [r8], ip
571
-        vst1.64         {d3},     [r8], ip
572
-        vst1.64         {d7},     [r8], ip
573
-        subs            r3,  r3,  #4
574
-        popeq           {r4-r8,pc}
575
-        cmp             r3,  #4
576
-        add             r0,  r0,  #8
577
-        bge             5b
578
-
579
-        @ 2 channels
580
-4:      cmp             r3,  #2
581
-        blt             4f
582
-        ldmia           r1!, {r4-r5}
583
-        mov             lr,  r2
584
-        mov             r8,  r0
585
-        tst             lr,  #8
586
-        vld1.64         {d16-d17},[r4,:128]!
587
-        vcvt.s32.f32    q8,  q8,  #16
588
-        vld1.64         {d18-d19},[r5,:128]!
589
-        vcvt.s32.f32    q9,  q9,  #16
590
-        vld1.64         {d20-d21},[r4,:128]!
591
-        vcvt.s32.f32    q10, q10, #16
592
-        vld1.64         {d22-d23},[r5,:128]!
593
-        vcvt.s32.f32    q11, q11, #16
594
-        beq             6f
595
-        subs            lr,  lr,  #8
596
-        beq             7f
597
-        vsri.32         d18, d16, #16
598
-        vsri.32         d19, d17, #16
599
-        vld1.64         {d16-d17},[r4,:128]!
600
-        vcvt.s32.f32    q8,  q8,  #16
601
-        vst1.32         {d18[0]}, [r8], ip
602
-        vsri.32         d22, d20, #16
603
-        vst1.32         {d18[1]}, [r8], ip
604
-        vsri.32         d23, d21, #16
605
-        vst1.32         {d19[0]}, [r8], ip
606
-        vst1.32         {d19[1]}, [r8], ip
607
-        vld1.64         {d18-d19},[r5,:128]!
608
-        vcvt.s32.f32    q9,  q9,  #16
609
-        vst1.32         {d22[0]}, [r8], ip
610
-        vst1.32         {d22[1]}, [r8], ip
611
-        vld1.64         {d20-d21},[r4,:128]!
612
-        vcvt.s32.f32    q10, q10, #16
613
-        vst1.32         {d23[0]}, [r8], ip
614
-        vst1.32         {d23[1]}, [r8], ip
615
-        vld1.64         {d22-d23},[r5,:128]!
616
-        vcvt.s32.f32    q11, q11, #16
617
-6:      subs            lr,  lr,  #16
618
-        vld1.64         {d0-d1},  [r4,:128]!
619
-        vcvt.s32.f32    q0,  q0,  #16
620
-        vsri.32         d18, d16, #16
621
-        vld1.64         {d2-d3},  [r5,:128]!
622
-        vcvt.s32.f32    q1,  q1,  #16
623
-        vsri.32         d19, d17, #16
624
-        vld1.64         {d4-d5},  [r4,:128]!
625
-        vcvt.s32.f32    q2,  q2,  #16
626
-        vld1.64         {d6-d7},  [r5,:128]!
627
-        vcvt.s32.f32    q3,  q3,  #16
628
-        vst1.32         {d18[0]}, [r8], ip
629
-        vsri.32         d22, d20, #16
630
-        vst1.32         {d18[1]}, [r8], ip
631
-        vsri.32         d23, d21, #16
632
-        vst1.32         {d19[0]}, [r8], ip
633
-        vsri.32         d2,  d0,  #16
634
-        vst1.32         {d19[1]}, [r8], ip
635
-        vsri.32         d3,  d1,  #16
636
-        vst1.32         {d22[0]}, [r8], ip
637
-        vsri.32         d6,  d4,  #16
638
-        vst1.32         {d22[1]}, [r8], ip
639
-        vsri.32         d7,  d5,  #16
640
-        vst1.32         {d23[0]}, [r8], ip
641
-        vst1.32         {d23[1]}, [r8], ip
642
-        beq             6f
643
-        vld1.64         {d16-d17},[r4,:128]!
644
-        vcvt.s32.f32    q8,  q8,  #16
645
-        vst1.32         {d2[0]},  [r8], ip
646
-        vst1.32         {d2[1]},  [r8], ip
647
-        vld1.64         {d18-d19},[r5,:128]!
648
-        vcvt.s32.f32    q9,  q9,  #16
649
-        vst1.32         {d3[0]},  [r8], ip
650
-        vst1.32         {d3[1]},  [r8], ip
651
-        vld1.64         {d20-d21},[r4,:128]!
652
-        vcvt.s32.f32    q10, q10, #16
653
-        vst1.32         {d6[0]},  [r8], ip
654
-        vst1.32         {d6[1]},  [r8], ip
655
-        vld1.64         {d22-d23},[r5,:128]!
656
-        vcvt.s32.f32    q11, q11, #16
657
-        vst1.32         {d7[0]},  [r8], ip
658
-        vst1.32         {d7[1]},  [r8], ip
659
-        bgt             6b
660
-6:      vst1.32         {d2[0]},  [r8], ip
661
-        vst1.32         {d2[1]},  [r8], ip
662
-        vst1.32         {d3[0]},  [r8], ip
663
-        vst1.32         {d3[1]},  [r8], ip
664
-        vst1.32         {d6[0]},  [r8], ip
665
-        vst1.32         {d6[1]},  [r8], ip
666
-        vst1.32         {d7[0]},  [r8], ip
667
-        vst1.32         {d7[1]},  [r8], ip
668
-        b               8f
669
-7:      vsri.32         d18, d16, #16
670
-        vsri.32         d19, d17, #16
671
-        vst1.32         {d18[0]}, [r8], ip
672
-        vsri.32         d22, d20, #16
673
-        vst1.32         {d18[1]}, [r8], ip
674
-        vsri.32         d23, d21, #16
675
-        vst1.32         {d19[0]}, [r8], ip
676
-        vst1.32         {d19[1]}, [r8], ip
677
-        vst1.32         {d22[0]}, [r8], ip
678
-        vst1.32         {d22[1]}, [r8], ip
679
-        vst1.32         {d23[0]}, [r8], ip
680
-        vst1.32         {d23[1]}, [r8], ip
681
-8:      subs            r3,  r3,  #2
682
-        add             r0,  r0,  #4
683
-        popeq           {r4-r8,pc}
684
-
685
-        @ 1 channel
686
-4:      ldr             r4,  [r1],#4
687
-        tst             r2,  #8
688
-        mov             lr,  r2
689
-        mov             r5,  r0
690
-        vld1.64         {d0-d1},  [r4,:128]!
691
-        vcvt.s32.f32    q0,  q0,  #16
692
-        vld1.64         {d2-d3},  [r4,:128]!
693
-        vcvt.s32.f32    q1,  q1,  #16
694
-        bne             8f
695
-6:      subs            lr,  lr,  #16
696
-        vld1.64         {d4-d5},  [r4,:128]!
697
-        vcvt.s32.f32    q2,  q2,  #16
698
-        vld1.64         {d6-d7},  [r4,:128]!
699
-        vcvt.s32.f32    q3,  q3,  #16
700
-        vst1.16         {d0[1]},  [r5,:16], ip
701
-        vst1.16         {d0[3]},  [r5,:16], ip
702
-        vst1.16         {d1[1]},  [r5,:16], ip
703
-        vst1.16         {d1[3]},  [r5,:16], ip
704
-        vst1.16         {d2[1]},  [r5,:16], ip
705
-        vst1.16         {d2[3]},  [r5,:16], ip
706
-        vst1.16         {d3[1]},  [r5,:16], ip
707
-        vst1.16         {d3[3]},  [r5,:16], ip
708
-        beq             7f
709
-        vld1.64         {d0-d1},  [r4,:128]!
710
-        vcvt.s32.f32    q0,  q0,  #16
711
-        vld1.64         {d2-d3},  [r4,:128]!
712
-        vcvt.s32.f32    q1,  q1,  #16
713
-7:      vst1.16         {d4[1]},  [r5,:16], ip
714
-        vst1.16         {d4[3]},  [r5,:16], ip
715
-        vst1.16         {d5[1]},  [r5,:16], ip
716
-        vst1.16         {d5[3]},  [r5,:16], ip
717
-        vst1.16         {d6[1]},  [r5,:16], ip
718
-        vst1.16         {d6[3]},  [r5,:16], ip
719
-        vst1.16         {d7[1]},  [r5,:16], ip
720
-        vst1.16         {d7[3]},  [r5,:16], ip
721
-        bgt             6b
722
-        pop             {r4-r8,pc}
723
-8:      subs            lr,  lr,  #8
724
-        vst1.16         {d0[1]},  [r5,:16], ip
725
-        vst1.16         {d0[3]},  [r5,:16], ip
726
-        vst1.16         {d1[1]},  [r5,:16], ip
727
-        vst1.16         {d1[3]},  [r5,:16], ip
728
-        vst1.16         {d2[1]},  [r5,:16], ip
729
-        vst1.16         {d2[3]},  [r5,:16], ip
730
-        vst1.16         {d3[1]},  [r5,:16], ip
731
-        vst1.16         {d3[3]},  [r5,:16], ip
732
-        popeq           {r4-r8,pc}
733
-        vld1.64         {d0-d1},  [r4,:128]!
734
-        vcvt.s32.f32    q0,  q0,  #16
735
-        vld1.64         {d2-d3},  [r4,:128]!
736
-        vcvt.s32.f32    q1,  q1,  #16
737
-        b               6b
738
-endfunc
739
-
740 403
 function ff_vector_fmul_neon, export=1
741 404
         subs            r3,  r3,  #8
742 405
         vld1.64         {d0-d3},  [r1,:128]!
... ...
@@ -1050,34 +713,6 @@ NOVFP   vmov.32         r0,  d0[0]
1050 1050
         bx              lr
1051 1051
 endfunc
1052 1052
 
1053
-function ff_int32_to_float_fmul_scalar_neon, export=1
1054
-VFP     vdup.32         q0,  d0[0]
1055
-VFP     len     .req    r2
1056
-NOVFP   vdup.32         q0,  r2
1057
-NOVFP   len     .req    r3
1058
-
1059
-        vld1.32         {q1},[r1,:128]!
1060
-        vcvt.f32.s32    q3,  q1
1061
-        vld1.32         {q2},[r1,:128]!
1062
-        vcvt.f32.s32    q8,  q2
1063
-1:      subs            len, len, #8
1064
-        pld             [r1, #16]
1065
-        vmul.f32        q9,  q3,  q0
1066
-        vmul.f32        q10, q8,  q0
1067
-        beq             2f
1068
-        vld1.32         {q1},[r1,:128]!
1069
-        vcvt.f32.s32    q3,  q1
1070
-        vld1.32         {q2},[r1,:128]!
1071
-        vcvt.f32.s32    q8,  q2
1072
-        vst1.32         {q9}, [r0,:128]!
1073
-        vst1.32         {q10},[r0,:128]!
1074
-        b               1b
1075
-2:      vst1.32         {q9}, [r0,:128]!
1076
-        vst1.32         {q10},[r0,:128]!
1077
-        bx              lr
1078
-        .unreq  len
1079
-endfunc
1080
-
1081 1053
 function ff_vector_fmul_reverse_neon, export=1
1082 1054
         add             r2,  r2,  r3,  lsl #2
1083 1055
         sub             r2,  r2,  #32
... ...
@@ -131,58 +131,3 @@ function ff_vector_fmul_reverse_vfp, export=1
131 131
         vpop            {d8-d15}
132 132
         bx              lr
133 133
 endfunc
134
-
135
-#if HAVE_ARMV6
136
-/**
137
- * ARM VFP optimized float to int16 conversion.
138
- * Assume that len is a positive number and is multiple of 8, destination
139
- * buffer is at least 4 bytes aligned (8 bytes alignment is better for
140
- * performance), little endian byte sex
141
- */
142
-@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
143
-function ff_float_to_int16_vfp, export=1
144
-        push            {r4-r8,lr}
145
-        vpush           {d8-d11}
146
-        vldmia          r1!, {s16-s23}
147
-        vcvt.s32.f32    s0,  s16
148
-        vcvt.s32.f32    s1,  s17
149
-        vcvt.s32.f32    s2,  s18
150
-        vcvt.s32.f32    s3,  s19
151
-        vcvt.s32.f32    s4,  s20
152
-        vcvt.s32.f32    s5,  s21
153
-        vcvt.s32.f32    s6,  s22
154
-        vcvt.s32.f32    s7,  s23
155
-1:
156
-        subs            r2,  r2,  #8
157
-        vmov            r3,  r4,  s0, s1
158
-        vmov            r5,  r6,  s2, s3
159
-        vmov            r7,  r8,  s4, s5
160
-        vmov            ip,  lr,  s6, s7
161
-        vldmiagt        r1!, {s16-s23}
162
-        ssat            r4,  #16, r4
163
-        ssat            r3,  #16, r3
164
-        ssat            r6,  #16, r6
165
-        ssat            r5,  #16, r5
166
-        pkhbt           r3,  r3,  r4, lsl #16
167
-        pkhbt           r4,  r5,  r6, lsl #16
168
-        vcvtgt.s32.f32  s0,  s16
169
-        vcvtgt.s32.f32  s1,  s17
170
-        vcvtgt.s32.f32  s2,  s18
171
-        vcvtgt.s32.f32  s3,  s19
172
-        vcvtgt.s32.f32  s4,  s20
173
-        vcvtgt.s32.f32  s5,  s21
174
-        vcvtgt.s32.f32  s6,  s22
175
-        vcvtgt.s32.f32  s7,  s23
176
-        ssat            r8,  #16, r8
177
-        ssat            r7,  #16, r7
178
-        ssat            lr,  #16, lr
179
-        ssat            ip,  #16, ip
180
-        pkhbt           r5,  r7,  r8, lsl #16
181
-        pkhbt           r6,  ip,  lr, lsl #16
182
-        stmia           r0!, {r3-r6}
183
-        bgt             1b
184
-
185
-        vpop            {d8-d11}
186
-        pop             {r4-r8,pc}
187
-endfunc
188
-#endif
189 134
new file mode 100644
... ...
@@ -0,0 +1,48 @@
0
+/*
1
+ * ARM optimized Format Conversion Utils
2
+ *
3
+ * This file is part of FFmpeg.
4
+ *
5
+ * FFmpeg is free software; you can redistribute it and/or
6
+ * modify it under the terms of the GNU Lesser General Public
7
+ * License as published by the Free Software Foundation; either
8
+ * version 2.1 of the License, or (at your option) any later version.
9
+ *
10
+ * FFmpeg is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
+ * Lesser General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU Lesser General Public
16
+ * License along with FFmpeg; if not, write to the Free Software
17
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+#include <stdint.h>
21
+
22
+#include "libavcodec/avcodec.h"
23
+#include "libavcodec/fmtconvert.h"
24
+
25
+void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
26
+                                        float mul, int len);
27
+
28
+void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
29
+void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
30
+
31
+void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
32
+
33
+void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx)
34
+{
35
+    if (HAVE_ARMVFP && HAVE_ARMV6) {
36
+        c->float_to_int16 = ff_float_to_int16_vfp;
37
+    }
38
+
39
+    if (HAVE_NEON) {
40
+        c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
41
+
42
+        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
43
+            c->float_to_int16            = ff_float_to_int16_neon;
44
+            c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
45
+        }
46
+    }
47
+}
0 48
new file mode 100644
... ...
@@ -0,0 +1,391 @@
0
+/*
1
+ * ARM NEON optimised Format Conversion Utils
2
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3
+ *
4
+ * This file is part of FFmpeg.
5
+ *
6
+ * FFmpeg is free software; you can redistribute it and/or
7
+ * modify it under the terms of the GNU Lesser General Public
8
+ * License as published by the Free Software Foundation; either
9
+ * version 2.1 of the License, or (at your option) any later version.
10
+ *
11
+ * FFmpeg is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
+ * Lesser General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU Lesser General Public
17
+ * License along with FFmpeg; if not, write to the Free Software
18
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ */
20
+
21
+#include "config.h"
22
+#include "asm.S"
23
+
24
+        preserve8
25
+        .text
26
+
27
+function ff_float_to_int16_neon, export=1
28
+        subs            r2,  r2,  #8
29
+        vld1.64         {d0-d1},  [r1,:128]!
30
+        vcvt.s32.f32    q8,  q0,  #16
31
+        vld1.64         {d2-d3},  [r1,:128]!
32
+        vcvt.s32.f32    q9,  q1,  #16
33
+        beq             3f
34
+        bics            ip,  r2,  #15
35
+        beq             2f
36
+1:      subs            ip,  ip,  #16
37
+        vshrn.s32       d4,  q8,  #16
38
+        vld1.64         {d0-d1},  [r1,:128]!
39
+        vcvt.s32.f32    q0,  q0,  #16
40
+        vshrn.s32       d5,  q9,  #16
41
+        vld1.64         {d2-d3},  [r1,:128]!
42
+        vcvt.s32.f32    q1,  q1,  #16
43
+        vshrn.s32       d6,  q0,  #16
44
+        vst1.64         {d4-d5},  [r0,:128]!
45
+        vshrn.s32       d7,  q1,  #16
46
+        vld1.64         {d16-d17},[r1,:128]!
47
+        vcvt.s32.f32    q8,  q8,  #16
48
+        vld1.64         {d18-d19},[r1,:128]!
49
+        vcvt.s32.f32    q9,  q9,  #16
50
+        vst1.64         {d6-d7},  [r0,:128]!
51
+        bne             1b
52
+        ands            r2,  r2,  #15
53
+        beq             3f
54
+2:      vld1.64         {d0-d1},  [r1,:128]!
55
+        vshrn.s32       d4,  q8,  #16
56
+        vcvt.s32.f32    q0,  q0,  #16
57
+        vld1.64         {d2-d3},  [r1,:128]!
58
+        vshrn.s32       d5,  q9,  #16
59
+        vcvt.s32.f32    q1,  q1,  #16
60
+        vshrn.s32       d6,  q0,  #16
61
+        vst1.64         {d4-d5},  [r0,:128]!
62
+        vshrn.s32       d7,  q1,  #16
63
+        vst1.64         {d6-d7},  [r0,:128]!
64
+        bx              lr
65
+3:      vshrn.s32       d4,  q8,  #16
66
+        vshrn.s32       d5,  q9,  #16
67
+        vst1.64         {d4-d5},  [r0,:128]!
68
+        bx              lr
69
+endfunc
70
+
71
+function ff_float_to_int16_interleave_neon, export=1
72
+        cmp             r3, #2
73
+        ldrlt           r1, [r1]
74
+        blt             ff_float_to_int16_neon
75
+        bne             4f
76
+
77
+        ldr             r3, [r1]
78
+        ldr             r1, [r1, #4]
79
+
80
+        subs            r2,  r2,  #8
81
+        vld1.64         {d0-d1},  [r3,:128]!
82
+        vcvt.s32.f32    q8,  q0,  #16
83
+        vld1.64         {d2-d3},  [r3,:128]!
84
+        vcvt.s32.f32    q9,  q1,  #16
85
+        vld1.64         {d20-d21},[r1,:128]!
86
+        vcvt.s32.f32    q10, q10, #16
87
+        vld1.64         {d22-d23},[r1,:128]!
88
+        vcvt.s32.f32    q11, q11, #16
89
+        beq             3f
90
+        bics            ip,  r2,  #15
91
+        beq             2f
92
+1:      subs            ip,  ip,  #16
93
+        vld1.64         {d0-d1},  [r3,:128]!
94
+        vcvt.s32.f32    q0,  q0,  #16
95
+        vsri.32         q10, q8,  #16
96
+        vld1.64         {d2-d3},  [r3,:128]!
97
+        vcvt.s32.f32    q1,  q1,  #16
98
+        vld1.64         {d24-d25},[r1,:128]!
99
+        vcvt.s32.f32    q12, q12, #16
100
+        vld1.64         {d26-d27},[r1,:128]!
101
+        vsri.32         q11, q9,  #16
102
+        vst1.64         {d20-d21},[r0,:128]!
103
+        vcvt.s32.f32    q13, q13, #16
104
+        vst1.64         {d22-d23},[r0,:128]!
105
+        vsri.32         q12, q0,  #16
106
+        vld1.64         {d16-d17},[r3,:128]!
107
+        vsri.32         q13, q1,  #16
108
+        vst1.64         {d24-d25},[r0,:128]!
109
+        vcvt.s32.f32    q8,  q8,  #16
110
+        vld1.64         {d18-d19},[r3,:128]!
111
+        vcvt.s32.f32    q9,  q9,  #16
112
+        vld1.64         {d20-d21},[r1,:128]!
113
+        vcvt.s32.f32    q10, q10, #16
114
+        vld1.64         {d22-d23},[r1,:128]!
115
+        vcvt.s32.f32    q11, q11, #16
116
+        vst1.64         {d26-d27},[r0,:128]!
117
+        bne             1b
118
+        ands            r2,  r2,  #15
119
+        beq             3f
120
+2:      vsri.32         q10, q8,  #16
121
+        vld1.64         {d0-d1},  [r3,:128]!
122
+        vcvt.s32.f32    q0,  q0,  #16
123
+        vld1.64         {d2-d3},  [r3,:128]!
124
+        vcvt.s32.f32    q1,  q1,  #16
125
+        vld1.64         {d24-d25},[r1,:128]!
126
+        vcvt.s32.f32    q12, q12, #16
127
+        vsri.32         q11, q9,  #16
128
+        vld1.64         {d26-d27},[r1,:128]!
129
+        vcvt.s32.f32    q13, q13, #16
130
+        vst1.64         {d20-d21},[r0,:128]!
131
+        vsri.32         q12, q0,  #16
132
+        vst1.64         {d22-d23},[r0,:128]!
133
+        vsri.32         q13, q1,  #16
134
+        vst1.64         {d24-d27},[r0,:128]!
135
+        bx              lr
136
+3:      vsri.32         q10, q8,  #16
137
+        vsri.32         q11, q9,  #16
138
+        vst1.64         {d20-d23},[r0,:128]!
139
+        bx              lr
140
+
141
+4:      push            {r4-r8,lr}
142
+        cmp             r3,  #4
143
+        lsl             ip,  r3,  #1
144
+        blt             4f
145
+
146
+        @ 4 channels
147
+5:      ldmia           r1!, {r4-r7}
148
+        mov             lr,  r2
149
+        mov             r8,  r0
150
+        vld1.64         {d16-d17},[r4,:128]!
151
+        vcvt.s32.f32    q8,  q8,  #16
152
+        vld1.64         {d18-d19},[r5,:128]!
153
+        vcvt.s32.f32    q9,  q9,  #16
154
+        vld1.64         {d20-d21},[r6,:128]!
155
+        vcvt.s32.f32    q10, q10, #16
156
+        vld1.64         {d22-d23},[r7,:128]!
157
+        vcvt.s32.f32    q11, q11, #16
158
+6:      subs            lr,  lr,  #8
159
+        vld1.64         {d0-d1},  [r4,:128]!
160
+        vcvt.s32.f32    q0,  q0,  #16
161
+        vsri.32         q9,  q8,  #16
162
+        vld1.64         {d2-d3},  [r5,:128]!
163
+        vcvt.s32.f32    q1,  q1,  #16
164
+        vsri.32         q11, q10, #16
165
+        vld1.64         {d4-d5},  [r6,:128]!
166
+        vcvt.s32.f32    q2,  q2,  #16
167
+        vzip.32         d18, d22
168
+        vld1.64         {d6-d7},  [r7,:128]!
169
+        vcvt.s32.f32    q3,  q3,  #16
170
+        vzip.32         d19, d23
171
+        vst1.64         {d18},    [r8], ip
172
+        vsri.32         q1,  q0,  #16
173
+        vst1.64         {d22},    [r8], ip
174
+        vsri.32         q3,  q2,  #16
175
+        vst1.64         {d19},    [r8], ip
176
+        vzip.32         d2,  d6
177
+        vst1.64         {d23},    [r8], ip
178
+        vzip.32         d3,  d7
179
+        beq             7f
180
+        vld1.64         {d16-d17},[r4,:128]!
181
+        vcvt.s32.f32    q8,  q8,  #16
182
+        vst1.64         {d2},     [r8], ip
183
+        vld1.64         {d18-d19},[r5,:128]!
184
+        vcvt.s32.f32    q9,  q9,  #16
185
+        vst1.64         {d6},     [r8], ip
186
+        vld1.64         {d20-d21},[r6,:128]!
187
+        vcvt.s32.f32    q10, q10, #16
188
+        vst1.64         {d3},     [r8], ip
189
+        vld1.64         {d22-d23},[r7,:128]!
190
+        vcvt.s32.f32    q11, q11, #16
191
+        vst1.64         {d7},     [r8], ip
192
+        b               6b
193
+7:      vst1.64         {d2},     [r8], ip
194
+        vst1.64         {d6},     [r8], ip
195
+        vst1.64         {d3},     [r8], ip
196
+        vst1.64         {d7},     [r8], ip
197
+        subs            r3,  r3,  #4
198
+        popeq           {r4-r8,pc}
199
+        cmp             r3,  #4
200
+        add             r0,  r0,  #8
201
+        bge             5b
202
+
203
+        @ 2 channels
204
+4:      cmp             r3,  #2
205
+        blt             4f
206
+        ldmia           r1!, {r4-r5}
207
+        mov             lr,  r2
208
+        mov             r8,  r0
209
+        tst             lr,  #8
210
+        vld1.64         {d16-d17},[r4,:128]!
211
+        vcvt.s32.f32    q8,  q8,  #16
212
+        vld1.64         {d18-d19},[r5,:128]!
213
+        vcvt.s32.f32    q9,  q9,  #16
214
+        vld1.64         {d20-d21},[r4,:128]!
215
+        vcvt.s32.f32    q10, q10, #16
216
+        vld1.64         {d22-d23},[r5,:128]!
217
+        vcvt.s32.f32    q11, q11, #16
218
+        beq             6f
219
+        subs            lr,  lr,  #8
220
+        beq             7f
221
+        vsri.32         d18, d16, #16
222
+        vsri.32         d19, d17, #16
223
+        vld1.64         {d16-d17},[r4,:128]!
224
+        vcvt.s32.f32    q8,  q8,  #16
225
+        vst1.32         {d18[0]}, [r8], ip
226
+        vsri.32         d22, d20, #16
227
+        vst1.32         {d18[1]}, [r8], ip
228
+        vsri.32         d23, d21, #16
229
+        vst1.32         {d19[0]}, [r8], ip
230
+        vst1.32         {d19[1]}, [r8], ip
231
+        vld1.64         {d18-d19},[r5,:128]!
232
+        vcvt.s32.f32    q9,  q9,  #16
233
+        vst1.32         {d22[0]}, [r8], ip
234
+        vst1.32         {d22[1]}, [r8], ip
235
+        vld1.64         {d20-d21},[r4,:128]!
236
+        vcvt.s32.f32    q10, q10, #16
237
+        vst1.32         {d23[0]}, [r8], ip
238
+        vst1.32         {d23[1]}, [r8], ip
239
+        vld1.64         {d22-d23},[r5,:128]!
240
+        vcvt.s32.f32    q11, q11, #16
241
+6:      subs            lr,  lr,  #16
242
+        vld1.64         {d0-d1},  [r4,:128]!
243
+        vcvt.s32.f32    q0,  q0,  #16
244
+        vsri.32         d18, d16, #16
245
+        vld1.64         {d2-d3},  [r5,:128]!
246
+        vcvt.s32.f32    q1,  q1,  #16
247
+        vsri.32         d19, d17, #16
248
+        vld1.64         {d4-d5},  [r4,:128]!
249
+        vcvt.s32.f32    q2,  q2,  #16
250
+        vld1.64         {d6-d7},  [r5,:128]!
251
+        vcvt.s32.f32    q3,  q3,  #16
252
+        vst1.32         {d18[0]}, [r8], ip
253
+        vsri.32         d22, d20, #16
254
+        vst1.32         {d18[1]}, [r8], ip
255
+        vsri.32         d23, d21, #16
256
+        vst1.32         {d19[0]}, [r8], ip
257
+        vsri.32         d2,  d0,  #16
258
+        vst1.32         {d19[1]}, [r8], ip
259
+        vsri.32         d3,  d1,  #16
260
+        vst1.32         {d22[0]}, [r8], ip
261
+        vsri.32         d6,  d4,  #16
262
+        vst1.32         {d22[1]}, [r8], ip
263
+        vsri.32         d7,  d5,  #16
264
+        vst1.32         {d23[0]}, [r8], ip
265
+        vst1.32         {d23[1]}, [r8], ip
266
+        beq             6f
267
+        vld1.64         {d16-d17},[r4,:128]!
268
+        vcvt.s32.f32    q8,  q8,  #16
269
+        vst1.32         {d2[0]},  [r8], ip
270
+        vst1.32         {d2[1]},  [r8], ip
271
+        vld1.64         {d18-d19},[r5,:128]!
272
+        vcvt.s32.f32    q9,  q9,  #16
273
+        vst1.32         {d3[0]},  [r8], ip
274
+        vst1.32         {d3[1]},  [r8], ip
275
+        vld1.64         {d20-d21},[r4,:128]!
276
+        vcvt.s32.f32    q10, q10, #16
277
+        vst1.32         {d6[0]},  [r8], ip
278
+        vst1.32         {d6[1]},  [r8], ip
279
+        vld1.64         {d22-d23},[r5,:128]!
280
+        vcvt.s32.f32    q11, q11, #16
281
+        vst1.32         {d7[0]},  [r8], ip
282
+        vst1.32         {d7[1]},  [r8], ip
283
+        bgt             6b
284
+6:      vst1.32         {d2[0]},  [r8], ip
285
+        vst1.32         {d2[1]},  [r8], ip
286
+        vst1.32         {d3[0]},  [r8], ip
287
+        vst1.32         {d3[1]},  [r8], ip
288
+        vst1.32         {d6[0]},  [r8], ip
289
+        vst1.32         {d6[1]},  [r8], ip
290
+        vst1.32         {d7[0]},  [r8], ip
291
+        vst1.32         {d7[1]},  [r8], ip
292
+        b               8f
293
+7:      vsri.32         d18, d16, #16
294
+        vsri.32         d19, d17, #16
295
+        vst1.32         {d18[0]}, [r8], ip
296
+        vsri.32         d22, d20, #16
297
+        vst1.32         {d18[1]}, [r8], ip
298
+        vsri.32         d23, d21, #16
299
+        vst1.32         {d19[0]}, [r8], ip
300
+        vst1.32         {d19[1]}, [r8], ip
301
+        vst1.32         {d22[0]}, [r8], ip
302
+        vst1.32         {d22[1]}, [r8], ip
303
+        vst1.32         {d23[0]}, [r8], ip
304
+        vst1.32         {d23[1]}, [r8], ip
305
+8:      subs            r3,  r3,  #2
306
+        add             r0,  r0,  #4
307
+        popeq           {r4-r8,pc}
308
+
309
+        @ 1 channel
310
+4:      ldr             r4,  [r1],#4
311
+        tst             r2,  #8
312
+        mov             lr,  r2
313
+        mov             r5,  r0
314
+        vld1.64         {d0-d1},  [r4,:128]!
315
+        vcvt.s32.f32    q0,  q0,  #16
316
+        vld1.64         {d2-d3},  [r4,:128]!
317
+        vcvt.s32.f32    q1,  q1,  #16
318
+        bne             8f
319
+6:      subs            lr,  lr,  #16
320
+        vld1.64         {d4-d5},  [r4,:128]!
321
+        vcvt.s32.f32    q2,  q2,  #16
322
+        vld1.64         {d6-d7},  [r4,:128]!
323
+        vcvt.s32.f32    q3,  q3,  #16
324
+        vst1.16         {d0[1]},  [r5,:16], ip
325
+        vst1.16         {d0[3]},  [r5,:16], ip
326
+        vst1.16         {d1[1]},  [r5,:16], ip
327
+        vst1.16         {d1[3]},  [r5,:16], ip
328
+        vst1.16         {d2[1]},  [r5,:16], ip
329
+        vst1.16         {d2[3]},  [r5,:16], ip
330
+        vst1.16         {d3[1]},  [r5,:16], ip
331
+        vst1.16         {d3[3]},  [r5,:16], ip
332
+        beq             7f
333
+        vld1.64         {d0-d1},  [r4,:128]!
334
+        vcvt.s32.f32    q0,  q0,  #16
335
+        vld1.64         {d2-d3},  [r4,:128]!
336
+        vcvt.s32.f32    q1,  q1,  #16
337
+7:      vst1.16         {d4[1]},  [r5,:16], ip
338
+        vst1.16         {d4[3]},  [r5,:16], ip
339
+        vst1.16         {d5[1]},  [r5,:16], ip
340
+        vst1.16         {d5[3]},  [r5,:16], ip
341
+        vst1.16         {d6[1]},  [r5,:16], ip
342
+        vst1.16         {d6[3]},  [r5,:16], ip
343
+        vst1.16         {d7[1]},  [r5,:16], ip
344
+        vst1.16         {d7[3]},  [r5,:16], ip
345
+        bgt             6b
346
+        pop             {r4-r8,pc}
347
+8:      subs            lr,  lr,  #8
348
+        vst1.16         {d0[1]},  [r5,:16], ip
349
+        vst1.16         {d0[3]},  [r5,:16], ip
350
+        vst1.16         {d1[1]},  [r5,:16], ip
351
+        vst1.16         {d1[3]},  [r5,:16], ip
352
+        vst1.16         {d2[1]},  [r5,:16], ip
353
+        vst1.16         {d2[3]},  [r5,:16], ip
354
+        vst1.16         {d3[1]},  [r5,:16], ip
355
+        vst1.16         {d3[3]},  [r5,:16], ip
356
+        popeq           {r4-r8,pc}
357
+        vld1.64         {d0-d1},  [r4,:128]!
358
+        vcvt.s32.f32    q0,  q0,  #16
359
+        vld1.64         {d2-d3},  [r4,:128]!
360
+        vcvt.s32.f32    q1,  q1,  #16
361
+        b               6b
362
+endfunc
363
+
364
+function ff_int32_to_float_fmul_scalar_neon, export=1
365
+VFP     vdup.32         q0,  d0[0]
366
+VFP     len     .req    r2
367
+NOVFP   vdup.32         q0,  r2
368
+NOVFP   len     .req    r3
369
+
370
+        vld1.32         {q1},[r1,:128]!
371
+        vcvt.f32.s32    q3,  q1
372
+        vld1.32         {q2},[r1,:128]!
373
+        vcvt.f32.s32    q8,  q2
374
+1:      subs            len, len, #8
375
+        pld             [r1, #16]
376
+        vmul.f32        q9,  q3,  q0
377
+        vmul.f32        q10, q8,  q0
378
+        beq             2f
379
+        vld1.32         {q1},[r1,:128]!
380
+        vcvt.f32.s32    q3,  q1
381
+        vld1.32         {q2},[r1,:128]!
382
+        vcvt.f32.s32    q8,  q2
383
+        vst1.32         {q9}, [r0,:128]!
384
+        vst1.32         {q10},[r0,:128]!
385
+        b               1b
386
+2:      vst1.32         {q9}, [r0,:128]!
387
+        vst1.32         {q10},[r0,:128]!
388
+        bx              lr
389
+        .unreq  len
390
+endfunc
0 391
new file mode 100644
... ...
@@ -0,0 +1,77 @@
0
+/*
1
+ * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
2
+ *
3
+ * This file is part of FFmpeg.
4
+ *
5
+ * FFmpeg is free software; you can redistribute it and/or
6
+ * modify it under the terms of the GNU Lesser General Public
7
+ * License as published by the Free Software Foundation; either
8
+ * version 2.1 of the License, or (at your option) any later version.
9
+ *
10
+ * FFmpeg is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
+ * Lesser General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU Lesser General Public
16
+ * License along with FFmpeg; if not, write to the Free Software
17
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+#include "config.h"
21
+#include "asm.S"
22
+
23
+        .syntax unified
24
+
25
+/**
26
+ * ARM VFP optimized float to int16 conversion.
27
+ * Assume that len is a positive number and is multiple of 8, destination
28
+ * buffer is at least 4 bytes aligned (8 bytes alignment is better for
29
+ * performance), little endian byte sex
30
+ */
31
+@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
32
+function ff_float_to_int16_vfp, export=1
33
+        push            {r4-r8,lr}
34
+        vpush           {d8-d11}
35
+        vldmia          r1!, {s16-s23}
36
+        vcvt.s32.f32    s0,  s16
37
+        vcvt.s32.f32    s1,  s17
38
+        vcvt.s32.f32    s2,  s18
39
+        vcvt.s32.f32    s3,  s19
40
+        vcvt.s32.f32    s4,  s20
41
+        vcvt.s32.f32    s5,  s21
42
+        vcvt.s32.f32    s6,  s22
43
+        vcvt.s32.f32    s7,  s23
44
+1:
45
+        subs            r2,  r2,  #8
46
+        vmov            r3,  r4,  s0, s1
47
+        vmov            r5,  r6,  s2, s3
48
+        vmov            r7,  r8,  s4, s5
49
+        vmov            ip,  lr,  s6, s7
50
+        vldmiagt        r1!, {s16-s23}
51
+        ssat            r4,  #16, r4
52
+        ssat            r3,  #16, r3
53
+        ssat            r6,  #16, r6
54
+        ssat            r5,  #16, r5
55
+        pkhbt           r3,  r3,  r4, lsl #16
56
+        pkhbt           r4,  r5,  r6, lsl #16
57
+        vcvtgt.s32.f32  s0,  s16
58
+        vcvtgt.s32.f32  s1,  s17
59
+        vcvtgt.s32.f32  s2,  s18
60
+        vcvtgt.s32.f32  s3,  s19
61
+        vcvtgt.s32.f32  s4,  s20
62
+        vcvtgt.s32.f32  s5,  s21
63
+        vcvtgt.s32.f32  s6,  s22
64
+        vcvtgt.s32.f32  s7,  s23
65
+        ssat            r8,  #16, r8
66
+        ssat            r7,  #16, r7
67
+        ssat            lr,  #16, lr
68
+        ssat            ip,  #16, ip
69
+        pkhbt           r5,  r7,  r8, lsl #16
70
+        pkhbt           r6,  ip,  lr, lsl #16
71
+        stmia           r0!, {r3-r6}
72
+        bgt             1b
73
+
74
+        vpop            {d8-d11}
75
+        pop             {r4-r8,pc}
76
+endfunc
... ...
@@ -33,6 +33,7 @@
33 33
 #include "get_bits.h"
34 34
 #include "dsputil.h"
35 35
 #include "fft.h"
36
+#include "fmtconvert.h"
36 37
 
37 38
 extern const uint16_t ff_wma_critical_freqs[25];
38 39
 
... ...
@@ -43,6 +44,7 @@ typedef struct {
43 43
     AVCodecContext *avctx;
44 44
     GetBitContext gb;
45 45
     DSPContext dsp;
46
+    FmtConvertContext fmt_conv;
46 47
     int first;
47 48
     int channels;
48 49
     int frame_len;          ///< transform size (samples)
... ...
@@ -71,6 +73,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
71 71
 
72 72
     s->avctx = avctx;
73 73
     dsputil_init(&s->dsp, avctx);
74
+    ff_fmt_convert_init(&s->fmt_conv, avctx);
74 75
 
75 76
     /* determine frame length */
76 77
     if (avctx->sample_rate < 22050) {
... ...
@@ -222,7 +225,8 @@ static void decode_block(BinkAudioContext *s, short *out, int use_dct)
222 222
             ff_rdft_calc(&s->trans.rdft, coeffs);
223 223
     }
224 224
 
225
-    s->dsp.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, s->frame_len, s->channels);
225
+    s->fmt_conv.float_to_int16_interleave(out, (const float **)s->coeffs_ptr,
226
+                                          s->frame_len, s->channels);
226 227
 
227 228
     if (!s->first) {
228 229
         int count = s->overlap_len * s->channels;
... ...
@@ -40,6 +40,7 @@
40 40
 #include "dca.h"
41 41
 #include "synth_filter.h"
42 42
 #include "dcadsp.h"
43
+#include "fmtconvert.h"
43 44
 
44 45
 //#define TRACE
45 46
 
... ...
@@ -347,6 +348,7 @@ typedef struct {
347 347
     FFTContext imdct;
348 348
     SynthFilterContext synth;
349 349
     DCADSPContext dcadsp;
350
+    FmtConvertContext fmt_conv;
350 351
 } DCAContext;
351 352
 
352 353
 static const uint16_t dca_vlc_offs[] = {
... ...
@@ -1115,7 +1117,7 @@ static int dca_subsubframe(DCAContext * s, int base_channel, int block_index)
1115 1115
                         block[m] = get_bitalloc(&s->gb, &dca_smpl_bitalloc[abits], sel);
1116 1116
                 }
1117 1117
 
1118
-                s->dsp.int32_to_float_fmul_scalar(subband_samples[k][l],
1118
+                s->fmt_conv.int32_to_float_fmul_scalar(subband_samples[k][l],
1119 1119
                                                   block, rscale, 8);
1120 1120
             }
1121 1121
 
... ...
@@ -1802,7 +1804,7 @@ static int dca_decode_frame(AVCodecContext * avctx,
1802 1802
             }
1803 1803
         }
1804 1804
 
1805
-        s->dsp.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
1805
+        s->fmt_conv.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
1806 1806
         samples += 256 * channels;
1807 1807
     }
1808 1808
 
... ...
@@ -1835,6 +1837,7 @@ static av_cold int dca_decode_init(AVCodecContext * avctx)
1835 1835
     ff_mdct_init(&s->imdct, 6, 1, 1.0);
1836 1836
     ff_synth_filter_init(&s->synth);
1837 1837
     ff_dcadsp_init(&s->dcadsp);
1838
+    ff_fmt_convert_init(&s->fmt_conv, avctx);
1838 1839
 
1839 1840
     for (i = 0; i < DCA_PRIM_CHANNELS_MAX+1; i++)
1840 1841
         s->samples_chanptr[i] = s->samples + i * 256;
... ...
@@ -3867,12 +3867,6 @@ static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3867 3867
     return p;
3868 3868
 }
3869 3869
 
3870
-static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3871
-    int i;
3872
-    for(i=0; i<len; i++)
3873
-        dst[i] = src[i] * mul;
3874
-}
3875
-
3876 3870
 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3877 3871
                    uint32_t maxi, uint32_t maxisign)
3878 3872
 {
... ...
@@ -3918,30 +3912,6 @@ static void vector_clipf_c(float *dst, const float *src, float min, float max, i
3918 3918
     }
3919 3919
 }
3920 3920
 
3921
-static av_always_inline int float_to_int16_one(const float *src){
3922
-    return av_clip_int16(lrintf(*src));
3923
-}
3924
-
3925
-static void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3926
-    int i;
3927
-    for(i=0; i<len; i++)
3928
-        dst[i] = float_to_int16_one(src+i);
3929
-}
3930
-
3931
-static void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3932
-    int i,j,c;
3933
-    if(channels==2){
3934
-        for(i=0; i<len; i++){
3935
-            dst[2*i]   = float_to_int16_one(src[0]+i);
3936
-            dst[2*i+1] = float_to_int16_one(src[1]+i);
3937
-        }
3938
-    }else{
3939
-        for(c=0; c<channels; c++)
3940
-            for(i=0, j=c; i<len; i++, j+=channels)
3941
-                dst[j] = float_to_int16_one(src[c]+i);
3942
-    }
3943
-}
3944
-
3945 3921
 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3946 3922
 {
3947 3923
     int res = 0;
... ...
@@ -4437,10 +4407,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4437 4437
     c->vector_fmul_reverse = vector_fmul_reverse_c;
4438 4438
     c->vector_fmul_add = vector_fmul_add_c;
4439 4439
     c->vector_fmul_window = vector_fmul_window_c;
4440
-    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4441 4440
     c->vector_clipf = vector_clipf_c;
4442
-    c->float_to_int16 = ff_float_to_int16_c;
4443
-    c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4444 4441
     c->scalarproduct_int16 = scalarproduct_int16_c;
4445 4442
     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4446 4443
     c->scalarproduct_float = scalarproduct_float_c;
... ...
@@ -392,7 +392,6 @@ typedef struct DSPContext {
392 392
     /* assume len is a multiple of 4, and arrays are 16-byte aligned */
393 393
     void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len);
394 394
     /* assume len is a multiple of 8, and arrays are 16-byte aligned */
395
-    void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
396 395
     void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
397 396
     /**
398 397
      * Multiply a vector of floats by a scalar float.  Source and
... ...
@@ -445,10 +444,6 @@ typedef struct DSPContext {
445 445
      */
446 446
     void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
447 447
 
448
-    /* convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
449
-    void (*float_to_int16)(int16_t *dst, const float *src, long len);
450
-    void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels);
451
-
452 448
     /* (I)DCT */
453 449
     void (*fdct)(DCTELEM *block/* align 16*/);
454 450
     void (*fdct248)(DCTELEM *block/* align 16*/);
455 451
new file mode 100644
... ...
@@ -0,0 +1,68 @@
0
+/*
1
+ * Format Conversion Utils
2
+ * Copyright (c) 2000, 2001 Fabrice Bellard
3
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
4
+ *
5
+ * This file is part of FFmpeg.
6
+ *
7
+ * FFmpeg is free software; you can redistribute it and/or
8
+ * modify it under the terms of the GNU Lesser General Public
9
+ * License as published by the Free Software Foundation; either
10
+ * version 2.1 of the License, or (at your option) any later version.
11
+ *
12
+ * FFmpeg is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
+ * Lesser General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU Lesser General Public
18
+ * License along with FFmpeg; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ */
21
+
22
+#include "avcodec.h"
23
+#include "fmtconvert.h"
24
+
25
+static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
26
+    int i;
27
+    for(i=0; i<len; i++)
28
+        dst[i] = src[i] * mul;
29
+}
30
+
31
+static av_always_inline int float_to_int16_one(const float *src){
32
+    return av_clip_int16(lrintf(*src));
33
+}
34
+
35
+static void float_to_int16_c(int16_t *dst, const float *src, long len)
36
+{
37
+    int i;
38
+    for(i=0; i<len; i++)
39
+        dst[i] = float_to_int16_one(src+i);
40
+}
41
+
42
+static void float_to_int16_interleave_c(int16_t *dst, const float **src,
43
+                                        long len, int channels)
44
+{
45
+    int i,j,c;
46
+    if(channels==2){
47
+        for(i=0; i<len; i++){
48
+            dst[2*i]   = float_to_int16_one(src[0]+i);
49
+            dst[2*i+1] = float_to_int16_one(src[1]+i);
50
+        }
51
+    }else{
52
+        for(c=0; c<channels; c++)
53
+            for(i=0, j=c; i<len; i++, j+=channels)
54
+                dst[j] = float_to_int16_one(src[c]+i);
55
+    }
56
+}
57
+
58
+av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
59
+{
60
+    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
61
+    c->float_to_int16             = float_to_int16_c;
62
+    c->float_to_int16_interleave  = float_to_int16_interleave_c;
63
+
64
+    if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
65
+    if (ARCH_PPC) ff_fmt_convert_init_ppc(c, avctx);
66
+    if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx);
67
+}
0 68
new file mode 100644
... ...
@@ -0,0 +1,79 @@
0
+/*
1
+ * Format Conversion Utils
2
+ * Copyright (c) 2000, 2001 Fabrice Bellard
3
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
4
+ *
5
+ * This file is part of FFmpeg.
6
+ *
7
+ * FFmpeg is free software; you can redistribute it and/or
8
+ * modify it under the terms of the GNU Lesser General Public
9
+ * License as published by the Free Software Foundation; either
10
+ * version 2.1 of the License, or (at your option) any later version.
11
+ *
12
+ * FFmpeg is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
+ * Lesser General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU Lesser General Public
18
+ * License along with FFmpeg; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ */
21
+
22
+#ifndef AVCODEC_FMTCONVERT_H
23
+#define AVCODEC_FMTCONVERT_H
24
+
25
+#include "avcodec.h"
26
+
27
+typedef struct FmtConvertContext {
28
+    /**
29
+     * Convert an array of int32_t to float and multiply by a float value.
30
+     * @param dst destination array of float.
31
+     *            constraints: 16-byte aligned
32
+     * @param src source array of int32_t.
33
+     *            constraints: 16-byte aligned
34
+     * @param len number of elements to convert.
35
+     *            constraints: multiple of 8
36
+     */
37
+    void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
38
+
39
+    /**
40
+     * Convert an array of float to an array of int16_t.
41
+     *
42
+     * Convert floats from in the range [-32768.0,32767.0] to ints
43
+     * without rescaling
44
+     *
45
+     * @param dst destination array of int16_t.
46
+     *            constraints: 16-byte aligned
47
+     * @param src source array of float.
48
+     *            constraints: 16-byte aligned
49
+     * @param len number of elements to convert.
50
+     *            constraints: multiple of 8
51
+     */
52
+    void (*float_to_int16)(int16_t *dst, const float *src, long len);
53
+
54
+    /**
55
+     * Convert multiple arrays of float to an interleaved array of int16_t.
56
+     *
57
+     * Convert floats from in the range [-32768.0,32767.0] to ints
58
+     * without rescaling
59
+     *
60
+     * @param dst destination array of interleaved int16_t.
61
+     *            constraints: 16-byte aligned
62
+     * @param src source array of float arrays, one for each channel.
63
+     *            constraints: 16-byte aligned
64
+     * @param len number of elements to convert.
65
+     *            constraints: multiple of 8
66
+     * @param channels number of channels
67
+     */
68
+    void (*float_to_int16_interleave)(int16_t *dst, const float **src,
69
+                                      long len, int channels);
70
+} FmtConvertContext;
71
+
72
+void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
73
+
74
+void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
75
+void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx);
76
+void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx);
77
+
78
+#endif /* AVCODEC_FMTCONVERT_H */
... ...
@@ -38,6 +38,7 @@
38 38
 #include "avcodec.h"
39 39
 #include "dsputil.h"
40 40
 #include "fft.h"
41
+#include "fmtconvert.h"
41 42
 
42 43
 #define ALT_BITSTREAM_READER_LE
43 44
 #include "get_bits.h"
... ...
@@ -52,6 +53,7 @@ typedef struct NellyMoserDecodeContext {
52 52
     float           scale_bias;
53 53
     DSPContext      dsp;
54 54
     FFTContext      imdct_ctx;
55
+    FmtConvertContext fmt_conv;
55 56
     DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2];
56 57
 } NellyMoserDecodeContext;
57 58
 
... ...
@@ -134,6 +136,7 @@ static av_cold int decode_init(AVCodecContext * avctx) {
134 134
     ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0);
135 135
 
136 136
     dsputil_init(&s->dsp, avctx);
137
+    ff_fmt_convert_init(&s->fmt_conv, avctx);
137 138
 
138 139
     s->scale_bias = 1.0/(1*8);
139 140
 
... ...
@@ -175,7 +178,7 @@ static int decode_tag(AVCodecContext * avctx,
175 175
 
176 176
     for (i=0 ; i<blocks ; i++) {
177 177
         nelly_decode_block(s, &buf[i*NELLY_BLOCK_LEN], s->float_buf);
178
-        s->dsp.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
178
+        s->fmt_conv.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
179 179
         *data_size += NELLY_SAMPLES*sizeof(int16_t);
180 180
     }
181 181
 
... ...
@@ -21,6 +21,7 @@ ALTIVEC-OBJS-$(CONFIG_FFT)             += ppc/fft_altivec.o             \
21 21
 OBJS-$(HAVE_ALTIVEC)                   += ppc/dsputil_altivec.o         \
22 22
                                           ppc/fdct_altivec.o            \
23 23
                                           ppc/float_altivec.o           \
24
+                                          ppc/fmtconvert_altivec.o      \
24 25
                                           ppc/gmc_altivec.o             \
25 26
                                           ppc/idct_altivec.o            \
26 27
                                           ppc/int_altivec.o             \
... ...
@@ -122,124 +122,12 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa
122 122
     }
123 123
 }
124 124
 
125
-static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
126
-{
127
-    union {
128
-        vector float v;
129
-        float s[4];
130
-    } mul_u;
131
-    int i;
132
-    vector float src1, src2, dst1, dst2, mul_v, zero;
133
-
134
-    zero = (vector float)vec_splat_u32(0);
135
-    mul_u.s[0] = mul;
136
-    mul_v = vec_splat(mul_u.v, 0);
137
-
138
-    for(i=0; i<len; i+=8) {
139
-        src1 = vec_ctf(vec_ld(0,  src+i), 0);
140
-        src2 = vec_ctf(vec_ld(16, src+i), 0);
141
-        dst1 = vec_madd(src1, mul_v, zero);
142
-        dst2 = vec_madd(src2, mul_v, zero);
143
-        vec_st(dst1,  0, dst+i);
144
-        vec_st(dst2, 16, dst+i);
145
-    }
146
-}
147
-
148
-
149
-static vector signed short
150
-float_to_int16_one_altivec(const float *src)
151
-{
152
-    vector float s0 = vec_ld(0, src);
153
-    vector float s1 = vec_ld(16, src);
154
-    vector signed int t0 = vec_cts(s0, 0);
155
-    vector signed int t1 = vec_cts(s1, 0);
156
-    return vec_packs(t0,t1);
157
-}
158
-
159
-static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
160
-{
161
-    int i;
162
-    vector signed short d0, d1, d;
163
-    vector unsigned char align;
164
-    if(((long)dst)&15) //FIXME
165
-    for(i=0; i<len-7; i+=8) {
166
-        d0 = vec_ld(0, dst+i);
167
-        d = float_to_int16_one_altivec(src+i);
168
-        d1 = vec_ld(15, dst+i);
169
-        d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
170
-        align = vec_lvsr(0, dst+i);
171
-        d0 = vec_perm(d1, d, align);
172
-        d1 = vec_perm(d, d1, align);
173
-        vec_st(d0, 0, dst+i);
174
-        vec_st(d1,15, dst+i);
175
-    }
176
-    else
177
-    for(i=0; i<len-7; i+=8) {
178
-        d = float_to_int16_one_altivec(src+i);
179
-        vec_st(d, 0, dst+i);
180
-    }
181
-}
182
-
183
-static void
184
-float_to_int16_interleave_altivec(int16_t *dst, const float **src,
185
-                                  long len, int channels)
186
-{
187
-    int i;
188
-    vector signed short d0, d1, d2, c0, c1, t0, t1;
189
-    vector unsigned char align;
190
-    if(channels == 1)
191
-        float_to_int16_altivec(dst, src[0], len);
192
-    else
193
-        if (channels == 2) {
194
-        if(((long)dst)&15)
195
-        for(i=0; i<len-7; i+=8) {
196
-            d0 = vec_ld(0, dst + i);
197
-            t0 = float_to_int16_one_altivec(src[0] + i);
198
-            d1 = vec_ld(31, dst + i);
199
-            t1 = float_to_int16_one_altivec(src[1] + i);
200
-            c0 = vec_mergeh(t0, t1);
201
-            c1 = vec_mergel(t0, t1);
202
-            d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
203
-            align = vec_lvsr(0, dst + i);
204
-            d0 = vec_perm(d2, c0, align);
205
-            d1 = vec_perm(c0, c1, align);
206
-            vec_st(d0,  0, dst + i);
207
-            d0 = vec_perm(c1, d2, align);
208
-            vec_st(d1, 15, dst + i);
209
-            vec_st(d0, 31, dst + i);
210
-            dst+=8;
211
-        }
212
-        else
213
-        for(i=0; i<len-7; i+=8) {
214
-            t0 = float_to_int16_one_altivec(src[0] + i);
215
-            t1 = float_to_int16_one_altivec(src[1] + i);
216
-            d0 = vec_mergeh(t0, t1);
217
-            d1 = vec_mergel(t0, t1);
218
-            vec_st(d0,  0, dst + i);
219
-            vec_st(d1, 16, dst + i);
220
-            dst+=8;
221
-        }
222
-    } else {
223
-        DECLARE_ALIGNED(16, int16_t, tmp)[len];
224
-        int c, j;
225
-        for (c = 0; c < channels; c++) {
226
-            float_to_int16_altivec(tmp, src[c], len);
227
-            for (i = 0, j = c; i < len; i++, j+=channels) {
228
-                dst[j] = tmp[i];
229
-            }
230
-        }
231
-   }
232
-}
233
-
234 125
 void float_init_altivec(DSPContext* c, AVCodecContext *avctx)
235 126
 {
236 127
     c->vector_fmul = vector_fmul_altivec;
237 128
     c->vector_fmul_reverse = vector_fmul_reverse_altivec;
238 129
     c->vector_fmul_add = vector_fmul_add_altivec;
239
-    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
240 130
     if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
241 131
         c->vector_fmul_window = vector_fmul_window_altivec;
242
-        c->float_to_int16 = float_to_int16_altivec;
243
-        c->float_to_int16_interleave = float_to_int16_interleave_altivec;
244 132
     }
245 133
 }
246 134
new file mode 100644
... ...
@@ -0,0 +1,142 @@
0
+/*
1
+ * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
2
+ *
3
+ * This file is part of FFmpeg.
4
+ *
5
+ * FFmpeg is free software; you can redistribute it and/or
6
+ * modify it under the terms of the GNU Lesser General Public
7
+ * License as published by the Free Software Foundation; either
8
+ * version 2.1 of the License, or (at your option) any later version.
9
+ *
10
+ * FFmpeg is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
+ * Lesser General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU Lesser General Public
16
+ * License along with FFmpeg; if not, write to the Free Software
17
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+#include "libavcodec/fmtconvert.h"
21
+
22
+#include "dsputil_altivec.h"
23
+#include "util_altivec.h"
24
+
25
+static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
26
+{
27
+    union {
28
+        vector float v;
29
+        float s[4];
30
+    } mul_u;
31
+    int i;
32
+    vector float src1, src2, dst1, dst2, mul_v, zero;
33
+
34
+    zero = (vector float)vec_splat_u32(0);
35
+    mul_u.s[0] = mul;
36
+    mul_v = vec_splat(mul_u.v, 0);
37
+
38
+    for(i=0; i<len; i+=8) {
39
+        src1 = vec_ctf(vec_ld(0,  src+i), 0);
40
+        src2 = vec_ctf(vec_ld(16, src+i), 0);
41
+        dst1 = vec_madd(src1, mul_v, zero);
42
+        dst2 = vec_madd(src2, mul_v, zero);
43
+        vec_st(dst1,  0, dst+i);
44
+        vec_st(dst2, 16, dst+i);
45
+    }
46
+}
47
+
48
+
49
+static vector signed short
50
+float_to_int16_one_altivec(const float *src)
51
+{
52
+    vector float s0 = vec_ld(0, src);
53
+    vector float s1 = vec_ld(16, src);
54
+    vector signed int t0 = vec_cts(s0, 0);
55
+    vector signed int t1 = vec_cts(s1, 0);
56
+    return vec_packs(t0,t1);
57
+}
58
+
59
+static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
60
+{
61
+    int i;
62
+    vector signed short d0, d1, d;
63
+    vector unsigned char align;
64
+    if(((long)dst)&15) //FIXME
65
+    for(i=0; i<len-7; i+=8) {
66
+        d0 = vec_ld(0, dst+i);
67
+        d = float_to_int16_one_altivec(src+i);
68
+        d1 = vec_ld(15, dst+i);
69
+        d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
70
+        align = vec_lvsr(0, dst+i);
71
+        d0 = vec_perm(d1, d, align);
72
+        d1 = vec_perm(d, d1, align);
73
+        vec_st(d0, 0, dst+i);
74
+        vec_st(d1,15, dst+i);
75
+    }
76
+    else
77
+    for(i=0; i<len-7; i+=8) {
78
+        d = float_to_int16_one_altivec(src+i);
79
+        vec_st(d, 0, dst+i);
80
+    }
81
+}
82
+
83
+static void
84
+float_to_int16_interleave_altivec(int16_t *dst, const float **src,
85
+                                  long len, int channels)
86
+{
87
+    int i;
88
+    vector signed short d0, d1, d2, c0, c1, t0, t1;
89
+    vector unsigned char align;
90
+    if(channels == 1)
91
+        float_to_int16_altivec(dst, src[0], len);
92
+    else
93
+        if (channels == 2) {
94
+        if(((long)dst)&15)
95
+        for(i=0; i<len-7; i+=8) {
96
+            d0 = vec_ld(0, dst + i);
97
+            t0 = float_to_int16_one_altivec(src[0] + i);
98
+            d1 = vec_ld(31, dst + i);
99
+            t1 = float_to_int16_one_altivec(src[1] + i);
100
+            c0 = vec_mergeh(t0, t1);
101
+            c1 = vec_mergel(t0, t1);
102
+            d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
103
+            align = vec_lvsr(0, dst + i);
104
+            d0 = vec_perm(d2, c0, align);
105
+            d1 = vec_perm(c0, c1, align);
106
+            vec_st(d0,  0, dst + i);
107
+            d0 = vec_perm(c1, d2, align);
108
+            vec_st(d1, 15, dst + i);
109
+            vec_st(d0, 31, dst + i);
110
+            dst+=8;
111
+        }
112
+        else
113
+        for(i=0; i<len-7; i+=8) {
114
+            t0 = float_to_int16_one_altivec(src[0] + i);
115
+            t1 = float_to_int16_one_altivec(src[1] + i);
116
+            d0 = vec_mergeh(t0, t1);
117
+            d1 = vec_mergel(t0, t1);
118
+            vec_st(d0,  0, dst + i);
119
+            vec_st(d1, 16, dst + i);
120
+            dst+=8;
121
+        }
122
+    } else {
123
+        DECLARE_ALIGNED(16, int16_t, tmp)[len];
124
+        int c, j;
125
+        for (c = 0; c < channels; c++) {
126
+            float_to_int16_altivec(tmp, src[c], len);
127
+            for (i = 0, j = c; i < len; i++, j+=channels) {
128
+                dst[j] = tmp[i];
129
+            }
130
+        }
131
+   }
132
+}
133
+
134
+void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx)
135
+{
136
+    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
137
+    if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
138
+        c->float_to_int16 = float_to_int16_altivec;
139
+        c->float_to_int16_interleave = float_to_int16_interleave_altivec;
140
+    }
141
+}
... ...
@@ -31,6 +31,7 @@
31 31
 #include "get_bits.h"
32 32
 #include "dsputil.h"
33 33
 #include "fft.h"
34
+#include "fmtconvert.h"
34 35
 
35 36
 #include "vorbis.h"
36 37
 #include "xiph.h"
... ...
@@ -127,6 +128,7 @@ typedef struct vorbis_context_s {
127 127
     AVCodecContext *avccontext;
128 128
     GetBitContext gb;
129 129
     DSPContext dsp;
130
+    FmtConvertContext fmt_conv;
130 131
 
131 132
     FFTContext mdct[2];
132 133
     uint_fast8_t  first_frame;
... ...
@@ -961,6 +963,7 @@ static av_cold int vorbis_decode_init(AVCodecContext *avccontext)
961 961
 
962 962
     vc->avccontext = avccontext;
963 963
     dsputil_init(&vc->dsp, avccontext);
964
+    ff_fmt_convert_init(&vc->fmt_conv, avccontext);
964 965
 
965 966
     vc->scale_bias = 32768.0f;
966 967
 
... ...
@@ -1636,7 +1639,8 @@ static int vorbis_decode_frame(AVCodecContext *avccontext,
1636 1636
                               len * ff_vorbis_channel_layout_offsets[vc->audio_channels - 1][i];
1637 1637
     }
1638 1638
 
1639
-    vc->dsp.float_to_int16_interleave(data, channel_ptrs, len, vc->audio_channels);
1639
+    vc->fmt_conv.float_to_int16_interleave(data, channel_ptrs, len,
1640
+                                           vc->audio_channels);
1640 1641
     *data_size = len * 2 * vc->audio_channels;
1641 1642
 
1642 1643
     return buf_size ;
... ...
@@ -126,6 +126,7 @@ int ff_wma_init(AVCodecContext *avctx, int flags2)
126 126
     s->block_align = avctx->block_align;
127 127
 
128 128
     dsputil_init(&s->dsp, avctx);
129
+    ff_fmt_convert_init(&s->fmt_conv, avctx);
129 130
 
130 131
     if (avctx->codec->id == CODEC_ID_WMAV1) {
131 132
         s->version = 1;
... ...
@@ -26,6 +26,7 @@
26 26
 #include "put_bits.h"
27 27
 #include "dsputil.h"
28 28
 #include "fft.h"
29
+#include "fmtconvert.h"
29 30
 
30 31
 /* size of blocks */
31 32
 #define BLOCK_MIN_BITS 7
... ...
@@ -134,6 +135,7 @@ typedef struct WMACodecContext {
134 134
     float lsp_pow_m_table1[(1 << LSP_POW_BITS)];
135 135
     float lsp_pow_m_table2[(1 << LSP_POW_BITS)];
136 136
     DSPContext dsp;
137
+    FmtConvertContext fmt_conv;
137 138
 
138 139
 #ifdef TRACE
139 140
     int frame_count;
... ...
@@ -791,7 +791,7 @@ static int wma_decode_frame(WMACodecContext *s, int16_t *samples)
791 791
     incr = s->nb_channels;
792 792
     for (ch = 0; ch < MAX_CHANNELS; ch++)
793 793
         output[ch] = s->frame_out[ch];
794
-    s->dsp.float_to_int16_interleave(samples, output, n, incr);
794
+    s->fmt_conv.float_to_int16_interleave(samples, output, n, incr);
795 795
     for (ch = 0; ch < incr; ch++) {
796 796
         /* prepare for next block */
797 797
         memmove(&s->frame_out[ch][0], &s->frame_out[ch][n], n * sizeof(float));
... ...
@@ -39,6 +39,7 @@ YASM-OBJS-$(CONFIG_VP8_DECODER)        += x86/vp8dsp.o
39 39
 MMX-OBJS-$(CONFIG_VP8_DECODER)         += x86/vp8dsp-init.o
40 40
 MMX-OBJS-$(HAVE_YASM)                  += x86/dsputil_yasm.o            \
41 41
                                           x86/deinterlace.o             \
42
+                                          x86/fmtconvert.o              \
42 43
                                           x86/h264_chromamc.o           \
43 44
                                           $(YASM-OBJS-yes)
44 45
 
... ...
@@ -47,6 +48,7 @@ MMX-OBJS-$(CONFIG_FFT)                 += x86/fft.o
47 47
 OBJS-$(HAVE_MMX)                       += x86/dnxhd_mmx.o               \
48 48
                                           x86/dsputil_mmx.o             \
49 49
                                           x86/fdct_mmx.o                \
50
+                                          x86/fmtconvert_mmx.o          \
50 51
                                           x86/idct_mmx_xvid.o           \
51 52
                                           x86/idct_sse2_xvid.o          \
52 53
                                           x86/motion_est_mmx.o          \
... ...
@@ -2349,50 +2349,6 @@ static void vector_fmul_window_sse(float *dst, const float *src0, const float *s
2349 2349
 }
2350 2350
 #endif /* HAVE_6REGS */
2351 2351
 
2352
-static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
2353
-{
2354
-    x86_reg i = -4*len;
2355
-    __asm__ volatile(
2356
-        "movss  %3, %%xmm4 \n"
2357
-        "shufps $0, %%xmm4, %%xmm4 \n"
2358
-        "1: \n"
2359
-        "cvtpi2ps   (%2,%0), %%xmm0 \n"
2360
-        "cvtpi2ps  8(%2,%0), %%xmm1 \n"
2361
-        "cvtpi2ps 16(%2,%0), %%xmm2 \n"
2362
-        "cvtpi2ps 24(%2,%0), %%xmm3 \n"
2363
-        "movlhps  %%xmm1,    %%xmm0 \n"
2364
-        "movlhps  %%xmm3,    %%xmm2 \n"
2365
-        "mulps    %%xmm4,    %%xmm0 \n"
2366
-        "mulps    %%xmm4,    %%xmm2 \n"
2367
-        "movaps   %%xmm0,   (%1,%0) \n"
2368
-        "movaps   %%xmm2, 16(%1,%0) \n"
2369
-        "add $32, %0 \n"
2370
-        "jl 1b \n"
2371
-        :"+r"(i)
2372
-        :"r"(dst+len), "r"(src+len), "m"(mul)
2373
-    );
2374
-}
2375
-
2376
-static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
2377
-{
2378
-    x86_reg i = -4*len;
2379
-    __asm__ volatile(
2380
-        "movss  %3, %%xmm4 \n"
2381
-        "shufps $0, %%xmm4, %%xmm4 \n"
2382
-        "1: \n"
2383
-        "cvtdq2ps   (%2,%0), %%xmm0 \n"
2384
-        "cvtdq2ps 16(%2,%0), %%xmm1 \n"
2385
-        "mulps    %%xmm4,    %%xmm0 \n"
2386
-        "mulps    %%xmm4,    %%xmm1 \n"
2387
-        "movaps   %%xmm0,   (%1,%0) \n"
2388
-        "movaps   %%xmm1, 16(%1,%0) \n"
2389
-        "add $32, %0 \n"
2390
-        "jl 1b \n"
2391
-        :"+r"(i)
2392
-        :"r"(dst+len), "r"(src+len), "m"(mul)
2393
-    );
2394
-}
2395
-
2396 2352
 static void vector_clipf_sse(float *dst, const float *src, float min, float max,
2397 2353
                              int len)
2398 2354
 {
... ...
@@ -2427,70 +2383,6 @@ static void vector_clipf_sse(float *dst, const float *src, float min, float max,
2427 2427
     );
2428 2428
 }
2429 2429
 
2430
-static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
2431
-    x86_reg reglen = len;
2432
-    // not bit-exact: pf2id uses different rounding than C and SSE
2433
-    __asm__ volatile(
2434
-        "add        %0          , %0        \n\t"
2435
-        "lea         (%2,%0,2)  , %2        \n\t"
2436
-        "add        %0          , %1        \n\t"
2437
-        "neg        %0                      \n\t"
2438
-        "1:                                 \n\t"
2439
-        "pf2id       (%2,%0,2)  , %%mm0     \n\t"
2440
-        "pf2id      8(%2,%0,2)  , %%mm1     \n\t"
2441
-        "pf2id     16(%2,%0,2)  , %%mm2     \n\t"
2442
-        "pf2id     24(%2,%0,2)  , %%mm3     \n\t"
2443
-        "packssdw   %%mm1       , %%mm0     \n\t"
2444
-        "packssdw   %%mm3       , %%mm2     \n\t"
2445
-        "movq       %%mm0       ,  (%1,%0)  \n\t"
2446
-        "movq       %%mm2       , 8(%1,%0)  \n\t"
2447
-        "add        $16         , %0        \n\t"
2448
-        " js 1b                             \n\t"
2449
-        "femms                              \n\t"
2450
-        :"+r"(reglen), "+r"(dst), "+r"(src)
2451
-    );
2452
-}
2453
-static void float_to_int16_sse(int16_t *dst, const float *src, long len){
2454
-    x86_reg reglen = len;
2455
-    __asm__ volatile(
2456
-        "add        %0          , %0        \n\t"
2457
-        "lea         (%2,%0,2)  , %2        \n\t"
2458
-        "add        %0          , %1        \n\t"
2459
-        "neg        %0                      \n\t"
2460
-        "1:                                 \n\t"
2461
-        "cvtps2pi    (%2,%0,2)  , %%mm0     \n\t"
2462
-        "cvtps2pi   8(%2,%0,2)  , %%mm1     \n\t"
2463
-        "cvtps2pi  16(%2,%0,2)  , %%mm2     \n\t"
2464
-        "cvtps2pi  24(%2,%0,2)  , %%mm3     \n\t"
2465
-        "packssdw   %%mm1       , %%mm0     \n\t"
2466
-        "packssdw   %%mm3       , %%mm2     \n\t"
2467
-        "movq       %%mm0       ,  (%1,%0)  \n\t"
2468
-        "movq       %%mm2       , 8(%1,%0)  \n\t"
2469
-        "add        $16         , %0        \n\t"
2470
-        " js 1b                             \n\t"
2471
-        "emms                               \n\t"
2472
-        :"+r"(reglen), "+r"(dst), "+r"(src)
2473
-    );
2474
-}
2475
-
2476
-static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
2477
-    x86_reg reglen = len;
2478
-    __asm__ volatile(
2479
-        "add        %0          , %0        \n\t"
2480
-        "lea         (%2,%0,2)  , %2        \n\t"
2481
-        "add        %0          , %1        \n\t"
2482
-        "neg        %0                      \n\t"
2483
-        "1:                                 \n\t"
2484
-        "cvtps2dq    (%2,%0,2)  , %%xmm0    \n\t"
2485
-        "cvtps2dq  16(%2,%0,2)  , %%xmm1    \n\t"
2486
-        "packssdw   %%xmm1      , %%xmm0    \n\t"
2487
-        "movdqa     %%xmm0      ,  (%1,%0)  \n\t"
2488
-        "add        $16         , %0        \n\t"
2489
-        " js 1b                             \n\t"
2490
-        :"+r"(reglen), "+r"(dst), "+r"(src)
2491
-    );
2492
-}
2493
-
2494 2430
 void ff_vp3_idct_mmx(int16_t *input_data);
2495 2431
 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2496 2432
 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
... ...
@@ -2504,9 +2396,6 @@ void ff_vp3_idct_sse2(int16_t *input_data);
2504 2504
 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2505 2505
 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2506 2506
 
2507
-void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
2508
-void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
2509
-void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
2510 2507
 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
2511 2508
 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
2512 2509
 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
... ...
@@ -2516,102 +2405,6 @@ void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const
2516 2516
 int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2517 2517
 int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
2518 2518
 
2519
-#if !HAVE_YASM
2520
-#define ff_float_to_int16_interleave6_sse(a,b,c)   float_to_int16_interleave_misc_sse(a,b,c,6)
2521
-#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
2522
-#define ff_float_to_int16_interleave6_3dn2(a,b,c)  float_to_int16_interleave_misc_3dnow(a,b,c,6)
2523
-#endif
2524
-#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
2525
-
2526
-#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
2527
-/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
2528
-static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
2529
-    DECLARE_ALIGNED(16, int16_t, tmp)[len];\
2530
-    int i,j,c;\
2531
-    for(c=0; c<channels; c++){\
2532
-        float_to_int16_##cpu(tmp, src[c], len);\
2533
-        for(i=0, j=c; i<len; i++, j+=channels)\
2534
-            dst[j] = tmp[i];\
2535
-    }\
2536
-}\
2537
-\
2538
-static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
2539
-    if(channels==1)\
2540
-        float_to_int16_##cpu(dst, src[0], len);\
2541
-    else if(channels==2){\
2542
-        x86_reg reglen = len; \
2543
-        const float *src0 = src[0];\
2544
-        const float *src1 = src[1];\
2545
-        __asm__ volatile(\
2546
-            "shl $2, %0 \n"\
2547
-            "add %0, %1 \n"\
2548
-            "add %0, %2 \n"\
2549
-            "add %0, %3 \n"\
2550
-            "neg %0 \n"\
2551
-            body\
2552
-            :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
2553
-        );\
2554
-    }else if(channels==6){\
2555
-        ff_float_to_int16_interleave6_##cpu(dst, src, len);\
2556
-    }else\
2557
-        float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
2558
-}
2559
-
2560
-FLOAT_TO_INT16_INTERLEAVE(3dnow,
2561
-    "1:                         \n"
2562
-    "pf2id     (%2,%0), %%mm0   \n"
2563
-    "pf2id    8(%2,%0), %%mm1   \n"
2564
-    "pf2id     (%3,%0), %%mm2   \n"
2565
-    "pf2id    8(%3,%0), %%mm3   \n"
2566
-    "packssdw    %%mm1, %%mm0   \n"
2567
-    "packssdw    %%mm3, %%mm2   \n"
2568
-    "movq        %%mm0, %%mm1   \n"
2569
-    "punpcklwd   %%mm2, %%mm0   \n"
2570
-    "punpckhwd   %%mm2, %%mm1   \n"
2571
-    "movq        %%mm0,  (%1,%0)\n"
2572
-    "movq        %%mm1, 8(%1,%0)\n"
2573
-    "add $16, %0                \n"
2574
-    "js 1b                      \n"
2575
-    "femms                      \n"
2576
-)
2577
-
2578
-FLOAT_TO_INT16_INTERLEAVE(sse,
2579
-    "1:                         \n"
2580
-    "cvtps2pi  (%2,%0), %%mm0   \n"
2581
-    "cvtps2pi 8(%2,%0), %%mm1   \n"
2582
-    "cvtps2pi  (%3,%0), %%mm2   \n"
2583
-    "cvtps2pi 8(%3,%0), %%mm3   \n"
2584
-    "packssdw    %%mm1, %%mm0   \n"
2585
-    "packssdw    %%mm3, %%mm2   \n"
2586
-    "movq        %%mm0, %%mm1   \n"
2587
-    "punpcklwd   %%mm2, %%mm0   \n"
2588
-    "punpckhwd   %%mm2, %%mm1   \n"
2589
-    "movq        %%mm0,  (%1,%0)\n"
2590
-    "movq        %%mm1, 8(%1,%0)\n"
2591
-    "add $16, %0                \n"
2592
-    "js 1b                      \n"
2593
-    "emms                       \n"
2594
-)
2595
-
2596
-FLOAT_TO_INT16_INTERLEAVE(sse2,
2597
-    "1:                         \n"
2598
-    "cvtps2dq  (%2,%0), %%xmm0  \n"
2599
-    "cvtps2dq  (%3,%0), %%xmm1  \n"
2600
-    "packssdw   %%xmm1, %%xmm0  \n"
2601
-    "movhlps    %%xmm0, %%xmm1  \n"
2602
-    "punpcklwd  %%xmm1, %%xmm0  \n"
2603
-    "movdqa     %%xmm0, (%1,%0) \n"
2604
-    "add $16, %0                \n"
2605
-    "js 1b                      \n"
2606
-)
2607
-
2608
-static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
2609
-    if(channels==6)
2610
-        ff_float_to_int16_interleave6_3dn2(dst, src, len);
2611
-    else
2612
-        float_to_int16_interleave_3dnow(dst, src, len, channels);
2613
-}
2614
-
2615 2519
 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2616 2520
 
2617 2521
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
... ...
@@ -2968,19 +2761,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2968 2968
         if(mm_flags & AV_CPU_FLAG_3DNOW){
2969 2969
             c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2970 2970
             c->vector_fmul = vector_fmul_3dnow;
2971
-            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2972
-                c->float_to_int16 = float_to_int16_3dnow;
2973
-                c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
2974
-            }
2975 2971
         }
2976 2972
         if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
2977 2973
             c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
2978 2974
 #if HAVE_6REGS
2979 2975
             c->vector_fmul_window = vector_fmul_window_3dnow2;
2980 2976
 #endif
2981
-            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2982
-                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
2983
-            }
2984 2977
         }
2985 2978
         if(mm_flags & AV_CPU_FLAG_MMX2){
2986 2979
 #if HAVE_YASM
... ...
@@ -2997,10 +2783,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2997 2997
 #if HAVE_6REGS
2998 2998
             c->vector_fmul_window = vector_fmul_window_sse;
2999 2999
 #endif
3000
-            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
3001 3000
             c->vector_clipf = vector_clipf_sse;
3002
-            c->float_to_int16 = float_to_int16_sse;
3003
-            c->float_to_int16_interleave = float_to_int16_interleave_sse;
3004 3001
 #if HAVE_YASM
3005 3002
             c->scalarproduct_float = ff_scalarproduct_float_sse;
3006 3003
 #endif
... ...
@@ -3008,9 +2791,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
3008 3008
         if(mm_flags & AV_CPU_FLAG_3DNOW)
3009 3009
             c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
3010 3010
         if(mm_flags & AV_CPU_FLAG_SSE2){
3011
-            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
3012
-            c->float_to_int16 = float_to_int16_sse2;
3013
-            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
3014 3011
 #if HAVE_YASM
3015 3012
             c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
3016 3013
             c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
... ...
@@ -30,75 +30,6 @@ pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
30 30
 
31 31
 section .text align=16
32 32
 
33
-%macro PSWAPD_SSE 2
34
-    pshufw %1, %2, 0x4e
35
-%endmacro
36
-%macro PSWAPD_3DN1 2
37
-    movq  %1, %2
38
-    psrlq %1, 32
39
-    punpckldq %1, %2
40
-%endmacro
41
-
42
-%macro FLOAT_TO_INT16_INTERLEAVE6 1
43
-; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
44
-cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
45
-%ifdef ARCH_X86_64
46
-    %define lend r10d
47
-    mov     lend, r2d
48
-%else
49
-    %define lend dword r2m
50
-%endif
51
-    mov src1q, [srcq+1*gprsize]
52
-    mov src2q, [srcq+2*gprsize]
53
-    mov src3q, [srcq+3*gprsize]
54
-    mov src4q, [srcq+4*gprsize]
55
-    mov src5q, [srcq+5*gprsize]
56
-    mov srcq,  [srcq]
57
-    sub src1q, srcq
58
-    sub src2q, srcq
59
-    sub src3q, srcq
60
-    sub src4q, srcq
61
-    sub src5q, srcq
62
-.loop:
63
-    cvtps2pi   mm0, [srcq]
64
-    cvtps2pi   mm1, [srcq+src1q]
65
-    cvtps2pi   mm2, [srcq+src2q]
66
-    cvtps2pi   mm3, [srcq+src3q]
67
-    cvtps2pi   mm4, [srcq+src4q]
68
-    cvtps2pi   mm5, [srcq+src5q]
69
-    packssdw   mm0, mm3
70
-    packssdw   mm1, mm4
71
-    packssdw   mm2, mm5
72
-    pswapd     mm3, mm0
73
-    punpcklwd  mm0, mm1
74
-    punpckhwd  mm1, mm2
75
-    punpcklwd  mm2, mm3
76
-    pswapd     mm3, mm0
77
-    punpckldq  mm0, mm2
78
-    punpckhdq  mm2, mm1
79
-    punpckldq  mm1, mm3
80
-    movq [dstq   ], mm0
81
-    movq [dstq+16], mm2
82
-    movq [dstq+ 8], mm1
83
-    add srcq, 8
84
-    add dstq, 24
85
-    sub lend, 2
86
-    jg .loop
87
-    emms
88
-    RET
89
-%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
90
-
91
-%define pswapd PSWAPD_SSE
92
-FLOAT_TO_INT16_INTERLEAVE6 sse
93
-%define cvtps2pi pf2id
94
-%define pswapd PSWAPD_3DN1
95
-FLOAT_TO_INT16_INTERLEAVE6 3dnow
96
-%undef pswapd
97
-FLOAT_TO_INT16_INTERLEAVE6 3dn2
98
-%undef cvtps2pi
99
-
100
-
101
-
102 33
 %macro SCALARPRODUCT 1
103 34
 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
104 35
 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
105 36
new file mode 100644
... ...
@@ -0,0 +1,91 @@
0
+;******************************************************************************
1
+;* x86 optimized Format Conversion Utils
2
+;* Copyright (c) 2008 Loren Merritt
3
+;*
4
+;* This file is part of FFmpeg.
5
+;*
6
+;* FFmpeg is free software; you can redistribute it and/or
7
+;* modify it under the terms of the GNU Lesser General Public
8
+;* License as published by the Free Software Foundation; either
9
+;* version 2.1 of the License, or (at your option) any later version.
10
+;*
11
+;* FFmpeg is distributed in the hope that it will be useful,
12
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
+;* Lesser General Public License for more details.
15
+;*
16
+;* You should have received a copy of the GNU Lesser General Public
17
+;* License along with FFmpeg; if not, write to the Free Software
18
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+;******************************************************************************
20
+
21
+%include "x86inc.asm"
22
+
23
+section .text align=16
24
+
25
+%macro PSWAPD_SSE 2
26
+    pshufw %1, %2, 0x4e
27
+%endmacro
28
+%macro PSWAPD_3DN1 2
29
+    movq  %1, %2
30
+    psrlq %1, 32
31
+    punpckldq %1, %2
32
+%endmacro
33
+
34
+%macro FLOAT_TO_INT16_INTERLEAVE6 1
35
+; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
36
+cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
37
+%ifdef ARCH_X86_64
38
+    %define lend r10d
39
+    mov     lend, r2d
40
+%else
41
+    %define lend dword r2m
42
+%endif
43
+    mov src1q, [srcq+1*gprsize]
44
+    mov src2q, [srcq+2*gprsize]
45
+    mov src3q, [srcq+3*gprsize]
46
+    mov src4q, [srcq+4*gprsize]
47
+    mov src5q, [srcq+5*gprsize]
48
+    mov srcq,  [srcq]
49
+    sub src1q, srcq
50
+    sub src2q, srcq
51
+    sub src3q, srcq
52
+    sub src4q, srcq
53
+    sub src5q, srcq
54
+.loop:
55
+    cvtps2pi   mm0, [srcq]
56
+    cvtps2pi   mm1, [srcq+src1q]
57
+    cvtps2pi   mm2, [srcq+src2q]
58
+    cvtps2pi   mm3, [srcq+src3q]
59
+    cvtps2pi   mm4, [srcq+src4q]
60
+    cvtps2pi   mm5, [srcq+src5q]
61
+    packssdw   mm0, mm3
62
+    packssdw   mm1, mm4
63
+    packssdw   mm2, mm5
64
+    pswapd     mm3, mm0
65
+    punpcklwd  mm0, mm1
66
+    punpckhwd  mm1, mm2
67
+    punpcklwd  mm2, mm3
68
+    pswapd     mm3, mm0
69
+    punpckldq  mm0, mm2
70
+    punpckhdq  mm2, mm1
71
+    punpckldq  mm1, mm3
72
+    movq [dstq   ], mm0
73
+    movq [dstq+16], mm2
74
+    movq [dstq+ 8], mm1
75
+    add srcq, 8
76
+    add dstq, 24
77
+    sub lend, 2
78
+    jg .loop
79
+    emms
80
+    RET
81
+%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
82
+
83
+%define pswapd PSWAPD_SSE
84
+FLOAT_TO_INT16_INTERLEAVE6 sse
85
+%define cvtps2pi pf2id
86
+%define pswapd PSWAPD_3DN1
87
+FLOAT_TO_INT16_INTERLEAVE6 3dnow
88
+%undef pswapd
89
+FLOAT_TO_INT16_INTERLEAVE6 3dn2
90
+%undef cvtps2pi
0 91
new file mode 100644
... ...
@@ -0,0 +1,266 @@
0
+/*
1
+ * Format Conversion Utils
2
+ * Copyright (c) 2000, 2001 Fabrice Bellard
3
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
4
+ *
5
+ * This file is part of FFmpeg.
6
+ *
7
+ * FFmpeg is free software; you can redistribute it and/or
8
+ * modify it under the terms of the GNU Lesser General Public
9
+ * License as published by the Free Software Foundation; either
10
+ * version 2.1 of the License, or (at your option) any later version.
11
+ *
12
+ * FFmpeg is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
+ * Lesser General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU Lesser General Public
18
+ * License along with FFmpeg; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ *
21
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
22
+ */
23
+
24
+#include "libavutil/cpu.h"
25
+#include "libavutil/x86_cpu.h"
26
+#include "libavcodec/fmtconvert.h"
27
+
28
+static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
29
+{
30
+    x86_reg i = -4*len;
31
+    __asm__ volatile(
32
+        "movss  %3, %%xmm4 \n"
33
+        "shufps $0, %%xmm4, %%xmm4 \n"
34
+        "1: \n"
35
+        "cvtpi2ps   (%2,%0), %%xmm0 \n"
36
+        "cvtpi2ps  8(%2,%0), %%xmm1 \n"
37
+        "cvtpi2ps 16(%2,%0), %%xmm2 \n"
38
+        "cvtpi2ps 24(%2,%0), %%xmm3 \n"
39
+        "movlhps  %%xmm1,    %%xmm0 \n"
40
+        "movlhps  %%xmm3,    %%xmm2 \n"
41
+        "mulps    %%xmm4,    %%xmm0 \n"
42
+        "mulps    %%xmm4,    %%xmm2 \n"
43
+        "movaps   %%xmm0,   (%1,%0) \n"
44
+        "movaps   %%xmm2, 16(%1,%0) \n"
45
+        "add $32, %0 \n"
46
+        "jl 1b \n"
47
+        :"+r"(i)
48
+        :"r"(dst+len), "r"(src+len), "m"(mul)
49
+    );
50
+}
51
+
52
+static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
53
+{
54
+    x86_reg i = -4*len;
55
+    __asm__ volatile(
56
+        "movss  %3, %%xmm4 \n"
57
+        "shufps $0, %%xmm4, %%xmm4 \n"
58
+        "1: \n"
59
+        "cvtdq2ps   (%2,%0), %%xmm0 \n"
60
+        "cvtdq2ps 16(%2,%0), %%xmm1 \n"
61
+        "mulps    %%xmm4,    %%xmm0 \n"
62
+        "mulps    %%xmm4,    %%xmm1 \n"
63
+        "movaps   %%xmm0,   (%1,%0) \n"
64
+        "movaps   %%xmm1, 16(%1,%0) \n"
65
+        "add $32, %0 \n"
66
+        "jl 1b \n"
67
+        :"+r"(i)
68
+        :"r"(dst+len), "r"(src+len), "m"(mul)
69
+    );
70
+}
71
+
72
+static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
73
+    x86_reg reglen = len;
74
+    // not bit-exact: pf2id uses different rounding than C and SSE
75
+    __asm__ volatile(
76
+        "add        %0          , %0        \n\t"
77
+        "lea         (%2,%0,2)  , %2        \n\t"
78
+        "add        %0          , %1        \n\t"
79
+        "neg        %0                      \n\t"
80
+        "1:                                 \n\t"
81
+        "pf2id       (%2,%0,2)  , %%mm0     \n\t"
82
+        "pf2id      8(%2,%0,2)  , %%mm1     \n\t"
83
+        "pf2id     16(%2,%0,2)  , %%mm2     \n\t"
84
+        "pf2id     24(%2,%0,2)  , %%mm3     \n\t"
85
+        "packssdw   %%mm1       , %%mm0     \n\t"
86
+        "packssdw   %%mm3       , %%mm2     \n\t"
87
+        "movq       %%mm0       ,  (%1,%0)  \n\t"
88
+        "movq       %%mm2       , 8(%1,%0)  \n\t"
89
+        "add        $16         , %0        \n\t"
90
+        " js 1b                             \n\t"
91
+        "femms                              \n\t"
92
+        :"+r"(reglen), "+r"(dst), "+r"(src)
93
+    );
94
+}
95
+
96
+static void float_to_int16_sse(int16_t *dst, const float *src, long len){
97
+    x86_reg reglen = len;
98
+    __asm__ volatile(
99
+        "add        %0          , %0        \n\t"
100
+        "lea         (%2,%0,2)  , %2        \n\t"
101
+        "add        %0          , %1        \n\t"
102
+        "neg        %0                      \n\t"
103
+        "1:                                 \n\t"
104
+        "cvtps2pi    (%2,%0,2)  , %%mm0     \n\t"
105
+        "cvtps2pi   8(%2,%0,2)  , %%mm1     \n\t"
106
+        "cvtps2pi  16(%2,%0,2)  , %%mm2     \n\t"
107
+        "cvtps2pi  24(%2,%0,2)  , %%mm3     \n\t"
108
+        "packssdw   %%mm1       , %%mm0     \n\t"
109
+        "packssdw   %%mm3       , %%mm2     \n\t"
110
+        "movq       %%mm0       ,  (%1,%0)  \n\t"
111
+        "movq       %%mm2       , 8(%1,%0)  \n\t"
112
+        "add        $16         , %0        \n\t"
113
+        " js 1b                             \n\t"
114
+        "emms                               \n\t"
115
+        :"+r"(reglen), "+r"(dst), "+r"(src)
116
+    );
117
+}
118
+
119
+static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
120
+    x86_reg reglen = len;
121
+    __asm__ volatile(
122
+        "add        %0          , %0        \n\t"
123
+        "lea         (%2,%0,2)  , %2        \n\t"
124
+        "add        %0          , %1        \n\t"
125
+        "neg        %0                      \n\t"
126
+        "1:                                 \n\t"
127
+        "cvtps2dq    (%2,%0,2)  , %%xmm0    \n\t"
128
+        "cvtps2dq  16(%2,%0,2)  , %%xmm1    \n\t"
129
+        "packssdw   %%xmm1      , %%xmm0    \n\t"
130
+        "movdqa     %%xmm0      ,  (%1,%0)  \n\t"
131
+        "add        $16         , %0        \n\t"
132
+        " js 1b                             \n\t"
133
+        :"+r"(reglen), "+r"(dst), "+r"(src)
134
+    );
135
+}
136
+
137
+void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
138
+void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
139
+void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
140
+
141
+#if !HAVE_YASM
142
+#define ff_float_to_int16_interleave6_sse(a,b,c)   float_to_int16_interleave_misc_sse(a,b,c,6)
143
+#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
144
+#define ff_float_to_int16_interleave6_3dn2(a,b,c)  float_to_int16_interleave_misc_3dnow(a,b,c,6)
145
+#endif
146
+#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
147
+
148
+#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
149
+/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
150
+static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
151
+    DECLARE_ALIGNED(16, int16_t, tmp)[len];\
152
+    int i,j,c;\
153
+    for(c=0; c<channels; c++){\
154
+        float_to_int16_##cpu(tmp, src[c], len);\
155
+        for(i=0, j=c; i<len; i++, j+=channels)\
156
+            dst[j] = tmp[i];\
157
+    }\
158
+}\
159
+\
160
+static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
161
+    if(channels==1)\
162
+        float_to_int16_##cpu(dst, src[0], len);\
163
+    else if(channels==2){\
164
+        x86_reg reglen = len; \
165
+        const float *src0 = src[0];\
166
+        const float *src1 = src[1];\
167
+        __asm__ volatile(\
168
+            "shl $2, %0 \n"\
169
+            "add %0, %1 \n"\
170
+            "add %0, %2 \n"\
171
+            "add %0, %3 \n"\
172
+            "neg %0 \n"\
173
+            body\
174
+            :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
175
+        );\
176
+    }else if(channels==6){\
177
+        ff_float_to_int16_interleave6_##cpu(dst, src, len);\
178
+    }else\
179
+        float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
180
+}
181
+
182
+FLOAT_TO_INT16_INTERLEAVE(3dnow,
183
+    "1:                         \n"
184
+    "pf2id     (%2,%0), %%mm0   \n"
185
+    "pf2id    8(%2,%0), %%mm1   \n"
186
+    "pf2id     (%3,%0), %%mm2   \n"
187
+    "pf2id    8(%3,%0), %%mm3   \n"
188
+    "packssdw    %%mm1, %%mm0   \n"
189
+    "packssdw    %%mm3, %%mm2   \n"
190
+    "movq        %%mm0, %%mm1   \n"
191
+    "punpcklwd   %%mm2, %%mm0   \n"
192
+    "punpckhwd   %%mm2, %%mm1   \n"
193
+    "movq        %%mm0,  (%1,%0)\n"
194
+    "movq        %%mm1, 8(%1,%0)\n"
195
+    "add $16, %0                \n"
196
+    "js 1b                      \n"
197
+    "femms                      \n"
198
+)
199
+
200
+FLOAT_TO_INT16_INTERLEAVE(sse,
201
+    "1:                         \n"
202
+    "cvtps2pi  (%2,%0), %%mm0   \n"
203
+    "cvtps2pi 8(%2,%0), %%mm1   \n"
204
+    "cvtps2pi  (%3,%0), %%mm2   \n"
205
+    "cvtps2pi 8(%3,%0), %%mm3   \n"
206
+    "packssdw    %%mm1, %%mm0   \n"
207
+    "packssdw    %%mm3, %%mm2   \n"
208
+    "movq        %%mm0, %%mm1   \n"
209
+    "punpcklwd   %%mm2, %%mm0   \n"
210
+    "punpckhwd   %%mm2, %%mm1   \n"
211
+    "movq        %%mm0,  (%1,%0)\n"
212
+    "movq        %%mm1, 8(%1,%0)\n"
213
+    "add $16, %0                \n"
214
+    "js 1b                      \n"
215
+    "emms                       \n"
216
+)
217
+
218
+FLOAT_TO_INT16_INTERLEAVE(sse2,
219
+    "1:                         \n"
220
+    "cvtps2dq  (%2,%0), %%xmm0  \n"
221
+    "cvtps2dq  (%3,%0), %%xmm1  \n"
222
+    "packssdw   %%xmm1, %%xmm0  \n"
223
+    "movhlps    %%xmm0, %%xmm1  \n"
224
+    "punpcklwd  %%xmm1, %%xmm0  \n"
225
+    "movdqa     %%xmm0, (%1,%0) \n"
226
+    "add $16, %0                \n"
227
+    "js 1b                      \n"
228
+)
229
+
230
+static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
231
+    if(channels==6)
232
+        ff_float_to_int16_interleave6_3dn2(dst, src, len);
233
+    else
234
+        float_to_int16_interleave_3dnow(dst, src, len, channels);
235
+}
236
+
237
+void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
238
+{
239
+    int mm_flags = av_get_cpu_flags();
240
+
241
+    if (mm_flags & AV_CPU_FLAG_MMX) {
242
+
243
+        if(mm_flags & AV_CPU_FLAG_3DNOW){
244
+            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
245
+                c->float_to_int16 = float_to_int16_3dnow;
246
+                c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
247
+            }
248
+        }
249
+        if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
250
+            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
251
+                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
252
+            }
253
+        }
254
+        if(mm_flags & AV_CPU_FLAG_SSE){
255
+            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
256
+            c->float_to_int16 = float_to_int16_sse;
257
+            c->float_to_int16_interleave = float_to_int16_interleave_sse;
258
+        }
259
+        if(mm_flags & AV_CPU_FLAG_SSE2){
260
+            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
261
+            c->float_to_int16 = float_to_int16_sse2;
262
+            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
263
+        }
264
+    }
265
+}