GitList

Browse code

Separate format conversion DSP functions from DSPContext.

This will be beneficial for use with the audio conversion API without
requiring it to depend on all of dsputil.

Signed-off-by: Mans Rullgard <mans@mansr.com>
(cherry picked from commit c73d99e672329c8f2df290736ffc474c360ac4ae)

Justin Ruggles authored on 2011/01/31 00:06:46
Showing 32 changed files

libavcodec/Makefile index de1bde0..6a0a05b 100644
libavcodec/aac.h index 714e314..cff476a 100644
libavcodec/aacdec.c index 0ea7dc8..411c1df 100644
libavcodec/ac3dec.c index 8e40ce1..5ebee19 100644
libavcodec/ac3dec.h index 55520cd..147e5e5 100644
libavcodec/arm/Makefile index 4c30e0a..014456e 100644
libavcodec/arm/dsputil_init_neon.c index 6798204..76ae632 100644
libavcodec/arm/dsputil_init_vfp.c index 76ef6b4..bd52315 100644
libavcodec/arm/dsputil_neon.S index 8329f6c..05a9115 100644
libavcodec/arm/dsputil_vfp.S index a65b69e..197d500 100644
libavcodec/arm/fmtconvert_init_arm.c index 0000000..4b6e393
libavcodec/arm/fmtconvert_neon.S index 0000000..359e57e
libavcodec/arm/fmtconvert_vfp.S index 0000000..1d19e77
libavcodec/binkaudio.c index ae2f6c8..5348465 100644
libavcodec/dca.c index 3a3eb25..63ea329 100644
libavcodec/dsputil.c index 2d4ec72..84714de 100644
libavcodec/dsputil.h index b942e66..c811186 100644
libavcodec/fmtconvert.c index 0000000..e26b899
libavcodec/fmtconvert.h index 0000000..f2ee261
libavcodec/nellymoserdec.c index 8b13a5d..80e04ee 100644
libavcodec/ppc/Makefile index 9b2358d..35ea0c3 100644
libavcodec/ppc/float_altivec.c index 60bae9a..ba97cbf 100644
libavcodec/ppc/fmtconvert_altivec.c index 0000000..e5287c9
libavcodec/vorbis_dec.c index 9fef5eb..bca56ba 100644
libavcodec/wma.c index e0b9b68..a7eacb8 100644
libavcodec/wma.h index 11274ad..a51b3e8 100644
libavcodec/wmadec.c index d85d80d..83f8dea 100644
libavcodec/x86/Makefile index 943edcb..83cec00 100644
libavcodec/x86/dsputil_mmx.c index 2eb7d85..39bf3f2 100644
libavcodec/x86/dsputil_yasm.asm index 099f0a8..b1b37e1 100644
libavcodec/x86/fmtconvert.asm index 0000000..6c744fc
libavcodec/x86/fmtconvert_mmx.c index 0000000..ea41f73

libavcodec/Makefile

History View file @ fe2ff6d

@@ -12,6 +12,7 @@ OBJS = allcodecs.o                                                      \
                             bitstream_filter.o                                               \
                             dsputil.o                                                        \
                             faanidct.o                                                       \
                     +       fmtconvert.o                                                     \
                             imgconvert.o                                                     \
                             jrevdct.o                                                        \
                             opt.o                                                            \

libavcodec/aac.h

History View file @ fe2ff6d

@@ -35,6 +35,7 @@
                      #include "fft.h"
                      #include "mpeg4audio.h"
                      #include "sbr.h"
                     +#include "fmtconvert.h"
                      #include <stdint.h>
@@ -268,6 +269,7 @@ typedef struct {
                          FFTContext mdct;
                          FFTContext mdct_small;
                          DSPContext dsp;
                     +    FmtConvertContext fmt_conv;
                          int random_state;
                          /** @} */

libavcodec/aacdec.c

History View file @ fe2ff6d

@@ -85,6 +85,7 @@
                      #include "get_bits.h"
                      #include "dsputil.h"
                      #include "fft.h"
                     +#include "fmtconvert.h"
                      #include "lpc.h"
                      #include "aac.h"
@@ -562,6 +563,7 @@ static av_cold int aac_decode_init(AVCodecContext *avctx)
                          ff_aac_sbr_init();
                          dsputil_init(&ac->dsp, avctx);
                     +    ff_fmt_convert_init(&ac->fmt_conv, avctx);
                          ac->random_state = 0x1f2e3d4c;
@@ -2032,7 +2034,7 @@ static int aac_decode_frame_int(AVCodecContext *avctx, void *data,
                          *data_size = data_size_tmp;
                          if (samples)
                     -        ac->dsp.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);
                     +        ac->fmt_conv.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);
                          if (ac->output_configured)
                              ac->output_configured = OC_LOCKED;

libavcodec/ac3dec.c

History View file @ fe2ff6d

@@ -193,6 +193,7 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
                          ff_mdct_init(&s->imdct_512, 9, 1, 1.0);
                          ff_kbd_window_init(s->window, 5.0, 256);
                          dsputil_init(&s->dsp, avctx);
                     +    ff_fmt_convert_init(&s->fmt_conv, avctx);
                          av_lfg_init(&s->dith_state, 0);
                          /* set scale value for float to int16 conversion */
@@ -1255,7 +1256,7 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
                              } else {
                                  gain *= s->dynamic_range[0];
+                             }
                     -        s->dsp.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
                     +        s->fmt_conv.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
+                         }
                          /* apply spectral extension to high frequency bins */
@@ -1407,7 +1408,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size,
                                  av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n");
                                  err = 1;
+                             }
                     -        s->dsp.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
                     +        s->fmt_conv.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
                              out_samples += 256 * s->out_channels;
+                         }
                          *data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t);

libavcodec/ac3dec.h

History View file @ fe2ff6d

@@ -55,6 +55,7 @@
                      #include "get_bits.h"
                      #include "dsputil.h"
                      #include "fft.h"
                     +#include "fmtconvert.h"
                      /* override ac3.h to include coupling channel */
                      #undef AC3_MAX_CHANNELS
@@ -190,6 +191,7 @@ typedef struct {
                      ///@defgroup opt optimization
                          DSPContext dsp;                         ///< for optimization
                     +    FmtConvertContext fmt_conv;             ///< optimized conversion functions
                          float mul_bias;                         ///< scaling for float_to_int16 conversion
                      ///@}

libavcodec/arm/Makefile

History View file @ fe2ff6d

@@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264PRED)                += arm/h264pred_init_arm.o
                      OBJS                                   += arm/dsputil_init_arm.o        \
                                                                arm/dsputil_arm.o             \
                                                                arm/fft_init_arm.o            \
                     +                                          arm/fmtconvert_init_arm.o     \
                                                                arm/jrevdct_arm.o             \
                                                                arm/mpegvideo_arm.o           \
                                                                arm/simple_idct_arm.o         \
@@ -22,8 +23,11 @@ OBJS-$(HAVE_ARMV6)                     += arm/dsputil_init_armv6.o      \
                                                                arm/dsputil_armv6.o           \
                                                                arm/simple_idct_armv6.o       \
                     +VFP-OBJS-$(HAVE_ARMV6)                 += arm/fmtconvert_vfp.o          \
+                    +
                      OBJS-$(HAVE_ARMVFP)                    += arm/dsputil_vfp.o             \
                                                                arm/dsputil_init_vfp.o        \
                     +                                          $(VFP-OBJS-yes)
                      OBJS-$(HAVE_IWMMXT)                    += arm/dsputil_iwmmxt.o          \
                                                                arm/mpegvideo_iwmmxt.o        \
@@ -52,6 +56,7 @@ NEON-OBJS-$(CONFIG_VP6_DECODER)        += arm/vp56dsp_neon.o            \
                      OBJS-$(HAVE_NEON)                      += arm/dsputil_init_neon.o       \
                                                                arm/dsputil_neon.o            \
                     +                                          arm/fmtconvert_neon.o         \
                                                                arm/int_neon.o                \
                                                                arm/mpegvideo_neon.o          \
                                                                arm/simple_idct_neon.o        \

libavcodec/arm/dsputil_init_neon.c

History View file @ fe2ff6d

@@ -153,8 +153,6 @@ void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul,
                                                    int len);
                      void ff_butterflies_float_neon(float *v1, float *v2, int len);
                      float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
                     -void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
                     -                                        float mul, int len);
                      void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
                                                       const float *src1, int len);
                      void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
@@ -162,8 +160,6 @@ void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
                      void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
                                                int len);
                     -void ff_float_to_int16_neon(int16_t *, const float *, long);
                     -void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
                      void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
@@ -308,7 +304,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
                          c->vector_fmul_scalar         = ff_vector_fmul_scalar_neon;
                          c->butterflies_float          = ff_butterflies_float_neon;
                          c->scalarproduct_float        = ff_scalarproduct_float_neon;
                     -    c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
                          c->vector_fmul_reverse        = ff_vector_fmul_reverse_neon;
                          c->vector_fmul_add            = ff_vector_fmul_add_neon;
                          c->vector_clipf               = ff_vector_clipf_neon;
@@ -319,11 +314,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
                          c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon;
                          c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon;
                     -    if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
                     -        c->float_to_int16            = ff_float_to_int16_neon;
                     -        c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
                     -    }
+                    -
                          if (CONFIG_VORBIS_DECODER)
                              c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;

libavcodec/arm/dsputil_init_vfp.c

History View file @ fe2ff6d

@@ -25,13 +25,9 @@ void ff_vector_fmul_vfp(float *dst, const float *src0,
                                              const float *src1, int len);
                      void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
                                                      const float *src1, int len);
                     -void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
                      void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx)
+                     {
                          c->vector_fmul = ff_vector_fmul_vfp;
                          c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
                     -#if HAVE_ARMV6
                     -    c->float_to_int16 = ff_float_to_int16_vfp;
                     -#endif
+                     }

libavcodec/arm/dsputil_neon.S

History View file @ fe2ff6d

@@ -400,343 +400,6 @@ function ff_add_pixels_clamped_neon, export=1
                              bx              lr
                      endfunc
                     -function ff_float_to_int16_neon, export=1
                     -        subs            r2,  r2,  #8
                     -        vld1.64         {d0-d1},  [r1,:128]!
                     -        vcvt.s32.f32    q8,  q0,  #16
                     -        vld1.64         {d2-d3},  [r1,:128]!
                     -        vcvt.s32.f32    q9,  q1,  #16
                     -        beq             3f
                     -        bics            ip,  r2,  #15
                     -        beq             2f
                     -1:      subs            ip,  ip,  #16
                     -        vshrn.s32       d4,  q8,  #16
                     -        vld1.64         {d0-d1},  [r1,:128]!
                     -        vcvt.s32.f32    q0,  q0,  #16
                     -        vshrn.s32       d5,  q9,  #16
                     -        vld1.64         {d2-d3},  [r1,:128]!
                     -        vcvt.s32.f32    q1,  q1,  #16
                     -        vshrn.s32       d6,  q0,  #16
                     -        vst1.64         {d4-d5},  [r0,:128]!
                     -        vshrn.s32       d7,  q1,  #16
                     -        vld1.64         {d16-d17},[r1,:128]!
                     -        vcvt.s32.f32    q8,  q8,  #16
                     -        vld1.64         {d18-d19},[r1,:128]!
                     -        vcvt.s32.f32    q9,  q9,  #16
                     -        vst1.64         {d6-d7},  [r0,:128]!
                     -        bne             1b
                     -        ands            r2,  r2,  #15
                     -        beq             3f
                     -2:      vld1.64         {d0-d1},  [r1,:128]!
                     -        vshrn.s32       d4,  q8,  #16
                     -        vcvt.s32.f32    q0,  q0,  #16
                     -        vld1.64         {d2-d3},  [r1,:128]!
                     -        vshrn.s32       d5,  q9,  #16
                     -        vcvt.s32.f32    q1,  q1,  #16
                     -        vshrn.s32       d6,  q0,  #16
                     -        vst1.64         {d4-d5},  [r0,:128]!
                     -        vshrn.s32       d7,  q1,  #16
                     -        vst1.64         {d6-d7},  [r0,:128]!
                     -        bx              lr
                     -3:      vshrn.s32       d4,  q8,  #16
                     -        vshrn.s32       d5,  q9,  #16
                     -        vst1.64         {d4-d5},  [r0,:128]!
                     -        bx              lr
                     -endfunc
+                    -
                     -function ff_float_to_int16_interleave_neon, export=1
                     -        cmp             r3, #2
                     -        ldrlt           r1, [r1]
                     -        blt             ff_float_to_int16_neon
                     -        bne             4f
+                    -
                     -        ldr             r3, [r1]
                     -        ldr             r1, [r1, #4]
+                    -
                     -        subs            r2,  r2,  #8
                     -        vld1.64         {d0-d1},  [r3,:128]!
                     -        vcvt.s32.f32    q8,  q0,  #16
                     -        vld1.64         {d2-d3},  [r3,:128]!
                     -        vcvt.s32.f32    q9,  q1,  #16
                     -        vld1.64         {d20-d21},[r1,:128]!
                     -        vcvt.s32.f32    q10, q10, #16
                     -        vld1.64         {d22-d23},[r1,:128]!
                     -        vcvt.s32.f32    q11, q11, #16
                     -        beq             3f
                     -        bics            ip,  r2,  #15
                     -        beq             2f
                     -1:      subs            ip,  ip,  #16
                     -        vld1.64         {d0-d1},  [r3,:128]!
                     -        vcvt.s32.f32    q0,  q0,  #16
                     -        vsri.32         q10, q8,  #16
                     -        vld1.64         {d2-d3},  [r3,:128]!
                     -        vcvt.s32.f32    q1,  q1,  #16
                     -        vld1.64         {d24-d25},[r1,:128]!
                     -        vcvt.s32.f32    q12, q12, #16
                     -        vld1.64         {d26-d27},[r1,:128]!
                     -        vsri.32         q11, q9,  #16
                     -        vst1.64         {d20-d21},[r0,:128]!
                     -        vcvt.s32.f32    q13, q13, #16
                     -        vst1.64         {d22-d23},[r0,:128]!
                     -        vsri.32         q12, q0,  #16
                     -        vld1.64         {d16-d17},[r3,:128]!
                     -        vsri.32         q13, q1,  #16
                     -        vst1.64         {d24-d25},[r0,:128]!
                     -        vcvt.s32.f32    q8,  q8,  #16
                     -        vld1.64         {d18-d19},[r3,:128]!
                     -        vcvt.s32.f32    q9,  q9,  #16
                     -        vld1.64         {d20-d21},[r1,:128]!
                     -        vcvt.s32.f32    q10, q10, #16
                     -        vld1.64         {d22-d23},[r1,:128]!
                     -        vcvt.s32.f32    q11, q11, #16
                     -        vst1.64         {d26-d27},[r0,:128]!
                     -        bne             1b
                     -        ands            r2,  r2,  #15
                     -        beq             3f
                     -2:      vsri.32         q10, q8,  #16
                     -        vld1.64         {d0-d1},  [r3,:128]!
                     -        vcvt.s32.f32    q0,  q0,  #16
                     -        vld1.64         {d2-d3},  [r3,:128]!
                     -        vcvt.s32.f32    q1,  q1,  #16
                     -        vld1.64         {d24-d25},[r1,:128]!
                     -        vcvt.s32.f32    q12, q12, #16
                     -        vsri.32         q11, q9,  #16
                     -        vld1.64         {d26-d27},[r1,:128]!
                     -        vcvt.s32.f32    q13, q13, #16
                     -        vst1.64         {d20-d21},[r0,:128]!
                     -        vsri.32         q12, q0,  #16
                     -        vst1.64         {d22-d23},[r0,:128]!
                     -        vsri.32         q13, q1,  #16
                     -        vst1.64         {d24-d27},[r0,:128]!
                     -        bx              lr
                     -3:      vsri.32         q10, q8,  #16
                     -        vsri.32         q11, q9,  #16
                     -        vst1.64         {d20-d23},[r0,:128]!
                     -        bx              lr
+                    -
                     -4:      push            {r4-r8,lr}
                     -        cmp             r3,  #4
                     -        lsl             ip,  r3,  #1
                     -        blt             4f
+                    -
                     -        @ 4 channels
                     -5:      ldmia           r1!, {r4-r7}
                     -        mov             lr,  r2
                     -        mov             r8,  r0
                     -        vld1.64         {d16-d17},[r4,:128]!
                     -        vcvt.s32.f32    q8,  q8,  #16
                     -        vld1.64         {d18-d19},[r5,:128]!
                     -        vcvt.s32.f32    q9,  q9,  #16
                     -        vld1.64         {d20-d21},[r6,:128]!
                     -        vcvt.s32.f32    q10, q10, #16
                     -        vld1.64         {d22-d23},[r7,:128]!
                     -        vcvt.s32.f32    q11, q11, #16
                     -6:      subs            lr,  lr,  #8
                     -        vld1.64         {d0-d1},  [r4,:128]!
                     -        vcvt.s32.f32    q0,  q0,  #16
                     -        vsri.32         q9,  q8,  #16
                     -        vld1.64         {d2-d3},  [r5,:128]!
                     -        vcvt.s32.f32    q1,  q1,  #16
                     -        vsri.32         q11, q10, #16
                     -        vld1.64         {d4-d5},  [r6,:128]!
                     -        vcvt.s32.f32    q2,  q2,  #16
                     -        vzip.32         d18, d22
                     -        vld1.64         {d6-d7},  [r7,:128]!
                     -        vcvt.s32.f32    q3,  q3,  #16
                     -        vzip.32         d19, d23
                     -        vst1.64         {d18},    [r8], ip
                     -        vsri.32         q1,  q0,  #16
                     -        vst1.64         {d22},    [r8], ip
                     -        vsri.32         q3,  q2,  #16
                     -        vst1.64         {d19},    [r8], ip
                     -        vzip.32         d2,  d6
                     -        vst1.64         {d23},    [r8], ip
                     -        vzip.32         d3,  d7
                     -        beq             7f
                     -        vld1.64         {d16-d17},[r4,:128]!
                     -        vcvt.s32.f32    q8,  q8,  #16
                     -        vst1.64         {d2},     [r8], ip
                     -        vld1.64         {d18-d19},[r5,:128]!
                     -        vcvt.s32.f32    q9,  q9,  #16
                     -        vst1.64         {d6},     [r8], ip
                     -        vld1.64         {d20-d21},[r6,:128]!
                     -        vcvt.s32.f32    q10, q10, #16
                     -        vst1.64         {d3},     [r8], ip
                     -        vld1.64         {d22-d23},[r7,:128]!
                     -        vcvt.s32.f32    q11, q11, #16
                     -        vst1.64         {d7},     [r8], ip
                     -        b               6b
                     -7:      vst1.64         {d2},     [r8], ip
                     -        vst1.64         {d6},     [r8], ip
                     -        vst1.64         {d3},     [r8], ip
                     -        vst1.64         {d7},     [r8], ip
                     -        subs            r3,  r3,  #4
                     -        popeq           {r4-r8,pc}
                     -        cmp             r3,  #4
                     -        add             r0,  r0,  #8
                     -        bge             5b
+                    -
                     -        @ 2 channels
                     -4:      cmp             r3,  #2
                     -        blt             4f
                     -        ldmia           r1!, {r4-r5}
                     -        mov             lr,  r2
                     -        mov             r8,  r0
                     -        tst             lr,  #8
                     -        vld1.64         {d16-d17},[r4,:128]!
                     -        vcvt.s32.f32    q8,  q8,  #16
                     -        vld1.64         {d18-d19},[r5,:128]!
                     -        vcvt.s32.f32    q9,  q9,  #16
                     -        vld1.64         {d20-d21},[r4,:128]!
                     -        vcvt.s32.f32    q10, q10, #16
                     -        vld1.64         {d22-d23},[r5,:128]!
                     -        vcvt.s32.f32    q11, q11, #16
                     -        beq             6f
                     -        subs            lr,  lr,  #8
                     -        beq             7f
                     -        vsri.32         d18, d16, #16
                     -        vsri.32         d19, d17, #16
                     -        vld1.64         {d16-d17},[r4,:128]!
                     -        vcvt.s32.f32    q8,  q8,  #16
                     -        vst1.32         {d18[0]}, [r8], ip
                     -        vsri.32         d22, d20, #16
                     -        vst1.32         {d18[1]}, [r8], ip
                     -        vsri.32         d23, d21, #16
                     -        vst1.32         {d19[0]}, [r8], ip
                     -        vst1.32         {d19[1]}, [r8], ip
                     -        vld1.64         {d18-d19},[r5,:128]!
                     -        vcvt.s32.f32    q9,  q9,  #16
                     -        vst1.32         {d22[0]}, [r8], ip
                     -        vst1.32         {d22[1]}, [r8], ip
                     -        vld1.64         {d20-d21},[r4,:128]!
                     -        vcvt.s32.f32    q10, q10, #16
                     -        vst1.32         {d23[0]}, [r8], ip
                     -        vst1.32         {d23[1]}, [r8], ip
                     -        vld1.64         {d22-d23},[r5,:128]!
                     -        vcvt.s32.f32    q11, q11, #16
                     -6:      subs            lr,  lr,  #16
                     -        vld1.64         {d0-d1},  [r4,:128]!
                     -        vcvt.s32.f32    q0,  q0,  #16
                     -        vsri.32         d18, d16, #16
                     -        vld1.64         {d2-d3},  [r5,:128]!
                     -        vcvt.s32.f32    q1,  q1,  #16
                     -        vsri.32         d19, d17, #16
                     -        vld1.64         {d4-d5},  [r4,:128]!
                     -        vcvt.s32.f32    q2,  q2,  #16
                     -        vld1.64         {d6-d7},  [r5,:128]!
                     -        vcvt.s32.f32    q3,  q3,  #16
                     -        vst1.32         {d18[0]}, [r8], ip
                     -        vsri.32         d22, d20, #16
                     -        vst1.32         {d18[1]}, [r8], ip
                     -        vsri.32         d23, d21, #16
                     -        vst1.32         {d19[0]}, [r8], ip
                     -        vsri.32         d2,  d0,  #16
                     -        vst1.32         {d19[1]}, [r8], ip
                     -        vsri.32         d3,  d1,  #16
                     -        vst1.32         {d22[0]}, [r8], ip
                     -        vsri.32         d6,  d4,  #16
                     -        vst1.32         {d22[1]}, [r8], ip
                     -        vsri.32         d7,  d5,  #16
                     -        vst1.32         {d23[0]}, [r8], ip
                     -        vst1.32         {d23[1]}, [r8], ip
                     -        beq             6f
                     -        vld1.64         {d16-d17},[r4,:128]!
                     -        vcvt.s32.f32    q8,  q8,  #16
                     -        vst1.32         {d2[0]},  [r8], ip
                     -        vst1.32         {d2[1]},  [r8], ip
                     -        vld1.64         {d18-d19},[r5,:128]!
                     -        vcvt.s32.f32    q9,  q9,  #16
                     -        vst1.32         {d3[0]},  [r8], ip
                     -        vst1.32         {d3[1]},  [r8], ip
                     -        vld1.64         {d20-d21},[r4,:128]!
                     -        vcvt.s32.f32    q10, q10, #16
                     -        vst1.32         {d6[0]},  [r8], ip
                     -        vst1.32         {d6[1]},  [r8], ip
                     -        vld1.64         {d22-d23},[r5,:128]!
                     -        vcvt.s32.f32    q11, q11, #16
                     -        vst1.32         {d7[0]},  [r8], ip
                     -        vst1.32         {d7[1]},  [r8], ip
                     -        bgt             6b
                     -6:      vst1.32         {d2[0]},  [r8], ip
                     -        vst1.32         {d2[1]},  [r8], ip
                     -        vst1.32         {d3[0]},  [r8], ip
                     -        vst1.32         {d3[1]},  [r8], ip
                     -        vst1.32         {d6[0]},  [r8], ip
                     -        vst1.32         {d6[1]},  [r8], ip
                     -        vst1.32         {d7[0]},  [r8], ip
                     -        vst1.32         {d7[1]},  [r8], ip
                     -        b               8f
                     -7:      vsri.32         d18, d16, #16
                     -        vsri.32         d19, d17, #16
                     -        vst1.32         {d18[0]}, [r8], ip
                     -        vsri.32         d22, d20, #16
                     -        vst1.32         {d18[1]}, [r8], ip
                     -        vsri.32         d23, d21, #16
                     -        vst1.32         {d19[0]}, [r8], ip
                     -        vst1.32         {d19[1]}, [r8], ip
                     -        vst1.32         {d22[0]}, [r8], ip
                     -        vst1.32         {d22[1]}, [r8], ip
                     -        vst1.32         {d23[0]}, [r8], ip
                     -        vst1.32         {d23[1]}, [r8], ip
                     -8:      subs            r3,  r3,  #2
                     -        add             r0,  r0,  #4
                     -        popeq           {r4-r8,pc}
+                    -
                     -        @ 1 channel
                     -4:      ldr             r4,  [r1],#4
                     -        tst             r2,  #8
                     -        mov             lr,  r2
                     -        mov             r5,  r0
                     -        vld1.64         {d0-d1},  [r4,:128]!
                     -        vcvt.s32.f32    q0,  q0,  #16
                     -        vld1.64         {d2-d3},  [r4,:128]!
                     -        vcvt.s32.f32    q1,  q1,  #16
                     -        bne             8f
                     -6:      subs            lr,  lr,  #16
                     -        vld1.64         {d4-d5},  [r4,:128]!
                     -        vcvt.s32.f32    q2,  q2,  #16
                     -        vld1.64         {d6-d7},  [r4,:128]!
                     -        vcvt.s32.f32    q3,  q3,  #16
                     -        vst1.16         {d0[1]},  [r5,:16], ip
                     -        vst1.16         {d0[3]},  [r5,:16], ip
                     -        vst1.16         {d1[1]},  [r5,:16], ip
                     -        vst1.16         {d1[3]},  [r5,:16], ip
                     -        vst1.16         {d2[1]},  [r5,:16], ip
                     -        vst1.16         {d2[3]},  [r5,:16], ip
                     -        vst1.16         {d3[1]},  [r5,:16], ip
                     -        vst1.16         {d3[3]},  [r5,:16], ip
                     -        beq             7f
                     -        vld1.64         {d0-d1},  [r4,:128]!
                     -        vcvt.s32.f32    q0,  q0,  #16
                     -        vld1.64         {d2-d3},  [r4,:128]!
                     -        vcvt.s32.f32    q1,  q1,  #16
                     -7:      vst1.16         {d4[1]},  [r5,:16], ip
                     -        vst1.16         {d4[3]},  [r5,:16], ip
                     -        vst1.16         {d5[1]},  [r5,:16], ip
                     -        vst1.16         {d5[3]},  [r5,:16], ip
                     -        vst1.16         {d6[1]},  [r5,:16], ip
                     -        vst1.16         {d6[3]},  [r5,:16], ip
                     -        vst1.16         {d7[1]},  [r5,:16], ip
                     -        vst1.16         {d7[3]},  [r5,:16], ip
                     -        bgt             6b
                     -        pop             {r4-r8,pc}
                     -8:      subs            lr,  lr,  #8
                     -        vst1.16         {d0[1]},  [r5,:16], ip
                     -        vst1.16         {d0[3]},  [r5,:16], ip
                     -        vst1.16         {d1[1]},  [r5,:16], ip
                     -        vst1.16         {d1[3]},  [r5,:16], ip
                     -        vst1.16         {d2[1]},  [r5,:16], ip
                     -        vst1.16         {d2[3]},  [r5,:16], ip
                     -        vst1.16         {d3[1]},  [r5,:16], ip
                     -        vst1.16         {d3[3]},  [r5,:16], ip
                     -        popeq           {r4-r8,pc}
                     -        vld1.64         {d0-d1},  [r4,:128]!
                     -        vcvt.s32.f32    q0,  q0,  #16
                     -        vld1.64         {d2-d3},  [r4,:128]!
                     -        vcvt.s32.f32    q1,  q1,  #16
                     -        b               6b
                     -endfunc
+                    -
                      function ff_vector_fmul_neon, export=1
                              subs            r3,  r3,  #8
                              vld1.64         {d0-d3},  [r1,:128]!
@@ -1050,34 +713,6 @@ NOVFP   vmov.32         r0,  d0[0]
                              bx              lr
                      endfunc
                     -function ff_int32_to_float_fmul_scalar_neon, export=1
                     -VFP     vdup.32         q0,  d0[0]
                     -VFP     len     .req    r2
                     -NOVFP   vdup.32         q0,  r2
                     -NOVFP   len     .req    r3
+                    -
                     -        vld1.32         {q1},[r1,:128]!
                     -        vcvt.f32.s32    q3,  q1
                     -        vld1.32         {q2},[r1,:128]!
                     -        vcvt.f32.s32    q8,  q2
                     -1:      subs            len, len, #8
                     -        pld             [r1, #16]
                     -        vmul.f32        q9,  q3,  q0
                     -        vmul.f32        q10, q8,  q0
                     -        beq             2f
                     -        vld1.32         {q1},[r1,:128]!
                     -        vcvt.f32.s32    q3,  q1
                     -        vld1.32         {q2},[r1,:128]!
                     -        vcvt.f32.s32    q8,  q2
                     -        vst1.32         {q9}, [r0,:128]!
                     -        vst1.32         {q10},[r0,:128]!
                     -        b               1b
                     -2:      vst1.32         {q9}, [r0,:128]!
                     -        vst1.32         {q10},[r0,:128]!
                     -        bx              lr
                     -        .unreq  len
                     -endfunc
+                    -
                      function ff_vector_fmul_reverse_neon, export=1
                              add             r2,  r2,  r3,  lsl #2
                              sub             r2,  r2,  #32

libavcodec/arm/dsputil_vfp.S

History View file @ fe2ff6d

@@ -131,58 +131,3 @@ function ff_vector_fmul_reverse_vfp, export=1
                              vpop            {d8-d15}
                              bx              lr
                      endfunc
+                    -
                     -#if HAVE_ARMV6
                     -/**
                     - * ARM VFP optimized float to int16 conversion.
                     - * Assume that len is a positive number and is multiple of 8, destination
                     - * buffer is at least 4 bytes aligned (8 bytes alignment is better for
                     - * performance), little endian byte sex
                     - */
                     -@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
                     -function ff_float_to_int16_vfp, export=1
                     -        push            {r4-r8,lr}
                     -        vpush           {d8-d11}
                     -        vldmia          r1!, {s16-s23}
                     -        vcvt.s32.f32    s0,  s16
                     -        vcvt.s32.f32    s1,  s17
                     -        vcvt.s32.f32    s2,  s18
                     -        vcvt.s32.f32    s3,  s19
                     -        vcvt.s32.f32    s4,  s20
                     -        vcvt.s32.f32    s5,  s21
                     -        vcvt.s32.f32    s6,  s22
                     -        vcvt.s32.f32    s7,  s23
                     -1:
                     -        subs            r2,  r2,  #8
                     -        vmov            r3,  r4,  s0, s1
                     -        vmov            r5,  r6,  s2, s3
                     -        vmov            r7,  r8,  s4, s5
                     -        vmov            ip,  lr,  s6, s7
                     -        vldmiagt        r1!, {s16-s23}
                     -        ssat            r4,  #16, r4
                     -        ssat            r3,  #16, r3
                     -        ssat            r6,  #16, r6
                     -        ssat            r5,  #16, r5
                     -        pkhbt           r3,  r3,  r4, lsl #16
                     -        pkhbt           r4,  r5,  r6, lsl #16
                     -        vcvtgt.s32.f32  s0,  s16
                     -        vcvtgt.s32.f32  s1,  s17
                     -        vcvtgt.s32.f32  s2,  s18
                     -        vcvtgt.s32.f32  s3,  s19
                     -        vcvtgt.s32.f32  s4,  s20
                     -        vcvtgt.s32.f32  s5,  s21
                     -        vcvtgt.s32.f32  s6,  s22
                     -        vcvtgt.s32.f32  s7,  s23
                     -        ssat            r8,  #16, r8
                     -        ssat            r7,  #16, r7
                     -        ssat            lr,  #16, lr
                     -        ssat            ip,  #16, ip
                     -        pkhbt           r5,  r7,  r8, lsl #16
                     -        pkhbt           r6,  ip,  lr, lsl #16
                     -        stmia           r0!, {r3-r6}
                     -        bgt             1b
+                    -
                     -        vpop            {d8-d11}
                     -        pop             {r4-r8,pc}
                     -endfunc
                     -#endif

libavcodec/arm/fmtconvert_init_arm.c

History View file @ fe2ff6d

                     new file mode 100644
@@ -0,0 +1,48 @@
                     +/*
                     + * ARM optimized Format Conversion Utils
                     + *
                     + * This file is part of FFmpeg.
                     + *
                     + * FFmpeg is free software; you can redistribute it and/or
                     + * modify it under the terms of the GNU Lesser General Public
                     + * License as published by the Free Software Foundation; either
                     + * version 2.1 of the License, or (at your option) any later version.
                     + *
                     + * FFmpeg is distributed in the hope that it will be useful,
                     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     + * Lesser General Public License for more details.
                     + *
                     + * You should have received a copy of the GNU Lesser General Public
                     + * License along with FFmpeg; if not, write to the Free Software
                     + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     + */
+                    +
                     +#include <stdint.h>
+                    +
                     +#include "libavcodec/avcodec.h"
                     +#include "libavcodec/fmtconvert.h"
+                    +
                     +void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
                     +                                        float mul, int len);
+                    +
                     +void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
                     +void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
+                    +
                     +void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
+                    +
                     +void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx)
                     +{
                     +    if (HAVE_ARMVFP && HAVE_ARMV6) {
                     +        c->float_to_int16 = ff_float_to_int16_vfp;
                     +    }
+                    +
                     +    if (HAVE_NEON) {
                     +        c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
+                    +
                     +        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
                     +            c->float_to_int16            = ff_float_to_int16_neon;
                     +            c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
                     +        }
                     +    }
                     +}

libavcodec/arm/fmtconvert_neon.S

History View file @ fe2ff6d

                     new file mode 100644
@@ -0,0 +1,391 @@
                     +/*
                     + * ARM NEON optimised Format Conversion Utils
                     + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
                     + *
                     + * This file is part of FFmpeg.
                     + *
                     + * FFmpeg is free software; you can redistribute it and/or
                     + * modify it under the terms of the GNU Lesser General Public
                     + * License as published by the Free Software Foundation; either
                     + * version 2.1 of the License, or (at your option) any later version.
                     + *
                     + * FFmpeg is distributed in the hope that it will be useful,
                     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     + * Lesser General Public License for more details.
                     + *
                     + * You should have received a copy of the GNU Lesser General Public
                     + * License along with FFmpeg; if not, write to the Free Software
                     + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     + */
+                    +
                     +#include "config.h"
                     +#include "asm.S"
+                    +
                     +        preserve8
                     +        .text
+                    +
                     +function ff_float_to_int16_neon, export=1
                     +        subs            r2,  r2,  #8
                     +        vld1.64         {d0-d1},  [r1,:128]!
                     +        vcvt.s32.f32    q8,  q0,  #16
                     +        vld1.64         {d2-d3},  [r1,:128]!
                     +        vcvt.s32.f32    q9,  q1,  #16
                     +        beq             3f
                     +        bics            ip,  r2,  #15
                     +        beq             2f
                     +1:      subs            ip,  ip,  #16
                     +        vshrn.s32       d4,  q8,  #16
                     +        vld1.64         {d0-d1},  [r1,:128]!
                     +        vcvt.s32.f32    q0,  q0,  #16
                     +        vshrn.s32       d5,  q9,  #16
                     +        vld1.64         {d2-d3},  [r1,:128]!
                     +        vcvt.s32.f32    q1,  q1,  #16
                     +        vshrn.s32       d6,  q0,  #16
                     +        vst1.64         {d4-d5},  [r0,:128]!
                     +        vshrn.s32       d7,  q1,  #16
                     +        vld1.64         {d16-d17},[r1,:128]!
                     +        vcvt.s32.f32    q8,  q8,  #16
                     +        vld1.64         {d18-d19},[r1,:128]!
                     +        vcvt.s32.f32    q9,  q9,  #16
                     +        vst1.64         {d6-d7},  [r0,:128]!
                     +        bne             1b
                     +        ands            r2,  r2,  #15
                     +        beq             3f
                     +2:      vld1.64         {d0-d1},  [r1,:128]!
                     +        vshrn.s32       d4,  q8,  #16
                     +        vcvt.s32.f32    q0,  q0,  #16
                     +        vld1.64         {d2-d3},  [r1,:128]!
                     +        vshrn.s32       d5,  q9,  #16
                     +        vcvt.s32.f32    q1,  q1,  #16
                     +        vshrn.s32       d6,  q0,  #16
                     +        vst1.64         {d4-d5},  [r0,:128]!
                     +        vshrn.s32       d7,  q1,  #16
                     +        vst1.64         {d6-d7},  [r0,:128]!
                     +        bx              lr
                     +3:      vshrn.s32       d4,  q8,  #16
                     +        vshrn.s32       d5,  q9,  #16
                     +        vst1.64         {d4-d5},  [r0,:128]!
                     +        bx              lr
                     +endfunc
+                    +
                     +function ff_float_to_int16_interleave_neon, export=1
                     +        cmp             r3, #2
                     +        ldrlt           r1, [r1]
                     +        blt             ff_float_to_int16_neon
                     +        bne             4f
+                    +
                     +        ldr             r3, [r1]
                     +        ldr             r1, [r1, #4]
+                    +
                     +        subs            r2,  r2,  #8
                     +        vld1.64         {d0-d1},  [r3,:128]!
                     +        vcvt.s32.f32    q8,  q0,  #16
                     +        vld1.64         {d2-d3},  [r3,:128]!
                     +        vcvt.s32.f32    q9,  q1,  #16
                     +        vld1.64         {d20-d21},[r1,:128]!
                     +        vcvt.s32.f32    q10, q10, #16
                     +        vld1.64         {d22-d23},[r1,:128]!
                     +        vcvt.s32.f32    q11, q11, #16
                     +        beq             3f
                     +        bics            ip,  r2,  #15
                     +        beq             2f
                     +1:      subs            ip,  ip,  #16
                     +        vld1.64         {d0-d1},  [r3,:128]!
                     +        vcvt.s32.f32    q0,  q0,  #16
                     +        vsri.32         q10, q8,  #16
                     +        vld1.64         {d2-d3},  [r3,:128]!
                     +        vcvt.s32.f32    q1,  q1,  #16
                     +        vld1.64         {d24-d25},[r1,:128]!
                     +        vcvt.s32.f32    q12, q12, #16
                     +        vld1.64         {d26-d27},[r1,:128]!
                     +        vsri.32         q11, q9,  #16
                     +        vst1.64         {d20-d21},[r0,:128]!
                     +        vcvt.s32.f32    q13, q13, #16
                     +        vst1.64         {d22-d23},[r0,:128]!
                     +        vsri.32         q12, q0,  #16
                     +        vld1.64         {d16-d17},[r3,:128]!
                     +        vsri.32         q13, q1,  #16
                     +        vst1.64         {d24-d25},[r0,:128]!
                     +        vcvt.s32.f32    q8,  q8,  #16
                     +        vld1.64         {d18-d19},[r3,:128]!
                     +        vcvt.s32.f32    q9,  q9,  #16
                     +        vld1.64         {d20-d21},[r1,:128]!
                     +        vcvt.s32.f32    q10, q10, #16
                     +        vld1.64         {d22-d23},[r1,:128]!
                     +        vcvt.s32.f32    q11, q11, #16
                     +        vst1.64         {d26-d27},[r0,:128]!
                     +        bne             1b
                     +        ands            r2,  r2,  #15
                     +        beq             3f
                     +2:      vsri.32         q10, q8,  #16
                     +        vld1.64         {d0-d1},  [r3,:128]!
                     +        vcvt.s32.f32    q0,  q0,  #16
                     +        vld1.64         {d2-d3},  [r3,:128]!
                     +        vcvt.s32.f32    q1,  q1,  #16
                     +        vld1.64         {d24-d25},[r1,:128]!
                     +        vcvt.s32.f32    q12, q12, #16
                     +        vsri.32         q11, q9,  #16
                     +        vld1.64         {d26-d27},[r1,:128]!
                     +        vcvt.s32.f32    q13, q13, #16
                     +        vst1.64         {d20-d21},[r0,:128]!
                     +        vsri.32         q12, q0,  #16
                     +        vst1.64         {d22-d23},[r0,:128]!
                     +        vsri.32         q13, q1,  #16
                     +        vst1.64         {d24-d27},[r0,:128]!
                     +        bx              lr
                     +3:      vsri.32         q10, q8,  #16
                     +        vsri.32         q11, q9,  #16
                     +        vst1.64         {d20-d23},[r0,:128]!
                     +        bx              lr
+                    +
                     +4:      push            {r4-r8,lr}
                     +        cmp             r3,  #4
                     +        lsl             ip,  r3,  #1
                     +        blt             4f
+                    +
                     +        @ 4 channels
                     +5:      ldmia           r1!, {r4-r7}
                     +        mov             lr,  r2
                     +        mov             r8,  r0
                     +        vld1.64         {d16-d17},[r4,:128]!
                     +        vcvt.s32.f32    q8,  q8,  #16
                     +        vld1.64         {d18-d19},[r5,:128]!
                     +        vcvt.s32.f32    q9,  q9,  #16
                     +        vld1.64         {d20-d21},[r6,:128]!
                     +        vcvt.s32.f32    q10, q10, #16
                     +        vld1.64         {d22-d23},[r7,:128]!
                     +        vcvt.s32.f32    q11, q11, #16
                     +6:      subs            lr,  lr,  #8
                     +        vld1.64         {d0-d1},  [r4,:128]!
                     +        vcvt.s32.f32    q0,  q0,  #16
                     +        vsri.32         q9,  q8,  #16
                     +        vld1.64         {d2-d3},  [r5,:128]!
                     +        vcvt.s32.f32    q1,  q1,  #16
                     +        vsri.32         q11, q10, #16
                     +        vld1.64         {d4-d5},  [r6,:128]!
                     +        vcvt.s32.f32    q2,  q2,  #16
                     +        vzip.32         d18, d22
                     +        vld1.64         {d6-d7},  [r7,:128]!
                     +        vcvt.s32.f32    q3,  q3,  #16
                     +        vzip.32         d19, d23
                     +        vst1.64         {d18},    [r8], ip
                     +        vsri.32         q1,  q0,  #16
                     +        vst1.64         {d22},    [r8], ip
                     +        vsri.32         q3,  q2,  #16
                     +        vst1.64         {d19},    [r8], ip
                     +        vzip.32         d2,  d6
                     +        vst1.64         {d23},    [r8], ip
                     +        vzip.32         d3,  d7
                     +        beq             7f
                     +        vld1.64         {d16-d17},[r4,:128]!
                     +        vcvt.s32.f32    q8,  q8,  #16
                     +        vst1.64         {d2},     [r8], ip
                     +        vld1.64         {d18-d19},[r5,:128]!
                     +        vcvt.s32.f32    q9,  q9,  #16
                     +        vst1.64         {d6},     [r8], ip
                     +        vld1.64         {d20-d21},[r6,:128]!
                     +        vcvt.s32.f32    q10, q10, #16
                     +        vst1.64         {d3},     [r8], ip
                     +        vld1.64         {d22-d23},[r7,:128]!
                     +        vcvt.s32.f32    q11, q11, #16
                     +        vst1.64         {d7},     [r8], ip
                     +        b               6b
                     +7:      vst1.64         {d2},     [r8], ip
                     +        vst1.64         {d6},     [r8], ip
                     +        vst1.64         {d3},     [r8], ip
                     +        vst1.64         {d7},     [r8], ip
                     +        subs            r3,  r3,  #4
                     +        popeq           {r4-r8,pc}
                     +        cmp             r3,  #4
                     +        add             r0,  r0,  #8
                     +        bge             5b
+                    +
                     +        @ 2 channels
                     +4:      cmp             r3,  #2
                     +        blt             4f
                     +        ldmia           r1!, {r4-r5}
                     +        mov             lr,  r2
                     +        mov             r8,  r0
                     +        tst             lr,  #8
                     +        vld1.64         {d16-d17},[r4,:128]!
                     +        vcvt.s32.f32    q8,  q8,  #16
                     +        vld1.64         {d18-d19},[r5,:128]!
                     +        vcvt.s32.f32    q9,  q9,  #16
                     +        vld1.64         {d20-d21},[r4,:128]!
                     +        vcvt.s32.f32    q10, q10, #16
                     +        vld1.64         {d22-d23},[r5,:128]!
                     +        vcvt.s32.f32    q11, q11, #16
                     +        beq             6f
                     +        subs            lr,  lr,  #8
                     +        beq             7f
                     +        vsri.32         d18, d16, #16
                     +        vsri.32         d19, d17, #16
                     +        vld1.64         {d16-d17},[r4,:128]!
                     +        vcvt.s32.f32    q8,  q8,  #16
                     +        vst1.32         {d18[0]}, [r8], ip
                     +        vsri.32         d22, d20, #16
                     +        vst1.32         {d18[1]}, [r8], ip
                     +        vsri.32         d23, d21, #16
                     +        vst1.32         {d19[0]}, [r8], ip
                     +        vst1.32         {d19[1]}, [r8], ip
                     +        vld1.64         {d18-d19},[r5,:128]!
                     +        vcvt.s32.f32    q9,  q9,  #16
                     +        vst1.32         {d22[0]}, [r8], ip
                     +        vst1.32         {d22[1]}, [r8], ip
                     +        vld1.64         {d20-d21},[r4,:128]!
                     +        vcvt.s32.f32    q10, q10, #16
                     +        vst1.32         {d23[0]}, [r8], ip
                     +        vst1.32         {d23[1]}, [r8], ip
                     +        vld1.64         {d22-d23},[r5,:128]!
                     +        vcvt.s32.f32    q11, q11, #16
                     +6:      subs            lr,  lr,  #16
                     +        vld1.64         {d0-d1},  [r4,:128]!
                     +        vcvt.s32.f32    q0,  q0,  #16
                     +        vsri.32         d18, d16, #16
                     +        vld1.64         {d2-d3},  [r5,:128]!
                     +        vcvt.s32.f32    q1,  q1,  #16
                     +        vsri.32         d19, d17, #16
                     +        vld1.64         {d4-d5},  [r4,:128]!
                     +        vcvt.s32.f32    q2,  q2,  #16
                     +        vld1.64         {d6-d7},  [r5,:128]!
                     +        vcvt.s32.f32    q3,  q3,  #16
                     +        vst1.32         {d18[0]}, [r8], ip
                     +        vsri.32         d22, d20, #16
                     +        vst1.32         {d18[1]}, [r8], ip
                     +        vsri.32         d23, d21, #16
                     +        vst1.32         {d19[0]}, [r8], ip
                     +        vsri.32         d2,  d0,  #16
                     +        vst1.32         {d19[1]}, [r8], ip
                     +        vsri.32         d3,  d1,  #16
                     +        vst1.32         {d22[0]}, [r8], ip
                     +        vsri.32         d6,  d4,  #16
                     +        vst1.32         {d22[1]}, [r8], ip
                     +        vsri.32         d7,  d5,  #16
                     +        vst1.32         {d23[0]}, [r8], ip
                     +        vst1.32         {d23[1]}, [r8], ip
                     +        beq             6f
                     +        vld1.64         {d16-d17},[r4,:128]!
                     +        vcvt.s32.f32    q8,  q8,  #16
                     +        vst1.32         {d2[0]},  [r8], ip
                     +        vst1.32         {d2[1]},  [r8], ip
                     +        vld1.64         {d18-d19},[r5,:128]!
                     +        vcvt.s32.f32    q9,  q9,  #16
                     +        vst1.32         {d3[0]},  [r8], ip
                     +        vst1.32         {d3[1]},  [r8], ip
                     +        vld1.64         {d20-d21},[r4,:128]!
                     +        vcvt.s32.f32    q10, q10, #16
                     +        vst1.32         {d6[0]},  [r8], ip
                     +        vst1.32         {d6[1]},  [r8], ip
                     +        vld1.64         {d22-d23},[r5,:128]!
                     +        vcvt.s32.f32    q11, q11, #16
                     +        vst1.32         {d7[0]},  [r8], ip
                     +        vst1.32         {d7[1]},  [r8], ip
                     +        bgt             6b
                     +6:      vst1.32         {d2[0]},  [r8], ip
                     +        vst1.32         {d2[1]},  [r8], ip
                     +        vst1.32         {d3[0]},  [r8], ip
                     +        vst1.32         {d3[1]},  [r8], ip
                     +        vst1.32         {d6[0]},  [r8], ip
                     +        vst1.32         {d6[1]},  [r8], ip
                     +        vst1.32         {d7[0]},  [r8], ip
                     +        vst1.32         {d7[1]},  [r8], ip
                     +        b               8f
                     +7:      vsri.32         d18, d16, #16
                     +        vsri.32         d19, d17, #16
                     +        vst1.32         {d18[0]}, [r8], ip
                     +        vsri.32         d22, d20, #16
                     +        vst1.32         {d18[1]}, [r8], ip
                     +        vsri.32         d23, d21, #16
                     +        vst1.32         {d19[0]}, [r8], ip
                     +        vst1.32         {d19[1]}, [r8], ip
                     +        vst1.32         {d22[0]}, [r8], ip
                     +        vst1.32         {d22[1]}, [r8], ip
                     +        vst1.32         {d23[0]}, [r8], ip
                     +        vst1.32         {d23[1]}, [r8], ip
                     +8:      subs            r3,  r3,  #2
                     +        add             r0,  r0,  #4
                     +        popeq           {r4-r8,pc}
+                    +
                     +        @ 1 channel
                     +4:      ldr             r4,  [r1],#4
                     +        tst             r2,  #8
                     +        mov             lr,  r2
                     +        mov             r5,  r0
                     +        vld1.64         {d0-d1},  [r4,:128]!
                     +        vcvt.s32.f32    q0,  q0,  #16
                     +        vld1.64         {d2-d3},  [r4,:128]!
                     +        vcvt.s32.f32    q1,  q1,  #16
                     +        bne             8f
                     +6:      subs            lr,  lr,  #16
                     +        vld1.64         {d4-d5},  [r4,:128]!
                     +        vcvt.s32.f32    q2,  q2,  #16
                     +        vld1.64         {d6-d7},  [r4,:128]!
                     +        vcvt.s32.f32    q3,  q3,  #16
                     +        vst1.16         {d0[1]},  [r5,:16], ip
                     +        vst1.16         {d0[3]},  [r5,:16], ip
                     +        vst1.16         {d1[1]},  [r5,:16], ip
                     +        vst1.16         {d1[3]},  [r5,:16], ip
                     +        vst1.16         {d2[1]},  [r5,:16], ip
                     +        vst1.16         {d2[3]},  [r5,:16], ip
                     +        vst1.16         {d3[1]},  [r5,:16], ip
                     +        vst1.16         {d3[3]},  [r5,:16], ip
                     +        beq             7f
                     +        vld1.64         {d0-d1},  [r4,:128]!
                     +        vcvt.s32.f32    q0,  q0,  #16
                     +        vld1.64         {d2-d3},  [r4,:128]!
                     +        vcvt.s32.f32    q1,  q1,  #16
                     +7:      vst1.16         {d4[1]},  [r5,:16], ip
                     +        vst1.16         {d4[3]},  [r5,:16], ip
                     +        vst1.16         {d5[1]},  [r5,:16], ip
                     +        vst1.16         {d5[3]},  [r5,:16], ip
                     +        vst1.16         {d6[1]},  [r5,:16], ip
                     +        vst1.16         {d6[3]},  [r5,:16], ip
                     +        vst1.16         {d7[1]},  [r5,:16], ip
                     +        vst1.16         {d7[3]},  [r5,:16], ip
                     +        bgt             6b
                     +        pop             {r4-r8,pc}
                     +8:      subs            lr,  lr,  #8
                     +        vst1.16         {d0[1]},  [r5,:16], ip
                     +        vst1.16         {d0[3]},  [r5,:16], ip
                     +        vst1.16         {d1[1]},  [r5,:16], ip
                     +        vst1.16         {d1[3]},  [r5,:16], ip
                     +        vst1.16         {d2[1]},  [r5,:16], ip
                     +        vst1.16         {d2[3]},  [r5,:16], ip
                     +        vst1.16         {d3[1]},  [r5,:16], ip
                     +        vst1.16         {d3[3]},  [r5,:16], ip
                     +        popeq           {r4-r8,pc}
                     +        vld1.64         {d0-d1},  [r4,:128]!
                     +        vcvt.s32.f32    q0,  q0,  #16
                     +        vld1.64         {d2-d3},  [r4,:128]!
                     +        vcvt.s32.f32    q1,  q1,  #16
                     +        b               6b
                     +endfunc
+                    +
                     +function ff_int32_to_float_fmul_scalar_neon, export=1
                     +VFP     vdup.32         q0,  d0[0]
                     +VFP     len     .req    r2
                     +NOVFP   vdup.32         q0,  r2
                     +NOVFP   len     .req    r3
+                    +
                     +        vld1.32         {q1},[r1,:128]!
                     +        vcvt.f32.s32    q3,  q1
                     +        vld1.32         {q2},[r1,:128]!
                     +        vcvt.f32.s32    q8,  q2
                     +1:      subs            len, len, #8
                     +        pld             [r1, #16]
                     +        vmul.f32        q9,  q3,  q0
                     +        vmul.f32        q10, q8,  q0
                     +        beq             2f
                     +        vld1.32         {q1},[r1,:128]!
                     +        vcvt.f32.s32    q3,  q1
                     +        vld1.32         {q2},[r1,:128]!
                     +        vcvt.f32.s32    q8,  q2
                     +        vst1.32         {q9}, [r0,:128]!
                     +        vst1.32         {q10},[r0,:128]!
                     +        b               1b
                     +2:      vst1.32         {q9}, [r0,:128]!
                     +        vst1.32         {q10},[r0,:128]!
                     +        bx              lr
                     +        .unreq  len
                     +endfunc

libavcodec/arm/fmtconvert_vfp.S

History View file @ fe2ff6d

                     new file mode 100644
@@ -0,0 +1,77 @@
                     +/*
                     + * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
                     + *
                     + * This file is part of FFmpeg.
                     + *
                     + * FFmpeg is free software; you can redistribute it and/or
                     + * modify it under the terms of the GNU Lesser General Public
                     + * License as published by the Free Software Foundation; either
                     + * version 2.1 of the License, or (at your option) any later version.
                     + *
                     + * FFmpeg is distributed in the hope that it will be useful,
                     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     + * Lesser General Public License for more details.
                     + *
                     + * You should have received a copy of the GNU Lesser General Public
                     + * License along with FFmpeg; if not, write to the Free Software
                     + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     + */
+                    +
                     +#include "config.h"
                     +#include "asm.S"
+                    +
                     +        .syntax unified
+                    +
                     +/**
                     + * ARM VFP optimized float to int16 conversion.
                     + * Assume that len is a positive number and is multiple of 8, destination
                     + * buffer is at least 4 bytes aligned (8 bytes alignment is better for
                     + * performance), little endian byte sex
                     + */
                     +@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
                     +function ff_float_to_int16_vfp, export=1
                     +        push            {r4-r8,lr}
                     +        vpush           {d8-d11}
                     +        vldmia          r1!, {s16-s23}
                     +        vcvt.s32.f32    s0,  s16
                     +        vcvt.s32.f32    s1,  s17
                     +        vcvt.s32.f32    s2,  s18
                     +        vcvt.s32.f32    s3,  s19
                     +        vcvt.s32.f32    s4,  s20
                     +        vcvt.s32.f32    s5,  s21
                     +        vcvt.s32.f32    s6,  s22
                     +        vcvt.s32.f32    s7,  s23
                     +1:
                     +        subs            r2,  r2,  #8
                     +        vmov            r3,  r4,  s0, s1
                     +        vmov            r5,  r6,  s2, s3
                     +        vmov            r7,  r8,  s4, s5
                     +        vmov            ip,  lr,  s6, s7
                     +        vldmiagt        r1!, {s16-s23}
                     +        ssat            r4,  #16, r4
                     +        ssat            r3,  #16, r3
                     +        ssat            r6,  #16, r6
                     +        ssat            r5,  #16, r5
                     +        pkhbt           r3,  r3,  r4, lsl #16
                     +        pkhbt           r4,  r5,  r6, lsl #16
                     +        vcvtgt.s32.f32  s0,  s16
                     +        vcvtgt.s32.f32  s1,  s17
                     +        vcvtgt.s32.f32  s2,  s18
                     +        vcvtgt.s32.f32  s3,  s19
                     +        vcvtgt.s32.f32  s4,  s20
                     +        vcvtgt.s32.f32  s5,  s21
                     +        vcvtgt.s32.f32  s6,  s22
                     +        vcvtgt.s32.f32  s7,  s23
                     +        ssat            r8,  #16, r8
                     +        ssat            r7,  #16, r7
                     +        ssat            lr,  #16, lr
                     +        ssat            ip,  #16, ip
                     +        pkhbt           r5,  r7,  r8, lsl #16
                     +        pkhbt           r6,  ip,  lr, lsl #16
                     +        stmia           r0!, {r3-r6}
                     +        bgt             1b
+                    +
                     +        vpop            {d8-d11}
                     +        pop             {r4-r8,pc}
                     +endfunc

libavcodec/binkaudio.c

History View file @ fe2ff6d

@@ -33,6 +33,7 @@
                      #include "get_bits.h"
                      #include "dsputil.h"
                      #include "fft.h"
                     +#include "fmtconvert.h"
                      extern const uint16_t ff_wma_critical_freqs[25];
@@ -43,6 +44,7 @@ typedef struct {
                          AVCodecContext *avctx;
                          GetBitContext gb;
                          DSPContext dsp;
                     +    FmtConvertContext fmt_conv;
                          int first;
                          int channels;
                          int frame_len;          ///< transform size (samples)
@@ -71,6 +73,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
                          s->avctx = avctx;
                          dsputil_init(&s->dsp, avctx);
                     +    ff_fmt_convert_init(&s->fmt_conv, avctx);
                          /* determine frame length */
                          if (avctx->sample_rate < 22050) {
@@ -222,7 +225,8 @@ static void decode_block(BinkAudioContext *s, short *out, int use_dct)
                                  ff_rdft_calc(&s->trans.rdft, coeffs);
+                         }
                     -    s->dsp.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, s->frame_len, s->channels);
                     +    s->fmt_conv.float_to_int16_interleave(out, (const float **)s->coeffs_ptr,
                     +                                          s->frame_len, s->channels);
                          if (!s->first) {
                              int count = s->overlap_len * s->channels;

libavcodec/dca.c

History View file @ fe2ff6d

@@ -40,6 +40,7 @@
                      #include "dca.h"
                      #include "synth_filter.h"
                      #include "dcadsp.h"
                     +#include "fmtconvert.h"
                      //#define TRACE
@@ -347,6 +348,7 @@ typedef struct {
                          FFTContext imdct;
                          SynthFilterContext synth;
                          DCADSPContext dcadsp;
                     +    FmtConvertContext fmt_conv;
                      } DCAContext;
                      static const uint16_t dca_vlc_offs[] = {
@@ -1115,7 +1117,7 @@ static int dca_subsubframe(DCAContext * s, int base_channel, int block_index)
                                              block[m] = get_bitalloc(&s->gb, &dca_smpl_bitalloc[abits], sel);
+                                     }
                     -                s->dsp.int32_to_float_fmul_scalar(subband_samples[k][l],
                     +                s->fmt_conv.int32_to_float_fmul_scalar(subband_samples[k][l],
                                                                        block, rscale, 8);
+                                 }
@@ -1802,7 +1804,7 @@ static int dca_decode_frame(AVCodecContext * avctx,
+                                 }
+                             }
                     -        s->dsp.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
                     +        s->fmt_conv.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
                              samples += 256 * channels;
+                         }
@@ -1835,6 +1837,7 @@ static av_cold int dca_decode_init(AVCodecContext * avctx)
                          ff_mdct_init(&s->imdct, 6, 1, 1.0);
                          ff_synth_filter_init(&s->synth);
                          ff_dcadsp_init(&s->dcadsp);
                     +    ff_fmt_convert_init(&s->fmt_conv, avctx);
                          for (i = 0; i < DCA_PRIM_CHANNELS_MAX+1; i++)
                              s->samples_chanptr[i] = s->samples + i * 256;

libavcodec/dsputil.c

History View file @ fe2ff6d

@@ -3867,12 +3867,6 @@ static float scalarproduct_float_c(const float *v1, const float *v2, int len)
                          return p;
+                     }
                     -static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
                     -    int i;
                     -    for(i=0; i<len; i++)
                     -        dst[i] = src[i] * mul;
                     -}
+                    -
                      static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
                                         uint32_t maxi, uint32_t maxisign)
+                     {
@@ -3918,30 +3912,6 @@ static void vector_clipf_c(float *dst, const float *src, float min, float max, i
+                         }
+                     }
                     -static av_always_inline int float_to_int16_one(const float *src){
                     -    return av_clip_int16(lrintf(*src));
                     -}
+                    -
                     -static void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
                     -    int i;
                     -    for(i=0; i<len; i++)
                     -        dst[i] = float_to_int16_one(src+i);
                     -}
+                    -
                     -static void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
                     -    int i,j,c;
                     -    if(channels==2){
                     -        for(i=0; i<len; i++){
                     -            dst[2*i]   = float_to_int16_one(src[0]+i);
                     -            dst[2*i+1] = float_to_int16_one(src[1]+i);
                     -        }
                     -    }else{
                     -        for(c=0; c<channels; c++)
                     -            for(i=0, j=c; i<len; i++, j+=channels)
                     -                dst[j] = float_to_int16_one(src[c]+i);
                     -    }
                     -}
+                    -
                      static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
+                     {
                          int res = 0;
@@ -4437,10 +4407,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
                          c->vector_fmul_reverse = vector_fmul_reverse_c;
                          c->vector_fmul_add = vector_fmul_add_c;
                          c->vector_fmul_window = vector_fmul_window_c;
                     -    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
                          c->vector_clipf = vector_clipf_c;
                     -    c->float_to_int16 = ff_float_to_int16_c;
                     -    c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
                          c->scalarproduct_int16 = scalarproduct_int16_c;
                          c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
                          c->scalarproduct_float = scalarproduct_float_c;

libavcodec/dsputil.h

History View file @ fe2ff6d

@@ -392,7 +392,6 @@ typedef struct DSPContext {
                          /* assume len is a multiple of 4, and arrays are 16-byte aligned */
                          void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len);
                          /* assume len is a multiple of 8, and arrays are 16-byte aligned */
                     -    void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
                          void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
                          /**
                           * Multiply a vector of floats by a scalar float.  Source and
@@ -445,10 +444,6 @@ typedef struct DSPContext {
                           */
                          void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
                     -    /* convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
                     -    void (*float_to_int16)(int16_t *dst, const float *src, long len);
                     -    void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels);
+                    -
                          /* (I)DCT */
                          void (*fdct)(DCTELEM *block/* align 16*/);
                          void (*fdct248)(DCTELEM *block/* align 16*/);

libavcodec/fmtconvert.c

History View file @ fe2ff6d

                     new file mode 100644
@@ -0,0 +1,68 @@
                     +/*
                     + * Format Conversion Utils
                     + * Copyright (c) 2000, 2001 Fabrice Bellard
                     + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
                     + *
                     + * This file is part of FFmpeg.
                     + *
                     + * FFmpeg is free software; you can redistribute it and/or
                     + * modify it under the terms of the GNU Lesser General Public
                     + * License as published by the Free Software Foundation; either
                     + * version 2.1 of the License, or (at your option) any later version.
                     + *
                     + * FFmpeg is distributed in the hope that it will be useful,
                     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     + * Lesser General Public License for more details.
                     + *
                     + * You should have received a copy of the GNU Lesser General Public
                     + * License along with FFmpeg; if not, write to the Free Software
                     + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     + */
+                    +
                     +#include "avcodec.h"
                     +#include "fmtconvert.h"
+                    +
                     +static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
                     +    int i;
                     +    for(i=0; i<len; i++)
                     +        dst[i] = src[i] * mul;
                     +}
+                    +
                     +static av_always_inline int float_to_int16_one(const float *src){
                     +    return av_clip_int16(lrintf(*src));
                     +}
+                    +
                     +static void float_to_int16_c(int16_t *dst, const float *src, long len)
                     +{
                     +    int i;
                     +    for(i=0; i<len; i++)
                     +        dst[i] = float_to_int16_one(src+i);
                     +}
+                    +
                     +static void float_to_int16_interleave_c(int16_t *dst, const float **src,
                     +                                        long len, int channels)
                     +{
                     +    int i,j,c;
                     +    if(channels==2){
                     +        for(i=0; i<len; i++){
                     +            dst[2*i]   = float_to_int16_one(src[0]+i);
                     +            dst[2*i+1] = float_to_int16_one(src[1]+i);
                     +        }
                     +    }else{
                     +        for(c=0; c<channels; c++)
                     +            for(i=0, j=c; i<len; i++, j+=channels)
                     +                dst[j] = float_to_int16_one(src[c]+i);
                     +    }
                     +}
+                    +
                     +av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
                     +{
                     +    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
                     +    c->float_to_int16             = float_to_int16_c;
                     +    c->float_to_int16_interleave  = float_to_int16_interleave_c;
+                    +
                     +    if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
                     +    if (ARCH_PPC) ff_fmt_convert_init_ppc(c, avctx);
                     +    if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx);
                     +}

libavcodec/fmtconvert.h

History View file @ fe2ff6d

                     new file mode 100644
@@ -0,0 +1,79 @@
                     +/*
                     + * Format Conversion Utils
                     + * Copyright (c) 2000, 2001 Fabrice Bellard
                     + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
                     + *
                     + * This file is part of FFmpeg.
                     + *
                     + * FFmpeg is free software; you can redistribute it and/or
                     + * modify it under the terms of the GNU Lesser General Public
                     + * License as published by the Free Software Foundation; either
                     + * version 2.1 of the License, or (at your option) any later version.
                     + *
                     + * FFmpeg is distributed in the hope that it will be useful,
                     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     + * Lesser General Public License for more details.
                     + *
                     + * You should have received a copy of the GNU Lesser General Public
                     + * License along with FFmpeg; if not, write to the Free Software
                     + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     + */
+                    +
                     +#ifndef AVCODEC_FMTCONVERT_H
                     +#define AVCODEC_FMTCONVERT_H
+                    +
                     +#include "avcodec.h"
+                    +
                     +typedef struct FmtConvertContext {
                     +    /**
                     +     * Convert an array of int32_t to float and multiply by a float value.
                     +     * @param dst destination array of float.
                     +     *            constraints: 16-byte aligned
                     +     * @param src source array of int32_t.
                     +     *            constraints: 16-byte aligned
                     +     * @param len number of elements to convert.
                     +     *            constraints: multiple of 8
                     +     */
                     +    void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
+                    +
                     +    /**
                     +     * Convert an array of float to an array of int16_t.
                     +     *
                     +     * Convert floats from in the range [-32768.0,32767.0] to ints
                     +     * without rescaling
                     +     *
                     +     * @param dst destination array of int16_t.
                     +     *            constraints: 16-byte aligned
                     +     * @param src source array of float.
                     +     *            constraints: 16-byte aligned
                     +     * @param len number of elements to convert.
                     +     *            constraints: multiple of 8
                     +     */
                     +    void (*float_to_int16)(int16_t *dst, const float *src, long len);
+                    +
                     +    /**
                     +     * Convert multiple arrays of float to an interleaved array of int16_t.
                     +     *
                     +     * Convert floats from in the range [-32768.0,32767.0] to ints
                     +     * without rescaling
                     +     *
                     +     * @param dst destination array of interleaved int16_t.
                     +     *            constraints: 16-byte aligned
                     +     * @param src source array of float arrays, one for each channel.
                     +     *            constraints: 16-byte aligned
                     +     * @param len number of elements to convert.
                     +     *            constraints: multiple of 8
                     +     * @param channels number of channels
                     +     */
                     +    void (*float_to_int16_interleave)(int16_t *dst, const float **src,
                     +                                      long len, int channels);
                     +} FmtConvertContext;
+                    +
                     +void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
+                    +
                     +void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
                     +void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx);
                     +void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx);
+                    +
                     +#endif /* AVCODEC_FMTCONVERT_H */

libavcodec/nellymoserdec.c

History View file @ fe2ff6d

@@ -38,6 +38,7 @@
                      #include "avcodec.h"
                      #include "dsputil.h"
                      #include "fft.h"
                     +#include "fmtconvert.h"
                      #define ALT_BITSTREAM_READER_LE
                      #include "get_bits.h"
@@ -52,6 +53,7 @@ typedef struct NellyMoserDecodeContext {
                          float           scale_bias;
                          DSPContext      dsp;
                          FFTContext      imdct_ctx;
                     +    FmtConvertContext fmt_conv;
                          DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2];
                      } NellyMoserDecodeContext;
@@ -134,6 +136,7 @@ static av_cold int decode_init(AVCodecContext * avctx) {
                          ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0);
                          dsputil_init(&s->dsp, avctx);
                     +    ff_fmt_convert_init(&s->fmt_conv, avctx);
                          s->scale_bias = 1.0/(1*8);
@@ -175,7 +178,7 @@ static int decode_tag(AVCodecContext * avctx,
                          for (i=0 ; i<blocks ; i++) {
                              nelly_decode_block(s, &buf[i*NELLY_BLOCK_LEN], s->float_buf);
                     -        s->dsp.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
                     +        s->fmt_conv.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
                              *data_size += NELLY_SAMPLES*sizeof(int16_t);
+                         }

libavcodec/ppc/Makefile

History View file @ fe2ff6d

@@ -21,6 +21,7 @@ ALTIVEC-OBJS-$(CONFIG_FFT)             += ppc/fft_altivec.o             \
                      OBJS-$(HAVE_ALTIVEC)                   += ppc/dsputil_altivec.o         \
                                                                ppc/fdct_altivec.o            \
                                                                ppc/float_altivec.o           \
                     +                                          ppc/fmtconvert_altivec.o      \
                                                                ppc/gmc_altivec.o             \
                                                                ppc/idct_altivec.o            \
                                                                ppc/int_altivec.o             \

libavcodec/ppc/float_altivec.c

History View file @ fe2ff6d

@@ -122,124 +122,12 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa
+                         }
+                     }
                     -static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
                     -{
                     -    union {
                     -        vector float v;
                     -        float s[4];
                     -    } mul_u;
                     -    int i;
                     -    vector float src1, src2, dst1, dst2, mul_v, zero;
+                    -
                     -    zero = (vector float)vec_splat_u32(0);
                     -    mul_u.s[0] = mul;
                     -    mul_v = vec_splat(mul_u.v, 0);
+                    -
                     -    for(i=0; i<len; i+=8) {
                     -        src1 = vec_ctf(vec_ld(0,  src+i), 0);
                     -        src2 = vec_ctf(vec_ld(16, src+i), 0);
                     -        dst1 = vec_madd(src1, mul_v, zero);
                     -        dst2 = vec_madd(src2, mul_v, zero);
                     -        vec_st(dst1,  0, dst+i);
                     -        vec_st(dst2, 16, dst+i);
                     -    }
                     -}
+                    -
+                    -
                     -static vector signed short
                     -float_to_int16_one_altivec(const float *src)
                     -{
                     -    vector float s0 = vec_ld(0, src);
                     -    vector float s1 = vec_ld(16, src);
                     -    vector signed int t0 = vec_cts(s0, 0);
                     -    vector signed int t1 = vec_cts(s1, 0);
                     -    return vec_packs(t0,t1);
                     -}
+                    -
                     -static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
                     -{
                     -    int i;
                     -    vector signed short d0, d1, d;
                     -    vector unsigned char align;
                     -    if(((long)dst)&15) //FIXME
                     -    for(i=0; i<len-7; i+=8) {
                     -        d0 = vec_ld(0, dst+i);
                     -        d = float_to_int16_one_altivec(src+i);
                     -        d1 = vec_ld(15, dst+i);
                     -        d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
                     -        align = vec_lvsr(0, dst+i);
                     -        d0 = vec_perm(d1, d, align);
                     -        d1 = vec_perm(d, d1, align);
                     -        vec_st(d0, 0, dst+i);
                     -        vec_st(d1,15, dst+i);
                     -    }
                     -    else
                     -    for(i=0; i<len-7; i+=8) {
                     -        d = float_to_int16_one_altivec(src+i);
                     -        vec_st(d, 0, dst+i);
                     -    }
                     -}
+                    -
                     -static void
                     -float_to_int16_interleave_altivec(int16_t *dst, const float **src,
                     -                                  long len, int channels)
                     -{
                     -    int i;
                     -    vector signed short d0, d1, d2, c0, c1, t0, t1;
                     -    vector unsigned char align;
                     -    if(channels == 1)
                     -        float_to_int16_altivec(dst, src[0], len);
                     -    else
                     -        if (channels == 2) {
                     -        if(((long)dst)&15)
                     -        for(i=0; i<len-7; i+=8) {
                     -            d0 = vec_ld(0, dst + i);
                     -            t0 = float_to_int16_one_altivec(src[0] + i);
                     -            d1 = vec_ld(31, dst + i);
                     -            t1 = float_to_int16_one_altivec(src[1] + i);
                     -            c0 = vec_mergeh(t0, t1);
                     -            c1 = vec_mergel(t0, t1);
                     -            d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
                     -            align = vec_lvsr(0, dst + i);
                     -            d0 = vec_perm(d2, c0, align);
                     -            d1 = vec_perm(c0, c1, align);
                     -            vec_st(d0,  0, dst + i);
                     -            d0 = vec_perm(c1, d2, align);
                     -            vec_st(d1, 15, dst + i);
                     -            vec_st(d0, 31, dst + i);
                     -            dst+=8;
                     -        }
                     -        else
                     -        for(i=0; i<len-7; i+=8) {
                     -            t0 = float_to_int16_one_altivec(src[0] + i);
                     -            t1 = float_to_int16_one_altivec(src[1] + i);
                     -            d0 = vec_mergeh(t0, t1);
                     -            d1 = vec_mergel(t0, t1);
                     -            vec_st(d0,  0, dst + i);
                     -            vec_st(d1, 16, dst + i);
                     -            dst+=8;
                     -        }
                     -    } else {
                     -        DECLARE_ALIGNED(16, int16_t, tmp)[len];
                     -        int c, j;
                     -        for (c = 0; c < channels; c++) {
                     -            float_to_int16_altivec(tmp, src[c], len);
                     -            for (i = 0, j = c; i < len; i++, j+=channels) {
                     -                dst[j] = tmp[i];
                     -            }
                     -        }
                     -   }
                     -}
+                    -
                      void float_init_altivec(DSPContext* c, AVCodecContext *avctx)
+                     {
                          c->vector_fmul = vector_fmul_altivec;
                          c->vector_fmul_reverse = vector_fmul_reverse_altivec;
                          c->vector_fmul_add = vector_fmul_add_altivec;
                     -    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
                          if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
                              c->vector_fmul_window = vector_fmul_window_altivec;
                     -        c->float_to_int16 = float_to_int16_altivec;
                     -        c->float_to_int16_interleave = float_to_int16_interleave_altivec;
+                         }
+                     }

libavcodec/ppc/fmtconvert_altivec.c

History View file @ fe2ff6d

                     new file mode 100644
@@ -0,0 +1,142 @@
                     +/*
                     + * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
                     + *
                     + * This file is part of FFmpeg.
                     + *
                     + * FFmpeg is free software; you can redistribute it and/or
                     + * modify it under the terms of the GNU Lesser General Public
                     + * License as published by the Free Software Foundation; either
                     + * version 2.1 of the License, or (at your option) any later version.
                     + *
                     + * FFmpeg is distributed in the hope that it will be useful,
                     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     + * Lesser General Public License for more details.
                     + *
                     + * You should have received a copy of the GNU Lesser General Public
                     + * License along with FFmpeg; if not, write to the Free Software
                     + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     + */
+                    +
                     +#include "libavcodec/fmtconvert.h"
+                    +
                     +#include "dsputil_altivec.h"
                     +#include "util_altivec.h"
+                    +
                     +static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
                     +{
                     +    union {
                     +        vector float v;
                     +        float s[4];
                     +    } mul_u;
                     +    int i;
                     +    vector float src1, src2, dst1, dst2, mul_v, zero;
+                    +
                     +    zero = (vector float)vec_splat_u32(0);
                     +    mul_u.s[0] = mul;
                     +    mul_v = vec_splat(mul_u.v, 0);
+                    +
                     +    for(i=0; i<len; i+=8) {
                     +        src1 = vec_ctf(vec_ld(0,  src+i), 0);
                     +        src2 = vec_ctf(vec_ld(16, src+i), 0);
                     +        dst1 = vec_madd(src1, mul_v, zero);
                     +        dst2 = vec_madd(src2, mul_v, zero);
                     +        vec_st(dst1,  0, dst+i);
                     +        vec_st(dst2, 16, dst+i);
                     +    }
                     +}
+                    +
+                    +
                     +static vector signed short
                     +float_to_int16_one_altivec(const float *src)
                     +{
                     +    vector float s0 = vec_ld(0, src);
                     +    vector float s1 = vec_ld(16, src);
                     +    vector signed int t0 = vec_cts(s0, 0);
                     +    vector signed int t1 = vec_cts(s1, 0);
                     +    return vec_packs(t0,t1);
                     +}
+                    +
                     +static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
                     +{
                     +    int i;
                     +    vector signed short d0, d1, d;
                     +    vector unsigned char align;
                     +    if(((long)dst)&15) //FIXME
                     +    for(i=0; i<len-7; i+=8) {
                     +        d0 = vec_ld(0, dst+i);
                     +        d = float_to_int16_one_altivec(src+i);
                     +        d1 = vec_ld(15, dst+i);
                     +        d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
                     +        align = vec_lvsr(0, dst+i);
                     +        d0 = vec_perm(d1, d, align);
                     +        d1 = vec_perm(d, d1, align);
                     +        vec_st(d0, 0, dst+i);
                     +        vec_st(d1,15, dst+i);
                     +    }
                     +    else
                     +    for(i=0; i<len-7; i+=8) {
                     +        d = float_to_int16_one_altivec(src+i);
                     +        vec_st(d, 0, dst+i);
                     +    }
                     +}
+                    +
                     +static void
                     +float_to_int16_interleave_altivec(int16_t *dst, const float **src,
                     +                                  long len, int channels)
                     +{
                     +    int i;
                     +    vector signed short d0, d1, d2, c0, c1, t0, t1;
                     +    vector unsigned char align;
                     +    if(channels == 1)
                     +        float_to_int16_altivec(dst, src[0], len);
                     +    else
                     +        if (channels == 2) {
                     +        if(((long)dst)&15)
                     +        for(i=0; i<len-7; i+=8) {
                     +            d0 = vec_ld(0, dst + i);
                     +            t0 = float_to_int16_one_altivec(src[0] + i);
                     +            d1 = vec_ld(31, dst + i);
                     +            t1 = float_to_int16_one_altivec(src[1] + i);
                     +            c0 = vec_mergeh(t0, t1);
                     +            c1 = vec_mergel(t0, t1);
                     +            d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
                     +            align = vec_lvsr(0, dst + i);
                     +            d0 = vec_perm(d2, c0, align);
                     +            d1 = vec_perm(c0, c1, align);
                     +            vec_st(d0,  0, dst + i);
                     +            d0 = vec_perm(c1, d2, align);
                     +            vec_st(d1, 15, dst + i);
                     +            vec_st(d0, 31, dst + i);
                     +            dst+=8;
                     +        }
                     +        else
                     +        for(i=0; i<len-7; i+=8) {
                     +            t0 = float_to_int16_one_altivec(src[0] + i);
                     +            t1 = float_to_int16_one_altivec(src[1] + i);
                     +            d0 = vec_mergeh(t0, t1);
                     +            d1 = vec_mergel(t0, t1);
                     +            vec_st(d0,  0, dst + i);
                     +            vec_st(d1, 16, dst + i);
                     +            dst+=8;
                     +        }
                     +    } else {
                     +        DECLARE_ALIGNED(16, int16_t, tmp)[len];
                     +        int c, j;
                     +        for (c = 0; c < channels; c++) {
                     +            float_to_int16_altivec(tmp, src[c], len);
                     +            for (i = 0, j = c; i < len; i++, j+=channels) {
                     +                dst[j] = tmp[i];
                     +            }
                     +        }
                     +   }
                     +}
+                    +
                     +void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx)
                     +{
                     +    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
                     +    if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
                     +        c->float_to_int16 = float_to_int16_altivec;
                     +        c->float_to_int16_interleave = float_to_int16_interleave_altivec;
                     +    }
                     +}

libavcodec/vorbis_dec.c

History View file @ fe2ff6d

@@ -31,6 +31,7 @@
                      #include "get_bits.h"
                      #include "dsputil.h"
                      #include "fft.h"
                     +#include "fmtconvert.h"
                      #include "vorbis.h"
                      #include "xiph.h"
@@ -127,6 +128,7 @@ typedef struct vorbis_context_s {
                          AVCodecContext *avccontext;
                          GetBitContext gb;
                          DSPContext dsp;
                     +    FmtConvertContext fmt_conv;
                          FFTContext mdct[2];
                          uint_fast8_t  first_frame;
@@ -961,6 +963,7 @@ static av_cold int vorbis_decode_init(AVCodecContext *avccontext)
                          vc->avccontext = avccontext;
                          dsputil_init(&vc->dsp, avccontext);
                     +    ff_fmt_convert_init(&vc->fmt_conv, avccontext);
                          vc->scale_bias = 32768.0f;
@@ -1636,7 +1639,8 @@ static int vorbis_decode_frame(AVCodecContext *avccontext,
                                                    len * ff_vorbis_channel_layout_offsets[vc->audio_channels - 1][i];
+                         }
                     -    vc->dsp.float_to_int16_interleave(data, channel_ptrs, len, vc->audio_channels);
                     +    vc->fmt_conv.float_to_int16_interleave(data, channel_ptrs, len,
                     +                                           vc->audio_channels);
                          *data_size = len * 2 * vc->audio_channels;
                          return buf_size ;

libavcodec/wma.c

History View file @ fe2ff6d

@@ -126,6 +126,7 @@ int ff_wma_init(AVCodecContext *avctx, int flags2)
                          s->block_align = avctx->block_align;
                          dsputil_init(&s->dsp, avctx);
                     +    ff_fmt_convert_init(&s->fmt_conv, avctx);
                          if (avctx->codec->id == CODEC_ID_WMAV1) {
                              s->version = 1;

libavcodec/wma.h

History View file @ fe2ff6d

@@ -26,6 +26,7 @@
                      #include "put_bits.h"
                      #include "dsputil.h"
                      #include "fft.h"
                     +#include "fmtconvert.h"
                      /* size of blocks */
                      #define BLOCK_MIN_BITS 7
@@ -134,6 +135,7 @@ typedef struct WMACodecContext {
                          float lsp_pow_m_table1[(1 << LSP_POW_BITS)];
                          float lsp_pow_m_table2[(1 << LSP_POW_BITS)];
                          DSPContext dsp;
                     +    FmtConvertContext fmt_conv;
                      #ifdef TRACE
                          int frame_count;

libavcodec/wmadec.c

History View file @ fe2ff6d

@@ -791,7 +791,7 @@ static int wma_decode_frame(WMACodecContext *s, int16_t *samples)
                          incr = s->nb_channels;
                          for (ch = 0; ch < MAX_CHANNELS; ch++)
                              output[ch] = s->frame_out[ch];
                     -    s->dsp.float_to_int16_interleave(samples, output, n, incr);
                     +    s->fmt_conv.float_to_int16_interleave(samples, output, n, incr);
                          for (ch = 0; ch < incr; ch++) {
                              /* prepare for next block */
                              memmove(&s->frame_out[ch][0], &s->frame_out[ch][n], n * sizeof(float));

libavcodec/x86/Makefile

History View file @ fe2ff6d

@@ -39,6 +39,7 @@ YASM-OBJS-$(CONFIG_VP8_DECODER)        += x86/vp8dsp.o
                      MMX-OBJS-$(CONFIG_VP8_DECODER)         += x86/vp8dsp-init.o
                      MMX-OBJS-$(HAVE_YASM)                  += x86/dsputil_yasm.o            \
                                                                x86/deinterlace.o             \
                     +                                          x86/fmtconvert.o              \
                                                                x86/h264_chromamc.o           \
                                                                $(YASM-OBJS-yes)
@@ -47,6 +48,7 @@ MMX-OBJS-$(CONFIG_FFT)                 += x86/fft.o
                      OBJS-$(HAVE_MMX)                       += x86/dnxhd_mmx.o               \
                                                                x86/dsputil_mmx.o             \
                                                                x86/fdct_mmx.o                \
                     +                                          x86/fmtconvert_mmx.o          \
                                                                x86/idct_mmx_xvid.o           \
                                                                x86/idct_sse2_xvid.o          \
                                                                x86/motion_est_mmx.o          \

libavcodec/x86/dsputil_mmx.c

History View file @ fe2ff6d

@@ -2349,50 +2349,6 @@ static void vector_fmul_window_sse(float *dst, const float *src0, const float *s
+                     }
                      #endif /* HAVE_6REGS */
                     -static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
                     -{
                     -    x86_reg i = -4*len;
                     -    __asm__ volatile(
                     -        "movss  %3, %%xmm4 \n"
                     -        "shufps $0, %%xmm4, %%xmm4 \n"
                     -        "1: \n"
                     -        "cvtpi2ps   (%2,%0), %%xmm0 \n"
                     -        "cvtpi2ps  8(%2,%0), %%xmm1 \n"
                     -        "cvtpi2ps 16(%2,%0), %%xmm2 \n"
                     -        "cvtpi2ps 24(%2,%0), %%xmm3 \n"
                     -        "movlhps  %%xmm1,    %%xmm0 \n"
                     -        "movlhps  %%xmm3,    %%xmm2 \n"
                     -        "mulps    %%xmm4,    %%xmm0 \n"
                     -        "mulps    %%xmm4,    %%xmm2 \n"
                     -        "movaps   %%xmm0,   (%1,%0) \n"
                     -        "movaps   %%xmm2, 16(%1,%0) \n"
                     -        "add $32, %0 \n"
                     -        "jl 1b \n"
                     -        :"+r"(i)
                     -        :"r"(dst+len), "r"(src+len), "m"(mul)
                     -    );
                     -}
+                    -
                     -static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
                     -{
                     -    x86_reg i = -4*len;
                     -    __asm__ volatile(
                     -        "movss  %3, %%xmm4 \n"
                     -        "shufps $0, %%xmm4, %%xmm4 \n"
                     -        "1: \n"
                     -        "cvtdq2ps   (%2,%0), %%xmm0 \n"
                     -        "cvtdq2ps 16(%2,%0), %%xmm1 \n"
                     -        "mulps    %%xmm4,    %%xmm0 \n"
                     -        "mulps    %%xmm4,    %%xmm1 \n"
                     -        "movaps   %%xmm0,   (%1,%0) \n"
                     -        "movaps   %%xmm1, 16(%1,%0) \n"
                     -        "add $32, %0 \n"
                     -        "jl 1b \n"
                     -        :"+r"(i)
                     -        :"r"(dst+len), "r"(src+len), "m"(mul)
                     -    );
                     -}
+                    -
                      static void vector_clipf_sse(float *dst, const float *src, float min, float max,
                                                   int len)
+                     {
@@ -2427,70 +2383,6 @@ static void vector_clipf_sse(float *dst, const float *src, float min, float max,
                          );
+                     }
                     -static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
                     -    x86_reg reglen = len;
                     -    // not bit-exact: pf2id uses different rounding than C and SSE
                     -    __asm__ volatile(
                     -        "add        %0          , %0        \n\t"
                     -        "lea         (%2,%0,2)  , %2        \n\t"
                     -        "add        %0          , %1        \n\t"
                     -        "neg        %0                      \n\t"
                     -        "1:                                 \n\t"
                     -        "pf2id       (%2,%0,2)  , %%mm0     \n\t"
                     -        "pf2id      8(%2,%0,2)  , %%mm1     \n\t"
                     -        "pf2id     16(%2,%0,2)  , %%mm2     \n\t"
                     -        "pf2id     24(%2,%0,2)  , %%mm3     \n\t"
                     -        "packssdw   %%mm1       , %%mm0     \n\t"
                     -        "packssdw   %%mm3       , %%mm2     \n\t"
                     -        "movq       %%mm0       ,  (%1,%0)  \n\t"
                     -        "movq       %%mm2       , 8(%1,%0)  \n\t"
                     -        "add        $16         , %0        \n\t"
                     -        " js 1b                             \n\t"
                     -        "femms                              \n\t"
                     -        :"+r"(reglen), "+r"(dst), "+r"(src)
                     -    );
                     -}
                     -static void float_to_int16_sse(int16_t *dst, const float *src, long len){
                     -    x86_reg reglen = len;
                     -    __asm__ volatile(
                     -        "add        %0          , %0        \n\t"
                     -        "lea         (%2,%0,2)  , %2        \n\t"
                     -        "add        %0          , %1        \n\t"
                     -        "neg        %0                      \n\t"
                     -        "1:                                 \n\t"
                     -        "cvtps2pi    (%2,%0,2)  , %%mm0     \n\t"
                     -        "cvtps2pi   8(%2,%0,2)  , %%mm1     \n\t"
                     -        "cvtps2pi  16(%2,%0,2)  , %%mm2     \n\t"
                     -        "cvtps2pi  24(%2,%0,2)  , %%mm3     \n\t"
                     -        "packssdw   %%mm1       , %%mm0     \n\t"
                     -        "packssdw   %%mm3       , %%mm2     \n\t"
                     -        "movq       %%mm0       ,  (%1,%0)  \n\t"
                     -        "movq       %%mm2       , 8(%1,%0)  \n\t"
                     -        "add        $16         , %0        \n\t"
                     -        " js 1b                             \n\t"
                     -        "emms                               \n\t"
                     -        :"+r"(reglen), "+r"(dst), "+r"(src)
                     -    );
                     -}
+                    -
                     -static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
                     -    x86_reg reglen = len;
                     -    __asm__ volatile(
                     -        "add        %0          , %0        \n\t"
                     -        "lea         (%2,%0,2)  , %2        \n\t"
                     -        "add        %0          , %1        \n\t"
                     -        "neg        %0                      \n\t"
                     -        "1:                                 \n\t"
                     -        "cvtps2dq    (%2,%0,2)  , %%xmm0    \n\t"
                     -        "cvtps2dq  16(%2,%0,2)  , %%xmm1    \n\t"
                     -        "packssdw   %%xmm1      , %%xmm0    \n\t"
                     -        "movdqa     %%xmm0      ,  (%1,%0)  \n\t"
                     -        "add        $16         , %0        \n\t"
                     -        " js 1b                             \n\t"
                     -        :"+r"(reglen), "+r"(dst), "+r"(src)
                     -    );
                     -}
+                    -
                      void ff_vp3_idct_mmx(int16_t *input_data);
                      void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
                      void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
@@ -2504,9 +2396,6 @@ void ff_vp3_idct_sse2(int16_t *input_data);
                      void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
                      void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
                     -void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
                     -void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
                     -void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
                      int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
                      int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
                      int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
@@ -2516,102 +2405,6 @@ void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const
                      int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
                      int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
                     -#if !HAVE_YASM
                     -#define ff_float_to_int16_interleave6_sse(a,b,c)   float_to_int16_interleave_misc_sse(a,b,c,6)
                     -#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
                     -#define ff_float_to_int16_interleave6_3dn2(a,b,c)  float_to_int16_interleave_misc_3dnow(a,b,c,6)
                     -#endif
                     -#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
+                    -
                     -#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
                     -/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
                     -static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
                     -    DECLARE_ALIGNED(16, int16_t, tmp)[len];\
                     -    int i,j,c;\
                     -    for(c=0; c<channels; c++){\
                     -        float_to_int16_##cpu(tmp, src[c], len);\
                     -        for(i=0, j=c; i<len; i++, j+=channels)\
                     -            dst[j] = tmp[i];\
                     -    }\
                     -}\
                     -\
                     -static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
                     -    if(channels==1)\
                     -        float_to_int16_##cpu(dst, src[0], len);\
                     -    else if(channels==2){\
                     -        x86_reg reglen = len; \
                     -        const float *src0 = src[0];\
                     -        const float *src1 = src[1];\
                     -        __asm__ volatile(\
                     -            "shl $2, %0 \n"\
                     -            "add %0, %1 \n"\
                     -            "add %0, %2 \n"\
                     -            "add %0, %3 \n"\
                     -            "neg %0 \n"\
                     -            body\
                     -            :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
                     -        );\
                     -    }else if(channels==6){\
                     -        ff_float_to_int16_interleave6_##cpu(dst, src, len);\
                     -    }else\
                     -        float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
                     -}
+                    -
                     -FLOAT_TO_INT16_INTERLEAVE(3dnow,
                     -    "1:                         \n"
                     -    "pf2id     (%2,%0), %%mm0   \n"
                     -    "pf2id    8(%2,%0), %%mm1   \n"
                     -    "pf2id     (%3,%0), %%mm2   \n"
                     -    "pf2id    8(%3,%0), %%mm3   \n"
                     -    "packssdw    %%mm1, %%mm0   \n"
                     -    "packssdw    %%mm3, %%mm2   \n"
                     -    "movq        %%mm0, %%mm1   \n"
                     -    "punpcklwd   %%mm2, %%mm0   \n"
                     -    "punpckhwd   %%mm2, %%mm1   \n"
                     -    "movq        %%mm0,  (%1,%0)\n"
                     -    "movq        %%mm1, 8(%1,%0)\n"
                     -    "add $16, %0                \n"
                     -    "js 1b                      \n"
                     -    "femms                      \n"
                     -)
+                    -
                     -FLOAT_TO_INT16_INTERLEAVE(sse,
                     -    "1:                         \n"
                     -    "cvtps2pi  (%2,%0), %%mm0   \n"
                     -    "cvtps2pi 8(%2,%0), %%mm1   \n"
                     -    "cvtps2pi  (%3,%0), %%mm2   \n"
                     -    "cvtps2pi 8(%3,%0), %%mm3   \n"
                     -    "packssdw    %%mm1, %%mm0   \n"
                     -    "packssdw    %%mm3, %%mm2   \n"
                     -    "movq        %%mm0, %%mm1   \n"
                     -    "punpcklwd   %%mm2, %%mm0   \n"
                     -    "punpckhwd   %%mm2, %%mm1   \n"
                     -    "movq        %%mm0,  (%1,%0)\n"
                     -    "movq        %%mm1, 8(%1,%0)\n"
                     -    "add $16, %0                \n"
                     -    "js 1b                      \n"
                     -    "emms                       \n"
                     -)
+                    -
                     -FLOAT_TO_INT16_INTERLEAVE(sse2,
                     -    "1:                         \n"
                     -    "cvtps2dq  (%2,%0), %%xmm0  \n"
                     -    "cvtps2dq  (%3,%0), %%xmm1  \n"
                     -    "packssdw   %%xmm1, %%xmm0  \n"
                     -    "movhlps    %%xmm0, %%xmm1  \n"
                     -    "punpcklwd  %%xmm1, %%xmm0  \n"
                     -    "movdqa     %%xmm0, (%1,%0) \n"
                     -    "add $16, %0                \n"
                     -    "js 1b                      \n"
                     -)
+                    -
                     -static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
                     -    if(channels==6)
                     -        ff_float_to_int16_interleave6_3dn2(dst, src, len);
                     -    else
                     -        float_to_int16_interleave_3dnow(dst, src, len, channels);
                     -}
+                    -
                      float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
                      void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
@@ -2968,19 +2761,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                              if(mm_flags & AV_CPU_FLAG_3DNOW){
                                  c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
                                  c->vector_fmul = vector_fmul_3dnow;
                     -            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                     -                c->float_to_int16 = float_to_int16_3dnow;
                     -                c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
                     -            }
+                             }
                              if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
                                  c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
                      #if HAVE_6REGS
                                  c->vector_fmul_window = vector_fmul_window_3dnow2;
                      #endif
                     -            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                     -                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
                     -            }
+                             }
                              if(mm_flags & AV_CPU_FLAG_MMX2){
                      #if HAVE_YASM
@@ -2997,10 +2783,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                      #if HAVE_6REGS
                                  c->vector_fmul_window = vector_fmul_window_sse;
                      #endif
                     -            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
                                  c->vector_clipf = vector_clipf_sse;
                     -            c->float_to_int16 = float_to_int16_sse;
                     -            c->float_to_int16_interleave = float_to_int16_interleave_sse;
                      #if HAVE_YASM
                                  c->scalarproduct_float = ff_scalarproduct_float_sse;
                      #endif
@@ -3008,9 +2791,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                              if(mm_flags & AV_CPU_FLAG_3DNOW)
                                  c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
                              if(mm_flags & AV_CPU_FLAG_SSE2){
                     -            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
                     -            c->float_to_int16 = float_to_int16_sse2;
                     -            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
                      #if HAVE_YASM
                                  c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
                                  c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;

libavcodec/x86/dsputil_yasm.asm

History View file @ fe2ff6d

@@ -30,75 +30,6 @@ pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
                      section .text align=16
                     -%macro PSWAPD_SSE 2
                     -    pshufw %1, %2, 0x4e
                     -%endmacro
                     -%macro PSWAPD_3DN1 2
                     -    movq  %1, %2
                     -    psrlq %1, 32
                     -    punpckldq %1, %2
                     -%endmacro
+                    -
                     -%macro FLOAT_TO_INT16_INTERLEAVE6 1
                     -; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
                     -cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
                     -%ifdef ARCH_X86_64
                     -    %define lend r10d
                     -    mov     lend, r2d
                     -%else
                     -    %define lend dword r2m
                     -%endif
                     -    mov src1q, [srcq+1*gprsize]
                     -    mov src2q, [srcq+2*gprsize]
                     -    mov src3q, [srcq+3*gprsize]
                     -    mov src4q, [srcq+4*gprsize]
                     -    mov src5q, [srcq+5*gprsize]
                     -    mov srcq,  [srcq]
                     -    sub src1q, srcq
                     -    sub src2q, srcq
                     -    sub src3q, srcq
                     -    sub src4q, srcq
                     -    sub src5q, srcq
                     -.loop:
                     -    cvtps2pi   mm0, [srcq]
                     -    cvtps2pi   mm1, [srcq+src1q]
                     -    cvtps2pi   mm2, [srcq+src2q]
                     -    cvtps2pi   mm3, [srcq+src3q]
                     -    cvtps2pi   mm4, [srcq+src4q]
                     -    cvtps2pi   mm5, [srcq+src5q]
                     -    packssdw   mm0, mm3
                     -    packssdw   mm1, mm4
                     -    packssdw   mm2, mm5
                     -    pswapd     mm3, mm0
                     -    punpcklwd  mm0, mm1
                     -    punpckhwd  mm1, mm2
                     -    punpcklwd  mm2, mm3
                     -    pswapd     mm3, mm0
                     -    punpckldq  mm0, mm2
                     -    punpckhdq  mm2, mm1
                     -    punpckldq  mm1, mm3
                     -    movq [dstq   ], mm0
                     -    movq [dstq+16], mm2
                     -    movq [dstq+ 8], mm1
                     -    add srcq, 8
                     -    add dstq, 24
                     -    sub lend, 2
                     -    jg .loop
                     -    emms
                     -    RET
                     -%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
+                    -
                     -%define pswapd PSWAPD_SSE
                     -FLOAT_TO_INT16_INTERLEAVE6 sse
                     -%define cvtps2pi pf2id
                     -%define pswapd PSWAPD_3DN1
                     -FLOAT_TO_INT16_INTERLEAVE6 3dnow
                     -%undef pswapd
                     -FLOAT_TO_INT16_INTERLEAVE6 3dn2
                     -%undef cvtps2pi
+                    -
+                    -
+                    -
                      %macro SCALARPRODUCT 1
                      ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
                      cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift

libavcodec/x86/fmtconvert.asm

History View file @ fe2ff6d

                     new file mode 100644
@@ -0,0 +1,91 @@
                     +;******************************************************************************
                     +;* x86 optimized Format Conversion Utils
                     +;* Copyright (c) 2008 Loren Merritt
                     +;*
                     +;* This file is part of FFmpeg.
                     +;*
                     +;* FFmpeg is free software; you can redistribute it and/or
                     +;* modify it under the terms of the GNU Lesser General Public
                     +;* License as published by the Free Software Foundation; either
                     +;* version 2.1 of the License, or (at your option) any later version.
                     +;*
                     +;* FFmpeg is distributed in the hope that it will be useful,
                     +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
                     +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     +;* Lesser General Public License for more details.
                     +;*
                     +;* You should have received a copy of the GNU Lesser General Public
                     +;* License along with FFmpeg; if not, write to the Free Software
                     +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     +;******************************************************************************
+                    +
                     +%include "x86inc.asm"
+                    +
                     +section .text align=16
+                    +
                     +%macro PSWAPD_SSE 2
                     +    pshufw %1, %2, 0x4e
                     +%endmacro
                     +%macro PSWAPD_3DN1 2
                     +    movq  %1, %2
                     +    psrlq %1, 32
                     +    punpckldq %1, %2
                     +%endmacro
+                    +
                     +%macro FLOAT_TO_INT16_INTERLEAVE6 1
                     +; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
                     +cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
                     +%ifdef ARCH_X86_64
                     +    %define lend r10d
                     +    mov     lend, r2d
                     +%else
                     +    %define lend dword r2m
                     +%endif
                     +    mov src1q, [srcq+1*gprsize]
                     +    mov src2q, [srcq+2*gprsize]
                     +    mov src3q, [srcq+3*gprsize]
                     +    mov src4q, [srcq+4*gprsize]
                     +    mov src5q, [srcq+5*gprsize]
                     +    mov srcq,  [srcq]
                     +    sub src1q, srcq
                     +    sub src2q, srcq
                     +    sub src3q, srcq
                     +    sub src4q, srcq
                     +    sub src5q, srcq
                     +.loop:
                     +    cvtps2pi   mm0, [srcq]
                     +    cvtps2pi   mm1, [srcq+src1q]
                     +    cvtps2pi   mm2, [srcq+src2q]
                     +    cvtps2pi   mm3, [srcq+src3q]
                     +    cvtps2pi   mm4, [srcq+src4q]
                     +    cvtps2pi   mm5, [srcq+src5q]
                     +    packssdw   mm0, mm3
                     +    packssdw   mm1, mm4
                     +    packssdw   mm2, mm5
                     +    pswapd     mm3, mm0
                     +    punpcklwd  mm0, mm1
                     +    punpckhwd  mm1, mm2
                     +    punpcklwd  mm2, mm3
                     +    pswapd     mm3, mm0
                     +    punpckldq  mm0, mm2
                     +    punpckhdq  mm2, mm1
                     +    punpckldq  mm1, mm3
                     +    movq [dstq   ], mm0
                     +    movq [dstq+16], mm2
                     +    movq [dstq+ 8], mm1
                     +    add srcq, 8
                     +    add dstq, 24
                     +    sub lend, 2
                     +    jg .loop
                     +    emms
                     +    RET
                     +%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
+                    +
                     +%define pswapd PSWAPD_SSE
                     +FLOAT_TO_INT16_INTERLEAVE6 sse
                     +%define cvtps2pi pf2id
                     +%define pswapd PSWAPD_3DN1
                     +FLOAT_TO_INT16_INTERLEAVE6 3dnow
                     +%undef pswapd
                     +FLOAT_TO_INT16_INTERLEAVE6 3dn2
                     +%undef cvtps2pi

libavcodec/x86/fmtconvert_mmx.c

History View file @ fe2ff6d

                     new file mode 100644
@@ -0,0 +1,266 @@
                     +/*
                     + * Format Conversion Utils
                     + * Copyright (c) 2000, 2001 Fabrice Bellard
                     + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
                     + *
                     + * This file is part of FFmpeg.
                     + *
                     + * FFmpeg is free software; you can redistribute it and/or
                     + * modify it under the terms of the GNU Lesser General Public
                     + * License as published by the Free Software Foundation; either
                     + * version 2.1 of the License, or (at your option) any later version.
                     + *
                     + * FFmpeg is distributed in the hope that it will be useful,
                     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     + * Lesser General Public License for more details.
                     + *
                     + * You should have received a copy of the GNU Lesser General Public
                     + * License along with FFmpeg; if not, write to the Free Software
                     + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     + *
                     + * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
                     + */
+                    +
                     +#include "libavutil/cpu.h"
                     +#include "libavutil/x86_cpu.h"
                     +#include "libavcodec/fmtconvert.h"
+                    +
                     +static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
                     +{
                     +    x86_reg i = -4*len;
                     +    __asm__ volatile(
                     +        "movss  %3, %%xmm4 \n"
                     +        "shufps $0, %%xmm4, %%xmm4 \n"
                     +        "1: \n"
                     +        "cvtpi2ps   (%2,%0), %%xmm0 \n"
                     +        "cvtpi2ps  8(%2,%0), %%xmm1 \n"
                     +        "cvtpi2ps 16(%2,%0), %%xmm2 \n"
                     +        "cvtpi2ps 24(%2,%0), %%xmm3 \n"
                     +        "movlhps  %%xmm1,    %%xmm0 \n"
                     +        "movlhps  %%xmm3,    %%xmm2 \n"
                     +        "mulps    %%xmm4,    %%xmm0 \n"
                     +        "mulps    %%xmm4,    %%xmm2 \n"
                     +        "movaps   %%xmm0,   (%1,%0) \n"
                     +        "movaps   %%xmm2, 16(%1,%0) \n"
                     +        "add $32, %0 \n"
                     +        "jl 1b \n"
                     +        :"+r"(i)
                     +        :"r"(dst+len), "r"(src+len), "m"(mul)
                     +    );
                     +}
+                    +
                     +static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
                     +{
                     +    x86_reg i = -4*len;
                     +    __asm__ volatile(
                     +        "movss  %3, %%xmm4 \n"
                     +        "shufps $0, %%xmm4, %%xmm4 \n"
                     +        "1: \n"
                     +        "cvtdq2ps   (%2,%0), %%xmm0 \n"
                     +        "cvtdq2ps 16(%2,%0), %%xmm1 \n"
                     +        "mulps    %%xmm4,    %%xmm0 \n"
                     +        "mulps    %%xmm4,    %%xmm1 \n"
                     +        "movaps   %%xmm0,   (%1,%0) \n"
                     +        "movaps   %%xmm1, 16(%1,%0) \n"
                     +        "add $32, %0 \n"
                     +        "jl 1b \n"
                     +        :"+r"(i)
                     +        :"r"(dst+len), "r"(src+len), "m"(mul)
                     +    );
                     +}
+                    +
                     +static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
                     +    x86_reg reglen = len;
                     +    // not bit-exact: pf2id uses different rounding than C and SSE
                     +    __asm__ volatile(
                     +        "add        %0          , %0        \n\t"
                     +        "lea         (%2,%0,2)  , %2        \n\t"
                     +        "add        %0          , %1        \n\t"
                     +        "neg        %0                      \n\t"
                     +        "1:                                 \n\t"
                     +        "pf2id       (%2,%0,2)  , %%mm0     \n\t"
                     +        "pf2id      8(%2,%0,2)  , %%mm1     \n\t"
                     +        "pf2id     16(%2,%0,2)  , %%mm2     \n\t"
                     +        "pf2id     24(%2,%0,2)  , %%mm3     \n\t"
                     +        "packssdw   %%mm1       , %%mm0     \n\t"
                     +        "packssdw   %%mm3       , %%mm2     \n\t"
                     +        "movq       %%mm0       ,  (%1,%0)  \n\t"
                     +        "movq       %%mm2       , 8(%1,%0)  \n\t"
                     +        "add        $16         , %0        \n\t"
                     +        " js 1b                             \n\t"
                     +        "femms                              \n\t"
                     +        :"+r"(reglen), "+r"(dst), "+r"(src)
                     +    );
                     +}
+                    +
                     +static void float_to_int16_sse(int16_t *dst, const float *src, long len){
                     +    x86_reg reglen = len;
                     +    __asm__ volatile(
                     +        "add        %0          , %0        \n\t"
                     +        "lea         (%2,%0,2)  , %2        \n\t"
                     +        "add        %0          , %1        \n\t"
                     +        "neg        %0                      \n\t"
                     +        "1:                                 \n\t"
                     +        "cvtps2pi    (%2,%0,2)  , %%mm0     \n\t"
                     +        "cvtps2pi   8(%2,%0,2)  , %%mm1     \n\t"
                     +        "cvtps2pi  16(%2,%0,2)  , %%mm2     \n\t"
                     +        "cvtps2pi  24(%2,%0,2)  , %%mm3     \n\t"
                     +        "packssdw   %%mm1       , %%mm0     \n\t"
                     +        "packssdw   %%mm3       , %%mm2     \n\t"
                     +        "movq       %%mm0       ,  (%1,%0)  \n\t"
                     +        "movq       %%mm2       , 8(%1,%0)  \n\t"
                     +        "add        $16         , %0        \n\t"
                     +        " js 1b                             \n\t"
                     +        "emms                               \n\t"
                     +        :"+r"(reglen), "+r"(dst), "+r"(src)
                     +    );
                     +}
+                    +
                     +static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
                     +    x86_reg reglen = len;
                     +    __asm__ volatile(
                     +        "add        %0          , %0        \n\t"
                     +        "lea         (%2,%0,2)  , %2        \n\t"
                     +        "add        %0          , %1        \n\t"
                     +        "neg        %0                      \n\t"
                     +        "1:                                 \n\t"
                     +        "cvtps2dq    (%2,%0,2)  , %%xmm0    \n\t"
                     +        "cvtps2dq  16(%2,%0,2)  , %%xmm1    \n\t"
                     +        "packssdw   %%xmm1      , %%xmm0    \n\t"
                     +        "movdqa     %%xmm0      ,  (%1,%0)  \n\t"
                     +        "add        $16         , %0        \n\t"
                     +        " js 1b                             \n\t"
                     +        :"+r"(reglen), "+r"(dst), "+r"(src)
                     +    );
                     +}
+                    +
                     +void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
                     +void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
                     +void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
+                    +
                     +#if !HAVE_YASM
                     +#define ff_float_to_int16_interleave6_sse(a,b,c)   float_to_int16_interleave_misc_sse(a,b,c,6)
                     +#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
                     +#define ff_float_to_int16_interleave6_3dn2(a,b,c)  float_to_int16_interleave_misc_3dnow(a,b,c,6)
                     +#endif
                     +#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
+                    +
                     +#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
                     +/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
                     +static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
                     +    DECLARE_ALIGNED(16, int16_t, tmp)[len];\
                     +    int i,j,c;\
                     +    for(c=0; c<channels; c++){\
                     +        float_to_int16_##cpu(tmp, src[c], len);\
                     +        for(i=0, j=c; i<len; i++, j+=channels)\
                     +            dst[j] = tmp[i];\
                     +    }\
                     +}\
                     +\
                     +static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
                     +    if(channels==1)\
                     +        float_to_int16_##cpu(dst, src[0], len);\
                     +    else if(channels==2){\
                     +        x86_reg reglen = len; \
                     +        const float *src0 = src[0];\
                     +        const float *src1 = src[1];\
                     +        __asm__ volatile(\
                     +            "shl $2, %0 \n"\
                     +            "add %0, %1 \n"\
                     +            "add %0, %2 \n"\
                     +            "add %0, %3 \n"\
                     +            "neg %0 \n"\
                     +            body\
                     +            :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
                     +        );\
                     +    }else if(channels==6){\
                     +        ff_float_to_int16_interleave6_##cpu(dst, src, len);\
                     +    }else\
                     +        float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
                     +}
+                    +
                     +FLOAT_TO_INT16_INTERLEAVE(3dnow,
                     +    "1:                         \n"
                     +    "pf2id     (%2,%0), %%mm0   \n"
                     +    "pf2id    8(%2,%0), %%mm1   \n"
                     +    "pf2id     (%3,%0), %%mm2   \n"
                     +    "pf2id    8(%3,%0), %%mm3   \n"
                     +    "packssdw    %%mm1, %%mm0   \n"
                     +    "packssdw    %%mm3, %%mm2   \n"
                     +    "movq        %%mm0, %%mm1   \n"
                     +    "punpcklwd   %%mm2, %%mm0   \n"
                     +    "punpckhwd   %%mm2, %%mm1   \n"
                     +    "movq        %%mm0,  (%1,%0)\n"
                     +    "movq        %%mm1, 8(%1,%0)\n"
                     +    "add $16, %0                \n"
                     +    "js 1b                      \n"
                     +    "femms                      \n"
                     +)
+                    +
                     +FLOAT_TO_INT16_INTERLEAVE(sse,
                     +    "1:                         \n"
                     +    "cvtps2pi  (%2,%0), %%mm0   \n"
                     +    "cvtps2pi 8(%2,%0), %%mm1   \n"
                     +    "cvtps2pi  (%3,%0), %%mm2   \n"
                     +    "cvtps2pi 8(%3,%0), %%mm3   \n"
                     +    "packssdw    %%mm1, %%mm0   \n"
                     +    "packssdw    %%mm3, %%mm2   \n"
                     +    "movq        %%mm0, %%mm1   \n"
                     +    "punpcklwd   %%mm2, %%mm0   \n"
                     +    "punpckhwd   %%mm2, %%mm1   \n"
                     +    "movq        %%mm0,  (%1,%0)\n"
                     +    "movq        %%mm1, 8(%1,%0)\n"
                     +    "add $16, %0                \n"
                     +    "js 1b                      \n"
                     +    "emms                       \n"
                     +)
+                    +
                     +FLOAT_TO_INT16_INTERLEAVE(sse2,
                     +    "1:                         \n"
                     +    "cvtps2dq  (%2,%0), %%xmm0  \n"
                     +    "cvtps2dq  (%3,%0), %%xmm1  \n"
                     +    "packssdw   %%xmm1, %%xmm0  \n"
                     +    "movhlps    %%xmm0, %%xmm1  \n"
                     +    "punpcklwd  %%xmm1, %%xmm0  \n"
                     +    "movdqa     %%xmm0, (%1,%0) \n"
                     +    "add $16, %0                \n"
                     +    "js 1b                      \n"
                     +)
+                    +
                     +static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
                     +    if(channels==6)
                     +        ff_float_to_int16_interleave6_3dn2(dst, src, len);
                     +    else
                     +        float_to_int16_interleave_3dnow(dst, src, len, channels);
                     +}
+                    +
                     +void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
                     +{
                     +    int mm_flags = av_get_cpu_flags();
+                    +
                     +    if (mm_flags & AV_CPU_FLAG_MMX) {
+                    +
                     +        if(mm_flags & AV_CPU_FLAG_3DNOW){
                     +            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                     +                c->float_to_int16 = float_to_int16_3dnow;
                     +                c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
                     +            }
                     +        }
                     +        if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
                     +            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                     +                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
                     +            }
                     +        }
                     +        if(mm_flags & AV_CPU_FLAG_SSE){
                     +            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
                     +            c->float_to_int16 = float_to_int16_sse;
                     +            c->float_to_int16_interleave = float_to_int16_interleave_sse;
                     +        }
                     +        if(mm_flags & AV_CPU_FLAG_SSE2){
                     +            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
                     +            c->float_to_int16 = float_to_int16_sse2;
                     +            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
                     +        }
                     +    }
                     +}