GitList

Browse code

twinvq: add SSE/AVX optimized sum/difference stereo interleaving

Justin Ruggles authored on 2011/10/30 14:13:55
Showing 5 changed files

libavcodec/dsputil.c index 182063c..9123857 100644
libavcodec/dsputil.h index acb2041..98b7b1e 100644
libavcodec/twinvq.c index 73eb7c1..a285156 100644
libavcodec/x86/dsputil_mmx.c index dd6cbf5..f0de05a 100644
libavcodec/x86/dsputil_yasm.asm index 8e3cbdc..f2894cd 100644

@@ -2509,6 +2509,18 @@ static void butterflies_float_c(float *restrict v1, float *restrict v2,
+                         }
+                     }
                     +static void butterflies_float_interleave_c(float *dst, const float *src0,
                     +                                           const float *src1, int len)
                     +{
                     +    int i;
                     +    for (i = 0; i < len; i++) {
                     +        float f1 = src0[i];
                     +        float f2 = src1[i];
                     +        dst[2*i    ] = f1 + f2;
                     +        dst[2*i + 1] = f1 - f2;
                     +    }
                     +}
+                    +
                      static float scalarproduct_float_c(const float *v1, const float *v2, int len)
+                     {
                          float p = 0.0;
@@ -3036,6 +3048,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
                          c->vector_clip_int32 = vector_clip_int32_c;
                          c->scalarproduct_float = scalarproduct_float_c;
                          c->butterflies_float = butterflies_float_c;
                     +    c->butterflies_float_interleave = butterflies_float_interleave_c;
                          c->vector_fmul_scalar = vector_fmul_scalar_c;
                          c->vector_fmac_scalar = vector_fmac_scalar_c;

libavcodec/dsputil.h

History View file @ 9d06037

@@ -453,6 +453,23 @@ typedef struct DSPContext {
                           */
                          void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
                     +    /**
                     +     * Calculate the sum and difference of two vectors of floats and interleave
                     +     * results into a separate output vector of floats, with each sum
                     +     * positioned before the corresponding difference.
                     +     *
                     +     * @param dst  output vector
                     +     *             constraints: 16-byte aligned
                     +     * @param src0 first input vector
                     +     *             constraints: 32-byte aligned
                     +     * @param src1 second input vector
                     +     *             constraints: 32-byte aligned
                     +     * @param len  number of elements in the input
                     +     *             constraints: multiple of 8
                     +     */
                     +    void (*butterflies_float_interleave)(float *dst, const float *src0,
                     +                                         const float *src1, int len);
+                    +
                          /* (I)DCT */
                          void (*fdct)(DCTELEM *block/* align 16*/);
                          void (*fdct248)(DCTELEM *block/* align 16*/);

libavcodec/twinvq.c

History View file @ 9d06037

@@ -665,8 +665,9 @@ static void imdct_output(TwinContext *tctx, enum FrameType ftype, int wtype,
                                               float *out)
+                     {
                          const ModeTab *mtab = tctx->mtab;
                     +    int size1, size2;
                          float *prev_buf = tctx->prev_frame + tctx->last_block_pos[0];
                     -    int i, j;
                     +    int i;
                          for (i = 0; i < tctx->avctx->channels; i++) {
                              imdct_and_window(tctx, ftype, wtype,
@@ -675,27 +676,24 @@ static void imdct_output(TwinContext *tctx, enum FrameType ftype, int wtype,
                                               i);
+                         }
                     +    size2 = tctx->last_block_pos[0];
                     +    size1 = mtab->size - size2;
                          if (tctx->avctx->channels == 2) {
                     -        for (i = 0; i < mtab->size - tctx->last_block_pos[0]; i++) {
                     -            float f1 = prev_buf[               i];
                     -            float f2 = prev_buf[2*mtab->size + i];
                     -            out[2*i    ] = f1 + f2;
                     -            out[2*i + 1] = f1 - f2;
                     -        }
                     -        for (j = 0; i < mtab->size; j++,i++) {
                     -            float f1 = tctx->curr_frame[               j];
                     -            float f2 = tctx->curr_frame[2*mtab->size + j];
                     -            out[2*i    ] = f1 + f2;
                     -            out[2*i + 1] = f1 - f2;
                     -        }
                     +        tctx->dsp.butterflies_float_interleave(out, prev_buf,
                     +                                               &prev_buf[2*mtab->size],
                     +                                               size1);
+                    +
                     +        out += 2 * size1;
+                    +
                     +        tctx->dsp.butterflies_float_interleave(out, tctx->curr_frame,
                     +                                               &tctx->curr_frame[2*mtab->size],
                     +                                               size2);
                          } else {
                     -        memcpy(out, prev_buf,
                     -               (mtab->size - tctx->last_block_pos[0]) * sizeof(*out));
                     +        memcpy(out, prev_buf, size1 * sizeof(*out));
                     -        out +=  mtab->size - tctx->last_block_pos[0];
                     +        out += size1;
                     -        memcpy(out, tctx->curr_frame,
                     -               (tctx->last_block_pos[0]) * sizeof(*out));
                     +        memcpy(out, tctx->curr_frame, size2 * sizeof(*out));
+                         }
+                     }

libavcodec/x86/dsputil_mmx.c

History View file @ 9d06037

@@ -2424,6 +2424,11 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min
                      void ff_vector_clip_int32_sse4    (int32_t *dst, const int32_t *src, int32_t min,
                                                         int32_t max, unsigned int len);
                     +extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
                     +                                                const float *src1, int len);
                     +extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
                     +                                                const float *src1, int len);
+                    +
                      void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
+                     {
                          int mm_flags = av_get_cpu_flags();
@@ -2868,6 +2873,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                                  c->vector_clipf = vector_clipf_sse;
                      #if HAVE_YASM
                                  c->scalarproduct_float = ff_scalarproduct_float_sse;
                     +            c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
                      #endif
+                             }
                              if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
@@ -2925,6 +2931,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                                      c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx;
                                      c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx;
+                                 }
                     +            c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
+                             }
                      #endif
+                         }

libavcodec/x86/dsputil_yasm.asm

History View file @ 9d06037

@@ -1129,3 +1129,51 @@ VECTOR_CLIP_INT32 11, 1, 1, 0
                      %else
                      VECTOR_CLIP_INT32 6, 1, 0, 0
                      %endif
+                    +
                     +;-----------------------------------------------------------------------------
                     +; void ff_butterflies_float_interleave(float *dst, const float *src0,
                     +;                                      const float *src1, int len);
                     +;-----------------------------------------------------------------------------
+                    +
                     +%macro BUTTERFLIES_FLOAT_INTERLEAVE 0
                     +cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
                     +%ifdef ARCH_X86_64
                     +    movsxd    lenq, lend
                     +%endif
                     +    test      lenq, lenq
                     +    jz .end
                     +    shl       lenq, 2
                     +    lea      src0q, [src0q +   lenq]
                     +    lea      src1q, [src1q +   lenq]
                     +    lea       dstq, [ dstq + 2*lenq]
                     +    neg       lenq
                     +.loop:
                     +    mova        m0, [src0q + lenq]
                     +    mova        m1, [src1q + lenq]
                     +    subps       m2, m0, m1
                     +    addps       m0, m0, m1
                     +    unpcklps    m1, m0, m2
                     +    unpckhps    m0, m0, m2
                     +%if cpuflag(avx)
                     +    vextractf128 [dstq + 2*lenq     ], m1, 0
                     +    vextractf128 [dstq + 2*lenq + 16], m0, 0
                     +    vextractf128 [dstq + 2*lenq + 32], m1, 1
                     +    vextractf128 [dstq + 2*lenq + 48], m0, 1
                     +%else
                     +    mova [dstq + 2*lenq         ], m1
                     +    mova [dstq + 2*lenq + mmsize], m0
                     +%endif
                     +    add       lenq, mmsize
                     +    jl .loop
                     +%if mmsize == 32
                     +    vzeroupper
                     +    RET
                     +%endif
                     +.end:
                     +    REP_RET
                     +%endmacro
+                    +
                     +INIT_XMM sse
                     +BUTTERFLIES_FLOAT_INTERLEAVE
                     +INIT_YMM avx
                     +BUTTERFLIES_FLOAT_INTERLEAVE