Browse code

twinvq: add SSE/AVX optimized sum/difference stereo interleaving

Justin Ruggles authored on 2011/10/30 14:13:55
Showing 5 changed files
... ...
@@ -2509,6 +2509,18 @@ static void butterflies_float_c(float *restrict v1, float *restrict v2,
2509 2509
     }
2510 2510
 }
2511 2511
 
2512
+static void butterflies_float_interleave_c(float *dst, const float *src0,
2513
+                                           const float *src1, int len)
2514
+{
2515
+    int i;
2516
+    for (i = 0; i < len; i++) {
2517
+        float f1 = src0[i];
2518
+        float f2 = src1[i];
2519
+        dst[2*i    ] = f1 + f2;
2520
+        dst[2*i + 1] = f1 - f2;
2521
+    }
2522
+}
2523
+
2512 2524
 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2513 2525
 {
2514 2526
     float p = 0.0;
... ...
@@ -3036,6 +3048,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3036 3036
     c->vector_clip_int32 = vector_clip_int32_c;
3037 3037
     c->scalarproduct_float = scalarproduct_float_c;
3038 3038
     c->butterflies_float = butterflies_float_c;
3039
+    c->butterflies_float_interleave = butterflies_float_interleave_c;
3039 3040
     c->vector_fmul_scalar = vector_fmul_scalar_c;
3040 3041
     c->vector_fmac_scalar = vector_fmac_scalar_c;
3041 3042
 
... ...
@@ -453,6 +453,23 @@ typedef struct DSPContext {
453 453
      */
454 454
     void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
455 455
 
456
+    /**
457
+     * Calculate the sum and difference of two vectors of floats and interleave
458
+     * results into a separate output vector of floats, with each sum
459
+     * positioned before the corresponding difference.
460
+     *
461
+     * @param dst  output vector
462
+     *             constraints: 16-byte aligned
463
+     * @param src0 first input vector
464
+     *             constraints: 32-byte aligned
465
+     * @param src1 second input vector
466
+     *             constraints: 32-byte aligned
467
+     * @param len  number of elements in the input
468
+     *             constraints: multiple of 8
469
+     */
470
+    void (*butterflies_float_interleave)(float *dst, const float *src0,
471
+                                         const float *src1, int len);
472
+
456 473
     /* (I)DCT */
457 474
     void (*fdct)(DCTELEM *block/* align 16*/);
458 475
     void (*fdct248)(DCTELEM *block/* align 16*/);
... ...
@@ -665,8 +665,9 @@ static void imdct_output(TwinContext *tctx, enum FrameType ftype, int wtype,
665 665
                          float *out)
666 666
 {
667 667
     const ModeTab *mtab = tctx->mtab;
668
+    int size1, size2;
668 669
     float *prev_buf = tctx->prev_frame + tctx->last_block_pos[0];
669
-    int i, j;
670
+    int i;
670 671
 
671 672
     for (i = 0; i < tctx->avctx->channels; i++) {
672 673
         imdct_and_window(tctx, ftype, wtype,
... ...
@@ -675,27 +676,24 @@ static void imdct_output(TwinContext *tctx, enum FrameType ftype, int wtype,
675 675
                          i);
676 676
     }
677 677
 
678
+    size2 = tctx->last_block_pos[0];
679
+    size1 = mtab->size - size2;
678 680
     if (tctx->avctx->channels == 2) {
679
-        for (i = 0; i < mtab->size - tctx->last_block_pos[0]; i++) {
680
-            float f1 = prev_buf[               i];
681
-            float f2 = prev_buf[2*mtab->size + i];
682
-            out[2*i    ] = f1 + f2;
683
-            out[2*i + 1] = f1 - f2;
684
-        }
685
-        for (j = 0; i < mtab->size; j++,i++) {
686
-            float f1 = tctx->curr_frame[               j];
687
-            float f2 = tctx->curr_frame[2*mtab->size + j];
688
-            out[2*i    ] = f1 + f2;
689
-            out[2*i + 1] = f1 - f2;
690
-        }
681
+        tctx->dsp.butterflies_float_interleave(out, prev_buf,
682
+                                               &prev_buf[2*mtab->size],
683
+                                               size1);
684
+
685
+        out += 2 * size1;
686
+
687
+        tctx->dsp.butterflies_float_interleave(out, tctx->curr_frame,
688
+                                               &tctx->curr_frame[2*mtab->size],
689
+                                               size2);
691 690
     } else {
692
-        memcpy(out, prev_buf,
693
-               (mtab->size - tctx->last_block_pos[0]) * sizeof(*out));
691
+        memcpy(out, prev_buf, size1 * sizeof(*out));
694 692
 
695
-        out +=  mtab->size - tctx->last_block_pos[0];
693
+        out += size1;
696 694
 
697
-        memcpy(out, tctx->curr_frame,
698
-               (tctx->last_block_pos[0]) * sizeof(*out));
695
+        memcpy(out, tctx->curr_frame, size2 * sizeof(*out));
699 696
     }
700 697
 
701 698
 }
... ...
@@ -2424,6 +2424,11 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min
2424 2424
 void ff_vector_clip_int32_sse4    (int32_t *dst, const int32_t *src, int32_t min,
2425 2425
                                    int32_t max, unsigned int len);
2426 2426
 
2427
+extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2428
+                                                const float *src1, int len);
2429
+extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2430
+                                                const float *src1, int len);
2431
+
2427 2432
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2428 2433
 {
2429 2434
     int mm_flags = av_get_cpu_flags();
... ...
@@ -2868,6 +2873,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2868 2868
             c->vector_clipf = vector_clipf_sse;
2869 2869
 #if HAVE_YASM
2870 2870
             c->scalarproduct_float = ff_scalarproduct_float_sse;
2871
+            c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
2871 2872
 #endif
2872 2873
         }
2873 2874
         if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
... ...
@@ -2925,6 +2931,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2925 2925
                 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx;
2926 2926
                 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx;
2927 2927
             }
2928
+            c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
2928 2929
         }
2929 2930
 #endif
2930 2931
     }
... ...
@@ -1129,3 +1129,51 @@ VECTOR_CLIP_INT32 11, 1, 1, 0
1129 1129
 %else
1130 1130
 VECTOR_CLIP_INT32 6, 1, 0, 0
1131 1131
 %endif
1132
+
1133
+;-----------------------------------------------------------------------------
1134
+; void ff_butterflies_float_interleave(float *dst, const float *src0,
1135
+;                                      const float *src1, int len);
1136
+;-----------------------------------------------------------------------------
1137
+
1138
+%macro BUTTERFLIES_FLOAT_INTERLEAVE 0
1139
+cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
1140
+%ifdef ARCH_X86_64
1141
+    movsxd    lenq, lend
1142
+%endif
1143
+    test      lenq, lenq
1144
+    jz .end
1145
+    shl       lenq, 2
1146
+    lea      src0q, [src0q +   lenq]
1147
+    lea      src1q, [src1q +   lenq]
1148
+    lea       dstq, [ dstq + 2*lenq]
1149
+    neg       lenq
1150
+.loop:
1151
+    mova        m0, [src0q + lenq]
1152
+    mova        m1, [src1q + lenq]
1153
+    subps       m2, m0, m1
1154
+    addps       m0, m0, m1
1155
+    unpcklps    m1, m0, m2
1156
+    unpckhps    m0, m0, m2
1157
+%if cpuflag(avx)
1158
+    vextractf128 [dstq + 2*lenq     ], m1, 0
1159
+    vextractf128 [dstq + 2*lenq + 16], m0, 0
1160
+    vextractf128 [dstq + 2*lenq + 32], m1, 1
1161
+    vextractf128 [dstq + 2*lenq + 48], m0, 1
1162
+%else
1163
+    mova [dstq + 2*lenq         ], m1
1164
+    mova [dstq + 2*lenq + mmsize], m0
1165
+%endif
1166
+    add       lenq, mmsize
1167
+    jl .loop
1168
+%if mmsize == 32
1169
+    vzeroupper
1170
+    RET
1171
+%endif
1172
+.end:
1173
+    REP_RET
1174
+%endmacro
1175
+
1176
+INIT_XMM sse
1177
+BUTTERFLIES_FLOAT_INTERLEAVE
1178
+INIT_YMM avx
1179
+BUTTERFLIES_FLOAT_INTERLEAVE