Browse code

ac3enc: add SIMD-optimized shifting functions for use with the fixed-point AC3 encoder.

Justin Ruggles authored on 2011/03/12 06:45:01
Showing 5 changed files
... ...
@@ -50,10 +50,47 @@ static int ac3_max_msb_abs_int16_c(const int16_t *src, int len)
50 50
     return v;
51 51
 }
52 52
 
53
+static void ac3_lshift_int16_c(int16_t *src, unsigned int len,
54
+                               unsigned int shift)
55
+{
56
+    uint32_t *src32 = (uint32_t *)src;
57
+    const uint32_t mask = ~(((1 << shift) - 1) << 16);
58
+    int i;
59
+    len >>= 1;
60
+    for (i = 0; i < len; i += 8) {
61
+        src32[i  ] = (src32[i  ] << shift) & mask;
62
+        src32[i+1] = (src32[i+1] << shift) & mask;
63
+        src32[i+2] = (src32[i+2] << shift) & mask;
64
+        src32[i+3] = (src32[i+3] << shift) & mask;
65
+        src32[i+4] = (src32[i+4] << shift) & mask;
66
+        src32[i+5] = (src32[i+5] << shift) & mask;
67
+        src32[i+6] = (src32[i+6] << shift) & mask;
68
+        src32[i+7] = (src32[i+7] << shift) & mask;
69
+    }
70
+}
71
+
72
+static void ac3_rshift_int32_c(int32_t *src, unsigned int len,
73
+                               unsigned int shift)
74
+{
75
+    do {
76
+        *src++ >>= shift;
77
+        *src++ >>= shift;
78
+        *src++ >>= shift;
79
+        *src++ >>= shift;
80
+        *src++ >>= shift;
81
+        *src++ >>= shift;
82
+        *src++ >>= shift;
83
+        *src++ >>= shift;
84
+        len -= 8;
85
+    } while (len > 0);
86
+}
87
+
53 88
 av_cold void ff_ac3dsp_init(AC3DSPContext *c)
54 89
 {
55 90
     c->ac3_exponent_min = ac3_exponent_min_c;
56 91
     c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c;
92
+    c->ac3_lshift_int16 = ac3_lshift_int16_c;
93
+    c->ac3_rshift_int32 = ac3_rshift_int32_c;
57 94
 
58 95
     if (HAVE_MMX)
59 96
         ff_ac3dsp_init_x86(c);
... ...
@@ -46,6 +46,28 @@ typedef struct AC3DSPContext {
46 46
      * @return    a value with the same MSB as max(abs(src[]))
47 47
      */
48 48
     int (*ac3_max_msb_abs_int16)(const int16_t *src, int len);
49
+
50
+    /**
51
+     * Left-shift each value in an array of int16_t by a specified amount.
52
+     * @param src    input array
53
+     *               constraints: align 16
54
+     * @param len    number of values in the array
55
+     *               constraints: multiple of 32 greater than 0
56
+     * @param shift  left shift amount
57
+     *               constraints: range [0,15]
58
+     */
59
+    void (*ac3_lshift_int16)(int16_t *src, unsigned int len, unsigned int shift);
60
+
61
+    /**
62
+     * Right-shift each value in an array of int32_t by a specified amount.
63
+     * @param src    input array
64
+     *               constraints: align 16
65
+     * @param len    number of values in the array
66
+     *               constraints: multiple of 16 greater than 0
67
+     * @param shift  right shift amount
68
+     *               constraints: range [0,31]
69
+     */
70
+    void (*ac3_rshift_int32)(int32_t *src, unsigned int len, unsigned int shift);
49 71
 } AC3DSPContext;
50 72
 
51 73
 void ff_ac3dsp_init    (AC3DSPContext *c);
... ...
@@ -278,40 +278,6 @@ static int log2_tab(AC3EncodeContext *s, int16_t *src, int len)
278 278
 
279 279
 
280 280
 /**
281
- * Left-shift each value in an array by a specified amount.
282
- * @param tab    input array
283
- * @param n      number of values in the array
284
- * @param lshift left shift amount
285
- */
286
-static void lshift_tab(int16_t *tab, int n, unsigned int lshift)
287
-{
288
-    int i;
289
-
290
-    if (lshift > 0) {
291
-        for (i = 0; i < n; i++)
292
-            tab[i] <<= lshift;
293
-    }
294
-}
295
-
296
-
297
-/**
298
- * Right-shift each value in an array of int32_t by a specified amount.
299
- * @param src    input array
300
- * @param len    number of values in the array
301
- * @param shift  right shift amount
302
- */
303
-static void ac3_rshift_int32_c(int32_t *src, unsigned int len, unsigned int shift)
304
-{
305
-    int i;
306
-
307
-    if (shift > 0) {
308
-        for (i = 0; i < len; i++)
309
-            src[i] >>= shift;
310
-    }
311
-}
312
-
313
-
314
-/**
315 281
  * Normalize the input samples to use the maximum available precision.
316 282
  * This assumes signed 16-bit input samples.
317 283
  *
... ...
@@ -320,7 +286,8 @@ static void ac3_rshift_int32_c(int32_t *src, unsigned int len, unsigned int shif
320 320
 static int normalize_samples(AC3EncodeContext *s)
321 321
 {
322 322
     int v = 14 - log2_tab(s, s->windowed_samples, AC3_WINDOW_SIZE);
323
-    lshift_tab(s->windowed_samples, AC3_WINDOW_SIZE, v);
323
+    if (v > 0)
324
+        s->ac3dsp.ac3_lshift_int16(s->windowed_samples, AC3_WINDOW_SIZE, v);
324 325
     /* +6 to right-shift from 31-bit to 25-bit */
325 326
     return v + 6;
326 327
 }
... ...
@@ -336,8 +303,8 @@ static void scale_coefficients(AC3EncodeContext *s)
336 336
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
337 337
         AC3Block *block = &s->blocks[blk];
338 338
         for (ch = 0; ch < s->channels; ch++) {
339
-            ac3_rshift_int32_c(block->mdct_coef[ch], AC3_MAX_COEFS,
340
-                               block->coeff_shift[ch]);
339
+            s->ac3dsp.ac3_rshift_int32(block->mdct_coef[ch], AC3_MAX_COEFS,
340
+                                       block->coeff_shift[ch]);
341 341
         }
342 342
     }
343 343
 }
... ...
@@ -133,3 +133,48 @@ INIT_XMM
133 133
 AC3_MAX_MSB_ABS_INT16 sse2, min_max
134 134
 %define ABS2 ABS2_SSSE3
135 135
 AC3_MAX_MSB_ABS_INT16 ssse3, or_abs
136
+
137
+;-----------------------------------------------------------------------------
138
+; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
139
+;-----------------------------------------------------------------------------
140
+
141
+%macro AC3_SHIFT 4 ; l/r, 16/32, shift instruction, instruction set
142
+cglobal ac3_%1shift_int%2_%4, 3,3,5, src, len, shift
143
+    movd      m0, shiftd
144
+.loop:
145
+    mova      m1, [srcq         ]
146
+    mova      m2, [srcq+mmsize  ]
147
+    mova      m3, [srcq+mmsize*2]
148
+    mova      m4, [srcq+mmsize*3]
149
+    %3        m1, m0
150
+    %3        m2, m0
151
+    %3        m3, m0
152
+    %3        m4, m0
153
+    mova  [srcq         ], m1
154
+    mova  [srcq+mmsize  ], m2
155
+    mova  [srcq+mmsize*2], m3
156
+    mova  [srcq+mmsize*3], m4
157
+    add     srcq, mmsize*4
158
+    sub     lend, mmsize*32/%2
159
+    ja .loop
160
+.end:
161
+    REP_RET
162
+%endmacro
163
+
164
+;-----------------------------------------------------------------------------
165
+; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
166
+;-----------------------------------------------------------------------------
167
+
168
+INIT_MMX
169
+AC3_SHIFT l, 16, psllw, mmx
170
+INIT_XMM
171
+AC3_SHIFT l, 16, psllw, sse2
172
+
173
+;-----------------------------------------------------------------------------
174
+; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
175
+;-----------------------------------------------------------------------------
176
+
177
+INIT_MMX
178
+AC3_SHIFT r, 32, psrad, mmx
179
+INIT_XMM
180
+AC3_SHIFT r, 32, psrad, sse2
... ...
@@ -32,6 +32,12 @@ extern int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
32 32
 extern int ff_ac3_max_msb_abs_int16_sse2  (const int16_t *src, int len);
33 33
 extern int ff_ac3_max_msb_abs_int16_ssse3 (const int16_t *src, int len);
34 34
 
35
+extern void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);
36
+extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);
37
+
38
+extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
39
+extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
40
+
35 41
 av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
36 42
 {
37 43
     int mm_flags = av_get_cpu_flags();
... ...
@@ -40,6 +46,8 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
40 40
     if (mm_flags & AV_CPU_FLAG_MMX) {
41 41
         c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
42 42
         c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
43
+        c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
44
+        c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
43 45
     }
44 46
     if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
45 47
         c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
... ...
@@ -48,6 +56,10 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
48 48
     if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
49 49
         c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
50 50
         c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
51
+        if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
52
+            c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
53
+            c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
54
+        }
51 55
     }
52 56
     if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) {
53 57
         c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;