Browse code

ac3enc: Add x86-optimized function to speed up log2_tab().

AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute
value of each element in an array of int16_t.

Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>

Justin Ruggles authored on 2011/02/14 04:49:50
Showing 5 changed files
... ...
@@ -42,9 +42,18 @@ static void ac3_exponent_min_c(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
42 42
     }
43 43
 }
44 44
 
45
+static int ac3_max_msb_abs_int16_c(const int16_t *src, int len)
46
+{
47
+    int i, v = 0;
48
+    for (i = 0; i < len; i++)
49
+        v |= abs(src[i]);
50
+    return v;
51
+}
52
+
45 53
 av_cold void ff_ac3dsp_init(AC3DSPContext *c)
46 54
 {
47 55
     c->ac3_exponent_min = ac3_exponent_min_c;
56
+    c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c;
48 57
 
49 58
     if (HAVE_MMX)
50 59
         ff_ac3dsp_init_x86(c);
... ...
@@ -35,6 +35,17 @@ typedef struct AC3DSPContext {
35 35
      * @param nb_coefs  number of frequency coefficients.
36 36
      */
37 37
     void (*ac3_exponent_min)(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
38
+
39
+    /**
40
+     * Calculate the maximum MSB of the absolute value of each element in an
41
+     * array of int16_t.
42
+     * @param src input array
43
+     *            constraints: align 16. values must be in range [-32767,32767]
44
+     * @param len number of values in the array
45
+     *            constraints: multiple of 16 greater than 0
46
+     * @return    a value with the same MSB as max(abs(src[]))
47
+     */
48
+    int (*ac3_max_msb_abs_int16)(const int16_t *src, int len);
38 49
 } AC3DSPContext;
39 50
 
40 51
 void ff_ac3dsp_init    (AC3DSPContext *c);
... ...
@@ -270,14 +270,9 @@ static void apply_window(DSPContext *dsp, int16_t *output, const int16_t *input,
270 270
  * @param n   number of values in the array
271 271
  * @return    log2(max(abs(tab[])))
272 272
  */
273
-static int log2_tab(int16_t *tab, int n)
273
+static int log2_tab(AC3EncodeContext *s, int16_t *src, int len)
274 274
 {
275
-    int i, v;
276
-
277
-    v = 0;
278
-    for (i = 0; i < n; i++)
279
-        v |= abs(tab[i]);
280
-
275
+    int v = s->ac3dsp.ac3_max_msb_abs_int16(src, len);
281 276
     return av_log2(v);
282 277
 }
283 278
 
... ...
@@ -308,7 +303,7 @@ static void lshift_tab(int16_t *tab, int n, unsigned int lshift)
308 308
  */
309 309
 static int normalize_samples(AC3EncodeContext *s)
310 310
 {
311
-    int v = 14 - log2_tab(s->windowed_samples, AC3_WINDOW_SIZE);
311
+    int v = 14 - log2_tab(s, s->windowed_samples, AC3_WINDOW_SIZE);
312 312
     lshift_tab(s->windowed_samples, AC3_WINDOW_SIZE, v);
313 313
     return v - 9;
314 314
 }
... ...
@@ -65,3 +65,72 @@ AC3_EXPONENT_MIN sse2
65 65
 %endif
66 66
 %undef PMINUB
67 67
 %undef LOOP_ALIGN
68
+
69
+;-----------------------------------------------------------------------------
70
+; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
71
+;
72
+; This function uses 2 different methods to calculate a valid result.
73
+; 1) logical 'or' of abs of each element
74
+;        This is used for ssse3 because of the pabsw instruction.
75
+;        It is also used for mmx because of the lack of min/max instructions.
76
+; 2) calculate min/max for the array, then or(abs(min),abs(max))
77
+;        This is used for mmxext and sse2 because they have pminsw/pmaxsw.
78
+;-----------------------------------------------------------------------------
79
+
80
+%macro AC3_MAX_MSB_ABS_INT16 2
81
+cglobal ac3_max_msb_abs_int16_%1, 2,2,5, src, len
82
+    pxor        m2, m2
83
+    pxor        m3, m3
84
+.loop:
85
+%ifidn %2, min_max
86
+    mova        m0, [srcq]
87
+    mova        m1, [srcq+mmsize]
88
+    pminsw      m2, m0
89
+    pminsw      m2, m1
90
+    pmaxsw      m3, m0
91
+    pmaxsw      m3, m1
92
+%else ; or_abs
93
+%ifidn %1, mmx
94
+    mova        m0, [srcq]
95
+    mova        m1, [srcq+mmsize]
96
+    ABS2        m0, m1, m3, m4
97
+%else ; ssse3
98
+    ; using memory args is faster for ssse3
99
+    pabsw       m0, [srcq]
100
+    pabsw       m1, [srcq+mmsize]
101
+%endif
102
+    por         m2, m0
103
+    por         m2, m1
104
+%endif
105
+    add       srcq, mmsize*2
106
+    sub       lend, mmsize
107
+    ja .loop
108
+%ifidn %2, min_max
109
+    ABS2        m2, m3, m0, m1
110
+    por         m2, m3
111
+%endif
112
+%ifidn mmsize, 16
113
+    mova        m0, m2
114
+    punpckhqdq  m0, m0
115
+    por         m2, m0
116
+%endif
117
+    PSHUFLW     m0, m2, 0xe
118
+    por         m2, m0
119
+    PSHUFLW     m0, m2, 0x1
120
+    por         m2, m0
121
+    movd       eax, m2
122
+    and        eax, 0xFFFF
123
+    RET
124
+%endmacro
125
+
126
+INIT_MMX
127
+%define ABS2 ABS2_MMX
128
+%define PSHUFLW pshufw
129
+AC3_MAX_MSB_ABS_INT16 mmx, or_abs
130
+%define ABS2 ABS2_MMX2
131
+AC3_MAX_MSB_ABS_INT16 mmxext, min_max
132
+INIT_XMM
133
+%define PSHUFLW pshuflw
134
+AC3_MAX_MSB_ABS_INT16 sse2, min_max
135
+%define ABS2 ABS2_SSSE3
136
+AC3_MAX_MSB_ABS_INT16 ssse3, or_abs
... ...
@@ -27,6 +27,11 @@ extern void ff_ac3_exponent_min_mmx   (uint8_t *exp, int num_reuse_blocks, int n
27 27
 extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
28 28
 extern void ff_ac3_exponent_min_sse2  (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
29 29
 
30
+extern int ff_ac3_max_msb_abs_int16_mmx   (const int16_t *src, int len);
31
+extern int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
32
+extern int ff_ac3_max_msb_abs_int16_sse2  (const int16_t *src, int len);
33
+extern int ff_ac3_max_msb_abs_int16_ssse3 (const int16_t *src, int len);
34
+
30 35
 av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
31 36
 {
32 37
     int mm_flags = av_get_cpu_flags();
... ...
@@ -34,12 +39,18 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
34 34
 #if HAVE_YASM
35 35
     if (mm_flags & AV_CPU_FLAG_MMX) {
36 36
         c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
37
+        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
37 38
     }
38 39
     if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
39 40
         c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
41
+        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
40 42
     }
41 43
     if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
42 44
         c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
45
+        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
46
+    }
47
+    if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) {
48
+        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
43 49
     }
44 50
 #endif
45 51
 }