Browse code

ac3enc: add float_to_fixed24() with x86-optimized versions to AC3DSPContext and use in scale_coefficients() for the floating-point AC-3 encoder.

Justin Ruggles authored on 2011/03/16 11:29:04
Showing 6 changed files
... ...
@@ -85,13 +85,30 @@ static void ac3_rshift_int32_c(int32_t *src, unsigned int len,
85 85
     } while (len > 0);
86 86
 }
87 87
 
88
-av_cold void ff_ac3dsp_init(AC3DSPContext *c)
88
+static void float_to_fixed24_c(int32_t *dst, const float *src, unsigned int len)
89
+{
90
+    const float scale = 1 << 24;
91
+    do {
92
+        *dst++ = lrintf(*src++ * scale);
93
+        *dst++ = lrintf(*src++ * scale);
94
+        *dst++ = lrintf(*src++ * scale);
95
+        *dst++ = lrintf(*src++ * scale);
96
+        *dst++ = lrintf(*src++ * scale);
97
+        *dst++ = lrintf(*src++ * scale);
98
+        *dst++ = lrintf(*src++ * scale);
99
+        *dst++ = lrintf(*src++ * scale);
100
+        len -= 8;
101
+    } while (len > 0);
102
+}
103
+
104
+av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact)
89 105
 {
90 106
     c->ac3_exponent_min = ac3_exponent_min_c;
91 107
     c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c;
92 108
     c->ac3_lshift_int16 = ac3_lshift_int16_c;
93 109
     c->ac3_rshift_int32 = ac3_rshift_int32_c;
110
+    c->float_to_fixed24 = float_to_fixed24_c;
94 111
 
95 112
     if (HAVE_MMX)
96
-        ff_ac3dsp_init_x86(c);
113
+        ff_ac3dsp_init_x86(c, bit_exact);
97 114
 }
... ...
@@ -68,9 +68,22 @@ typedef struct AC3DSPContext {
68 68
      *               constraints: range [0,31]
69 69
      */
70 70
     void (*ac3_rshift_int32)(int32_t *src, unsigned int len, unsigned int shift);
71
+
72
+    /**
73
+     * Convert an array of float in range [-1.0,1.0] to int32_t with range
74
+     * [-(1<<24),(1<<24)]
75
+     *
76
+     * @param dst destination array of int32_t.
77
+     *            constraints: 16-byte aligned
78
+     * @param src source array of float.
79
+     *            constraints: 16-byte aligned
80
+     * @param len number of elements to convert.
81
+     *            constraints: multiple of 32 greater than zero
82
+     */
83
+    void (*float_to_fixed24)(int32_t *dst, const float *src, unsigned int len);
71 84
 } AC3DSPContext;
72 85
 
73
-void ff_ac3dsp_init    (AC3DSPContext *c);
74
-void ff_ac3dsp_init_x86(AC3DSPContext *c);
86
+void ff_ac3dsp_init    (AC3DSPContext *c, int bit_exact);
87
+void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact);
75 88
 
76 89
 #endif /* AVCODEC_AC3DSP_H */
... ...
@@ -1843,7 +1843,7 @@ static av_cold int ac3_encode_init(AVCodecContext *avctx)
1843 1843
     avctx->coded_frame= avcodec_alloc_frame();
1844 1844
 
1845 1845
     dsputil_init(&s->dsp, avctx);
1846
-    ff_ac3dsp_init(&s->ac3dsp);
1846
+    ff_ac3dsp_init(&s->ac3dsp, avctx->flags & CODEC_FLAG_BITEXACT);
1847 1847
 
1848 1848
     return 0;
1849 1849
 init_fail:
... ...
@@ -103,9 +103,8 @@ static int normalize_samples(AC3EncodeContext *s)
103 103
  */
104 104
 static void scale_coefficients(AC3EncodeContext *s)
105 105
 {
106
-    int i;
107
-    for (i = 0; i < AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels; i++)
108
-        s->fixed_coef_buffer[i] = SCALE_FLOAT(s->mdct_coef_buffer[i], 24);
106
+    s->ac3dsp.float_to_fixed24(s->fixed_coef_buffer, s->mdct_coef_buffer,
107
+                               AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels);
109 108
 }
110 109
 
111 110
 
... ...
@@ -22,6 +22,11 @@
22 22
 %include "x86inc.asm"
23 23
 %include "x86util.asm"
24 24
 
25
+SECTION_RODATA
26
+
27
+; 16777216.0f - used in ff_float_to_fixed24()
28
+pf_1_24: times 4 dd 0x4B800000
29
+
25 30
 SECTION .text
26 31
 
27 32
 ;-----------------------------------------------------------------------------
... ...
@@ -178,3 +183,113 @@ INIT_MMX
178 178
 AC3_SHIFT r, 32, psrad, mmx
179 179
 INIT_XMM
180 180
 AC3_SHIFT r, 32, psrad, sse2
181
+
182
+;-----------------------------------------------------------------------------
183
+; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
184
+;-----------------------------------------------------------------------------
185
+
186
+; The 3DNow! version is not bit-identical because pf2id uses truncation rather
187
+; than round-to-nearest.
188
+INIT_MMX
189
+cglobal float_to_fixed24_3dnow, 3,3,0, dst, src, len
190
+    movq   m0, [pf_1_24]
191
+.loop:
192
+    movq   m1, [srcq   ]
193
+    movq   m2, [srcq+8 ]
194
+    movq   m3, [srcq+16]
195
+    movq   m4, [srcq+24]
196
+    pfmul  m1, m0
197
+    pfmul  m2, m0
198
+    pfmul  m3, m0
199
+    pfmul  m4, m0
200
+    pf2id  m1, m1
201
+    pf2id  m2, m2
202
+    pf2id  m3, m3
203
+    pf2id  m4, m4
204
+    movq  [dstq   ], m1
205
+    movq  [dstq+8 ], m2
206
+    movq  [dstq+16], m3
207
+    movq  [dstq+24], m4
208
+    add  srcq, 32
209
+    add  dstq, 32
210
+    sub  lend, 8
211
+    ja .loop
212
+    REP_RET
213
+
214
+INIT_XMM
215
+cglobal float_to_fixed24_sse, 3,3,3, dst, src, len
216
+    movaps     m0, [pf_1_24]
217
+.loop:
218
+    movaps     m1, [srcq   ]
219
+    movaps     m2, [srcq+16]
220
+    mulps      m1, m0
221
+    mulps      m2, m0
222
+    cvtps2pi  mm0, m1
223
+    movhlps    m1, m1
224
+    cvtps2pi  mm1, m1
225
+    cvtps2pi  mm2, m2
226
+    movhlps    m2, m2
227
+    cvtps2pi  mm3, m2
228
+    movq  [dstq   ], mm0
229
+    movq  [dstq+ 8], mm1
230
+    movq  [dstq+16], mm2
231
+    movq  [dstq+24], mm3
232
+    add      srcq, 32
233
+    add      dstq, 32
234
+    sub      lend, 8
235
+    ja .loop
236
+    REP_RET
237
+
238
+INIT_XMM
239
+cglobal float_to_fixed24_sse2, 3,3,9, dst, src, len
240
+    movaps     m0, [pf_1_24]
241
+.loop:
242
+    movaps     m1, [srcq    ]
243
+    movaps     m2, [srcq+16 ]
244
+    movaps     m3, [srcq+32 ]
245
+    movaps     m4, [srcq+48 ]
246
+%ifdef m8
247
+    movaps     m5, [srcq+64 ]
248
+    movaps     m6, [srcq+80 ]
249
+    movaps     m7, [srcq+96 ]
250
+    movaps     m8, [srcq+112]
251
+%endif
252
+    mulps      m1, m0
253
+    mulps      m2, m0
254
+    mulps      m3, m0
255
+    mulps      m4, m0
256
+%ifdef m8
257
+    mulps      m5, m0
258
+    mulps      m6, m0
259
+    mulps      m7, m0
260
+    mulps      m8, m0
261
+%endif
262
+    cvtps2dq   m1, m1
263
+    cvtps2dq   m2, m2
264
+    cvtps2dq   m3, m3
265
+    cvtps2dq   m4, m4
266
+%ifdef m8
267
+    cvtps2dq   m5, m5
268
+    cvtps2dq   m6, m6
269
+    cvtps2dq   m7, m7
270
+    cvtps2dq   m8, m8
271
+%endif
272
+    movdqa  [dstq    ], m1
273
+    movdqa  [dstq+16 ], m2
274
+    movdqa  [dstq+32 ], m3
275
+    movdqa  [dstq+48 ], m4
276
+%ifdef m8
277
+    movdqa  [dstq+64 ], m5
278
+    movdqa  [dstq+80 ], m6
279
+    movdqa  [dstq+96 ], m7
280
+    movdqa  [dstq+112], m8
281
+    add      srcq, 128
282
+    add      dstq, 128
283
+    sub      lenq, 32
284
+%else
285
+    add      srcq, 64
286
+    add      dstq, 64
287
+    sub      lenq, 16
288
+%endif
289
+    ja .loop
290
+    REP_RET
... ...
@@ -38,7 +38,11 @@ extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned in
38 38
 extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
39 39
 extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
40 40
 
41
-av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
41
+extern void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
42
+extern void ff_float_to_fixed24_sse  (int32_t *dst, const float *src, unsigned int len);
43
+extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
44
+
45
+av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
42 46
 {
43 47
     int mm_flags = av_get_cpu_flags();
44 48
 
... ...
@@ -49,13 +53,22 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
49 49
         c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
50 50
         c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
51 51
     }
52
+    if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
53
+        if (!bit_exact) {
54
+            c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
55
+        }
56
+    }
52 57
     if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
53 58
         c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
54 59
         c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
55 60
     }
61
+    if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
62
+        c->float_to_fixed24 = ff_float_to_fixed24_sse;
63
+    }
56 64
     if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
57 65
         c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
58 66
         c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
67
+        c->float_to_fixed24 = ff_float_to_fixed24_sse2;
59 68
         if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
60 69
             c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
61 70
             c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;