| ... | ... |
@@ -85,13 +85,30 @@ static void ac3_rshift_int32_c(int32_t *src, unsigned int len, |
| 85 | 85 |
} while (len > 0); |
| 86 | 86 |
} |
| 87 | 87 |
|
| 88 |
-av_cold void ff_ac3dsp_init(AC3DSPContext *c) |
|
| 88 |
+static void float_to_fixed24_c(int32_t *dst, const float *src, unsigned int len) |
|
| 89 |
+{
|
|
| 90 |
+ const float scale = 1 << 24; |
|
| 91 |
+ do {
|
|
| 92 |
+ *dst++ = lrintf(*src++ * scale); |
|
| 93 |
+ *dst++ = lrintf(*src++ * scale); |
|
| 94 |
+ *dst++ = lrintf(*src++ * scale); |
|
| 95 |
+ *dst++ = lrintf(*src++ * scale); |
|
| 96 |
+ *dst++ = lrintf(*src++ * scale); |
|
| 97 |
+ *dst++ = lrintf(*src++ * scale); |
|
| 98 |
+ *dst++ = lrintf(*src++ * scale); |
|
| 99 |
+ *dst++ = lrintf(*src++ * scale); |
|
| 100 |
+ len -= 8; |
|
| 101 |
+ } while (len > 0); |
|
| 102 |
+} |
|
| 103 |
+ |
|
| 104 |
+av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact) |
|
| 89 | 105 |
{
|
| 90 | 106 |
c->ac3_exponent_min = ac3_exponent_min_c; |
| 91 | 107 |
c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c; |
| 92 | 108 |
c->ac3_lshift_int16 = ac3_lshift_int16_c; |
| 93 | 109 |
c->ac3_rshift_int32 = ac3_rshift_int32_c; |
| 110 |
+ c->float_to_fixed24 = float_to_fixed24_c; |
|
| 94 | 111 |
|
| 95 | 112 |
if (HAVE_MMX) |
| 96 |
- ff_ac3dsp_init_x86(c); |
|
| 113 |
+ ff_ac3dsp_init_x86(c, bit_exact); |
|
| 97 | 114 |
} |
| ... | ... |
@@ -68,9 +68,22 @@ typedef struct AC3DSPContext {
|
| 68 | 68 |
* constraints: range [0,31] |
| 69 | 69 |
*/ |
| 70 | 70 |
void (*ac3_rshift_int32)(int32_t *src, unsigned int len, unsigned int shift); |
| 71 |
+ |
|
| 72 |
+ /** |
|
| 73 |
+ * Convert an array of float in range [-1.0,1.0] to int32_t with range |
|
| 74 |
+ * [-(1<<24),(1<<24)] |
|
| 75 |
+ * |
|
| 76 |
+ * @param dst destination array of int32_t. |
|
| 77 |
+ * constraints: 16-byte aligned |
|
| 78 |
+ * @param src source array of float. |
|
| 79 |
+ * constraints: 16-byte aligned |
|
| 80 |
+ * @param len number of elements to convert. |
|
| 81 |
+ * constraints: multiple of 32 greater than zero |
|
| 82 |
+ */ |
|
| 83 |
+ void (*float_to_fixed24)(int32_t *dst, const float *src, unsigned int len); |
|
| 71 | 84 |
} AC3DSPContext; |
| 72 | 85 |
|
| 73 |
-void ff_ac3dsp_init (AC3DSPContext *c); |
|
| 74 |
-void ff_ac3dsp_init_x86(AC3DSPContext *c); |
|
| 86 |
+void ff_ac3dsp_init (AC3DSPContext *c, int bit_exact); |
|
| 87 |
+void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact); |
|
| 75 | 88 |
|
| 76 | 89 |
#endif /* AVCODEC_AC3DSP_H */ |
| ... | ... |
@@ -1843,7 +1843,7 @@ static av_cold int ac3_encode_init(AVCodecContext *avctx) |
| 1843 | 1843 |
avctx->coded_frame= avcodec_alloc_frame(); |
| 1844 | 1844 |
|
| 1845 | 1845 |
dsputil_init(&s->dsp, avctx); |
| 1846 |
- ff_ac3dsp_init(&s->ac3dsp); |
|
| 1846 |
+ ff_ac3dsp_init(&s->ac3dsp, avctx->flags & CODEC_FLAG_BITEXACT); |
|
| 1847 | 1847 |
|
| 1848 | 1848 |
return 0; |
| 1849 | 1849 |
init_fail: |
| ... | ... |
@@ -103,9 +103,8 @@ static int normalize_samples(AC3EncodeContext *s) |
| 103 | 103 |
*/ |
| 104 | 104 |
static void scale_coefficients(AC3EncodeContext *s) |
| 105 | 105 |
{
|
| 106 |
- int i; |
|
| 107 |
- for (i = 0; i < AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels; i++) |
|
| 108 |
- s->fixed_coef_buffer[i] = SCALE_FLOAT(s->mdct_coef_buffer[i], 24); |
|
| 106 |
+ s->ac3dsp.float_to_fixed24(s->fixed_coef_buffer, s->mdct_coef_buffer, |
|
| 107 |
+ AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels); |
|
| 109 | 108 |
} |
| 110 | 109 |
|
| 111 | 110 |
|
| ... | ... |
@@ -22,6 +22,11 @@ |
| 22 | 22 |
%include "x86inc.asm" |
| 23 | 23 |
%include "x86util.asm" |
| 24 | 24 |
|
| 25 |
+SECTION_RODATA |
|
| 26 |
+ |
|
| 27 |
+; 16777216.0f - used in ff_float_to_fixed24() |
|
| 28 |
+pf_1_24: times 4 dd 0x4B800000 |
|
| 29 |
+ |
|
| 25 | 30 |
SECTION .text |
| 26 | 31 |
|
| 27 | 32 |
;----------------------------------------------------------------------------- |
| ... | ... |
@@ -178,3 +183,113 @@ INIT_MMX |
| 178 | 178 |
AC3_SHIFT r, 32, psrad, mmx |
| 179 | 179 |
INIT_XMM |
| 180 | 180 |
AC3_SHIFT r, 32, psrad, sse2 |
| 181 |
+ |
|
| 182 |
+;----------------------------------------------------------------------------- |
|
| 183 |
+; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len) |
|
| 184 |
+;----------------------------------------------------------------------------- |
|
| 185 |
+ |
|
| 186 |
+; The 3DNow! version is not bit-identical because pf2id uses truncation rather |
|
| 187 |
+; than round-to-nearest. |
|
| 188 |
+INIT_MMX |
|
| 189 |
+cglobal float_to_fixed24_3dnow, 3,3,0, dst, src, len |
|
| 190 |
+ movq m0, [pf_1_24] |
|
| 191 |
+.loop: |
|
| 192 |
+ movq m1, [srcq ] |
|
| 193 |
+ movq m2, [srcq+8 ] |
|
| 194 |
+ movq m3, [srcq+16] |
|
| 195 |
+ movq m4, [srcq+24] |
|
| 196 |
+ pfmul m1, m0 |
|
| 197 |
+ pfmul m2, m0 |
|
| 198 |
+ pfmul m3, m0 |
|
| 199 |
+ pfmul m4, m0 |
|
| 200 |
+ pf2id m1, m1 |
|
| 201 |
+ pf2id m2, m2 |
|
| 202 |
+ pf2id m3, m3 |
|
| 203 |
+ pf2id m4, m4 |
|
| 204 |
+ movq [dstq ], m1 |
|
| 205 |
+ movq [dstq+8 ], m2 |
|
| 206 |
+ movq [dstq+16], m3 |
|
| 207 |
+ movq [dstq+24], m4 |
|
| 208 |
+ add srcq, 32 |
|
| 209 |
+ add dstq, 32 |
|
| 210 |
+ sub lend, 8 |
|
| 211 |
+ ja .loop |
|
| 212 |
+ REP_RET |
|
| 213 |
+ |
|
| 214 |
+INIT_XMM |
|
| 215 |
+cglobal float_to_fixed24_sse, 3,3,3, dst, src, len |
|
| 216 |
+ movaps m0, [pf_1_24] |
|
| 217 |
+.loop: |
|
| 218 |
+ movaps m1, [srcq ] |
|
| 219 |
+ movaps m2, [srcq+16] |
|
| 220 |
+ mulps m1, m0 |
|
| 221 |
+ mulps m2, m0 |
|
| 222 |
+ cvtps2pi mm0, m1 |
|
| 223 |
+ movhlps m1, m1 |
|
| 224 |
+ cvtps2pi mm1, m1 |
|
| 225 |
+ cvtps2pi mm2, m2 |
|
| 226 |
+ movhlps m2, m2 |
|
| 227 |
+ cvtps2pi mm3, m2 |
|
| 228 |
+ movq [dstq ], mm0 |
|
| 229 |
+ movq [dstq+ 8], mm1 |
|
| 230 |
+ movq [dstq+16], mm2 |
|
| 231 |
+ movq [dstq+24], mm3 |
|
| 232 |
+ add srcq, 32 |
|
| 233 |
+ add dstq, 32 |
|
| 234 |
+ sub lend, 8 |
|
| 235 |
+ ja .loop |
|
| 236 |
+ REP_RET |
|
| 237 |
+ |
|
| 238 |
+INIT_XMM |
|
| 239 |
+cglobal float_to_fixed24_sse2, 3,3,9, dst, src, len |
|
| 240 |
+ movaps m0, [pf_1_24] |
|
| 241 |
+.loop: |
|
| 242 |
+ movaps m1, [srcq ] |
|
| 243 |
+ movaps m2, [srcq+16 ] |
|
| 244 |
+ movaps m3, [srcq+32 ] |
|
| 245 |
+ movaps m4, [srcq+48 ] |
|
| 246 |
+%ifdef m8 |
|
| 247 |
+ movaps m5, [srcq+64 ] |
|
| 248 |
+ movaps m6, [srcq+80 ] |
|
| 249 |
+ movaps m7, [srcq+96 ] |
|
| 250 |
+ movaps m8, [srcq+112] |
|
| 251 |
+%endif |
|
| 252 |
+ mulps m1, m0 |
|
| 253 |
+ mulps m2, m0 |
|
| 254 |
+ mulps m3, m0 |
|
| 255 |
+ mulps m4, m0 |
|
| 256 |
+%ifdef m8 |
|
| 257 |
+ mulps m5, m0 |
|
| 258 |
+ mulps m6, m0 |
|
| 259 |
+ mulps m7, m0 |
|
| 260 |
+ mulps m8, m0 |
|
| 261 |
+%endif |
|
| 262 |
+ cvtps2dq m1, m1 |
|
| 263 |
+ cvtps2dq m2, m2 |
|
| 264 |
+ cvtps2dq m3, m3 |
|
| 265 |
+ cvtps2dq m4, m4 |
|
| 266 |
+%ifdef m8 |
|
| 267 |
+ cvtps2dq m5, m5 |
|
| 268 |
+ cvtps2dq m6, m6 |
|
| 269 |
+ cvtps2dq m7, m7 |
|
| 270 |
+ cvtps2dq m8, m8 |
|
| 271 |
+%endif |
|
| 272 |
+ movdqa [dstq ], m1 |
|
| 273 |
+ movdqa [dstq+16 ], m2 |
|
| 274 |
+ movdqa [dstq+32 ], m3 |
|
| 275 |
+ movdqa [dstq+48 ], m4 |
|
| 276 |
+%ifdef m8 |
|
| 277 |
+ movdqa [dstq+64 ], m5 |
|
| 278 |
+ movdqa [dstq+80 ], m6 |
|
| 279 |
+ movdqa [dstq+96 ], m7 |
|
| 280 |
+ movdqa [dstq+112], m8 |
|
| 281 |
+ add srcq, 128 |
|
| 282 |
+ add dstq, 128 |
|
| 283 |
+ sub lenq, 32 |
|
| 284 |
+%else |
|
| 285 |
+ add srcq, 64 |
|
| 286 |
+ add dstq, 64 |
|
| 287 |
+ sub lenq, 16 |
|
| 288 |
+%endif |
|
| 289 |
+ ja .loop |
|
| 290 |
+ REP_RET |
| ... | ... |
@@ -38,7 +38,11 @@ extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned in |
| 38 | 38 |
extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift); |
| 39 | 39 |
extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift); |
| 40 | 40 |
|
| 41 |
-av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c) |
|
| 41 |
+extern void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len); |
|
| 42 |
+extern void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len); |
|
| 43 |
+extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len); |
|
| 44 |
+ |
|
| 45 |
+av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) |
|
| 42 | 46 |
{
|
| 43 | 47 |
int mm_flags = av_get_cpu_flags(); |
| 44 | 48 |
|
| ... | ... |
@@ -49,13 +53,22 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c) |
| 49 | 49 |
c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx; |
| 50 | 50 |
c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx; |
| 51 | 51 |
} |
| 52 |
+ if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
|
|
| 53 |
+ if (!bit_exact) {
|
|
| 54 |
+ c->float_to_fixed24 = ff_float_to_fixed24_3dnow; |
|
| 55 |
+ } |
|
| 56 |
+ } |
|
| 52 | 57 |
if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
|
| 53 | 58 |
c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; |
| 54 | 59 |
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext; |
| 55 | 60 |
} |
| 61 |
+ if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
|
|
| 62 |
+ c->float_to_fixed24 = ff_float_to_fixed24_sse; |
|
| 63 |
+ } |
|
| 56 | 64 |
if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
|
| 57 | 65 |
c->ac3_exponent_min = ff_ac3_exponent_min_sse2; |
| 58 | 66 |
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2; |
| 67 |
+ c->float_to_fixed24 = ff_float_to_fixed24_sse2; |
|
| 59 | 68 |
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
|
| 60 | 69 |
c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2; |
| 61 | 70 |
c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2; |