This will be beneficial for use with the audio conversion API without
requiring it to depend on all of dsputil.
Signed-off-by: Mans Rullgard <mans@mansr.com>
(cherry picked from commit c73d99e672329c8f2df290736ffc474c360ac4ae)
| ... | ... |
@@ -35,6 +35,7 @@ |
| 35 | 35 |
#include "fft.h" |
| 36 | 36 |
#include "mpeg4audio.h" |
| 37 | 37 |
#include "sbr.h" |
| 38 |
+#include "fmtconvert.h" |
|
| 38 | 39 |
|
| 39 | 40 |
#include <stdint.h> |
| 40 | 41 |
|
| ... | ... |
@@ -268,6 +269,7 @@ typedef struct {
|
| 268 | 268 |
FFTContext mdct; |
| 269 | 269 |
FFTContext mdct_small; |
| 270 | 270 |
DSPContext dsp; |
| 271 |
+ FmtConvertContext fmt_conv; |
|
| 271 | 272 |
int random_state; |
| 272 | 273 |
/** @} */ |
| 273 | 274 |
|
| ... | ... |
@@ -85,6 +85,7 @@ |
| 85 | 85 |
#include "get_bits.h" |
| 86 | 86 |
#include "dsputil.h" |
| 87 | 87 |
#include "fft.h" |
| 88 |
+#include "fmtconvert.h" |
|
| 88 | 89 |
#include "lpc.h" |
| 89 | 90 |
|
| 90 | 91 |
#include "aac.h" |
| ... | ... |
@@ -562,6 +563,7 @@ static av_cold int aac_decode_init(AVCodecContext *avctx) |
| 562 | 562 |
ff_aac_sbr_init(); |
| 563 | 563 |
|
| 564 | 564 |
dsputil_init(&ac->dsp, avctx); |
| 565 |
+ ff_fmt_convert_init(&ac->fmt_conv, avctx); |
|
| 565 | 566 |
|
| 566 | 567 |
ac->random_state = 0x1f2e3d4c; |
| 567 | 568 |
|
| ... | ... |
@@ -2032,7 +2034,7 @@ static int aac_decode_frame_int(AVCodecContext *avctx, void *data, |
| 2032 | 2032 |
*data_size = data_size_tmp; |
| 2033 | 2033 |
|
| 2034 | 2034 |
if (samples) |
| 2035 |
- ac->dsp.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels); |
|
| 2035 |
+ ac->fmt_conv.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels); |
|
| 2036 | 2036 |
|
| 2037 | 2037 |
if (ac->output_configured) |
| 2038 | 2038 |
ac->output_configured = OC_LOCKED; |
| ... | ... |
@@ -193,6 +193,7 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx) |
| 193 | 193 |
ff_mdct_init(&s->imdct_512, 9, 1, 1.0); |
| 194 | 194 |
ff_kbd_window_init(s->window, 5.0, 256); |
| 195 | 195 |
dsputil_init(&s->dsp, avctx); |
| 196 |
+ ff_fmt_convert_init(&s->fmt_conv, avctx); |
|
| 196 | 197 |
av_lfg_init(&s->dith_state, 0); |
| 197 | 198 |
|
| 198 | 199 |
/* set scale value for float to int16 conversion */ |
| ... | ... |
@@ -1255,7 +1256,7 @@ static int decode_audio_block(AC3DecodeContext *s, int blk) |
| 1255 | 1255 |
} else {
|
| 1256 | 1256 |
gain *= s->dynamic_range[0]; |
| 1257 | 1257 |
} |
| 1258 |
- s->dsp.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256); |
|
| 1258 |
+ s->fmt_conv.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256); |
|
| 1259 | 1259 |
} |
| 1260 | 1260 |
|
| 1261 | 1261 |
/* apply spectral extension to high frequency bins */ |
| ... | ... |
@@ -1407,7 +1408,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size, |
| 1407 | 1407 |
av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n"); |
| 1408 | 1408 |
err = 1; |
| 1409 | 1409 |
} |
| 1410 |
- s->dsp.float_to_int16_interleave(out_samples, output, 256, s->out_channels); |
|
| 1410 |
+ s->fmt_conv.float_to_int16_interleave(out_samples, output, 256, s->out_channels); |
|
| 1411 | 1411 |
out_samples += 256 * s->out_channels; |
| 1412 | 1412 |
} |
| 1413 | 1413 |
*data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t); |
| ... | ... |
@@ -55,6 +55,7 @@ |
| 55 | 55 |
#include "get_bits.h" |
| 56 | 56 |
#include "dsputil.h" |
| 57 | 57 |
#include "fft.h" |
| 58 |
+#include "fmtconvert.h" |
|
| 58 | 59 |
|
| 59 | 60 |
/* override ac3.h to include coupling channel */ |
| 60 | 61 |
#undef AC3_MAX_CHANNELS |
| ... | ... |
@@ -190,6 +191,7 @@ typedef struct {
|
| 190 | 190 |
|
| 191 | 191 |
///@defgroup opt optimization |
| 192 | 192 |
DSPContext dsp; ///< for optimization |
| 193 |
+ FmtConvertContext fmt_conv; ///< optimized conversion functions |
|
| 193 | 194 |
float mul_bias; ///< scaling for float_to_int16 conversion |
| 194 | 195 |
///@} |
| 195 | 196 |
|
| ... | ... |
@@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o |
| 9 | 9 |
OBJS += arm/dsputil_init_arm.o \ |
| 10 | 10 |
arm/dsputil_arm.o \ |
| 11 | 11 |
arm/fft_init_arm.o \ |
| 12 |
+ arm/fmtconvert_init_arm.o \ |
|
| 12 | 13 |
arm/jrevdct_arm.o \ |
| 13 | 14 |
arm/mpegvideo_arm.o \ |
| 14 | 15 |
arm/simple_idct_arm.o \ |
| ... | ... |
@@ -22,8 +23,11 @@ OBJS-$(HAVE_ARMV6) += arm/dsputil_init_armv6.o \ |
| 22 | 22 |
arm/dsputil_armv6.o \ |
| 23 | 23 |
arm/simple_idct_armv6.o \ |
| 24 | 24 |
|
| 25 |
+VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o \ |
|
| 26 |
+ |
|
| 25 | 27 |
OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \ |
| 26 | 28 |
arm/dsputil_init_vfp.o \ |
| 29 |
+ $(VFP-OBJS-yes) |
|
| 27 | 30 |
|
| 28 | 31 |
OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \ |
| 29 | 32 |
arm/mpegvideo_iwmmxt.o \ |
| ... | ... |
@@ -52,6 +56,7 @@ NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_neon.o \ |
| 52 | 52 |
|
| 53 | 53 |
OBJS-$(HAVE_NEON) += arm/dsputil_init_neon.o \ |
| 54 | 54 |
arm/dsputil_neon.o \ |
| 55 |
+ arm/fmtconvert_neon.o \ |
|
| 55 | 56 |
arm/int_neon.o \ |
| 56 | 57 |
arm/mpegvideo_neon.o \ |
| 57 | 58 |
arm/simple_idct_neon.o \ |
| ... | ... |
@@ -153,8 +153,6 @@ void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul, |
| 153 | 153 |
int len); |
| 154 | 154 |
void ff_butterflies_float_neon(float *v1, float *v2, int len); |
| 155 | 155 |
float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len); |
| 156 |
-void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, |
|
| 157 |
- float mul, int len); |
|
| 158 | 156 |
void ff_vector_fmul_reverse_neon(float *dst, const float *src0, |
| 159 | 157 |
const float *src1, int len); |
| 160 | 158 |
void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, |
| ... | ... |
@@ -162,8 +160,6 @@ void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, |
| 162 | 162 |
|
| 163 | 163 |
void ff_vector_clipf_neon(float *dst, const float *src, float min, float max, |
| 164 | 164 |
int len); |
| 165 |
-void ff_float_to_int16_neon(int16_t *, const float *, long); |
|
| 166 |
-void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); |
|
| 167 | 165 |
|
| 168 | 166 |
void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize); |
| 169 | 167 |
|
| ... | ... |
@@ -308,7 +304,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) |
| 308 | 308 |
c->vector_fmul_scalar = ff_vector_fmul_scalar_neon; |
| 309 | 309 |
c->butterflies_float = ff_butterflies_float_neon; |
| 310 | 310 |
c->scalarproduct_float = ff_scalarproduct_float_neon; |
| 311 |
- c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; |
|
| 312 | 311 |
c->vector_fmul_reverse = ff_vector_fmul_reverse_neon; |
| 313 | 312 |
c->vector_fmul_add = ff_vector_fmul_add_neon; |
| 314 | 313 |
c->vector_clipf = ff_vector_clipf_neon; |
| ... | ... |
@@ -319,11 +314,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) |
| 319 | 319 |
c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon; |
| 320 | 320 |
c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon; |
| 321 | 321 |
|
| 322 |
- if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
|
|
| 323 |
- c->float_to_int16 = ff_float_to_int16_neon; |
|
| 324 |
- c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; |
|
| 325 |
- } |
|
| 326 |
- |
|
| 327 | 322 |
if (CONFIG_VORBIS_DECODER) |
| 328 | 323 |
c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon; |
| 329 | 324 |
|
| ... | ... |
@@ -25,13 +25,9 @@ void ff_vector_fmul_vfp(float *dst, const float *src0, |
| 25 | 25 |
const float *src1, int len); |
| 26 | 26 |
void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, |
| 27 | 27 |
const float *src1, int len); |
| 28 |
-void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); |
|
| 29 | 28 |
|
| 30 | 29 |
void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx) |
| 31 | 30 |
{
|
| 32 | 31 |
c->vector_fmul = ff_vector_fmul_vfp; |
| 33 | 32 |
c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; |
| 34 |
-#if HAVE_ARMV6 |
|
| 35 |
- c->float_to_int16 = ff_float_to_int16_vfp; |
|
| 36 |
-#endif |
|
| 37 | 33 |
} |
| ... | ... |
@@ -400,343 +400,6 @@ function ff_add_pixels_clamped_neon, export=1 |
| 400 | 400 |
bx lr |
| 401 | 401 |
endfunc |
| 402 | 402 |
|
| 403 |
-function ff_float_to_int16_neon, export=1 |
|
| 404 |
- subs r2, r2, #8 |
|
| 405 |
- vld1.64 {d0-d1}, [r1,:128]!
|
|
| 406 |
- vcvt.s32.f32 q8, q0, #16 |
|
| 407 |
- vld1.64 {d2-d3}, [r1,:128]!
|
|
| 408 |
- vcvt.s32.f32 q9, q1, #16 |
|
| 409 |
- beq 3f |
|
| 410 |
- bics ip, r2, #15 |
|
| 411 |
- beq 2f |
|
| 412 |
-1: subs ip, ip, #16 |
|
| 413 |
- vshrn.s32 d4, q8, #16 |
|
| 414 |
- vld1.64 {d0-d1}, [r1,:128]!
|
|
| 415 |
- vcvt.s32.f32 q0, q0, #16 |
|
| 416 |
- vshrn.s32 d5, q9, #16 |
|
| 417 |
- vld1.64 {d2-d3}, [r1,:128]!
|
|
| 418 |
- vcvt.s32.f32 q1, q1, #16 |
|
| 419 |
- vshrn.s32 d6, q0, #16 |
|
| 420 |
- vst1.64 {d4-d5}, [r0,:128]!
|
|
| 421 |
- vshrn.s32 d7, q1, #16 |
|
| 422 |
- vld1.64 {d16-d17},[r1,:128]!
|
|
| 423 |
- vcvt.s32.f32 q8, q8, #16 |
|
| 424 |
- vld1.64 {d18-d19},[r1,:128]!
|
|
| 425 |
- vcvt.s32.f32 q9, q9, #16 |
|
| 426 |
- vst1.64 {d6-d7}, [r0,:128]!
|
|
| 427 |
- bne 1b |
|
| 428 |
- ands r2, r2, #15 |
|
| 429 |
- beq 3f |
|
| 430 |
-2: vld1.64 {d0-d1}, [r1,:128]!
|
|
| 431 |
- vshrn.s32 d4, q8, #16 |
|
| 432 |
- vcvt.s32.f32 q0, q0, #16 |
|
| 433 |
- vld1.64 {d2-d3}, [r1,:128]!
|
|
| 434 |
- vshrn.s32 d5, q9, #16 |
|
| 435 |
- vcvt.s32.f32 q1, q1, #16 |
|
| 436 |
- vshrn.s32 d6, q0, #16 |
|
| 437 |
- vst1.64 {d4-d5}, [r0,:128]!
|
|
| 438 |
- vshrn.s32 d7, q1, #16 |
|
| 439 |
- vst1.64 {d6-d7}, [r0,:128]!
|
|
| 440 |
- bx lr |
|
| 441 |
-3: vshrn.s32 d4, q8, #16 |
|
| 442 |
- vshrn.s32 d5, q9, #16 |
|
| 443 |
- vst1.64 {d4-d5}, [r0,:128]!
|
|
| 444 |
- bx lr |
|
| 445 |
-endfunc |
|
| 446 |
- |
|
| 447 |
-function ff_float_to_int16_interleave_neon, export=1 |
|
| 448 |
- cmp r3, #2 |
|
| 449 |
- ldrlt r1, [r1] |
|
| 450 |
- blt ff_float_to_int16_neon |
|
| 451 |
- bne 4f |
|
| 452 |
- |
|
| 453 |
- ldr r3, [r1] |
|
| 454 |
- ldr r1, [r1, #4] |
|
| 455 |
- |
|
| 456 |
- subs r2, r2, #8 |
|
| 457 |
- vld1.64 {d0-d1}, [r3,:128]!
|
|
| 458 |
- vcvt.s32.f32 q8, q0, #16 |
|
| 459 |
- vld1.64 {d2-d3}, [r3,:128]!
|
|
| 460 |
- vcvt.s32.f32 q9, q1, #16 |
|
| 461 |
- vld1.64 {d20-d21},[r1,:128]!
|
|
| 462 |
- vcvt.s32.f32 q10, q10, #16 |
|
| 463 |
- vld1.64 {d22-d23},[r1,:128]!
|
|
| 464 |
- vcvt.s32.f32 q11, q11, #16 |
|
| 465 |
- beq 3f |
|
| 466 |
- bics ip, r2, #15 |
|
| 467 |
- beq 2f |
|
| 468 |
-1: subs ip, ip, #16 |
|
| 469 |
- vld1.64 {d0-d1}, [r3,:128]!
|
|
| 470 |
- vcvt.s32.f32 q0, q0, #16 |
|
| 471 |
- vsri.32 q10, q8, #16 |
|
| 472 |
- vld1.64 {d2-d3}, [r3,:128]!
|
|
| 473 |
- vcvt.s32.f32 q1, q1, #16 |
|
| 474 |
- vld1.64 {d24-d25},[r1,:128]!
|
|
| 475 |
- vcvt.s32.f32 q12, q12, #16 |
|
| 476 |
- vld1.64 {d26-d27},[r1,:128]!
|
|
| 477 |
- vsri.32 q11, q9, #16 |
|
| 478 |
- vst1.64 {d20-d21},[r0,:128]!
|
|
| 479 |
- vcvt.s32.f32 q13, q13, #16 |
|
| 480 |
- vst1.64 {d22-d23},[r0,:128]!
|
|
| 481 |
- vsri.32 q12, q0, #16 |
|
| 482 |
- vld1.64 {d16-d17},[r3,:128]!
|
|
| 483 |
- vsri.32 q13, q1, #16 |
|
| 484 |
- vst1.64 {d24-d25},[r0,:128]!
|
|
| 485 |
- vcvt.s32.f32 q8, q8, #16 |
|
| 486 |
- vld1.64 {d18-d19},[r3,:128]!
|
|
| 487 |
- vcvt.s32.f32 q9, q9, #16 |
|
| 488 |
- vld1.64 {d20-d21},[r1,:128]!
|
|
| 489 |
- vcvt.s32.f32 q10, q10, #16 |
|
| 490 |
- vld1.64 {d22-d23},[r1,:128]!
|
|
| 491 |
- vcvt.s32.f32 q11, q11, #16 |
|
| 492 |
- vst1.64 {d26-d27},[r0,:128]!
|
|
| 493 |
- bne 1b |
|
| 494 |
- ands r2, r2, #15 |
|
| 495 |
- beq 3f |
|
| 496 |
-2: vsri.32 q10, q8, #16 |
|
| 497 |
- vld1.64 {d0-d1}, [r3,:128]!
|
|
| 498 |
- vcvt.s32.f32 q0, q0, #16 |
|
| 499 |
- vld1.64 {d2-d3}, [r3,:128]!
|
|
| 500 |
- vcvt.s32.f32 q1, q1, #16 |
|
| 501 |
- vld1.64 {d24-d25},[r1,:128]!
|
|
| 502 |
- vcvt.s32.f32 q12, q12, #16 |
|
| 503 |
- vsri.32 q11, q9, #16 |
|
| 504 |
- vld1.64 {d26-d27},[r1,:128]!
|
|
| 505 |
- vcvt.s32.f32 q13, q13, #16 |
|
| 506 |
- vst1.64 {d20-d21},[r0,:128]!
|
|
| 507 |
- vsri.32 q12, q0, #16 |
|
| 508 |
- vst1.64 {d22-d23},[r0,:128]!
|
|
| 509 |
- vsri.32 q13, q1, #16 |
|
| 510 |
- vst1.64 {d24-d27},[r0,:128]!
|
|
| 511 |
- bx lr |
|
| 512 |
-3: vsri.32 q10, q8, #16 |
|
| 513 |
- vsri.32 q11, q9, #16 |
|
| 514 |
- vst1.64 {d20-d23},[r0,:128]!
|
|
| 515 |
- bx lr |
|
| 516 |
- |
|
| 517 |
-4: push {r4-r8,lr}
|
|
| 518 |
- cmp r3, #4 |
|
| 519 |
- lsl ip, r3, #1 |
|
| 520 |
- blt 4f |
|
| 521 |
- |
|
| 522 |
- @ 4 channels |
|
| 523 |
-5: ldmia r1!, {r4-r7}
|
|
| 524 |
- mov lr, r2 |
|
| 525 |
- mov r8, r0 |
|
| 526 |
- vld1.64 {d16-d17},[r4,:128]!
|
|
| 527 |
- vcvt.s32.f32 q8, q8, #16 |
|
| 528 |
- vld1.64 {d18-d19},[r5,:128]!
|
|
| 529 |
- vcvt.s32.f32 q9, q9, #16 |
|
| 530 |
- vld1.64 {d20-d21},[r6,:128]!
|
|
| 531 |
- vcvt.s32.f32 q10, q10, #16 |
|
| 532 |
- vld1.64 {d22-d23},[r7,:128]!
|
|
| 533 |
- vcvt.s32.f32 q11, q11, #16 |
|
| 534 |
-6: subs lr, lr, #8 |
|
| 535 |
- vld1.64 {d0-d1}, [r4,:128]!
|
|
| 536 |
- vcvt.s32.f32 q0, q0, #16 |
|
| 537 |
- vsri.32 q9, q8, #16 |
|
| 538 |
- vld1.64 {d2-d3}, [r5,:128]!
|
|
| 539 |
- vcvt.s32.f32 q1, q1, #16 |
|
| 540 |
- vsri.32 q11, q10, #16 |
|
| 541 |
- vld1.64 {d4-d5}, [r6,:128]!
|
|
| 542 |
- vcvt.s32.f32 q2, q2, #16 |
|
| 543 |
- vzip.32 d18, d22 |
|
| 544 |
- vld1.64 {d6-d7}, [r7,:128]!
|
|
| 545 |
- vcvt.s32.f32 q3, q3, #16 |
|
| 546 |
- vzip.32 d19, d23 |
|
| 547 |
- vst1.64 {d18}, [r8], ip
|
|
| 548 |
- vsri.32 q1, q0, #16 |
|
| 549 |
- vst1.64 {d22}, [r8], ip
|
|
| 550 |
- vsri.32 q3, q2, #16 |
|
| 551 |
- vst1.64 {d19}, [r8], ip
|
|
| 552 |
- vzip.32 d2, d6 |
|
| 553 |
- vst1.64 {d23}, [r8], ip
|
|
| 554 |
- vzip.32 d3, d7 |
|
| 555 |
- beq 7f |
|
| 556 |
- vld1.64 {d16-d17},[r4,:128]!
|
|
| 557 |
- vcvt.s32.f32 q8, q8, #16 |
|
| 558 |
- vst1.64 {d2}, [r8], ip
|
|
| 559 |
- vld1.64 {d18-d19},[r5,:128]!
|
|
| 560 |
- vcvt.s32.f32 q9, q9, #16 |
|
| 561 |
- vst1.64 {d6}, [r8], ip
|
|
| 562 |
- vld1.64 {d20-d21},[r6,:128]!
|
|
| 563 |
- vcvt.s32.f32 q10, q10, #16 |
|
| 564 |
- vst1.64 {d3}, [r8], ip
|
|
| 565 |
- vld1.64 {d22-d23},[r7,:128]!
|
|
| 566 |
- vcvt.s32.f32 q11, q11, #16 |
|
| 567 |
- vst1.64 {d7}, [r8], ip
|
|
| 568 |
- b 6b |
|
| 569 |
-7: vst1.64 {d2}, [r8], ip
|
|
| 570 |
- vst1.64 {d6}, [r8], ip
|
|
| 571 |
- vst1.64 {d3}, [r8], ip
|
|
| 572 |
- vst1.64 {d7}, [r8], ip
|
|
| 573 |
- subs r3, r3, #4 |
|
| 574 |
- popeq {r4-r8,pc}
|
|
| 575 |
- cmp r3, #4 |
|
| 576 |
- add r0, r0, #8 |
|
| 577 |
- bge 5b |
|
| 578 |
- |
|
| 579 |
- @ 2 channels |
|
| 580 |
-4: cmp r3, #2 |
|
| 581 |
- blt 4f |
|
| 582 |
- ldmia r1!, {r4-r5}
|
|
| 583 |
- mov lr, r2 |
|
| 584 |
- mov r8, r0 |
|
| 585 |
- tst lr, #8 |
|
| 586 |
- vld1.64 {d16-d17},[r4,:128]!
|
|
| 587 |
- vcvt.s32.f32 q8, q8, #16 |
|
| 588 |
- vld1.64 {d18-d19},[r5,:128]!
|
|
| 589 |
- vcvt.s32.f32 q9, q9, #16 |
|
| 590 |
- vld1.64 {d20-d21},[r4,:128]!
|
|
| 591 |
- vcvt.s32.f32 q10, q10, #16 |
|
| 592 |
- vld1.64 {d22-d23},[r5,:128]!
|
|
| 593 |
- vcvt.s32.f32 q11, q11, #16 |
|
| 594 |
- beq 6f |
|
| 595 |
- subs lr, lr, #8 |
|
| 596 |
- beq 7f |
|
| 597 |
- vsri.32 d18, d16, #16 |
|
| 598 |
- vsri.32 d19, d17, #16 |
|
| 599 |
- vld1.64 {d16-d17},[r4,:128]!
|
|
| 600 |
- vcvt.s32.f32 q8, q8, #16 |
|
| 601 |
- vst1.32 {d18[0]}, [r8], ip
|
|
| 602 |
- vsri.32 d22, d20, #16 |
|
| 603 |
- vst1.32 {d18[1]}, [r8], ip
|
|
| 604 |
- vsri.32 d23, d21, #16 |
|
| 605 |
- vst1.32 {d19[0]}, [r8], ip
|
|
| 606 |
- vst1.32 {d19[1]}, [r8], ip
|
|
| 607 |
- vld1.64 {d18-d19},[r5,:128]!
|
|
| 608 |
- vcvt.s32.f32 q9, q9, #16 |
|
| 609 |
- vst1.32 {d22[0]}, [r8], ip
|
|
| 610 |
- vst1.32 {d22[1]}, [r8], ip
|
|
| 611 |
- vld1.64 {d20-d21},[r4,:128]!
|
|
| 612 |
- vcvt.s32.f32 q10, q10, #16 |
|
| 613 |
- vst1.32 {d23[0]}, [r8], ip
|
|
| 614 |
- vst1.32 {d23[1]}, [r8], ip
|
|
| 615 |
- vld1.64 {d22-d23},[r5,:128]!
|
|
| 616 |
- vcvt.s32.f32 q11, q11, #16 |
|
| 617 |
-6: subs lr, lr, #16 |
|
| 618 |
- vld1.64 {d0-d1}, [r4,:128]!
|
|
| 619 |
- vcvt.s32.f32 q0, q0, #16 |
|
| 620 |
- vsri.32 d18, d16, #16 |
|
| 621 |
- vld1.64 {d2-d3}, [r5,:128]!
|
|
| 622 |
- vcvt.s32.f32 q1, q1, #16 |
|
| 623 |
- vsri.32 d19, d17, #16 |
|
| 624 |
- vld1.64 {d4-d5}, [r4,:128]!
|
|
| 625 |
- vcvt.s32.f32 q2, q2, #16 |
|
| 626 |
- vld1.64 {d6-d7}, [r5,:128]!
|
|
| 627 |
- vcvt.s32.f32 q3, q3, #16 |
|
| 628 |
- vst1.32 {d18[0]}, [r8], ip
|
|
| 629 |
- vsri.32 d22, d20, #16 |
|
| 630 |
- vst1.32 {d18[1]}, [r8], ip
|
|
| 631 |
- vsri.32 d23, d21, #16 |
|
| 632 |
- vst1.32 {d19[0]}, [r8], ip
|
|
| 633 |
- vsri.32 d2, d0, #16 |
|
| 634 |
- vst1.32 {d19[1]}, [r8], ip
|
|
| 635 |
- vsri.32 d3, d1, #16 |
|
| 636 |
- vst1.32 {d22[0]}, [r8], ip
|
|
| 637 |
- vsri.32 d6, d4, #16 |
|
| 638 |
- vst1.32 {d22[1]}, [r8], ip
|
|
| 639 |
- vsri.32 d7, d5, #16 |
|
| 640 |
- vst1.32 {d23[0]}, [r8], ip
|
|
| 641 |
- vst1.32 {d23[1]}, [r8], ip
|
|
| 642 |
- beq 6f |
|
| 643 |
- vld1.64 {d16-d17},[r4,:128]!
|
|
| 644 |
- vcvt.s32.f32 q8, q8, #16 |
|
| 645 |
- vst1.32 {d2[0]}, [r8], ip
|
|
| 646 |
- vst1.32 {d2[1]}, [r8], ip
|
|
| 647 |
- vld1.64 {d18-d19},[r5,:128]!
|
|
| 648 |
- vcvt.s32.f32 q9, q9, #16 |
|
| 649 |
- vst1.32 {d3[0]}, [r8], ip
|
|
| 650 |
- vst1.32 {d3[1]}, [r8], ip
|
|
| 651 |
- vld1.64 {d20-d21},[r4,:128]!
|
|
| 652 |
- vcvt.s32.f32 q10, q10, #16 |
|
| 653 |
- vst1.32 {d6[0]}, [r8], ip
|
|
| 654 |
- vst1.32 {d6[1]}, [r8], ip
|
|
| 655 |
- vld1.64 {d22-d23},[r5,:128]!
|
|
| 656 |
- vcvt.s32.f32 q11, q11, #16 |
|
| 657 |
- vst1.32 {d7[0]}, [r8], ip
|
|
| 658 |
- vst1.32 {d7[1]}, [r8], ip
|
|
| 659 |
- bgt 6b |
|
| 660 |
-6: vst1.32 {d2[0]}, [r8], ip
|
|
| 661 |
- vst1.32 {d2[1]}, [r8], ip
|
|
| 662 |
- vst1.32 {d3[0]}, [r8], ip
|
|
| 663 |
- vst1.32 {d3[1]}, [r8], ip
|
|
| 664 |
- vst1.32 {d6[0]}, [r8], ip
|
|
| 665 |
- vst1.32 {d6[1]}, [r8], ip
|
|
| 666 |
- vst1.32 {d7[0]}, [r8], ip
|
|
| 667 |
- vst1.32 {d7[1]}, [r8], ip
|
|
| 668 |
- b 8f |
|
| 669 |
-7: vsri.32 d18, d16, #16 |
|
| 670 |
- vsri.32 d19, d17, #16 |
|
| 671 |
- vst1.32 {d18[0]}, [r8], ip
|
|
| 672 |
- vsri.32 d22, d20, #16 |
|
| 673 |
- vst1.32 {d18[1]}, [r8], ip
|
|
| 674 |
- vsri.32 d23, d21, #16 |
|
| 675 |
- vst1.32 {d19[0]}, [r8], ip
|
|
| 676 |
- vst1.32 {d19[1]}, [r8], ip
|
|
| 677 |
- vst1.32 {d22[0]}, [r8], ip
|
|
| 678 |
- vst1.32 {d22[1]}, [r8], ip
|
|
| 679 |
- vst1.32 {d23[0]}, [r8], ip
|
|
| 680 |
- vst1.32 {d23[1]}, [r8], ip
|
|
| 681 |
-8: subs r3, r3, #2 |
|
| 682 |
- add r0, r0, #4 |
|
| 683 |
- popeq {r4-r8,pc}
|
|
| 684 |
- |
|
| 685 |
- @ 1 channel |
|
| 686 |
-4: ldr r4, [r1],#4 |
|
| 687 |
- tst r2, #8 |
|
| 688 |
- mov lr, r2 |
|
| 689 |
- mov r5, r0 |
|
| 690 |
- vld1.64 {d0-d1}, [r4,:128]!
|
|
| 691 |
- vcvt.s32.f32 q0, q0, #16 |
|
| 692 |
- vld1.64 {d2-d3}, [r4,:128]!
|
|
| 693 |
- vcvt.s32.f32 q1, q1, #16 |
|
| 694 |
- bne 8f |
|
| 695 |
-6: subs lr, lr, #16 |
|
| 696 |
- vld1.64 {d4-d5}, [r4,:128]!
|
|
| 697 |
- vcvt.s32.f32 q2, q2, #16 |
|
| 698 |
- vld1.64 {d6-d7}, [r4,:128]!
|
|
| 699 |
- vcvt.s32.f32 q3, q3, #16 |
|
| 700 |
- vst1.16 {d0[1]}, [r5,:16], ip
|
|
| 701 |
- vst1.16 {d0[3]}, [r5,:16], ip
|
|
| 702 |
- vst1.16 {d1[1]}, [r5,:16], ip
|
|
| 703 |
- vst1.16 {d1[3]}, [r5,:16], ip
|
|
| 704 |
- vst1.16 {d2[1]}, [r5,:16], ip
|
|
| 705 |
- vst1.16 {d2[3]}, [r5,:16], ip
|
|
| 706 |
- vst1.16 {d3[1]}, [r5,:16], ip
|
|
| 707 |
- vst1.16 {d3[3]}, [r5,:16], ip
|
|
| 708 |
- beq 7f |
|
| 709 |
- vld1.64 {d0-d1}, [r4,:128]!
|
|
| 710 |
- vcvt.s32.f32 q0, q0, #16 |
|
| 711 |
- vld1.64 {d2-d3}, [r4,:128]!
|
|
| 712 |
- vcvt.s32.f32 q1, q1, #16 |
|
| 713 |
-7: vst1.16 {d4[1]}, [r5,:16], ip
|
|
| 714 |
- vst1.16 {d4[3]}, [r5,:16], ip
|
|
| 715 |
- vst1.16 {d5[1]}, [r5,:16], ip
|
|
| 716 |
- vst1.16 {d5[3]}, [r5,:16], ip
|
|
| 717 |
- vst1.16 {d6[1]}, [r5,:16], ip
|
|
| 718 |
- vst1.16 {d6[3]}, [r5,:16], ip
|
|
| 719 |
- vst1.16 {d7[1]}, [r5,:16], ip
|
|
| 720 |
- vst1.16 {d7[3]}, [r5,:16], ip
|
|
| 721 |
- bgt 6b |
|
| 722 |
- pop {r4-r8,pc}
|
|
| 723 |
-8: subs lr, lr, #8 |
|
| 724 |
- vst1.16 {d0[1]}, [r5,:16], ip
|
|
| 725 |
- vst1.16 {d0[3]}, [r5,:16], ip
|
|
| 726 |
- vst1.16 {d1[1]}, [r5,:16], ip
|
|
| 727 |
- vst1.16 {d1[3]}, [r5,:16], ip
|
|
| 728 |
- vst1.16 {d2[1]}, [r5,:16], ip
|
|
| 729 |
- vst1.16 {d2[3]}, [r5,:16], ip
|
|
| 730 |
- vst1.16 {d3[1]}, [r5,:16], ip
|
|
| 731 |
- vst1.16 {d3[3]}, [r5,:16], ip
|
|
| 732 |
- popeq {r4-r8,pc}
|
|
| 733 |
- vld1.64 {d0-d1}, [r4,:128]!
|
|
| 734 |
- vcvt.s32.f32 q0, q0, #16 |
|
| 735 |
- vld1.64 {d2-d3}, [r4,:128]!
|
|
| 736 |
- vcvt.s32.f32 q1, q1, #16 |
|
| 737 |
- b 6b |
|
| 738 |
-endfunc |
|
| 739 |
- |
|
| 740 | 403 |
function ff_vector_fmul_neon, export=1 |
| 741 | 404 |
subs r3, r3, #8 |
| 742 | 405 |
vld1.64 {d0-d3}, [r1,:128]!
|
| ... | ... |
@@ -1050,34 +713,6 @@ NOVFP vmov.32 r0, d0[0] |
| 1050 | 1050 |
bx lr |
| 1051 | 1051 |
endfunc |
| 1052 | 1052 |
|
| 1053 |
-function ff_int32_to_float_fmul_scalar_neon, export=1 |
|
| 1054 |
-VFP vdup.32 q0, d0[0] |
|
| 1055 |
-VFP len .req r2 |
|
| 1056 |
-NOVFP vdup.32 q0, r2 |
|
| 1057 |
-NOVFP len .req r3 |
|
| 1058 |
- |
|
| 1059 |
- vld1.32 {q1},[r1,:128]!
|
|
| 1060 |
- vcvt.f32.s32 q3, q1 |
|
| 1061 |
- vld1.32 {q2},[r1,:128]!
|
|
| 1062 |
- vcvt.f32.s32 q8, q2 |
|
| 1063 |
-1: subs len, len, #8 |
|
| 1064 |
- pld [r1, #16] |
|
| 1065 |
- vmul.f32 q9, q3, q0 |
|
| 1066 |
- vmul.f32 q10, q8, q0 |
|
| 1067 |
- beq 2f |
|
| 1068 |
- vld1.32 {q1},[r1,:128]!
|
|
| 1069 |
- vcvt.f32.s32 q3, q1 |
|
| 1070 |
- vld1.32 {q2},[r1,:128]!
|
|
| 1071 |
- vcvt.f32.s32 q8, q2 |
|
| 1072 |
- vst1.32 {q9}, [r0,:128]!
|
|
| 1073 |
- vst1.32 {q10},[r0,:128]!
|
|
| 1074 |
- b 1b |
|
| 1075 |
-2: vst1.32 {q9}, [r0,:128]!
|
|
| 1076 |
- vst1.32 {q10},[r0,:128]!
|
|
| 1077 |
- bx lr |
|
| 1078 |
- .unreq len |
|
| 1079 |
-endfunc |
|
| 1080 |
- |
|
| 1081 | 1053 |
function ff_vector_fmul_reverse_neon, export=1 |
| 1082 | 1054 |
add r2, r2, r3, lsl #2 |
| 1083 | 1055 |
sub r2, r2, #32 |
| ... | ... |
@@ -131,58 +131,3 @@ function ff_vector_fmul_reverse_vfp, export=1 |
| 131 | 131 |
vpop {d8-d15}
|
| 132 | 132 |
bx lr |
| 133 | 133 |
endfunc |
| 134 |
- |
|
| 135 |
-#if HAVE_ARMV6 |
|
| 136 |
-/** |
|
| 137 |
- * ARM VFP optimized float to int16 conversion. |
|
| 138 |
- * Assume that len is a positive number and is multiple of 8, destination |
|
| 139 |
- * buffer is at least 4 bytes aligned (8 bytes alignment is better for |
|
| 140 |
- * performance), little endian byte sex |
|
| 141 |
- */ |
|
| 142 |
-@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) |
|
| 143 |
-function ff_float_to_int16_vfp, export=1 |
|
| 144 |
- push {r4-r8,lr}
|
|
| 145 |
- vpush {d8-d11}
|
|
| 146 |
- vldmia r1!, {s16-s23}
|
|
| 147 |
- vcvt.s32.f32 s0, s16 |
|
| 148 |
- vcvt.s32.f32 s1, s17 |
|
| 149 |
- vcvt.s32.f32 s2, s18 |
|
| 150 |
- vcvt.s32.f32 s3, s19 |
|
| 151 |
- vcvt.s32.f32 s4, s20 |
|
| 152 |
- vcvt.s32.f32 s5, s21 |
|
| 153 |
- vcvt.s32.f32 s6, s22 |
|
| 154 |
- vcvt.s32.f32 s7, s23 |
|
| 155 |
-1: |
|
| 156 |
- subs r2, r2, #8 |
|
| 157 |
- vmov r3, r4, s0, s1 |
|
| 158 |
- vmov r5, r6, s2, s3 |
|
| 159 |
- vmov r7, r8, s4, s5 |
|
| 160 |
- vmov ip, lr, s6, s7 |
|
| 161 |
- vldmiagt r1!, {s16-s23}
|
|
| 162 |
- ssat r4, #16, r4 |
|
| 163 |
- ssat r3, #16, r3 |
|
| 164 |
- ssat r6, #16, r6 |
|
| 165 |
- ssat r5, #16, r5 |
|
| 166 |
- pkhbt r3, r3, r4, lsl #16 |
|
| 167 |
- pkhbt r4, r5, r6, lsl #16 |
|
| 168 |
- vcvtgt.s32.f32 s0, s16 |
|
| 169 |
- vcvtgt.s32.f32 s1, s17 |
|
| 170 |
- vcvtgt.s32.f32 s2, s18 |
|
| 171 |
- vcvtgt.s32.f32 s3, s19 |
|
| 172 |
- vcvtgt.s32.f32 s4, s20 |
|
| 173 |
- vcvtgt.s32.f32 s5, s21 |
|
| 174 |
- vcvtgt.s32.f32 s6, s22 |
|
| 175 |
- vcvtgt.s32.f32 s7, s23 |
|
| 176 |
- ssat r8, #16, r8 |
|
| 177 |
- ssat r7, #16, r7 |
|
| 178 |
- ssat lr, #16, lr |
|
| 179 |
- ssat ip, #16, ip |
|
| 180 |
- pkhbt r5, r7, r8, lsl #16 |
|
| 181 |
- pkhbt r6, ip, lr, lsl #16 |
|
| 182 |
- stmia r0!, {r3-r6}
|
|
| 183 |
- bgt 1b |
|
| 184 |
- |
|
| 185 |
- vpop {d8-d11}
|
|
| 186 |
- pop {r4-r8,pc}
|
|
| 187 |
-endfunc |
|
| 188 |
-#endif |
| 189 | 134 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,48 @@ |
| 0 |
+/* |
|
| 1 |
+ * ARM optimized Format Conversion Utils |
|
| 2 |
+ * |
|
| 3 |
+ * This file is part of FFmpeg. |
|
| 4 |
+ * |
|
| 5 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
| 6 |
+ * modify it under the terms of the GNU Lesser General Public |
|
| 7 |
+ * License as published by the Free Software Foundation; either |
|
| 8 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
| 9 |
+ * |
|
| 10 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
| 11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
| 13 |
+ * Lesser General Public License for more details. |
|
| 14 |
+ * |
|
| 15 |
+ * You should have received a copy of the GNU Lesser General Public |
|
| 16 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
| 17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
| 18 |
+ */ |
|
| 19 |
+ |
|
| 20 |
+#include <stdint.h> |
|
| 21 |
+ |
|
| 22 |
+#include "libavcodec/avcodec.h" |
|
| 23 |
+#include "libavcodec/fmtconvert.h" |
|
| 24 |
+ |
|
| 25 |
+void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, |
|
| 26 |
+ float mul, int len); |
|
| 27 |
+ |
|
| 28 |
+void ff_float_to_int16_neon(int16_t *dst, const float *src, long len); |
|
| 29 |
+void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); |
|
| 30 |
+ |
|
| 31 |
+void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); |
|
| 32 |
+ |
|
| 33 |
+void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx) |
|
| 34 |
+{
|
|
| 35 |
+ if (HAVE_ARMVFP && HAVE_ARMV6) {
|
|
| 36 |
+ c->float_to_int16 = ff_float_to_int16_vfp; |
|
| 37 |
+ } |
|
| 38 |
+ |
|
| 39 |
+ if (HAVE_NEON) {
|
|
| 40 |
+ c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; |
|
| 41 |
+ |
|
| 42 |
+ if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
|
|
| 43 |
+ c->float_to_int16 = ff_float_to_int16_neon; |
|
| 44 |
+ c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; |
|
| 45 |
+ } |
|
| 46 |
+ } |
|
| 47 |
+} |
| 0 | 48 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,391 @@ |
| 0 |
+/* |
|
| 1 |
+ * ARM NEON optimised Format Conversion Utils |
|
| 2 |
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> |
|
| 3 |
+ * |
|
| 4 |
+ * This file is part of FFmpeg. |
|
| 5 |
+ * |
|
| 6 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
| 7 |
+ * modify it under the terms of the GNU Lesser General Public |
|
| 8 |
+ * License as published by the Free Software Foundation; either |
|
| 9 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
| 10 |
+ * |
|
| 11 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
| 12 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 13 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
| 14 |
+ * Lesser General Public License for more details. |
|
| 15 |
+ * |
|
| 16 |
+ * You should have received a copy of the GNU Lesser General Public |
|
| 17 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
| 18 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
| 19 |
+ */ |
|
| 20 |
+ |
|
| 21 |
+#include "config.h" |
|
| 22 |
+#include "asm.S" |
|
| 23 |
+ |
|
| 24 |
+ preserve8 |
|
| 25 |
+ .text |
|
| 26 |
+ |
|
| 27 |
+function ff_float_to_int16_neon, export=1 |
|
| 28 |
+ subs r2, r2, #8 |
|
| 29 |
+ vld1.64 {d0-d1}, [r1,:128]!
|
|
| 30 |
+ vcvt.s32.f32 q8, q0, #16 |
|
| 31 |
+ vld1.64 {d2-d3}, [r1,:128]!
|
|
| 32 |
+ vcvt.s32.f32 q9, q1, #16 |
|
| 33 |
+ beq 3f |
|
| 34 |
+ bics ip, r2, #15 |
|
| 35 |
+ beq 2f |
|
| 36 |
+1: subs ip, ip, #16 |
|
| 37 |
+ vshrn.s32 d4, q8, #16 |
|
| 38 |
+ vld1.64 {d0-d1}, [r1,:128]!
|
|
| 39 |
+ vcvt.s32.f32 q0, q0, #16 |
|
| 40 |
+ vshrn.s32 d5, q9, #16 |
|
| 41 |
+ vld1.64 {d2-d3}, [r1,:128]!
|
|
| 42 |
+ vcvt.s32.f32 q1, q1, #16 |
|
| 43 |
+ vshrn.s32 d6, q0, #16 |
|
| 44 |
+ vst1.64 {d4-d5}, [r0,:128]!
|
|
| 45 |
+ vshrn.s32 d7, q1, #16 |
|
| 46 |
+ vld1.64 {d16-d17},[r1,:128]!
|
|
| 47 |
+ vcvt.s32.f32 q8, q8, #16 |
|
| 48 |
+ vld1.64 {d18-d19},[r1,:128]!
|
|
| 49 |
+ vcvt.s32.f32 q9, q9, #16 |
|
| 50 |
+ vst1.64 {d6-d7}, [r0,:128]!
|
|
| 51 |
+ bne 1b |
|
| 52 |
+ ands r2, r2, #15 |
|
| 53 |
+ beq 3f |
|
| 54 |
+2: vld1.64 {d0-d1}, [r1,:128]!
|
|
| 55 |
+ vshrn.s32 d4, q8, #16 |
|
| 56 |
+ vcvt.s32.f32 q0, q0, #16 |
|
| 57 |
+ vld1.64 {d2-d3}, [r1,:128]!
|
|
| 58 |
+ vshrn.s32 d5, q9, #16 |
|
| 59 |
+ vcvt.s32.f32 q1, q1, #16 |
|
| 60 |
+ vshrn.s32 d6, q0, #16 |
|
| 61 |
+ vst1.64 {d4-d5}, [r0,:128]!
|
|
| 62 |
+ vshrn.s32 d7, q1, #16 |
|
| 63 |
+ vst1.64 {d6-d7}, [r0,:128]!
|
|
| 64 |
+ bx lr |
|
| 65 |
+3: vshrn.s32 d4, q8, #16 |
|
| 66 |
+ vshrn.s32 d5, q9, #16 |
|
| 67 |
+ vst1.64 {d4-d5}, [r0,:128]!
|
|
| 68 |
+ bx lr |
|
| 69 |
+endfunc |
|
| 70 |
+ |
|
| 71 |
+function ff_float_to_int16_interleave_neon, export=1 |
|
| 72 |
+ cmp r3, #2 |
|
| 73 |
+ ldrlt r1, [r1] |
|
| 74 |
+ blt ff_float_to_int16_neon |
|
| 75 |
+ bne 4f |
|
| 76 |
+ |
|
| 77 |
+ ldr r3, [r1] |
|
| 78 |
+ ldr r1, [r1, #4] |
|
| 79 |
+ |
|
| 80 |
+ subs r2, r2, #8 |
|
| 81 |
+ vld1.64 {d0-d1}, [r3,:128]!
|
|
| 82 |
+ vcvt.s32.f32 q8, q0, #16 |
|
| 83 |
+ vld1.64 {d2-d3}, [r3,:128]!
|
|
| 84 |
+ vcvt.s32.f32 q9, q1, #16 |
|
| 85 |
+ vld1.64 {d20-d21},[r1,:128]!
|
|
| 86 |
+ vcvt.s32.f32 q10, q10, #16 |
|
| 87 |
+ vld1.64 {d22-d23},[r1,:128]!
|
|
| 88 |
+ vcvt.s32.f32 q11, q11, #16 |
|
| 89 |
+ beq 3f |
|
| 90 |
+ bics ip, r2, #15 |
|
| 91 |
+ beq 2f |
|
| 92 |
+1: subs ip, ip, #16 |
|
| 93 |
+ vld1.64 {d0-d1}, [r3,:128]!
|
|
| 94 |
+ vcvt.s32.f32 q0, q0, #16 |
|
| 95 |
+ vsri.32 q10, q8, #16 |
|
| 96 |
+ vld1.64 {d2-d3}, [r3,:128]!
|
|
| 97 |
+ vcvt.s32.f32 q1, q1, #16 |
|
| 98 |
+ vld1.64 {d24-d25},[r1,:128]!
|
|
| 99 |
+ vcvt.s32.f32 q12, q12, #16 |
|
| 100 |
+ vld1.64 {d26-d27},[r1,:128]!
|
|
| 101 |
+ vsri.32 q11, q9, #16 |
|
| 102 |
+ vst1.64 {d20-d21},[r0,:128]!
|
|
| 103 |
+ vcvt.s32.f32 q13, q13, #16 |
|
| 104 |
+ vst1.64 {d22-d23},[r0,:128]!
|
|
| 105 |
+ vsri.32 q12, q0, #16 |
|
| 106 |
+ vld1.64 {d16-d17},[r3,:128]!
|
|
| 107 |
+ vsri.32 q13, q1, #16 |
|
| 108 |
+ vst1.64 {d24-d25},[r0,:128]!
|
|
| 109 |
+ vcvt.s32.f32 q8, q8, #16 |
|
| 110 |
+ vld1.64 {d18-d19},[r3,:128]!
|
|
| 111 |
+ vcvt.s32.f32 q9, q9, #16 |
|
| 112 |
+ vld1.64 {d20-d21},[r1,:128]!
|
|
| 113 |
+ vcvt.s32.f32 q10, q10, #16 |
|
| 114 |
+ vld1.64 {d22-d23},[r1,:128]!
|
|
| 115 |
+ vcvt.s32.f32 q11, q11, #16 |
|
| 116 |
+ vst1.64 {d26-d27},[r0,:128]!
|
|
| 117 |
+ bne 1b |
|
| 118 |
+ ands r2, r2, #15 |
|
| 119 |
+ beq 3f |
|
| 120 |
+2: vsri.32 q10, q8, #16 |
|
| 121 |
+ vld1.64 {d0-d1}, [r3,:128]!
|
|
| 122 |
+ vcvt.s32.f32 q0, q0, #16 |
|
| 123 |
+ vld1.64 {d2-d3}, [r3,:128]!
|
|
| 124 |
+ vcvt.s32.f32 q1, q1, #16 |
|
| 125 |
+ vld1.64 {d24-d25},[r1,:128]!
|
|
| 126 |
+ vcvt.s32.f32 q12, q12, #16 |
|
| 127 |
+ vsri.32 q11, q9, #16 |
|
| 128 |
+ vld1.64 {d26-d27},[r1,:128]!
|
|
| 129 |
+ vcvt.s32.f32 q13, q13, #16 |
|
| 130 |
+ vst1.64 {d20-d21},[r0,:128]!
|
|
| 131 |
+ vsri.32 q12, q0, #16 |
|
| 132 |
+ vst1.64 {d22-d23},[r0,:128]!
|
|
| 133 |
+ vsri.32 q13, q1, #16 |
|
| 134 |
+ vst1.64 {d24-d27},[r0,:128]!
|
|
| 135 |
+ bx lr |
|
| 136 |
+3: vsri.32 q10, q8, #16 |
|
| 137 |
+ vsri.32 q11, q9, #16 |
|
| 138 |
+ vst1.64 {d20-d23},[r0,:128]!
|
|
| 139 |
+ bx lr |
|
| 140 |
+ |
|
| 141 |
+4: push {r4-r8,lr}
|
|
| 142 |
+ cmp r3, #4 |
|
| 143 |
+ lsl ip, r3, #1 |
|
| 144 |
+ blt 4f |
|
| 145 |
+ |
|
| 146 |
+ @ 4 channels |
|
| 147 |
+5: ldmia r1!, {r4-r7}
|
|
| 148 |
+ mov lr, r2 |
|
| 149 |
+ mov r8, r0 |
|
| 150 |
+ vld1.64 {d16-d17},[r4,:128]!
|
|
| 151 |
+ vcvt.s32.f32 q8, q8, #16 |
|
| 152 |
+ vld1.64 {d18-d19},[r5,:128]!
|
|
| 153 |
+ vcvt.s32.f32 q9, q9, #16 |
|
| 154 |
+ vld1.64 {d20-d21},[r6,:128]!
|
|
| 155 |
+ vcvt.s32.f32 q10, q10, #16 |
|
| 156 |
+ vld1.64 {d22-d23},[r7,:128]!
|
|
| 157 |
+ vcvt.s32.f32 q11, q11, #16 |
|
| 158 |
+6: subs lr, lr, #8 |
|
| 159 |
+ vld1.64 {d0-d1}, [r4,:128]!
|
|
| 160 |
+ vcvt.s32.f32 q0, q0, #16 |
|
| 161 |
+ vsri.32 q9, q8, #16 |
|
| 162 |
+ vld1.64 {d2-d3}, [r5,:128]!
|
|
| 163 |
+ vcvt.s32.f32 q1, q1, #16 |
|
| 164 |
+ vsri.32 q11, q10, #16 |
|
| 165 |
+ vld1.64 {d4-d5}, [r6,:128]!
|
|
| 166 |
+ vcvt.s32.f32 q2, q2, #16 |
|
| 167 |
+ vzip.32 d18, d22 |
|
| 168 |
+ vld1.64 {d6-d7}, [r7,:128]!
|
|
| 169 |
+ vcvt.s32.f32 q3, q3, #16 |
|
| 170 |
+ vzip.32 d19, d23 |
|
| 171 |
+ vst1.64 {d18}, [r8], ip
|
|
| 172 |
+ vsri.32 q1, q0, #16 |
|
| 173 |
+ vst1.64 {d22}, [r8], ip
|
|
| 174 |
+ vsri.32 q3, q2, #16 |
|
| 175 |
+ vst1.64 {d19}, [r8], ip
|
|
| 176 |
+ vzip.32 d2, d6 |
|
| 177 |
+ vst1.64 {d23}, [r8], ip
|
|
| 178 |
+ vzip.32 d3, d7 |
|
| 179 |
+ beq 7f |
|
| 180 |
+ vld1.64 {d16-d17},[r4,:128]!
|
|
| 181 |
+ vcvt.s32.f32 q8, q8, #16 |
|
| 182 |
+ vst1.64 {d2}, [r8], ip
|
|
| 183 |
+ vld1.64 {d18-d19},[r5,:128]!
|
|
| 184 |
+ vcvt.s32.f32 q9, q9, #16 |
|
| 185 |
+ vst1.64 {d6}, [r8], ip
|
|
| 186 |
+ vld1.64 {d20-d21},[r6,:128]!
|
|
| 187 |
+ vcvt.s32.f32 q10, q10, #16 |
|
| 188 |
+ vst1.64 {d3}, [r8], ip
|
|
| 189 |
+ vld1.64 {d22-d23},[r7,:128]!
|
|
| 190 |
+ vcvt.s32.f32 q11, q11, #16 |
|
| 191 |
+ vst1.64 {d7}, [r8], ip
|
|
| 192 |
+ b 6b |
|
| 193 |
+7: vst1.64 {d2}, [r8], ip
|
|
| 194 |
+ vst1.64 {d6}, [r8], ip
|
|
| 195 |
+ vst1.64 {d3}, [r8], ip
|
|
| 196 |
+ vst1.64 {d7}, [r8], ip
|
|
| 197 |
+ subs r3, r3, #4 |
|
| 198 |
+ popeq {r4-r8,pc}
|
|
| 199 |
+ cmp r3, #4 |
|
| 200 |
+ add r0, r0, #8 |
|
| 201 |
+ bge 5b |
|
| 202 |
+ |
|
| 203 |
+ @ 2 channels |
|
| 204 |
+4: cmp r3, #2 |
|
| 205 |
+ blt 4f |
|
| 206 |
+ ldmia r1!, {r4-r5}
|
|
| 207 |
+ mov lr, r2 |
|
| 208 |
+ mov r8, r0 |
|
| 209 |
+ tst lr, #8 |
|
| 210 |
+ vld1.64 {d16-d17},[r4,:128]!
|
|
| 211 |
+ vcvt.s32.f32 q8, q8, #16 |
|
| 212 |
+ vld1.64 {d18-d19},[r5,:128]!
|
|
| 213 |
+ vcvt.s32.f32 q9, q9, #16 |
|
| 214 |
+ vld1.64 {d20-d21},[r4,:128]!
|
|
| 215 |
+ vcvt.s32.f32 q10, q10, #16 |
|
| 216 |
+ vld1.64 {d22-d23},[r5,:128]!
|
|
| 217 |
+ vcvt.s32.f32 q11, q11, #16 |
|
| 218 |
+ beq 6f |
|
| 219 |
+ subs lr, lr, #8 |
|
| 220 |
+ beq 7f |
|
| 221 |
+ vsri.32 d18, d16, #16 |
|
| 222 |
+ vsri.32 d19, d17, #16 |
|
| 223 |
+ vld1.64 {d16-d17},[r4,:128]!
|
|
| 224 |
+ vcvt.s32.f32 q8, q8, #16 |
|
| 225 |
+ vst1.32 {d18[0]}, [r8], ip
|
|
| 226 |
+ vsri.32 d22, d20, #16 |
|
| 227 |
+ vst1.32 {d18[1]}, [r8], ip
|
|
| 228 |
+ vsri.32 d23, d21, #16 |
|
| 229 |
+ vst1.32 {d19[0]}, [r8], ip
|
|
| 230 |
+ vst1.32 {d19[1]}, [r8], ip
|
|
| 231 |
+ vld1.64 {d18-d19},[r5,:128]!
|
|
| 232 |
+ vcvt.s32.f32 q9, q9, #16 |
|
| 233 |
+ vst1.32 {d22[0]}, [r8], ip
|
|
| 234 |
+ vst1.32 {d22[1]}, [r8], ip
|
|
| 235 |
+ vld1.64 {d20-d21},[r4,:128]!
|
|
| 236 |
+ vcvt.s32.f32 q10, q10, #16 |
|
| 237 |
+ vst1.32 {d23[0]}, [r8], ip
|
|
| 238 |
+ vst1.32 {d23[1]}, [r8], ip
|
|
| 239 |
+ vld1.64 {d22-d23},[r5,:128]!
|
|
| 240 |
+ vcvt.s32.f32 q11, q11, #16 |
|
| 241 |
+6: subs lr, lr, #16 |
|
| 242 |
+ vld1.64 {d0-d1}, [r4,:128]!
|
|
| 243 |
+ vcvt.s32.f32 q0, q0, #16 |
|
| 244 |
+ vsri.32 d18, d16, #16 |
|
| 245 |
+ vld1.64 {d2-d3}, [r5,:128]!
|
|
| 246 |
+ vcvt.s32.f32 q1, q1, #16 |
|
| 247 |
+ vsri.32 d19, d17, #16 |
|
| 248 |
+ vld1.64 {d4-d5}, [r4,:128]!
|
|
| 249 |
+ vcvt.s32.f32 q2, q2, #16 |
|
| 250 |
+ vld1.64 {d6-d7}, [r5,:128]!
|
|
| 251 |
+ vcvt.s32.f32 q3, q3, #16 |
|
| 252 |
+ vst1.32 {d18[0]}, [r8], ip
|
|
| 253 |
+ vsri.32 d22, d20, #16 |
|
| 254 |
+ vst1.32 {d18[1]}, [r8], ip
|
|
| 255 |
+ vsri.32 d23, d21, #16 |
|
| 256 |
+ vst1.32 {d19[0]}, [r8], ip
|
|
| 257 |
+ vsri.32 d2, d0, #16 |
|
| 258 |
+ vst1.32 {d19[1]}, [r8], ip
|
|
| 259 |
+ vsri.32 d3, d1, #16 |
|
| 260 |
+ vst1.32 {d22[0]}, [r8], ip
|
|
| 261 |
+ vsri.32 d6, d4, #16 |
|
| 262 |
+ vst1.32 {d22[1]}, [r8], ip
|
|
| 263 |
+ vsri.32 d7, d5, #16 |
|
| 264 |
+ vst1.32 {d23[0]}, [r8], ip
|
|
| 265 |
+ vst1.32 {d23[1]}, [r8], ip
|
|
| 266 |
+ beq 6f |
|
| 267 |
+ vld1.64 {d16-d17},[r4,:128]!
|
|
| 268 |
+ vcvt.s32.f32 q8, q8, #16 |
|
| 269 |
+ vst1.32 {d2[0]}, [r8], ip
|
|
| 270 |
+ vst1.32 {d2[1]}, [r8], ip
|
|
| 271 |
+ vld1.64 {d18-d19},[r5,:128]!
|
|
| 272 |
+ vcvt.s32.f32 q9, q9, #16 |
|
| 273 |
+ vst1.32 {d3[0]}, [r8], ip
|
|
| 274 |
+ vst1.32 {d3[1]}, [r8], ip
|
|
| 275 |
+ vld1.64 {d20-d21},[r4,:128]!
|
|
| 276 |
+ vcvt.s32.f32 q10, q10, #16 |
|
| 277 |
+ vst1.32 {d6[0]}, [r8], ip
|
|
| 278 |
+ vst1.32 {d6[1]}, [r8], ip
|
|
| 279 |
+ vld1.64 {d22-d23},[r5,:128]!
|
|
| 280 |
+ vcvt.s32.f32 q11, q11, #16 |
|
| 281 |
+ vst1.32 {d7[0]}, [r8], ip
|
|
| 282 |
+ vst1.32 {d7[1]}, [r8], ip
|
|
| 283 |
+ bgt 6b |
|
| 284 |
+6: vst1.32 {d2[0]}, [r8], ip
|
|
| 285 |
+ vst1.32 {d2[1]}, [r8], ip
|
|
| 286 |
+ vst1.32 {d3[0]}, [r8], ip
|
|
| 287 |
+ vst1.32 {d3[1]}, [r8], ip
|
|
| 288 |
+ vst1.32 {d6[0]}, [r8], ip
|
|
| 289 |
+ vst1.32 {d6[1]}, [r8], ip
|
|
| 290 |
+ vst1.32 {d7[0]}, [r8], ip
|
|
| 291 |
+ vst1.32 {d7[1]}, [r8], ip
|
|
| 292 |
+ b 8f |
|
| 293 |
+7: vsri.32 d18, d16, #16 |
|
| 294 |
+ vsri.32 d19, d17, #16 |
|
| 295 |
+ vst1.32 {d18[0]}, [r8], ip
|
|
| 296 |
+ vsri.32 d22, d20, #16 |
|
| 297 |
+ vst1.32 {d18[1]}, [r8], ip
|
|
| 298 |
+ vsri.32 d23, d21, #16 |
|
| 299 |
+ vst1.32 {d19[0]}, [r8], ip
|
|
| 300 |
+ vst1.32 {d19[1]}, [r8], ip
|
|
| 301 |
+ vst1.32 {d22[0]}, [r8], ip
|
|
| 302 |
+ vst1.32 {d22[1]}, [r8], ip
|
|
| 303 |
+ vst1.32 {d23[0]}, [r8], ip
|
|
| 304 |
+ vst1.32 {d23[1]}, [r8], ip
|
|
| 305 |
+8: subs r3, r3, #2 |
|
| 306 |
+ add r0, r0, #4 |
|
| 307 |
+ popeq {r4-r8,pc}
|
|
| 308 |
+ |
|
| 309 |
+ @ 1 channel |
|
| 310 |
+4: ldr r4, [r1],#4 |
|
| 311 |
+ tst r2, #8 |
|
| 312 |
+ mov lr, r2 |
|
| 313 |
+ mov r5, r0 |
|
| 314 |
+ vld1.64 {d0-d1}, [r4,:128]!
|
|
| 315 |
+ vcvt.s32.f32 q0, q0, #16 |
|
| 316 |
+ vld1.64 {d2-d3}, [r4,:128]!
|
|
| 317 |
+ vcvt.s32.f32 q1, q1, #16 |
|
| 318 |
+ bne 8f |
|
| 319 |
+6: subs lr, lr, #16 |
|
| 320 |
+ vld1.64 {d4-d5}, [r4,:128]!
|
|
| 321 |
+ vcvt.s32.f32 q2, q2, #16 |
|
| 322 |
+ vld1.64 {d6-d7}, [r4,:128]!
|
|
| 323 |
+ vcvt.s32.f32 q3, q3, #16 |
|
| 324 |
+ vst1.16 {d0[1]}, [r5,:16], ip
|
|
| 325 |
+ vst1.16 {d0[3]}, [r5,:16], ip
|
|
| 326 |
+ vst1.16 {d1[1]}, [r5,:16], ip
|
|
| 327 |
+ vst1.16 {d1[3]}, [r5,:16], ip
|
|
| 328 |
+ vst1.16 {d2[1]}, [r5,:16], ip
|
|
| 329 |
+ vst1.16 {d2[3]}, [r5,:16], ip
|
|
| 330 |
+ vst1.16 {d3[1]}, [r5,:16], ip
|
|
| 331 |
+ vst1.16 {d3[3]}, [r5,:16], ip
|
|
| 332 |
+ beq 7f |
|
| 333 |
+ vld1.64 {d0-d1}, [r4,:128]!
|
|
| 334 |
+ vcvt.s32.f32 q0, q0, #16 |
|
| 335 |
+ vld1.64 {d2-d3}, [r4,:128]!
|
|
| 336 |
+ vcvt.s32.f32 q1, q1, #16 |
|
| 337 |
+7: vst1.16 {d4[1]}, [r5,:16], ip
|
|
| 338 |
+ vst1.16 {d4[3]}, [r5,:16], ip
|
|
| 339 |
+ vst1.16 {d5[1]}, [r5,:16], ip
|
|
| 340 |
+ vst1.16 {d5[3]}, [r5,:16], ip
|
|
| 341 |
+ vst1.16 {d6[1]}, [r5,:16], ip
|
|
| 342 |
+ vst1.16 {d6[3]}, [r5,:16], ip
|
|
| 343 |
+ vst1.16 {d7[1]}, [r5,:16], ip
|
|
| 344 |
+ vst1.16 {d7[3]}, [r5,:16], ip
|
|
| 345 |
+ bgt 6b |
|
| 346 |
+ pop {r4-r8,pc}
|
|
| 347 |
+8: subs lr, lr, #8 |
|
| 348 |
+ vst1.16 {d0[1]}, [r5,:16], ip
|
|
| 349 |
+ vst1.16 {d0[3]}, [r5,:16], ip
|
|
| 350 |
+ vst1.16 {d1[1]}, [r5,:16], ip
|
|
| 351 |
+ vst1.16 {d1[3]}, [r5,:16], ip
|
|
| 352 |
+ vst1.16 {d2[1]}, [r5,:16], ip
|
|
| 353 |
+ vst1.16 {d2[3]}, [r5,:16], ip
|
|
| 354 |
+ vst1.16 {d3[1]}, [r5,:16], ip
|
|
| 355 |
+ vst1.16 {d3[3]}, [r5,:16], ip
|
|
| 356 |
+ popeq {r4-r8,pc}
|
|
| 357 |
+ vld1.64 {d0-d1}, [r4,:128]!
|
|
| 358 |
+ vcvt.s32.f32 q0, q0, #16 |
|
| 359 |
+ vld1.64 {d2-d3}, [r4,:128]!
|
|
| 360 |
+ vcvt.s32.f32 q1, q1, #16 |
|
| 361 |
+ b 6b |
|
| 362 |
+endfunc |
|
| 363 |
+ |
|
| 364 |
+function ff_int32_to_float_fmul_scalar_neon, export=1 |
|
| 365 |
+VFP vdup.32 q0, d0[0] |
|
| 366 |
+VFP len .req r2 |
|
| 367 |
+NOVFP vdup.32 q0, r2 |
|
| 368 |
+NOVFP len .req r3 |
|
| 369 |
+ |
|
| 370 |
+ vld1.32 {q1},[r1,:128]!
|
|
| 371 |
+ vcvt.f32.s32 q3, q1 |
|
| 372 |
+ vld1.32 {q2},[r1,:128]!
|
|
| 373 |
+ vcvt.f32.s32 q8, q2 |
|
| 374 |
+1: subs len, len, #8 |
|
| 375 |
+ pld [r1, #16] |
|
| 376 |
+ vmul.f32 q9, q3, q0 |
|
| 377 |
+ vmul.f32 q10, q8, q0 |
|
| 378 |
+ beq 2f |
|
| 379 |
+ vld1.32 {q1},[r1,:128]!
|
|
| 380 |
+ vcvt.f32.s32 q3, q1 |
|
| 381 |
+ vld1.32 {q2},[r1,:128]!
|
|
| 382 |
+ vcvt.f32.s32 q8, q2 |
|
| 383 |
+ vst1.32 {q9}, [r0,:128]!
|
|
| 384 |
+ vst1.32 {q10},[r0,:128]!
|
|
| 385 |
+ b 1b |
|
| 386 |
+2: vst1.32 {q9}, [r0,:128]!
|
|
| 387 |
+ vst1.32 {q10},[r0,:128]!
|
|
| 388 |
+ bx lr |
|
| 389 |
+ .unreq len |
|
| 390 |
+endfunc |
| 0 | 391 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,77 @@ |
| 0 |
+/* |
|
| 1 |
+ * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> |
|
| 2 |
+ * |
|
| 3 |
+ * This file is part of FFmpeg. |
|
| 4 |
+ * |
|
| 5 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
| 6 |
+ * modify it under the terms of the GNU Lesser General Public |
|
| 7 |
+ * License as published by the Free Software Foundation; either |
|
| 8 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
| 9 |
+ * |
|
| 10 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
| 11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
| 13 |
+ * Lesser General Public License for more details. |
|
| 14 |
+ * |
|
| 15 |
+ * You should have received a copy of the GNU Lesser General Public |
|
| 16 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
| 17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
| 18 |
+ */ |
|
| 19 |
+ |
|
| 20 |
+#include "config.h" |
|
| 21 |
+#include "asm.S" |
|
| 22 |
+ |
|
| 23 |
+ .syntax unified |
|
| 24 |
+ |
|
| 25 |
+/** |
|
| 26 |
+ * ARM VFP optimized float to int16 conversion. |
|
| 27 |
+ * Assume that len is a positive number and is multiple of 8, destination |
|
| 28 |
+ * buffer is at least 4 bytes aligned (8 bytes alignment is better for |
|
| 29 |
+ * performance), little endian byte sex |
|
| 30 |
+ */ |
|
| 31 |
+@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) |
|
| 32 |
+function ff_float_to_int16_vfp, export=1 |
|
| 33 |
+ push {r4-r8,lr}
|
|
| 34 |
+ vpush {d8-d11}
|
|
| 35 |
+ vldmia r1!, {s16-s23}
|
|
| 36 |
+ vcvt.s32.f32 s0, s16 |
|
| 37 |
+ vcvt.s32.f32 s1, s17 |
|
| 38 |
+ vcvt.s32.f32 s2, s18 |
|
| 39 |
+ vcvt.s32.f32 s3, s19 |
|
| 40 |
+ vcvt.s32.f32 s4, s20 |
|
| 41 |
+ vcvt.s32.f32 s5, s21 |
|
| 42 |
+ vcvt.s32.f32 s6, s22 |
|
| 43 |
+ vcvt.s32.f32 s7, s23 |
|
| 44 |
+1: |
|
| 45 |
+ subs r2, r2, #8 |
|
| 46 |
+ vmov r3, r4, s0, s1 |
|
| 47 |
+ vmov r5, r6, s2, s3 |
|
| 48 |
+ vmov r7, r8, s4, s5 |
|
| 49 |
+ vmov ip, lr, s6, s7 |
|
| 50 |
+ vldmiagt r1!, {s16-s23}
|
|
| 51 |
+ ssat r4, #16, r4 |
|
| 52 |
+ ssat r3, #16, r3 |
|
| 53 |
+ ssat r6, #16, r6 |
|
| 54 |
+ ssat r5, #16, r5 |
|
| 55 |
+ pkhbt r3, r3, r4, lsl #16 |
|
| 56 |
+ pkhbt r4, r5, r6, lsl #16 |
|
| 57 |
+ vcvtgt.s32.f32 s0, s16 |
|
| 58 |
+ vcvtgt.s32.f32 s1, s17 |
|
| 59 |
+ vcvtgt.s32.f32 s2, s18 |
|
| 60 |
+ vcvtgt.s32.f32 s3, s19 |
|
| 61 |
+ vcvtgt.s32.f32 s4, s20 |
|
| 62 |
+ vcvtgt.s32.f32 s5, s21 |
|
| 63 |
+ vcvtgt.s32.f32 s6, s22 |
|
| 64 |
+ vcvtgt.s32.f32 s7, s23 |
|
| 65 |
+ ssat r8, #16, r8 |
|
| 66 |
+ ssat r7, #16, r7 |
|
| 67 |
+ ssat lr, #16, lr |
|
| 68 |
+ ssat ip, #16, ip |
|
| 69 |
+ pkhbt r5, r7, r8, lsl #16 |
|
| 70 |
+ pkhbt r6, ip, lr, lsl #16 |
|
| 71 |
+ stmia r0!, {r3-r6}
|
|
| 72 |
+ bgt 1b |
|
| 73 |
+ |
|
| 74 |
+ vpop {d8-d11}
|
|
| 75 |
+ pop {r4-r8,pc}
|
|
| 76 |
+endfunc |
| ... | ... |
@@ -33,6 +33,7 @@ |
| 33 | 33 |
#include "get_bits.h" |
| 34 | 34 |
#include "dsputil.h" |
| 35 | 35 |
#include "fft.h" |
| 36 |
+#include "fmtconvert.h" |
|
| 36 | 37 |
|
| 37 | 38 |
extern const uint16_t ff_wma_critical_freqs[25]; |
| 38 | 39 |
|
| ... | ... |
@@ -43,6 +44,7 @@ typedef struct {
|
| 43 | 43 |
AVCodecContext *avctx; |
| 44 | 44 |
GetBitContext gb; |
| 45 | 45 |
DSPContext dsp; |
| 46 |
+ FmtConvertContext fmt_conv; |
|
| 46 | 47 |
int first; |
| 47 | 48 |
int channels; |
| 48 | 49 |
int frame_len; ///< transform size (samples) |
| ... | ... |
@@ -71,6 +73,7 @@ static av_cold int decode_init(AVCodecContext *avctx) |
| 71 | 71 |
|
| 72 | 72 |
s->avctx = avctx; |
| 73 | 73 |
dsputil_init(&s->dsp, avctx); |
| 74 |
+ ff_fmt_convert_init(&s->fmt_conv, avctx); |
|
| 74 | 75 |
|
| 75 | 76 |
/* determine frame length */ |
| 76 | 77 |
if (avctx->sample_rate < 22050) {
|
| ... | ... |
@@ -222,7 +225,8 @@ static void decode_block(BinkAudioContext *s, short *out, int use_dct) |
| 222 | 222 |
ff_rdft_calc(&s->trans.rdft, coeffs); |
| 223 | 223 |
} |
| 224 | 224 |
|
| 225 |
- s->dsp.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, s->frame_len, s->channels); |
|
| 225 |
+ s->fmt_conv.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, |
|
| 226 |
+ s->frame_len, s->channels); |
|
| 226 | 227 |
|
| 227 | 228 |
if (!s->first) {
|
| 228 | 229 |
int count = s->overlap_len * s->channels; |
| ... | ... |
@@ -40,6 +40,7 @@ |
| 40 | 40 |
#include "dca.h" |
| 41 | 41 |
#include "synth_filter.h" |
| 42 | 42 |
#include "dcadsp.h" |
| 43 |
+#include "fmtconvert.h" |
|
| 43 | 44 |
|
| 44 | 45 |
//#define TRACE |
| 45 | 46 |
|
| ... | ... |
@@ -347,6 +348,7 @@ typedef struct {
|
| 347 | 347 |
FFTContext imdct; |
| 348 | 348 |
SynthFilterContext synth; |
| 349 | 349 |
DCADSPContext dcadsp; |
| 350 |
+ FmtConvertContext fmt_conv; |
|
| 350 | 351 |
} DCAContext; |
| 351 | 352 |
|
| 352 | 353 |
static const uint16_t dca_vlc_offs[] = {
|
| ... | ... |
@@ -1115,7 +1117,7 @@ static int dca_subsubframe(DCAContext * s, int base_channel, int block_index) |
| 1115 | 1115 |
block[m] = get_bitalloc(&s->gb, &dca_smpl_bitalloc[abits], sel); |
| 1116 | 1116 |
} |
| 1117 | 1117 |
|
| 1118 |
- s->dsp.int32_to_float_fmul_scalar(subband_samples[k][l], |
|
| 1118 |
+ s->fmt_conv.int32_to_float_fmul_scalar(subband_samples[k][l], |
|
| 1119 | 1119 |
block, rscale, 8); |
| 1120 | 1120 |
} |
| 1121 | 1121 |
|
| ... | ... |
@@ -1802,7 +1804,7 @@ static int dca_decode_frame(AVCodecContext * avctx, |
| 1802 | 1802 |
} |
| 1803 | 1803 |
} |
| 1804 | 1804 |
|
| 1805 |
- s->dsp.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels); |
|
| 1805 |
+ s->fmt_conv.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels); |
|
| 1806 | 1806 |
samples += 256 * channels; |
| 1807 | 1807 |
} |
| 1808 | 1808 |
|
| ... | ... |
@@ -1835,6 +1837,7 @@ static av_cold int dca_decode_init(AVCodecContext * avctx) |
| 1835 | 1835 |
ff_mdct_init(&s->imdct, 6, 1, 1.0); |
| 1836 | 1836 |
ff_synth_filter_init(&s->synth); |
| 1837 | 1837 |
ff_dcadsp_init(&s->dcadsp); |
| 1838 |
+ ff_fmt_convert_init(&s->fmt_conv, avctx); |
|
| 1838 | 1839 |
|
| 1839 | 1840 |
for (i = 0; i < DCA_PRIM_CHANNELS_MAX+1; i++) |
| 1840 | 1841 |
s->samples_chanptr[i] = s->samples + i * 256; |
| ... | ... |
@@ -3867,12 +3867,6 @@ static float scalarproduct_float_c(const float *v1, const float *v2, int len) |
| 3867 | 3867 |
return p; |
| 3868 | 3868 |
} |
| 3869 | 3869 |
|
| 3870 |
-static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
|
|
| 3871 |
- int i; |
|
| 3872 |
- for(i=0; i<len; i++) |
|
| 3873 |
- dst[i] = src[i] * mul; |
|
| 3874 |
-} |
|
| 3875 |
- |
|
| 3876 | 3870 |
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini, |
| 3877 | 3871 |
uint32_t maxi, uint32_t maxisign) |
| 3878 | 3872 |
{
|
| ... | ... |
@@ -3918,30 +3912,6 @@ static void vector_clipf_c(float *dst, const float *src, float min, float max, i |
| 3918 | 3918 |
} |
| 3919 | 3919 |
} |
| 3920 | 3920 |
|
| 3921 |
-static av_always_inline int float_to_int16_one(const float *src){
|
|
| 3922 |
- return av_clip_int16(lrintf(*src)); |
|
| 3923 |
-} |
|
| 3924 |
- |
|
| 3925 |
-static void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
|
|
| 3926 |
- int i; |
|
| 3927 |
- for(i=0; i<len; i++) |
|
| 3928 |
- dst[i] = float_to_int16_one(src+i); |
|
| 3929 |
-} |
|
| 3930 |
- |
|
| 3931 |
-static void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
|
|
| 3932 |
- int i,j,c; |
|
| 3933 |
- if(channels==2){
|
|
| 3934 |
- for(i=0; i<len; i++){
|
|
| 3935 |
- dst[2*i] = float_to_int16_one(src[0]+i); |
|
| 3936 |
- dst[2*i+1] = float_to_int16_one(src[1]+i); |
|
| 3937 |
- } |
|
| 3938 |
- }else{
|
|
| 3939 |
- for(c=0; c<channels; c++) |
|
| 3940 |
- for(i=0, j=c; i<len; i++, j+=channels) |
|
| 3941 |
- dst[j] = float_to_int16_one(src[c]+i); |
|
| 3942 |
- } |
|
| 3943 |
-} |
|
| 3944 |
- |
|
| 3945 | 3921 |
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift) |
| 3946 | 3922 |
{
|
| 3947 | 3923 |
int res = 0; |
| ... | ... |
@@ -4437,10 +4407,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) |
| 4437 | 4437 |
c->vector_fmul_reverse = vector_fmul_reverse_c; |
| 4438 | 4438 |
c->vector_fmul_add = vector_fmul_add_c; |
| 4439 | 4439 |
c->vector_fmul_window = vector_fmul_window_c; |
| 4440 |
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; |
|
| 4441 | 4440 |
c->vector_clipf = vector_clipf_c; |
| 4442 |
- c->float_to_int16 = ff_float_to_int16_c; |
|
| 4443 |
- c->float_to_int16_interleave = ff_float_to_int16_interleave_c; |
|
| 4444 | 4441 |
c->scalarproduct_int16 = scalarproduct_int16_c; |
| 4445 | 4442 |
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; |
| 4446 | 4443 |
c->scalarproduct_float = scalarproduct_float_c; |
| ... | ... |
@@ -392,7 +392,6 @@ typedef struct DSPContext {
|
| 392 | 392 |
/* assume len is a multiple of 4, and arrays are 16-byte aligned */ |
| 393 | 393 |
void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len); |
| 394 | 394 |
/* assume len is a multiple of 8, and arrays are 16-byte aligned */ |
| 395 |
- void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len); |
|
| 396 | 395 |
void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */); |
| 397 | 396 |
/** |
| 398 | 397 |
* Multiply a vector of floats by a scalar float. Source and |
| ... | ... |
@@ -445,10 +444,6 @@ typedef struct DSPContext {
|
| 445 | 445 |
*/ |
| 446 | 446 |
void (*butterflies_float)(float *restrict v1, float *restrict v2, int len); |
| 447 | 447 |
|
| 448 |
- /* convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */ |
|
| 449 |
- void (*float_to_int16)(int16_t *dst, const float *src, long len); |
|
| 450 |
- void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels); |
|
| 451 |
- |
|
| 452 | 448 |
/* (I)DCT */ |
| 453 | 449 |
void (*fdct)(DCTELEM *block/* align 16*/); |
| 454 | 450 |
void (*fdct248)(DCTELEM *block/* align 16*/); |
| 455 | 451 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,68 @@ |
| 0 |
+/* |
|
| 1 |
+ * Format Conversion Utils |
|
| 2 |
+ * Copyright (c) 2000, 2001 Fabrice Bellard |
|
| 3 |
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
| 4 |
+ * |
|
| 5 |
+ * This file is part of FFmpeg. |
|
| 6 |
+ * |
|
| 7 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
| 8 |
+ * modify it under the terms of the GNU Lesser General Public |
|
| 9 |
+ * License as published by the Free Software Foundation; either |
|
| 10 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
| 11 |
+ * |
|
| 12 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
| 13 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 14 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
| 15 |
+ * Lesser General Public License for more details. |
|
| 16 |
+ * |
|
| 17 |
+ * You should have received a copy of the GNU Lesser General Public |
|
| 18 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
| 19 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
| 20 |
+ */ |
|
| 21 |
+ |
|
| 22 |
+#include "avcodec.h" |
|
| 23 |
+#include "fmtconvert.h" |
|
| 24 |
+ |
|
| 25 |
+static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
|
|
| 26 |
+ int i; |
|
| 27 |
+ for(i=0; i<len; i++) |
|
| 28 |
+ dst[i] = src[i] * mul; |
|
| 29 |
+} |
|
| 30 |
+ |
|
| 31 |
+static av_always_inline int float_to_int16_one(const float *src){
|
|
| 32 |
+ return av_clip_int16(lrintf(*src)); |
|
| 33 |
+} |
|
| 34 |
+ |
|
| 35 |
+static void float_to_int16_c(int16_t *dst, const float *src, long len) |
|
| 36 |
+{
|
|
| 37 |
+ int i; |
|
| 38 |
+ for(i=0; i<len; i++) |
|
| 39 |
+ dst[i] = float_to_int16_one(src+i); |
|
| 40 |
+} |
|
| 41 |
+ |
|
| 42 |
+static void float_to_int16_interleave_c(int16_t *dst, const float **src, |
|
| 43 |
+ long len, int channels) |
|
| 44 |
+{
|
|
| 45 |
+ int i,j,c; |
|
| 46 |
+ if(channels==2){
|
|
| 47 |
+ for(i=0; i<len; i++){
|
|
| 48 |
+ dst[2*i] = float_to_int16_one(src[0]+i); |
|
| 49 |
+ dst[2*i+1] = float_to_int16_one(src[1]+i); |
|
| 50 |
+ } |
|
| 51 |
+ }else{
|
|
| 52 |
+ for(c=0; c<channels; c++) |
|
| 53 |
+ for(i=0, j=c; i<len; i++, j+=channels) |
|
| 54 |
+ dst[j] = float_to_int16_one(src[c]+i); |
|
| 55 |
+ } |
|
| 56 |
+} |
|
| 57 |
+ |
|
| 58 |
+av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx) |
|
| 59 |
+{
|
|
| 60 |
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; |
|
| 61 |
+ c->float_to_int16 = float_to_int16_c; |
|
| 62 |
+ c->float_to_int16_interleave = float_to_int16_interleave_c; |
|
| 63 |
+ |
|
| 64 |
+ if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx); |
|
| 65 |
+ if (ARCH_PPC) ff_fmt_convert_init_ppc(c, avctx); |
|
| 66 |
+ if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx); |
|
| 67 |
+} |
| 0 | 68 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,79 @@ |
| 0 |
+/* |
|
| 1 |
+ * Format Conversion Utils |
|
| 2 |
+ * Copyright (c) 2000, 2001 Fabrice Bellard |
|
| 3 |
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
| 4 |
+ * |
|
| 5 |
+ * This file is part of FFmpeg. |
|
| 6 |
+ * |
|
| 7 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
| 8 |
+ * modify it under the terms of the GNU Lesser General Public |
|
| 9 |
+ * License as published by the Free Software Foundation; either |
|
| 10 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
| 11 |
+ * |
|
| 12 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
| 13 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 14 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
| 15 |
+ * Lesser General Public License for more details. |
|
| 16 |
+ * |
|
| 17 |
+ * You should have received a copy of the GNU Lesser General Public |
|
| 18 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
| 19 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
| 20 |
+ */ |
|
| 21 |
+ |
|
| 22 |
+#ifndef AVCODEC_FMTCONVERT_H |
|
| 23 |
+#define AVCODEC_FMTCONVERT_H |
|
| 24 |
+ |
|
| 25 |
+#include "avcodec.h" |
|
| 26 |
+ |
|
| 27 |
+typedef struct FmtConvertContext {
|
|
| 28 |
+ /** |
|
| 29 |
+ * Convert an array of int32_t to float and multiply by a float value. |
|
| 30 |
+ * @param dst destination array of float. |
|
| 31 |
+ * constraints: 16-byte aligned |
|
| 32 |
+ * @param src source array of int32_t. |
|
| 33 |
+ * constraints: 16-byte aligned |
|
| 34 |
+ * @param len number of elements to convert. |
|
| 35 |
+ * constraints: multiple of 8 |
|
| 36 |
+ */ |
|
| 37 |
+ void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len); |
|
| 38 |
+ |
|
| 39 |
+ /** |
|
| 40 |
+ * Convert an array of float to an array of int16_t. |
|
| 41 |
+ * |
|
| 42 |
+ * Convert floats from in the range [-32768.0,32767.0] to ints |
|
| 43 |
+ * without rescaling |
|
| 44 |
+ * |
|
| 45 |
+ * @param dst destination array of int16_t. |
|
| 46 |
+ * constraints: 16-byte aligned |
|
| 47 |
+ * @param src source array of float. |
|
| 48 |
+ * constraints: 16-byte aligned |
|
| 49 |
+ * @param len number of elements to convert. |
|
| 50 |
+ * constraints: multiple of 8 |
|
| 51 |
+ */ |
|
| 52 |
+ void (*float_to_int16)(int16_t *dst, const float *src, long len); |
|
| 53 |
+ |
|
| 54 |
+ /** |
|
| 55 |
+ * Convert multiple arrays of float to an interleaved array of int16_t. |
|
| 56 |
+ * |
|
| 57 |
+ * Convert floats from in the range [-32768.0,32767.0] to ints |
|
| 58 |
+ * without rescaling |
|
| 59 |
+ * |
|
| 60 |
+ * @param dst destination array of interleaved int16_t. |
|
| 61 |
+ * constraints: 16-byte aligned |
|
| 62 |
+ * @param src source array of float arrays, one for each channel. |
|
| 63 |
+ * constraints: 16-byte aligned |
|
| 64 |
+ * @param len number of elements to convert. |
|
| 65 |
+ * constraints: multiple of 8 |
|
| 66 |
+ * @param channels number of channels |
|
| 67 |
+ */ |
|
| 68 |
+ void (*float_to_int16_interleave)(int16_t *dst, const float **src, |
|
| 69 |
+ long len, int channels); |
|
| 70 |
+} FmtConvertContext; |
|
| 71 |
+ |
|
| 72 |
+void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx); |
|
| 73 |
+ |
|
| 74 |
+void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx); |
|
| 75 |
+void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx); |
|
| 76 |
+void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx); |
|
| 77 |
+ |
|
| 78 |
+#endif /* AVCODEC_FMTCONVERT_H */ |
| ... | ... |
@@ -38,6 +38,7 @@ |
| 38 | 38 |
#include "avcodec.h" |
| 39 | 39 |
#include "dsputil.h" |
| 40 | 40 |
#include "fft.h" |
| 41 |
+#include "fmtconvert.h" |
|
| 41 | 42 |
|
| 42 | 43 |
#define ALT_BITSTREAM_READER_LE |
| 43 | 44 |
#include "get_bits.h" |
| ... | ... |
@@ -52,6 +53,7 @@ typedef struct NellyMoserDecodeContext {
|
| 52 | 52 |
float scale_bias; |
| 53 | 53 |
DSPContext dsp; |
| 54 | 54 |
FFTContext imdct_ctx; |
| 55 |
+ FmtConvertContext fmt_conv; |
|
| 55 | 56 |
DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2]; |
| 56 | 57 |
} NellyMoserDecodeContext; |
| 57 | 58 |
|
| ... | ... |
@@ -134,6 +136,7 @@ static av_cold int decode_init(AVCodecContext * avctx) {
|
| 134 | 134 |
ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0); |
| 135 | 135 |
|
| 136 | 136 |
dsputil_init(&s->dsp, avctx); |
| 137 |
+ ff_fmt_convert_init(&s->fmt_conv, avctx); |
|
| 137 | 138 |
|
| 138 | 139 |
s->scale_bias = 1.0/(1*8); |
| 139 | 140 |
|
| ... | ... |
@@ -175,7 +178,7 @@ static int decode_tag(AVCodecContext * avctx, |
| 175 | 175 |
|
| 176 | 176 |
for (i=0 ; i<blocks ; i++) {
|
| 177 | 177 |
nelly_decode_block(s, &buf[i*NELLY_BLOCK_LEN], s->float_buf); |
| 178 |
- s->dsp.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES); |
|
| 178 |
+ s->fmt_conv.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES); |
|
| 179 | 179 |
*data_size += NELLY_SAMPLES*sizeof(int16_t); |
| 180 | 180 |
} |
| 181 | 181 |
|
| ... | ... |
@@ -122,124 +122,12 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa |
| 122 | 122 |
} |
| 123 | 123 |
} |
| 124 | 124 |
|
| 125 |
-static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len) |
|
| 126 |
-{
|
|
| 127 |
- union {
|
|
| 128 |
- vector float v; |
|
| 129 |
- float s[4]; |
|
| 130 |
- } mul_u; |
|
| 131 |
- int i; |
|
| 132 |
- vector float src1, src2, dst1, dst2, mul_v, zero; |
|
| 133 |
- |
|
| 134 |
- zero = (vector float)vec_splat_u32(0); |
|
| 135 |
- mul_u.s[0] = mul; |
|
| 136 |
- mul_v = vec_splat(mul_u.v, 0); |
|
| 137 |
- |
|
| 138 |
- for(i=0; i<len; i+=8) {
|
|
| 139 |
- src1 = vec_ctf(vec_ld(0, src+i), 0); |
|
| 140 |
- src2 = vec_ctf(vec_ld(16, src+i), 0); |
|
| 141 |
- dst1 = vec_madd(src1, mul_v, zero); |
|
| 142 |
- dst2 = vec_madd(src2, mul_v, zero); |
|
| 143 |
- vec_st(dst1, 0, dst+i); |
|
| 144 |
- vec_st(dst2, 16, dst+i); |
|
| 145 |
- } |
|
| 146 |
-} |
|
| 147 |
- |
|
| 148 |
- |
|
| 149 |
-static vector signed short |
|
| 150 |
-float_to_int16_one_altivec(const float *src) |
|
| 151 |
-{
|
|
| 152 |
- vector float s0 = vec_ld(0, src); |
|
| 153 |
- vector float s1 = vec_ld(16, src); |
|
| 154 |
- vector signed int t0 = vec_cts(s0, 0); |
|
| 155 |
- vector signed int t1 = vec_cts(s1, 0); |
|
| 156 |
- return vec_packs(t0,t1); |
|
| 157 |
-} |
|
| 158 |
- |
|
| 159 |
-static void float_to_int16_altivec(int16_t *dst, const float *src, long len) |
|
| 160 |
-{
|
|
| 161 |
- int i; |
|
| 162 |
- vector signed short d0, d1, d; |
|
| 163 |
- vector unsigned char align; |
|
| 164 |
- if(((long)dst)&15) //FIXME |
|
| 165 |
- for(i=0; i<len-7; i+=8) {
|
|
| 166 |
- d0 = vec_ld(0, dst+i); |
|
| 167 |
- d = float_to_int16_one_altivec(src+i); |
|
| 168 |
- d1 = vec_ld(15, dst+i); |
|
| 169 |
- d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); |
|
| 170 |
- align = vec_lvsr(0, dst+i); |
|
| 171 |
- d0 = vec_perm(d1, d, align); |
|
| 172 |
- d1 = vec_perm(d, d1, align); |
|
| 173 |
- vec_st(d0, 0, dst+i); |
|
| 174 |
- vec_st(d1,15, dst+i); |
|
| 175 |
- } |
|
| 176 |
- else |
|
| 177 |
- for(i=0; i<len-7; i+=8) {
|
|
| 178 |
- d = float_to_int16_one_altivec(src+i); |
|
| 179 |
- vec_st(d, 0, dst+i); |
|
| 180 |
- } |
|
| 181 |
-} |
|
| 182 |
- |
|
| 183 |
-static void |
|
| 184 |
-float_to_int16_interleave_altivec(int16_t *dst, const float **src, |
|
| 185 |
- long len, int channels) |
|
| 186 |
-{
|
|
| 187 |
- int i; |
|
| 188 |
- vector signed short d0, d1, d2, c0, c1, t0, t1; |
|
| 189 |
- vector unsigned char align; |
|
| 190 |
- if(channels == 1) |
|
| 191 |
- float_to_int16_altivec(dst, src[0], len); |
|
| 192 |
- else |
|
| 193 |
- if (channels == 2) {
|
|
| 194 |
- if(((long)dst)&15) |
|
| 195 |
- for(i=0; i<len-7; i+=8) {
|
|
| 196 |
- d0 = vec_ld(0, dst + i); |
|
| 197 |
- t0 = float_to_int16_one_altivec(src[0] + i); |
|
| 198 |
- d1 = vec_ld(31, dst + i); |
|
| 199 |
- t1 = float_to_int16_one_altivec(src[1] + i); |
|
| 200 |
- c0 = vec_mergeh(t0, t1); |
|
| 201 |
- c1 = vec_mergel(t0, t1); |
|
| 202 |
- d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); |
|
| 203 |
- align = vec_lvsr(0, dst + i); |
|
| 204 |
- d0 = vec_perm(d2, c0, align); |
|
| 205 |
- d1 = vec_perm(c0, c1, align); |
|
| 206 |
- vec_st(d0, 0, dst + i); |
|
| 207 |
- d0 = vec_perm(c1, d2, align); |
|
| 208 |
- vec_st(d1, 15, dst + i); |
|
| 209 |
- vec_st(d0, 31, dst + i); |
|
| 210 |
- dst+=8; |
|
| 211 |
- } |
|
| 212 |
- else |
|
| 213 |
- for(i=0; i<len-7; i+=8) {
|
|
| 214 |
- t0 = float_to_int16_one_altivec(src[0] + i); |
|
| 215 |
- t1 = float_to_int16_one_altivec(src[1] + i); |
|
| 216 |
- d0 = vec_mergeh(t0, t1); |
|
| 217 |
- d1 = vec_mergel(t0, t1); |
|
| 218 |
- vec_st(d0, 0, dst + i); |
|
| 219 |
- vec_st(d1, 16, dst + i); |
|
| 220 |
- dst+=8; |
|
| 221 |
- } |
|
| 222 |
- } else {
|
|
| 223 |
- DECLARE_ALIGNED(16, int16_t, tmp)[len]; |
|
| 224 |
- int c, j; |
|
| 225 |
- for (c = 0; c < channels; c++) {
|
|
| 226 |
- float_to_int16_altivec(tmp, src[c], len); |
|
| 227 |
- for (i = 0, j = c; i < len; i++, j+=channels) {
|
|
| 228 |
- dst[j] = tmp[i]; |
|
| 229 |
- } |
|
| 230 |
- } |
|
| 231 |
- } |
|
| 232 |
-} |
|
| 233 |
- |
|
| 234 | 125 |
void float_init_altivec(DSPContext* c, AVCodecContext *avctx) |
| 235 | 126 |
{
|
| 236 | 127 |
c->vector_fmul = vector_fmul_altivec; |
| 237 | 128 |
c->vector_fmul_reverse = vector_fmul_reverse_altivec; |
| 238 | 129 |
c->vector_fmul_add = vector_fmul_add_altivec; |
| 239 |
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec; |
|
| 240 | 130 |
if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
|
| 241 | 131 |
c->vector_fmul_window = vector_fmul_window_altivec; |
| 242 |
- c->float_to_int16 = float_to_int16_altivec; |
|
| 243 |
- c->float_to_int16_interleave = float_to_int16_interleave_altivec; |
|
| 244 | 132 |
} |
| 245 | 133 |
} |
| 246 | 134 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,142 @@ |
| 0 |
+/* |
|
| 1 |
+ * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> |
|
| 2 |
+ * |
|
| 3 |
+ * This file is part of FFmpeg. |
|
| 4 |
+ * |
|
| 5 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
| 6 |
+ * modify it under the terms of the GNU Lesser General Public |
|
| 7 |
+ * License as published by the Free Software Foundation; either |
|
| 8 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
| 9 |
+ * |
|
| 10 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
| 11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
| 13 |
+ * Lesser General Public License for more details. |
|
| 14 |
+ * |
|
| 15 |
+ * You should have received a copy of the GNU Lesser General Public |
|
| 16 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
| 17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
| 18 |
+ */ |
|
| 19 |
+ |
|
| 20 |
+#include "libavcodec/fmtconvert.h" |
|
| 21 |
+ |
|
| 22 |
+#include "dsputil_altivec.h" |
|
| 23 |
+#include "util_altivec.h" |
|
| 24 |
+ |
|
| 25 |
+static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len) |
|
| 26 |
+{
|
|
| 27 |
+ union {
|
|
| 28 |
+ vector float v; |
|
| 29 |
+ float s[4]; |
|
| 30 |
+ } mul_u; |
|
| 31 |
+ int i; |
|
| 32 |
+ vector float src1, src2, dst1, dst2, mul_v, zero; |
|
| 33 |
+ |
|
| 34 |
+ zero = (vector float)vec_splat_u32(0); |
|
| 35 |
+ mul_u.s[0] = mul; |
|
| 36 |
+ mul_v = vec_splat(mul_u.v, 0); |
|
| 37 |
+ |
|
| 38 |
+ for(i=0; i<len; i+=8) {
|
|
| 39 |
+ src1 = vec_ctf(vec_ld(0, src+i), 0); |
|
| 40 |
+ src2 = vec_ctf(vec_ld(16, src+i), 0); |
|
| 41 |
+ dst1 = vec_madd(src1, mul_v, zero); |
|
| 42 |
+ dst2 = vec_madd(src2, mul_v, zero); |
|
| 43 |
+ vec_st(dst1, 0, dst+i); |
|
| 44 |
+ vec_st(dst2, 16, dst+i); |
|
| 45 |
+ } |
|
| 46 |
+} |
|
| 47 |
+ |
|
| 48 |
+ |
|
| 49 |
+static vector signed short |
|
| 50 |
+float_to_int16_one_altivec(const float *src) |
|
| 51 |
+{
|
|
| 52 |
+ vector float s0 = vec_ld(0, src); |
|
| 53 |
+ vector float s1 = vec_ld(16, src); |
|
| 54 |
+ vector signed int t0 = vec_cts(s0, 0); |
|
| 55 |
+ vector signed int t1 = vec_cts(s1, 0); |
|
| 56 |
+ return vec_packs(t0,t1); |
|
| 57 |
+} |
|
| 58 |
+ |
|
| 59 |
+static void float_to_int16_altivec(int16_t *dst, const float *src, long len) |
|
| 60 |
+{
|
|
| 61 |
+ int i; |
|
| 62 |
+ vector signed short d0, d1, d; |
|
| 63 |
+ vector unsigned char align; |
|
| 64 |
+ if(((long)dst)&15) //FIXME |
|
| 65 |
+ for(i=0; i<len-7; i+=8) {
|
|
| 66 |
+ d0 = vec_ld(0, dst+i); |
|
| 67 |
+ d = float_to_int16_one_altivec(src+i); |
|
| 68 |
+ d1 = vec_ld(15, dst+i); |
|
| 69 |
+ d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); |
|
| 70 |
+ align = vec_lvsr(0, dst+i); |
|
| 71 |
+ d0 = vec_perm(d1, d, align); |
|
| 72 |
+ d1 = vec_perm(d, d1, align); |
|
| 73 |
+ vec_st(d0, 0, dst+i); |
|
| 74 |
+ vec_st(d1,15, dst+i); |
|
| 75 |
+ } |
|
| 76 |
+ else |
|
| 77 |
+ for(i=0; i<len-7; i+=8) {
|
|
| 78 |
+ d = float_to_int16_one_altivec(src+i); |
|
| 79 |
+ vec_st(d, 0, dst+i); |
|
| 80 |
+ } |
|
| 81 |
+} |
|
| 82 |
+ |
|
| 83 |
+static void |
|
| 84 |
+float_to_int16_interleave_altivec(int16_t *dst, const float **src, |
|
| 85 |
+ long len, int channels) |
|
| 86 |
+{
|
|
| 87 |
+ int i; |
|
| 88 |
+ vector signed short d0, d1, d2, c0, c1, t0, t1; |
|
| 89 |
+ vector unsigned char align; |
|
| 90 |
+ if(channels == 1) |
|
| 91 |
+ float_to_int16_altivec(dst, src[0], len); |
|
| 92 |
+ else |
|
| 93 |
+ if (channels == 2) {
|
|
| 94 |
+ if(((long)dst)&15) |
|
| 95 |
+ for(i=0; i<len-7; i+=8) {
|
|
| 96 |
+ d0 = vec_ld(0, dst + i); |
|
| 97 |
+ t0 = float_to_int16_one_altivec(src[0] + i); |
|
| 98 |
+ d1 = vec_ld(31, dst + i); |
|
| 99 |
+ t1 = float_to_int16_one_altivec(src[1] + i); |
|
| 100 |
+ c0 = vec_mergeh(t0, t1); |
|
| 101 |
+ c1 = vec_mergel(t0, t1); |
|
| 102 |
+ d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); |
|
| 103 |
+ align = vec_lvsr(0, dst + i); |
|
| 104 |
+ d0 = vec_perm(d2, c0, align); |
|
| 105 |
+ d1 = vec_perm(c0, c1, align); |
|
| 106 |
+ vec_st(d0, 0, dst + i); |
|
| 107 |
+ d0 = vec_perm(c1, d2, align); |
|
| 108 |
+ vec_st(d1, 15, dst + i); |
|
| 109 |
+ vec_st(d0, 31, dst + i); |
|
| 110 |
+ dst+=8; |
|
| 111 |
+ } |
|
| 112 |
+ else |
|
| 113 |
+ for(i=0; i<len-7; i+=8) {
|
|
| 114 |
+ t0 = float_to_int16_one_altivec(src[0] + i); |
|
| 115 |
+ t1 = float_to_int16_one_altivec(src[1] + i); |
|
| 116 |
+ d0 = vec_mergeh(t0, t1); |
|
| 117 |
+ d1 = vec_mergel(t0, t1); |
|
| 118 |
+ vec_st(d0, 0, dst + i); |
|
| 119 |
+ vec_st(d1, 16, dst + i); |
|
| 120 |
+ dst+=8; |
|
| 121 |
+ } |
|
| 122 |
+ } else {
|
|
| 123 |
+ DECLARE_ALIGNED(16, int16_t, tmp)[len]; |
|
| 124 |
+ int c, j; |
|
| 125 |
+ for (c = 0; c < channels; c++) {
|
|
| 126 |
+ float_to_int16_altivec(tmp, src[c], len); |
|
| 127 |
+ for (i = 0, j = c; i < len; i++, j+=channels) {
|
|
| 128 |
+ dst[j] = tmp[i]; |
|
| 129 |
+ } |
|
| 130 |
+ } |
|
| 131 |
+ } |
|
| 132 |
+} |
|
| 133 |
+ |
|
| 134 |
+void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx) |
|
| 135 |
+{
|
|
| 136 |
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec; |
|
| 137 |
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
|
|
| 138 |
+ c->float_to_int16 = float_to_int16_altivec; |
|
| 139 |
+ c->float_to_int16_interleave = float_to_int16_interleave_altivec; |
|
| 140 |
+ } |
|
| 141 |
+} |
| ... | ... |
@@ -31,6 +31,7 @@ |
| 31 | 31 |
#include "get_bits.h" |
| 32 | 32 |
#include "dsputil.h" |
| 33 | 33 |
#include "fft.h" |
| 34 |
+#include "fmtconvert.h" |
|
| 34 | 35 |
|
| 35 | 36 |
#include "vorbis.h" |
| 36 | 37 |
#include "xiph.h" |
| ... | ... |
@@ -127,6 +128,7 @@ typedef struct vorbis_context_s {
|
| 127 | 127 |
AVCodecContext *avccontext; |
| 128 | 128 |
GetBitContext gb; |
| 129 | 129 |
DSPContext dsp; |
| 130 |
+ FmtConvertContext fmt_conv; |
|
| 130 | 131 |
|
| 131 | 132 |
FFTContext mdct[2]; |
| 132 | 133 |
uint_fast8_t first_frame; |
| ... | ... |
@@ -961,6 +963,7 @@ static av_cold int vorbis_decode_init(AVCodecContext *avccontext) |
| 961 | 961 |
|
| 962 | 962 |
vc->avccontext = avccontext; |
| 963 | 963 |
dsputil_init(&vc->dsp, avccontext); |
| 964 |
+ ff_fmt_convert_init(&vc->fmt_conv, avccontext); |
|
| 964 | 965 |
|
| 965 | 966 |
vc->scale_bias = 32768.0f; |
| 966 | 967 |
|
| ... | ... |
@@ -1636,7 +1639,8 @@ static int vorbis_decode_frame(AVCodecContext *avccontext, |
| 1636 | 1636 |
len * ff_vorbis_channel_layout_offsets[vc->audio_channels - 1][i]; |
| 1637 | 1637 |
} |
| 1638 | 1638 |
|
| 1639 |
- vc->dsp.float_to_int16_interleave(data, channel_ptrs, len, vc->audio_channels); |
|
| 1639 |
+ vc->fmt_conv.float_to_int16_interleave(data, channel_ptrs, len, |
|
| 1640 |
+ vc->audio_channels); |
|
| 1640 | 1641 |
*data_size = len * 2 * vc->audio_channels; |
| 1641 | 1642 |
|
| 1642 | 1643 |
return buf_size ; |
| ... | ... |
@@ -26,6 +26,7 @@ |
| 26 | 26 |
#include "put_bits.h" |
| 27 | 27 |
#include "dsputil.h" |
| 28 | 28 |
#include "fft.h" |
| 29 |
+#include "fmtconvert.h" |
|
| 29 | 30 |
|
| 30 | 31 |
/* size of blocks */ |
| 31 | 32 |
#define BLOCK_MIN_BITS 7 |
| ... | ... |
@@ -134,6 +135,7 @@ typedef struct WMACodecContext {
|
| 134 | 134 |
float lsp_pow_m_table1[(1 << LSP_POW_BITS)]; |
| 135 | 135 |
float lsp_pow_m_table2[(1 << LSP_POW_BITS)]; |
| 136 | 136 |
DSPContext dsp; |
| 137 |
+ FmtConvertContext fmt_conv; |
|
| 137 | 138 |
|
| 138 | 139 |
#ifdef TRACE |
| 139 | 140 |
int frame_count; |
| ... | ... |
@@ -791,7 +791,7 @@ static int wma_decode_frame(WMACodecContext *s, int16_t *samples) |
| 791 | 791 |
incr = s->nb_channels; |
| 792 | 792 |
for (ch = 0; ch < MAX_CHANNELS; ch++) |
| 793 | 793 |
output[ch] = s->frame_out[ch]; |
| 794 |
- s->dsp.float_to_int16_interleave(samples, output, n, incr); |
|
| 794 |
+ s->fmt_conv.float_to_int16_interleave(samples, output, n, incr); |
|
| 795 | 795 |
for (ch = 0; ch < incr; ch++) {
|
| 796 | 796 |
/* prepare for next block */ |
| 797 | 797 |
memmove(&s->frame_out[ch][0], &s->frame_out[ch][n], n * sizeof(float)); |
| ... | ... |
@@ -39,6 +39,7 @@ YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o |
| 39 | 39 |
MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o |
| 40 | 40 |
MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \ |
| 41 | 41 |
x86/deinterlace.o \ |
| 42 |
+ x86/fmtconvert.o \ |
|
| 42 | 43 |
x86/h264_chromamc.o \ |
| 43 | 44 |
$(YASM-OBJS-yes) |
| 44 | 45 |
|
| ... | ... |
@@ -47,6 +48,7 @@ MMX-OBJS-$(CONFIG_FFT) += x86/fft.o |
| 47 | 47 |
OBJS-$(HAVE_MMX) += x86/dnxhd_mmx.o \ |
| 48 | 48 |
x86/dsputil_mmx.o \ |
| 49 | 49 |
x86/fdct_mmx.o \ |
| 50 |
+ x86/fmtconvert_mmx.o \ |
|
| 50 | 51 |
x86/idct_mmx_xvid.o \ |
| 51 | 52 |
x86/idct_sse2_xvid.o \ |
| 52 | 53 |
x86/motion_est_mmx.o \ |
| ... | ... |
@@ -2349,50 +2349,6 @@ static void vector_fmul_window_sse(float *dst, const float *src0, const float *s |
| 2349 | 2349 |
} |
| 2350 | 2350 |
#endif /* HAVE_6REGS */ |
| 2351 | 2351 |
|
| 2352 |
-static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) |
|
| 2353 |
-{
|
|
| 2354 |
- x86_reg i = -4*len; |
|
| 2355 |
- __asm__ volatile( |
|
| 2356 |
- "movss %3, %%xmm4 \n" |
|
| 2357 |
- "shufps $0, %%xmm4, %%xmm4 \n" |
|
| 2358 |
- "1: \n" |
|
| 2359 |
- "cvtpi2ps (%2,%0), %%xmm0 \n" |
|
| 2360 |
- "cvtpi2ps 8(%2,%0), %%xmm1 \n" |
|
| 2361 |
- "cvtpi2ps 16(%2,%0), %%xmm2 \n" |
|
| 2362 |
- "cvtpi2ps 24(%2,%0), %%xmm3 \n" |
|
| 2363 |
- "movlhps %%xmm1, %%xmm0 \n" |
|
| 2364 |
- "movlhps %%xmm3, %%xmm2 \n" |
|
| 2365 |
- "mulps %%xmm4, %%xmm0 \n" |
|
| 2366 |
- "mulps %%xmm4, %%xmm2 \n" |
|
| 2367 |
- "movaps %%xmm0, (%1,%0) \n" |
|
| 2368 |
- "movaps %%xmm2, 16(%1,%0) \n" |
|
| 2369 |
- "add $32, %0 \n" |
|
| 2370 |
- "jl 1b \n" |
|
| 2371 |
- :"+r"(i) |
|
| 2372 |
- :"r"(dst+len), "r"(src+len), "m"(mul) |
|
| 2373 |
- ); |
|
| 2374 |
-} |
|
| 2375 |
- |
|
| 2376 |
-static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) |
|
| 2377 |
-{
|
|
| 2378 |
- x86_reg i = -4*len; |
|
| 2379 |
- __asm__ volatile( |
|
| 2380 |
- "movss %3, %%xmm4 \n" |
|
| 2381 |
- "shufps $0, %%xmm4, %%xmm4 \n" |
|
| 2382 |
- "1: \n" |
|
| 2383 |
- "cvtdq2ps (%2,%0), %%xmm0 \n" |
|
| 2384 |
- "cvtdq2ps 16(%2,%0), %%xmm1 \n" |
|
| 2385 |
- "mulps %%xmm4, %%xmm0 \n" |
|
| 2386 |
- "mulps %%xmm4, %%xmm1 \n" |
|
| 2387 |
- "movaps %%xmm0, (%1,%0) \n" |
|
| 2388 |
- "movaps %%xmm1, 16(%1,%0) \n" |
|
| 2389 |
- "add $32, %0 \n" |
|
| 2390 |
- "jl 1b \n" |
|
| 2391 |
- :"+r"(i) |
|
| 2392 |
- :"r"(dst+len), "r"(src+len), "m"(mul) |
|
| 2393 |
- ); |
|
| 2394 |
-} |
|
| 2395 |
- |
|
| 2396 | 2352 |
static void vector_clipf_sse(float *dst, const float *src, float min, float max, |
| 2397 | 2353 |
int len) |
| 2398 | 2354 |
{
|
| ... | ... |
@@ -2427,70 +2383,6 @@ static void vector_clipf_sse(float *dst, const float *src, float min, float max, |
| 2427 | 2427 |
); |
| 2428 | 2428 |
} |
| 2429 | 2429 |
|
| 2430 |
-static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
|
|
| 2431 |
- x86_reg reglen = len; |
|
| 2432 |
- // not bit-exact: pf2id uses different rounding than C and SSE |
|
| 2433 |
- __asm__ volatile( |
|
| 2434 |
- "add %0 , %0 \n\t" |
|
| 2435 |
- "lea (%2,%0,2) , %2 \n\t" |
|
| 2436 |
- "add %0 , %1 \n\t" |
|
| 2437 |
- "neg %0 \n\t" |
|
| 2438 |
- "1: \n\t" |
|
| 2439 |
- "pf2id (%2,%0,2) , %%mm0 \n\t" |
|
| 2440 |
- "pf2id 8(%2,%0,2) , %%mm1 \n\t" |
|
| 2441 |
- "pf2id 16(%2,%0,2) , %%mm2 \n\t" |
|
| 2442 |
- "pf2id 24(%2,%0,2) , %%mm3 \n\t" |
|
| 2443 |
- "packssdw %%mm1 , %%mm0 \n\t" |
|
| 2444 |
- "packssdw %%mm3 , %%mm2 \n\t" |
|
| 2445 |
- "movq %%mm0 , (%1,%0) \n\t" |
|
| 2446 |
- "movq %%mm2 , 8(%1,%0) \n\t" |
|
| 2447 |
- "add $16 , %0 \n\t" |
|
| 2448 |
- " js 1b \n\t" |
|
| 2449 |
- "femms \n\t" |
|
| 2450 |
- :"+r"(reglen), "+r"(dst), "+r"(src) |
|
| 2451 |
- ); |
|
| 2452 |
-} |
|
| 2453 |
-static void float_to_int16_sse(int16_t *dst, const float *src, long len){
|
|
| 2454 |
- x86_reg reglen = len; |
|
| 2455 |
- __asm__ volatile( |
|
| 2456 |
- "add %0 , %0 \n\t" |
|
| 2457 |
- "lea (%2,%0,2) , %2 \n\t" |
|
| 2458 |
- "add %0 , %1 \n\t" |
|
| 2459 |
- "neg %0 \n\t" |
|
| 2460 |
- "1: \n\t" |
|
| 2461 |
- "cvtps2pi (%2,%0,2) , %%mm0 \n\t" |
|
| 2462 |
- "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" |
|
| 2463 |
- "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" |
|
| 2464 |
- "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" |
|
| 2465 |
- "packssdw %%mm1 , %%mm0 \n\t" |
|
| 2466 |
- "packssdw %%mm3 , %%mm2 \n\t" |
|
| 2467 |
- "movq %%mm0 , (%1,%0) \n\t" |
|
| 2468 |
- "movq %%mm2 , 8(%1,%0) \n\t" |
|
| 2469 |
- "add $16 , %0 \n\t" |
|
| 2470 |
- " js 1b \n\t" |
|
| 2471 |
- "emms \n\t" |
|
| 2472 |
- :"+r"(reglen), "+r"(dst), "+r"(src) |
|
| 2473 |
- ); |
|
| 2474 |
-} |
|
| 2475 |
- |
|
| 2476 |
-static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
|
|
| 2477 |
- x86_reg reglen = len; |
|
| 2478 |
- __asm__ volatile( |
|
| 2479 |
- "add %0 , %0 \n\t" |
|
| 2480 |
- "lea (%2,%0,2) , %2 \n\t" |
|
| 2481 |
- "add %0 , %1 \n\t" |
|
| 2482 |
- "neg %0 \n\t" |
|
| 2483 |
- "1: \n\t" |
|
| 2484 |
- "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" |
|
| 2485 |
- "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" |
|
| 2486 |
- "packssdw %%xmm1 , %%xmm0 \n\t" |
|
| 2487 |
- "movdqa %%xmm0 , (%1,%0) \n\t" |
|
| 2488 |
- "add $16 , %0 \n\t" |
|
| 2489 |
- " js 1b \n\t" |
|
| 2490 |
- :"+r"(reglen), "+r"(dst), "+r"(src) |
|
| 2491 |
- ); |
|
| 2492 |
-} |
|
| 2493 |
- |
|
| 2494 | 2430 |
void ff_vp3_idct_mmx(int16_t *input_data); |
| 2495 | 2431 |
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); |
| 2496 | 2432 |
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); |
| ... | ... |
@@ -2504,9 +2396,6 @@ void ff_vp3_idct_sse2(int16_t *input_data); |
| 2504 | 2504 |
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); |
| 2505 | 2505 |
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); |
| 2506 | 2506 |
|
| 2507 |
-void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); |
|
| 2508 |
-void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); |
|
| 2509 |
-void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); |
|
| 2510 | 2507 |
int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift); |
| 2511 | 2508 |
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift); |
| 2512 | 2509 |
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); |
| ... | ... |
@@ -2516,102 +2405,6 @@ void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const |
| 2516 | 2516 |
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); |
| 2517 | 2517 |
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); |
| 2518 | 2518 |
|
| 2519 |
-#if !HAVE_YASM |
|
| 2520 |
-#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) |
|
| 2521 |
-#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) |
|
| 2522 |
-#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) |
|
| 2523 |
-#endif |
|
| 2524 |
-#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse |
|
| 2525 |
- |
|
| 2526 |
-#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ |
|
| 2527 |
-/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ |
|
| 2528 |
-static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
|
| 2529 |
- DECLARE_ALIGNED(16, int16_t, tmp)[len];\ |
|
| 2530 |
- int i,j,c;\ |
|
| 2531 |
- for(c=0; c<channels; c++){\
|
|
| 2532 |
- float_to_int16_##cpu(tmp, src[c], len);\ |
|
| 2533 |
- for(i=0, j=c; i<len; i++, j+=channels)\ |
|
| 2534 |
- dst[j] = tmp[i];\ |
|
| 2535 |
- }\ |
|
| 2536 |
-}\ |
|
| 2537 |
-\ |
|
| 2538 |
-static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
|
| 2539 |
- if(channels==1)\ |
|
| 2540 |
- float_to_int16_##cpu(dst, src[0], len);\ |
|
| 2541 |
- else if(channels==2){\
|
|
| 2542 |
- x86_reg reglen = len; \ |
|
| 2543 |
- const float *src0 = src[0];\ |
|
| 2544 |
- const float *src1 = src[1];\ |
|
| 2545 |
- __asm__ volatile(\ |
|
| 2546 |
- "shl $2, %0 \n"\ |
|
| 2547 |
- "add %0, %1 \n"\ |
|
| 2548 |
- "add %0, %2 \n"\ |
|
| 2549 |
- "add %0, %3 \n"\ |
|
| 2550 |
- "neg %0 \n"\ |
|
| 2551 |
- body\ |
|
| 2552 |
- :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ |
|
| 2553 |
- );\ |
|
| 2554 |
- }else if(channels==6){\
|
|
| 2555 |
- ff_float_to_int16_interleave6_##cpu(dst, src, len);\ |
|
| 2556 |
- }else\ |
|
| 2557 |
- float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ |
|
| 2558 |
-} |
|
| 2559 |
- |
|
| 2560 |
-FLOAT_TO_INT16_INTERLEAVE(3dnow, |
|
| 2561 |
- "1: \n" |
|
| 2562 |
- "pf2id (%2,%0), %%mm0 \n" |
|
| 2563 |
- "pf2id 8(%2,%0), %%mm1 \n" |
|
| 2564 |
- "pf2id (%3,%0), %%mm2 \n" |
|
| 2565 |
- "pf2id 8(%3,%0), %%mm3 \n" |
|
| 2566 |
- "packssdw %%mm1, %%mm0 \n" |
|
| 2567 |
- "packssdw %%mm3, %%mm2 \n" |
|
| 2568 |
- "movq %%mm0, %%mm1 \n" |
|
| 2569 |
- "punpcklwd %%mm2, %%mm0 \n" |
|
| 2570 |
- "punpckhwd %%mm2, %%mm1 \n" |
|
| 2571 |
- "movq %%mm0, (%1,%0)\n" |
|
| 2572 |
- "movq %%mm1, 8(%1,%0)\n" |
|
| 2573 |
- "add $16, %0 \n" |
|
| 2574 |
- "js 1b \n" |
|
| 2575 |
- "femms \n" |
|
| 2576 |
-) |
|
| 2577 |
- |
|
| 2578 |
-FLOAT_TO_INT16_INTERLEAVE(sse, |
|
| 2579 |
- "1: \n" |
|
| 2580 |
- "cvtps2pi (%2,%0), %%mm0 \n" |
|
| 2581 |
- "cvtps2pi 8(%2,%0), %%mm1 \n" |
|
| 2582 |
- "cvtps2pi (%3,%0), %%mm2 \n" |
|
| 2583 |
- "cvtps2pi 8(%3,%0), %%mm3 \n" |
|
| 2584 |
- "packssdw %%mm1, %%mm0 \n" |
|
| 2585 |
- "packssdw %%mm3, %%mm2 \n" |
|
| 2586 |
- "movq %%mm0, %%mm1 \n" |
|
| 2587 |
- "punpcklwd %%mm2, %%mm0 \n" |
|
| 2588 |
- "punpckhwd %%mm2, %%mm1 \n" |
|
| 2589 |
- "movq %%mm0, (%1,%0)\n" |
|
| 2590 |
- "movq %%mm1, 8(%1,%0)\n" |
|
| 2591 |
- "add $16, %0 \n" |
|
| 2592 |
- "js 1b \n" |
|
| 2593 |
- "emms \n" |
|
| 2594 |
-) |
|
| 2595 |
- |
|
| 2596 |
-FLOAT_TO_INT16_INTERLEAVE(sse2, |
|
| 2597 |
- "1: \n" |
|
| 2598 |
- "cvtps2dq (%2,%0), %%xmm0 \n" |
|
| 2599 |
- "cvtps2dq (%3,%0), %%xmm1 \n" |
|
| 2600 |
- "packssdw %%xmm1, %%xmm0 \n" |
|
| 2601 |
- "movhlps %%xmm0, %%xmm1 \n" |
|
| 2602 |
- "punpcklwd %%xmm1, %%xmm0 \n" |
|
| 2603 |
- "movdqa %%xmm0, (%1,%0) \n" |
|
| 2604 |
- "add $16, %0 \n" |
|
| 2605 |
- "js 1b \n" |
|
| 2606 |
-) |
|
| 2607 |
- |
|
| 2608 |
-static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
|
|
| 2609 |
- if(channels==6) |
|
| 2610 |
- ff_float_to_int16_interleave6_3dn2(dst, src, len); |
|
| 2611 |
- else |
|
| 2612 |
- float_to_int16_interleave_3dnow(dst, src, len, channels); |
|
| 2613 |
-} |
|
| 2614 |
- |
|
| 2615 | 2519 |
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); |
| 2616 | 2520 |
|
| 2617 | 2521 |
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
| ... | ... |
@@ -2968,19 +2761,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
| 2968 | 2968 |
if(mm_flags & AV_CPU_FLAG_3DNOW){
|
| 2969 | 2969 |
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; |
| 2970 | 2970 |
c->vector_fmul = vector_fmul_3dnow; |
| 2971 |
- if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
|
| 2972 |
- c->float_to_int16 = float_to_int16_3dnow; |
|
| 2973 |
- c->float_to_int16_interleave = float_to_int16_interleave_3dnow; |
|
| 2974 |
- } |
|
| 2975 | 2971 |
} |
| 2976 | 2972 |
if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
|
| 2977 | 2973 |
c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; |
| 2978 | 2974 |
#if HAVE_6REGS |
| 2979 | 2975 |
c->vector_fmul_window = vector_fmul_window_3dnow2; |
| 2980 | 2976 |
#endif |
| 2981 |
- if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
|
| 2982 |
- c->float_to_int16_interleave = float_to_int16_interleave_3dn2; |
|
| 2983 |
- } |
|
| 2984 | 2977 |
} |
| 2985 | 2978 |
if(mm_flags & AV_CPU_FLAG_MMX2){
|
| 2986 | 2979 |
#if HAVE_YASM |
| ... | ... |
@@ -2997,10 +2783,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
| 2997 | 2997 |
#if HAVE_6REGS |
| 2998 | 2998 |
c->vector_fmul_window = vector_fmul_window_sse; |
| 2999 | 2999 |
#endif |
| 3000 |
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; |
|
| 3001 | 3000 |
c->vector_clipf = vector_clipf_sse; |
| 3002 |
- c->float_to_int16 = float_to_int16_sse; |
|
| 3003 |
- c->float_to_int16_interleave = float_to_int16_interleave_sse; |
|
| 3004 | 3001 |
#if HAVE_YASM |
| 3005 | 3002 |
c->scalarproduct_float = ff_scalarproduct_float_sse; |
| 3006 | 3003 |
#endif |
| ... | ... |
@@ -3008,9 +2791,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
| 3008 | 3008 |
if(mm_flags & AV_CPU_FLAG_3DNOW) |
| 3009 | 3009 |
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse |
| 3010 | 3010 |
if(mm_flags & AV_CPU_FLAG_SSE2){
|
| 3011 |
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; |
|
| 3012 |
- c->float_to_int16 = float_to_int16_sse2; |
|
| 3013 |
- c->float_to_int16_interleave = float_to_int16_interleave_sse2; |
|
| 3014 | 3011 |
#if HAVE_YASM |
| 3015 | 3012 |
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; |
| 3016 | 3013 |
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; |
| ... | ... |
@@ -30,75 +30,6 @@ pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 |
| 30 | 30 |
|
| 31 | 31 |
section .text align=16 |
| 32 | 32 |
|
| 33 |
-%macro PSWAPD_SSE 2 |
|
| 34 |
- pshufw %1, %2, 0x4e |
|
| 35 |
-%endmacro |
|
| 36 |
-%macro PSWAPD_3DN1 2 |
|
| 37 |
- movq %1, %2 |
|
| 38 |
- psrlq %1, 32 |
|
| 39 |
- punpckldq %1, %2 |
|
| 40 |
-%endmacro |
|
| 41 |
- |
|
| 42 |
-%macro FLOAT_TO_INT16_INTERLEAVE6 1 |
|
| 43 |
-; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) |
|
| 44 |
-cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 |
|
| 45 |
-%ifdef ARCH_X86_64 |
|
| 46 |
- %define lend r10d |
|
| 47 |
- mov lend, r2d |
|
| 48 |
-%else |
|
| 49 |
- %define lend dword r2m |
|
| 50 |
-%endif |
|
| 51 |
- mov src1q, [srcq+1*gprsize] |
|
| 52 |
- mov src2q, [srcq+2*gprsize] |
|
| 53 |
- mov src3q, [srcq+3*gprsize] |
|
| 54 |
- mov src4q, [srcq+4*gprsize] |
|
| 55 |
- mov src5q, [srcq+5*gprsize] |
|
| 56 |
- mov srcq, [srcq] |
|
| 57 |
- sub src1q, srcq |
|
| 58 |
- sub src2q, srcq |
|
| 59 |
- sub src3q, srcq |
|
| 60 |
- sub src4q, srcq |
|
| 61 |
- sub src5q, srcq |
|
| 62 |
-.loop: |
|
| 63 |
- cvtps2pi mm0, [srcq] |
|
| 64 |
- cvtps2pi mm1, [srcq+src1q] |
|
| 65 |
- cvtps2pi mm2, [srcq+src2q] |
|
| 66 |
- cvtps2pi mm3, [srcq+src3q] |
|
| 67 |
- cvtps2pi mm4, [srcq+src4q] |
|
| 68 |
- cvtps2pi mm5, [srcq+src5q] |
|
| 69 |
- packssdw mm0, mm3 |
|
| 70 |
- packssdw mm1, mm4 |
|
| 71 |
- packssdw mm2, mm5 |
|
| 72 |
- pswapd mm3, mm0 |
|
| 73 |
- punpcklwd mm0, mm1 |
|
| 74 |
- punpckhwd mm1, mm2 |
|
| 75 |
- punpcklwd mm2, mm3 |
|
| 76 |
- pswapd mm3, mm0 |
|
| 77 |
- punpckldq mm0, mm2 |
|
| 78 |
- punpckhdq mm2, mm1 |
|
| 79 |
- punpckldq mm1, mm3 |
|
| 80 |
- movq [dstq ], mm0 |
|
| 81 |
- movq [dstq+16], mm2 |
|
| 82 |
- movq [dstq+ 8], mm1 |
|
| 83 |
- add srcq, 8 |
|
| 84 |
- add dstq, 24 |
|
| 85 |
- sub lend, 2 |
|
| 86 |
- jg .loop |
|
| 87 |
- emms |
|
| 88 |
- RET |
|
| 89 |
-%endmacro ; FLOAT_TO_INT16_INTERLEAVE6 |
|
| 90 |
- |
|
| 91 |
-%define pswapd PSWAPD_SSE |
|
| 92 |
-FLOAT_TO_INT16_INTERLEAVE6 sse |
|
| 93 |
-%define cvtps2pi pf2id |
|
| 94 |
-%define pswapd PSWAPD_3DN1 |
|
| 95 |
-FLOAT_TO_INT16_INTERLEAVE6 3dnow |
|
| 96 |
-%undef pswapd |
|
| 97 |
-FLOAT_TO_INT16_INTERLEAVE6 3dn2 |
|
| 98 |
-%undef cvtps2pi |
|
| 99 |
- |
|
| 100 |
- |
|
| 101 |
- |
|
| 102 | 33 |
%macro SCALARPRODUCT 1 |
| 103 | 34 |
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) |
| 104 | 35 |
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift |
| 105 | 36 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,91 @@ |
| 0 |
+;****************************************************************************** |
|
| 1 |
+;* x86 optimized Format Conversion Utils |
|
| 2 |
+;* Copyright (c) 2008 Loren Merritt |
|
| 3 |
+;* |
|
| 4 |
+;* This file is part of FFmpeg. |
|
| 5 |
+;* |
|
| 6 |
+;* FFmpeg is free software; you can redistribute it and/or |
|
| 7 |
+;* modify it under the terms of the GNU Lesser General Public |
|
| 8 |
+;* License as published by the Free Software Foundation; either |
|
| 9 |
+;* version 2.1 of the License, or (at your option) any later version. |
|
| 10 |
+;* |
|
| 11 |
+;* FFmpeg is distributed in the hope that it will be useful, |
|
| 12 |
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 13 |
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
| 14 |
+;* Lesser General Public License for more details. |
|
| 15 |
+;* |
|
| 16 |
+;* You should have received a copy of the GNU Lesser General Public |
|
| 17 |
+;* License along with FFmpeg; if not, write to the Free Software |
|
| 18 |
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
| 19 |
+;****************************************************************************** |
|
| 20 |
+ |
|
| 21 |
+%include "x86inc.asm" |
|
| 22 |
+ |
|
| 23 |
+section .text align=16 |
|
| 24 |
+ |
|
| 25 |
+%macro PSWAPD_SSE 2 |
|
| 26 |
+ pshufw %1, %2, 0x4e |
|
| 27 |
+%endmacro |
|
| 28 |
+%macro PSWAPD_3DN1 2 |
|
| 29 |
+ movq %1, %2 |
|
| 30 |
+ psrlq %1, 32 |
|
| 31 |
+ punpckldq %1, %2 |
|
| 32 |
+%endmacro |
|
| 33 |
+ |
|
| 34 |
+%macro FLOAT_TO_INT16_INTERLEAVE6 1 |
|
| 35 |
+; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) |
|
| 36 |
+cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 |
|
| 37 |
+%ifdef ARCH_X86_64 |
|
| 38 |
+ %define lend r10d |
|
| 39 |
+ mov lend, r2d |
|
| 40 |
+%else |
|
| 41 |
+ %define lend dword r2m |
|
| 42 |
+%endif |
|
| 43 |
+ mov src1q, [srcq+1*gprsize] |
|
| 44 |
+ mov src2q, [srcq+2*gprsize] |
|
| 45 |
+ mov src3q, [srcq+3*gprsize] |
|
| 46 |
+ mov src4q, [srcq+4*gprsize] |
|
| 47 |
+ mov src5q, [srcq+5*gprsize] |
|
| 48 |
+ mov srcq, [srcq] |
|
| 49 |
+ sub src1q, srcq |
|
| 50 |
+ sub src2q, srcq |
|
| 51 |
+ sub src3q, srcq |
|
| 52 |
+ sub src4q, srcq |
|
| 53 |
+ sub src5q, srcq |
|
| 54 |
+.loop: |
|
| 55 |
+ cvtps2pi mm0, [srcq] |
|
| 56 |
+ cvtps2pi mm1, [srcq+src1q] |
|
| 57 |
+ cvtps2pi mm2, [srcq+src2q] |
|
| 58 |
+ cvtps2pi mm3, [srcq+src3q] |
|
| 59 |
+ cvtps2pi mm4, [srcq+src4q] |
|
| 60 |
+ cvtps2pi mm5, [srcq+src5q] |
|
| 61 |
+ packssdw mm0, mm3 |
|
| 62 |
+ packssdw mm1, mm4 |
|
| 63 |
+ packssdw mm2, mm5 |
|
| 64 |
+ pswapd mm3, mm0 |
|
| 65 |
+ punpcklwd mm0, mm1 |
|
| 66 |
+ punpckhwd mm1, mm2 |
|
| 67 |
+ punpcklwd mm2, mm3 |
|
| 68 |
+ pswapd mm3, mm0 |
|
| 69 |
+ punpckldq mm0, mm2 |
|
| 70 |
+ punpckhdq mm2, mm1 |
|
| 71 |
+ punpckldq mm1, mm3 |
|
| 72 |
+ movq [dstq ], mm0 |
|
| 73 |
+ movq [dstq+16], mm2 |
|
| 74 |
+ movq [dstq+ 8], mm1 |
|
| 75 |
+ add srcq, 8 |
|
| 76 |
+ add dstq, 24 |
|
| 77 |
+ sub lend, 2 |
|
| 78 |
+ jg .loop |
|
| 79 |
+ emms |
|
| 80 |
+ RET |
|
| 81 |
+%endmacro ; FLOAT_TO_INT16_INTERLEAVE6 |
|
| 82 |
+ |
|
| 83 |
+%define pswapd PSWAPD_SSE |
|
| 84 |
+FLOAT_TO_INT16_INTERLEAVE6 sse |
|
| 85 |
+%define cvtps2pi pf2id |
|
| 86 |
+%define pswapd PSWAPD_3DN1 |
|
| 87 |
+FLOAT_TO_INT16_INTERLEAVE6 3dnow |
|
| 88 |
+%undef pswapd |
|
| 89 |
+FLOAT_TO_INT16_INTERLEAVE6 3dn2 |
|
| 90 |
+%undef cvtps2pi |
| 0 | 91 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,266 @@ |
| 0 |
+/* |
|
| 1 |
+ * Format Conversion Utils |
|
| 2 |
+ * Copyright (c) 2000, 2001 Fabrice Bellard |
|
| 3 |
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
| 4 |
+ * |
|
| 5 |
+ * This file is part of FFmpeg. |
|
| 6 |
+ * |
|
| 7 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
| 8 |
+ * modify it under the terms of the GNU Lesser General Public |
|
| 9 |
+ * License as published by the Free Software Foundation; either |
|
| 10 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
| 11 |
+ * |
|
| 12 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
| 13 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 14 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
| 15 |
+ * Lesser General Public License for more details. |
|
| 16 |
+ * |
|
| 17 |
+ * You should have received a copy of the GNU Lesser General Public |
|
| 18 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
| 19 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
| 20 |
+ * |
|
| 21 |
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
|
| 22 |
+ */ |
|
| 23 |
+ |
|
| 24 |
+#include "libavutil/cpu.h" |
|
| 25 |
+#include "libavutil/x86_cpu.h" |
|
| 26 |
+#include "libavcodec/fmtconvert.h" |
|
| 27 |
+ |
|
| 28 |
+static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) |
|
| 29 |
+{
|
|
| 30 |
+ x86_reg i = -4*len; |
|
| 31 |
+ __asm__ volatile( |
|
| 32 |
+ "movss %3, %%xmm4 \n" |
|
| 33 |
+ "shufps $0, %%xmm4, %%xmm4 \n" |
|
| 34 |
+ "1: \n" |
|
| 35 |
+ "cvtpi2ps (%2,%0), %%xmm0 \n" |
|
| 36 |
+ "cvtpi2ps 8(%2,%0), %%xmm1 \n" |
|
| 37 |
+ "cvtpi2ps 16(%2,%0), %%xmm2 \n" |
|
| 38 |
+ "cvtpi2ps 24(%2,%0), %%xmm3 \n" |
|
| 39 |
+ "movlhps %%xmm1, %%xmm0 \n" |
|
| 40 |
+ "movlhps %%xmm3, %%xmm2 \n" |
|
| 41 |
+ "mulps %%xmm4, %%xmm0 \n" |
|
| 42 |
+ "mulps %%xmm4, %%xmm2 \n" |
|
| 43 |
+ "movaps %%xmm0, (%1,%0) \n" |
|
| 44 |
+ "movaps %%xmm2, 16(%1,%0) \n" |
|
| 45 |
+ "add $32, %0 \n" |
|
| 46 |
+ "jl 1b \n" |
|
| 47 |
+ :"+r"(i) |
|
| 48 |
+ :"r"(dst+len), "r"(src+len), "m"(mul) |
|
| 49 |
+ ); |
|
| 50 |
+} |
|
| 51 |
+ |
|
| 52 |
+static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) |
|
| 53 |
+{
|
|
| 54 |
+ x86_reg i = -4*len; |
|
| 55 |
+ __asm__ volatile( |
|
| 56 |
+ "movss %3, %%xmm4 \n" |
|
| 57 |
+ "shufps $0, %%xmm4, %%xmm4 \n" |
|
| 58 |
+ "1: \n" |
|
| 59 |
+ "cvtdq2ps (%2,%0), %%xmm0 \n" |
|
| 60 |
+ "cvtdq2ps 16(%2,%0), %%xmm1 \n" |
|
| 61 |
+ "mulps %%xmm4, %%xmm0 \n" |
|
| 62 |
+ "mulps %%xmm4, %%xmm1 \n" |
|
| 63 |
+ "movaps %%xmm0, (%1,%0) \n" |
|
| 64 |
+ "movaps %%xmm1, 16(%1,%0) \n" |
|
| 65 |
+ "add $32, %0 \n" |
|
| 66 |
+ "jl 1b \n" |
|
| 67 |
+ :"+r"(i) |
|
| 68 |
+ :"r"(dst+len), "r"(src+len), "m"(mul) |
|
| 69 |
+ ); |
|
| 70 |
+} |
|
| 71 |
+ |
|
| 72 |
+static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
|
|
| 73 |
+ x86_reg reglen = len; |
|
| 74 |
+ // not bit-exact: pf2id uses different rounding than C and SSE |
|
| 75 |
+ __asm__ volatile( |
|
| 76 |
+ "add %0 , %0 \n\t" |
|
| 77 |
+ "lea (%2,%0,2) , %2 \n\t" |
|
| 78 |
+ "add %0 , %1 \n\t" |
|
| 79 |
+ "neg %0 \n\t" |
|
| 80 |
+ "1: \n\t" |
|
| 81 |
+ "pf2id (%2,%0,2) , %%mm0 \n\t" |
|
| 82 |
+ "pf2id 8(%2,%0,2) , %%mm1 \n\t" |
|
| 83 |
+ "pf2id 16(%2,%0,2) , %%mm2 \n\t" |
|
| 84 |
+ "pf2id 24(%2,%0,2) , %%mm3 \n\t" |
|
| 85 |
+ "packssdw %%mm1 , %%mm0 \n\t" |
|
| 86 |
+ "packssdw %%mm3 , %%mm2 \n\t" |
|
| 87 |
+ "movq %%mm0 , (%1,%0) \n\t" |
|
| 88 |
+ "movq %%mm2 , 8(%1,%0) \n\t" |
|
| 89 |
+ "add $16 , %0 \n\t" |
|
| 90 |
+ " js 1b \n\t" |
|
| 91 |
+ "femms \n\t" |
|
| 92 |
+ :"+r"(reglen), "+r"(dst), "+r"(src) |
|
| 93 |
+ ); |
|
| 94 |
+} |
|
| 95 |
+ |
|
| 96 |
+static void float_to_int16_sse(int16_t *dst, const float *src, long len){
|
|
| 97 |
+ x86_reg reglen = len; |
|
| 98 |
+ __asm__ volatile( |
|
| 99 |
+ "add %0 , %0 \n\t" |
|
| 100 |
+ "lea (%2,%0,2) , %2 \n\t" |
|
| 101 |
+ "add %0 , %1 \n\t" |
|
| 102 |
+ "neg %0 \n\t" |
|
| 103 |
+ "1: \n\t" |
|
| 104 |
+ "cvtps2pi (%2,%0,2) , %%mm0 \n\t" |
|
| 105 |
+ "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" |
|
| 106 |
+ "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" |
|
| 107 |
+ "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" |
|
| 108 |
+ "packssdw %%mm1 , %%mm0 \n\t" |
|
| 109 |
+ "packssdw %%mm3 , %%mm2 \n\t" |
|
| 110 |
+ "movq %%mm0 , (%1,%0) \n\t" |
|
| 111 |
+ "movq %%mm2 , 8(%1,%0) \n\t" |
|
| 112 |
+ "add $16 , %0 \n\t" |
|
| 113 |
+ " js 1b \n\t" |
|
| 114 |
+ "emms \n\t" |
|
| 115 |
+ :"+r"(reglen), "+r"(dst), "+r"(src) |
|
| 116 |
+ ); |
|
| 117 |
+} |
|
| 118 |
+ |
|
| 119 |
+static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
|
|
| 120 |
+ x86_reg reglen = len; |
|
| 121 |
+ __asm__ volatile( |
|
| 122 |
+ "add %0 , %0 \n\t" |
|
| 123 |
+ "lea (%2,%0,2) , %2 \n\t" |
|
| 124 |
+ "add %0 , %1 \n\t" |
|
| 125 |
+ "neg %0 \n\t" |
|
| 126 |
+ "1: \n\t" |
|
| 127 |
+ "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" |
|
| 128 |
+ "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" |
|
| 129 |
+ "packssdw %%xmm1 , %%xmm0 \n\t" |
|
| 130 |
+ "movdqa %%xmm0 , (%1,%0) \n\t" |
|
| 131 |
+ "add $16 , %0 \n\t" |
|
| 132 |
+ " js 1b \n\t" |
|
| 133 |
+ :"+r"(reglen), "+r"(dst), "+r"(src) |
|
| 134 |
+ ); |
|
| 135 |
+} |
|
| 136 |
+ |
|
| 137 |
+void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); |
|
| 138 |
+void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); |
|
| 139 |
+void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); |
|
| 140 |
+ |
|
| 141 |
+#if !HAVE_YASM |
|
| 142 |
+#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) |
|
| 143 |
+#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) |
|
| 144 |
+#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) |
|
| 145 |
+#endif |
|
| 146 |
+#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse |
|
| 147 |
+ |
|
| 148 |
+#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ |
|
| 149 |
+/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ |
|
| 150 |
+static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
|
| 151 |
+ DECLARE_ALIGNED(16, int16_t, tmp)[len];\ |
|
| 152 |
+ int i,j,c;\ |
|
| 153 |
+ for(c=0; c<channels; c++){\
|
|
| 154 |
+ float_to_int16_##cpu(tmp, src[c], len);\ |
|
| 155 |
+ for(i=0, j=c; i<len; i++, j+=channels)\ |
|
| 156 |
+ dst[j] = tmp[i];\ |
|
| 157 |
+ }\ |
|
| 158 |
+}\ |
|
| 159 |
+\ |
|
| 160 |
+static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
|
| 161 |
+ if(channels==1)\ |
|
| 162 |
+ float_to_int16_##cpu(dst, src[0], len);\ |
|
| 163 |
+ else if(channels==2){\
|
|
| 164 |
+ x86_reg reglen = len; \ |
|
| 165 |
+ const float *src0 = src[0];\ |
|
| 166 |
+ const float *src1 = src[1];\ |
|
| 167 |
+ __asm__ volatile(\ |
|
| 168 |
+ "shl $2, %0 \n"\ |
|
| 169 |
+ "add %0, %1 \n"\ |
|
| 170 |
+ "add %0, %2 \n"\ |
|
| 171 |
+ "add %0, %3 \n"\ |
|
| 172 |
+ "neg %0 \n"\ |
|
| 173 |
+ body\ |
|
| 174 |
+ :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ |
|
| 175 |
+ );\ |
|
| 176 |
+ }else if(channels==6){\
|
|
| 177 |
+ ff_float_to_int16_interleave6_##cpu(dst, src, len);\ |
|
| 178 |
+ }else\ |
|
| 179 |
+ float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ |
|
| 180 |
+} |
|
| 181 |
+ |
|
| 182 |
+FLOAT_TO_INT16_INTERLEAVE(3dnow, |
|
| 183 |
+ "1: \n" |
|
| 184 |
+ "pf2id (%2,%0), %%mm0 \n" |
|
| 185 |
+ "pf2id 8(%2,%0), %%mm1 \n" |
|
| 186 |
+ "pf2id (%3,%0), %%mm2 \n" |
|
| 187 |
+ "pf2id 8(%3,%0), %%mm3 \n" |
|
| 188 |
+ "packssdw %%mm1, %%mm0 \n" |
|
| 189 |
+ "packssdw %%mm3, %%mm2 \n" |
|
| 190 |
+ "movq %%mm0, %%mm1 \n" |
|
| 191 |
+ "punpcklwd %%mm2, %%mm0 \n" |
|
| 192 |
+ "punpckhwd %%mm2, %%mm1 \n" |
|
| 193 |
+ "movq %%mm0, (%1,%0)\n" |
|
| 194 |
+ "movq %%mm1, 8(%1,%0)\n" |
|
| 195 |
+ "add $16, %0 \n" |
|
| 196 |
+ "js 1b \n" |
|
| 197 |
+ "femms \n" |
|
| 198 |
+) |
|
| 199 |
+ |
|
| 200 |
+FLOAT_TO_INT16_INTERLEAVE(sse, |
|
| 201 |
+ "1: \n" |
|
| 202 |
+ "cvtps2pi (%2,%0), %%mm0 \n" |
|
| 203 |
+ "cvtps2pi 8(%2,%0), %%mm1 \n" |
|
| 204 |
+ "cvtps2pi (%3,%0), %%mm2 \n" |
|
| 205 |
+ "cvtps2pi 8(%3,%0), %%mm3 \n" |
|
| 206 |
+ "packssdw %%mm1, %%mm0 \n" |
|
| 207 |
+ "packssdw %%mm3, %%mm2 \n" |
|
| 208 |
+ "movq %%mm0, %%mm1 \n" |
|
| 209 |
+ "punpcklwd %%mm2, %%mm0 \n" |
|
| 210 |
+ "punpckhwd %%mm2, %%mm1 \n" |
|
| 211 |
+ "movq %%mm0, (%1,%0)\n" |
|
| 212 |
+ "movq %%mm1, 8(%1,%0)\n" |
|
| 213 |
+ "add $16, %0 \n" |
|
| 214 |
+ "js 1b \n" |
|
| 215 |
+ "emms \n" |
|
| 216 |
+) |
|
| 217 |
+ |
|
| 218 |
+FLOAT_TO_INT16_INTERLEAVE(sse2, |
|
| 219 |
+ "1: \n" |
|
| 220 |
+ "cvtps2dq (%2,%0), %%xmm0 \n" |
|
| 221 |
+ "cvtps2dq (%3,%0), %%xmm1 \n" |
|
| 222 |
+ "packssdw %%xmm1, %%xmm0 \n" |
|
| 223 |
+ "movhlps %%xmm0, %%xmm1 \n" |
|
| 224 |
+ "punpcklwd %%xmm1, %%xmm0 \n" |
|
| 225 |
+ "movdqa %%xmm0, (%1,%0) \n" |
|
| 226 |
+ "add $16, %0 \n" |
|
| 227 |
+ "js 1b \n" |
|
| 228 |
+) |
|
| 229 |
+ |
|
| 230 |
+static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
|
|
| 231 |
+ if(channels==6) |
|
| 232 |
+ ff_float_to_int16_interleave6_3dn2(dst, src, len); |
|
| 233 |
+ else |
|
| 234 |
+ float_to_int16_interleave_3dnow(dst, src, len, channels); |
|
| 235 |
+} |
|
| 236 |
+ |
|
| 237 |
+void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) |
|
| 238 |
+{
|
|
| 239 |
+ int mm_flags = av_get_cpu_flags(); |
|
| 240 |
+ |
|
| 241 |
+ if (mm_flags & AV_CPU_FLAG_MMX) {
|
|
| 242 |
+ |
|
| 243 |
+ if(mm_flags & AV_CPU_FLAG_3DNOW){
|
|
| 244 |
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
|
| 245 |
+ c->float_to_int16 = float_to_int16_3dnow; |
|
| 246 |
+ c->float_to_int16_interleave = float_to_int16_interleave_3dnow; |
|
| 247 |
+ } |
|
| 248 |
+ } |
|
| 249 |
+ if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
|
|
| 250 |
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
|
| 251 |
+ c->float_to_int16_interleave = float_to_int16_interleave_3dn2; |
|
| 252 |
+ } |
|
| 253 |
+ } |
|
| 254 |
+ if(mm_flags & AV_CPU_FLAG_SSE){
|
|
| 255 |
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; |
|
| 256 |
+ c->float_to_int16 = float_to_int16_sse; |
|
| 257 |
+ c->float_to_int16_interleave = float_to_int16_interleave_sse; |
|
| 258 |
+ } |
|
| 259 |
+ if(mm_flags & AV_CPU_FLAG_SSE2){
|
|
| 260 |
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; |
|
| 261 |
+ c->float_to_int16 = float_to_int16_sse2; |
|
| 262 |
+ c->float_to_int16_interleave = float_to_int16_interleave_sse2; |
|
| 263 |
+ } |
|
| 264 |
+ } |
|
| 265 |
+} |