The buffer holding the coefficients must be padded with 0 so as to use DSP
functions that may overread. Currently, the SSE2/3 versions is an example,
as they process batches of 16 bytes.
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
... | ... |
@@ -1681,12 +1681,9 @@ unsigned int ff_rescale_rms(unsigned int rms, unsigned int energy) |
1681 | 1681 |
} |
1682 | 1682 |
|
1683 | 1683 |
/** inverse root mean square */ |
1684 |
-int ff_irms(const int16_t *data) |
|
1684 |
+int ff_irms(DSPContext *dsp, const int16_t *data) |
|
1685 | 1685 |
{ |
1686 |
- unsigned int i, sum = 0; |
|
1687 |
- |
|
1688 |
- for (i=0; i < BLOCKSIZE; i++) |
|
1689 |
- sum += data[i] * data[i]; |
|
1686 |
+ unsigned int sum = dsp->scalarproduct_int16(data, data, BLOCKSIZE); |
|
1690 | 1687 |
|
1691 | 1688 |
if (sum == 0) |
1692 | 1689 |
return 0; /* OOPS - division by zero */ |
... | ... |
@@ -1698,14 +1695,13 @@ void ff_subblock_synthesis(RA144Context *ractx, const int16_t *lpc_coefs, |
1698 | 1698 |
int cba_idx, int cb1_idx, int cb2_idx, |
1699 | 1699 |
int gval, int gain) |
1700 | 1700 |
{ |
1701 |
- int16_t buffer_a[BLOCKSIZE]; |
|
1702 | 1701 |
int16_t *block; |
1703 | 1702 |
int m[3]; |
1704 | 1703 |
|
1705 | 1704 |
if (cba_idx) { |
1706 | 1705 |
cba_idx += BLOCKSIZE/2 - 1; |
1707 |
- ff_copy_and_dup(buffer_a, ractx->adapt_cb, cba_idx); |
|
1708 |
- m[0] = (ff_irms(buffer_a) * gval) >> 12; |
|
1706 |
+ ff_copy_and_dup(ractx->buffer_a, ractx->adapt_cb, cba_idx); |
|
1707 |
+ m[0] = (ff_irms(&ractx->dsp, ractx->buffer_a) * gval) >> 12; |
|
1709 | 1708 |
} else { |
1710 | 1709 |
m[0] = 0; |
1711 | 1710 |
} |
... | ... |
@@ -1716,7 +1712,7 @@ void ff_subblock_synthesis(RA144Context *ractx, const int16_t *lpc_coefs, |
1716 | 1716 |
|
1717 | 1717 |
block = ractx->adapt_cb + BUFFERSIZE - BLOCKSIZE; |
1718 | 1718 |
|
1719 |
- add_wav(block, gain, cba_idx, m, cba_idx? buffer_a: NULL, |
|
1719 |
+ add_wav(block, gain, cba_idx, m, cba_idx? ractx->buffer_a: NULL, |
|
1720 | 1720 |
ff_cb1_vects[cb1_idx], ff_cb2_vects[cb2_idx]); |
1721 | 1721 |
|
1722 | 1722 |
memcpy(ractx->curr_sblock, ractx->curr_sblock + BLOCKSIZE, |
... | ... |
@@ -25,6 +25,7 @@ |
25 | 25 |
#include <stdint.h> |
26 | 26 |
#include "lpc.h" |
27 | 27 |
#include "audio_frame_queue.h" |
28 |
+#include "dsputil.h" |
|
28 | 29 |
|
29 | 30 |
#define NBLOCKS 4 ///< number of subblocks within a block |
30 | 31 |
#define BLOCKSIZE 40 ///< subblock size in 16-bit words |
... | ... |
@@ -35,6 +36,7 @@ |
35 | 35 |
|
36 | 36 |
typedef struct RA144Context { |
37 | 37 |
AVCodecContext *avctx; |
38 |
+ DSPContext dsp; |
|
38 | 39 |
LPCContext lpc_ctx; |
39 | 40 |
AudioFrameQueue afq; |
40 | 41 |
int last_frame; |
... | ... |
@@ -57,6 +59,8 @@ typedef struct RA144Context { |
57 | 57 |
/** Adaptive codebook, its size is two units bigger to avoid a |
58 | 58 |
* buffer overflow. */ |
59 | 59 |
int16_t adapt_cb[146+2]; |
60 |
+ |
|
61 |
+ DECLARE_ALIGNED(16, int16_t, buffer_a)[FFALIGN(BLOCKSIZE,16)]; |
|
60 | 62 |
} RA144Context; |
61 | 63 |
|
62 | 64 |
void ff_copy_and_dup(int16_t *target, const int16_t *source, int offset); |
... | ... |
@@ -68,7 +72,7 @@ unsigned int ff_rms(const int *data); |
68 | 68 |
int ff_interp(RA144Context *ractx, int16_t *out, int a, int copyold, |
69 | 69 |
int energy); |
70 | 70 |
unsigned int ff_rescale_rms(unsigned int rms, unsigned int energy); |
71 |
-int ff_irms(const int16_t *data); |
|
71 |
+int ff_irms(DSPContext *dsp, const int16_t *data/*align 16*/); |
|
72 | 72 |
void ff_subblock_synthesis(RA144Context *ractx, const int16_t *lpc_coefs, |
73 | 73 |
int cba_idx, int cb1_idx, int cb2_idx, |
74 | 74 |
int gval, int gain); |
... | ... |
@@ -34,10 +34,13 @@ static av_cold int ra144_decode_init(AVCodecContext * avctx) |
34 | 34 |
RA144Context *ractx = avctx->priv_data; |
35 | 35 |
|
36 | 36 |
ractx->avctx = avctx; |
37 |
+ ff_dsputil_init(&ractx->dsp, avctx); |
|
37 | 38 |
|
38 | 39 |
ractx->lpc_coef[0] = ractx->lpc_tables[0]; |
39 | 40 |
ractx->lpc_coef[1] = ractx->lpc_tables[1]; |
40 | 41 |
|
42 |
+ AV_ZERO128(ractx->buffer_a+BLOCKSIZE); |
|
43 |
+ |
|
41 | 44 |
avctx->channels = 1; |
42 | 45 |
avctx->channel_layout = AV_CH_LAYOUT_MONO; |
43 | 46 |
avctx->sample_fmt = AV_SAMPLE_FMT_S16; |
... | ... |
@@ -60,7 +60,9 @@ static av_cold int ra144_encode_init(AVCodecContext * avctx) |
60 | 60 |
ractx = avctx->priv_data; |
61 | 61 |
ractx->lpc_coef[0] = ractx->lpc_tables[0]; |
62 | 62 |
ractx->lpc_coef[1] = ractx->lpc_tables[1]; |
63 |
+ AV_ZERO128(ractx->buffer_a+BLOCKSIZE); |
|
63 | 64 |
ractx->avctx = avctx; |
65 |
+ ff_dsputil_init(&ractx->dsp, avctx); |
|
64 | 66 |
ret = ff_lpc_init(&ractx->lpc_ctx, avctx->frame_size, LPC_ORDER, |
65 | 67 |
FF_LPC_TYPE_LEVINSON); |
66 | 68 |
if (ret < 0) |
... | ... |
@@ -334,7 +336,6 @@ static void ra144_encode_subblock(RA144Context *ractx, |
334 | 334 |
float data[BLOCKSIZE] = { 0 }, work[LPC_ORDER + BLOCKSIZE]; |
335 | 335 |
float coefs[LPC_ORDER]; |
336 | 336 |
float zero[BLOCKSIZE], cba[BLOCKSIZE], cb1[BLOCKSIZE], cb2[BLOCKSIZE]; |
337 |
- int16_t cba_vect[BLOCKSIZE]; |
|
338 | 337 |
int cba_idx, cb1_idx, cb2_idx, gain; |
339 | 338 |
int i, n; |
340 | 339 |
unsigned m[3]; |
... | ... |
@@ -373,8 +374,8 @@ static void ra144_encode_subblock(RA144Context *ractx, |
373 | 373 |
*/ |
374 | 374 |
memcpy(cba, work + LPC_ORDER, sizeof(cba)); |
375 | 375 |
|
376 |
- ff_copy_and_dup(cba_vect, ractx->adapt_cb, cba_idx + BLOCKSIZE / 2 - 1); |
|
377 |
- m[0] = (ff_irms(cba_vect) * rms) >> 12; |
|
376 |
+ ff_copy_and_dup(ractx->buffer_a, ractx->adapt_cb, cba_idx + BLOCKSIZE / 2 - 1); |
|
377 |
+ m[0] = (ff_irms(&ractx->dsp, ractx->buffer_a) * rms) >> 12; |
|
378 | 378 |
} |
379 | 379 |
fixed_cb_search(work + LPC_ORDER, coefs, data, cba_idx, &cb1_idx, &cb2_idx); |
380 | 380 |
for (i = 0; i < BLOCKSIZE; i++) { |