... | ... |
@@ -1533,7 +1533,6 @@ CONFIG_EXTRA=" |
1533 | 1533 |
blockdsp |
1534 | 1534 |
bswapdsp |
1535 | 1535 |
cabac |
1536 |
- dsputil |
|
1537 | 1536 |
dvprofile |
1538 | 1537 |
fdctdsp |
1539 | 1538 |
gcrypt |
... | ... |
@@ -1552,6 +1551,7 @@ CONFIG_EXTRA=" |
1552 | 1552 |
intrax8 |
1553 | 1553 |
lgplv3 |
1554 | 1554 |
lpc |
1555 |
+ me_cmp |
|
1555 | 1556 |
mpeg_er |
1556 | 1557 |
mpegaudio |
1557 | 1558 |
mpegaudiodsp |
... | ... |
@@ -1707,24 +1707,24 @@ threads_if_any="$THREADS_LIST" |
1707 | 1707 |
|
1708 | 1708 |
# subsystems |
1709 | 1709 |
dct_select="rdft" |
1710 |
-dsputil_select="fdctdsp idctdsp pixblockdsp" |
|
1711 |
-error_resilience_select="dsputil" |
|
1710 |
+error_resilience_select="me_cmp" |
|
1712 | 1711 |
intrax8_select="error_resilience" |
1713 | 1712 |
mdct_select="fft" |
1714 | 1713 |
rdft_select="fft" |
1714 |
+me_cmp_select="fdctdsp idctdsp pixblockdsp" |
|
1715 | 1715 |
mpeg_er_select="error_resilience" |
1716 | 1716 |
mpegaudio_select="mpegaudiodsp" |
1717 | 1717 |
mpegaudiodsp_select="dct" |
1718 |
-mpegvideo_select="blockdsp dsputil hpeldsp idctdsp videodsp" |
|
1719 |
-mpegvideoenc_select="dsputil mpegvideo pixblockdsp qpeldsp" |
|
1718 |
+mpegvideo_select="blockdsp hpeldsp idctdsp me_cmp videodsp" |
|
1719 |
+mpegvideoenc_select="me_cmp mpegvideo pixblockdsp qpeldsp" |
|
1720 | 1720 |
|
1721 | 1721 |
# decoders / encoders |
1722 | 1722 |
aac_decoder_select="mdct sinewin" |
1723 | 1723 |
aac_encoder_select="audio_frame_queue mdct sinewin" |
1724 | 1724 |
aac_latm_decoder_select="aac_decoder aac_latm_parser" |
1725 | 1725 |
ac3_decoder_select="ac3_parser ac3dsp bswapdsp mdct" |
1726 |
-ac3_encoder_select="ac3dsp audiodsp dsputil mdct" |
|
1727 |
-ac3_fixed_encoder_select="ac3dsp audiodsp dsputil mdct" |
|
1726 |
+ac3_encoder_select="ac3dsp audiodsp mdct me_cmp" |
|
1727 |
+ac3_fixed_encoder_select="ac3dsp audiodsp mdct me_cmp" |
|
1728 | 1728 |
aic_decoder_select="golomb idctdsp" |
1729 | 1729 |
alac_encoder_select="lpc" |
1730 | 1730 |
als_decoder_select="bswapdsp" |
... | ... |
@@ -1752,7 +1752,7 @@ dca_decoder_select="mdct" |
1752 | 1752 |
dnxhd_decoder_select="blockdsp idctdsp" |
1753 | 1753 |
dnxhd_encoder_select="aandcttables blockdsp fdctdsp idctdsp mpegvideoenc pixblockdsp" |
1754 | 1754 |
dvvideo_decoder_select="dvprofile idctdsp" |
1755 |
-dvvideo_encoder_select="dsputil dvprofile fdctdsp pixblockdsp" |
|
1755 |
+dvvideo_encoder_select="dvprofile fdctdsp me_cmp pixblockdsp" |
|
1756 | 1756 |
dxa_decoder_deps="zlib" |
1757 | 1757 |
eac3_decoder_select="ac3_decoder" |
1758 | 1758 |
eac3_encoder_select="ac3_encoder" |
... | ... |
@@ -1856,7 +1856,7 @@ shorten_decoder_select="golomb" |
1856 | 1856 |
sipr_decoder_select="lsp" |
1857 | 1857 |
sp5x_decoder_select="mjpeg_decoder" |
1858 | 1858 |
svq1_decoder_select="hpeldsp" |
1859 |
-svq1_encoder_select="aandcttables dsputil hpeldsp mpegvideoenc" |
|
1859 |
+svq1_encoder_select="aandcttables hpeldsp me_cmp mpegvideoenc" |
|
1860 | 1860 |
svq3_decoder_select="h264_decoder hpeldsp tpeldsp" |
1861 | 1861 |
svq3_decoder_suggest="zlib" |
1862 | 1862 |
tak_decoder_select="audiodsp" |
... | ... |
@@ -35,7 +35,6 @@ OBJS-$(CONFIG_BLOCKDSP) += blockdsp.o |
35 | 35 |
OBJS-$(CONFIG_BSWAPDSP) += bswapdsp.o |
36 | 36 |
OBJS-$(CONFIG_CABAC) += cabac.o |
37 | 37 |
OBJS-$(CONFIG_DCT) += dct.o dct32_fixed.o dct32_float.o |
38 |
-OBJS-$(CONFIG_DSPUTIL) += dsputil.o |
|
39 | 38 |
OBJS-$(CONFIG_DXVA2) += dxva2.o |
40 | 39 |
OBJS-$(CONFIG_ERROR_RESILIENCE) += error_resilience.o |
41 | 40 |
OBJS-$(CONFIG_FDCTDSP) += fdctdsp.o faandct.o \ |
... | ... |
@@ -60,6 +59,7 @@ OBJS-$(CONFIG_LIBXVID) += libxvid_rc.o |
60 | 60 |
OBJS-$(CONFIG_LPC) += lpc.o |
61 | 61 |
OBJS-$(CONFIG_LSP) += lsp.o |
62 | 62 |
OBJS-$(CONFIG_MDCT) += mdct_fixed.o mdct_float.o |
63 |
+OBJS-$(CONFIG_ME_CMP) += me_cmp.o |
|
63 | 64 |
OBJS-$(CONFIG_MPEG_ER) += mpeg_er.o |
64 | 65 |
OBJS-$(CONFIG_MPEGAUDIO) += mpegaudio.o mpegaudiodata.o \ |
65 | 66 |
mpegaudiodecheader.o |
... | ... |
@@ -36,6 +36,7 @@ |
36 | 36 |
#include "libavutil/internal.h" |
37 | 37 |
#include "libavutil/opt.h" |
38 | 38 |
#include "avcodec.h" |
39 |
+#include "me_cmp.h" |
|
39 | 40 |
#include "put_bits.h" |
40 | 41 |
#include "audiodsp.h" |
41 | 42 |
#include "ac3dsp.h" |
... | ... |
@@ -379,7 +380,7 @@ static void compute_exp_strategy(AC3EncodeContext *s) |
379 | 379 |
exp_strategy[blk] = EXP_NEW; |
380 | 380 |
continue; |
381 | 381 |
} |
382 |
- exp_diff = s->dsp.sad[0](NULL, exp, exp - AC3_MAX_COEFS, 16, 16); |
|
382 |
+ exp_diff = s->mecc.sad[0](NULL, exp, exp - AC3_MAX_COEFS, 16, 16); |
|
383 | 383 |
exp_strategy[blk] = EXP_REUSE; |
384 | 384 |
if (ch == CPL_CH && exp_diff > (EXP_DIFF_THRESHOLD * (s->blocks[blk].end_freq[ch] - s->start_freq[ch]) / AC3_MAX_COEFS)) |
385 | 385 |
exp_strategy[blk] = EXP_NEW; |
... | ... |
@@ -2482,7 +2483,7 @@ av_cold int ff_ac3_encode_init(AVCodecContext *avctx) |
2482 | 2482 |
goto init_fail; |
2483 | 2483 |
|
2484 | 2484 |
ff_audiodsp_init(&s->adsp); |
2485 |
- ff_dsputil_init(&s->dsp, avctx); |
|
2485 |
+ ff_me_cmp_init(&s->mecc, avctx); |
|
2486 | 2486 |
ff_ac3dsp_init(&s->ac3dsp, avctx->flags & CODEC_FLAG_BITEXACT); |
2487 | 2487 |
|
2488 | 2488 |
dprint_options(s); |
... | ... |
@@ -35,9 +35,9 @@ |
35 | 35 |
#include "ac3.h" |
36 | 36 |
#include "ac3dsp.h" |
37 | 37 |
#include "avcodec.h" |
38 |
-#include "dsputil.h" |
|
39 | 38 |
#include "fft.h" |
40 | 39 |
#include "mathops.h" |
40 |
+#include "me_cmp.h" |
|
41 | 41 |
#include "put_bits.h" |
42 | 42 |
#include "audiodsp.h" |
43 | 43 |
|
... | ... |
@@ -162,9 +162,9 @@ typedef struct AC3EncodeContext { |
162 | 162 |
AC3EncOptions options; ///< encoding options |
163 | 163 |
AVCodecContext *avctx; ///< parent AVCodecContext |
164 | 164 |
PutBitContext pb; ///< bitstream writer context |
165 |
- DSPContext dsp; |
|
166 | 165 |
AudioDSPContext adsp; |
167 | 166 |
AVFloatDSPContext fdsp; |
167 |
+ MECmpContext mecc; |
|
168 | 168 |
AC3DSPContext ac3dsp; ///< AC-3 optimized functions |
169 | 169 |
FFTContext mdct; ///< FFT context for MDCT calculation |
170 | 170 |
const SampleType *mdct_window; ///< MDCT window function array |
... | ... |
@@ -6,7 +6,6 @@ OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \ |
6 | 6 |
arm/ac3dsp_arm.o |
7 | 7 |
OBJS-$(CONFIG_AUDIODSP) += arm/audiodsp_init_arm.o |
8 | 8 |
OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_arm.o |
9 |
-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_arm.o |
|
10 | 9 |
OBJS-$(CONFIG_FFT) += arm/fft_init_arm.o \ |
11 | 10 |
arm/fft_fixed_init_arm.o |
12 | 11 |
OBJS-$(CONFIG_H264CHROMA) += arm/h264chroma_init_arm.o |
... | ... |
@@ -19,6 +18,7 @@ OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_arm.o \ |
19 | 19 |
arm/idctdsp_arm.o \ |
20 | 20 |
arm/jrevdct_arm.o \ |
21 | 21 |
arm/simple_idct_arm.o |
22 |
+OBJS-$(CONFIG_ME_CMP) += arm/me_cmp_init_arm.o |
|
22 | 23 |
OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o |
23 | 24 |
OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o |
24 | 25 |
OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_init_arm.o |
... | ... |
@@ -53,13 +53,13 @@ ARMV5TE-OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_armv5te.o \ |
53 | 53 |
ARMV5TE-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv5te.o |
54 | 54 |
|
55 | 55 |
ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o |
56 |
-ARMV6-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_armv6.o |
|
57 | 56 |
ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o |
58 | 57 |
ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \ |
59 | 58 |
arm/hpeldsp_armv6.o |
60 | 59 |
ARMV6-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_armv6.o \ |
61 | 60 |
arm/idctdsp_armv6.o \ |
62 | 61 |
arm/simple_idct_armv6.o |
62 |
+ARMV6-OBJS-$(CONFIG_ME_CMP) += arm/me_cmp_armv6.o |
|
63 | 63 |
ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o |
64 | 64 |
ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_armv6.o |
65 | 65 |
ARMV6-OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_armv6.o |
66 | 66 |
deleted file mode 100644 |
... | ... |
@@ -1,244 +0,0 @@ |
1 |
-/* |
|
2 |
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> |
|
3 |
- * |
|
4 |
- * This file is part of Libav. |
|
5 |
- * |
|
6 |
- * Libav is free software; you can redistribute it and/or |
|
7 |
- * modify it under the terms of the GNU Lesser General Public |
|
8 |
- * License as published by the Free Software Foundation; either |
|
9 |
- * version 2.1 of the License, or (at your option) any later version. |
|
10 |
- * |
|
11 |
- * Libav is distributed in the hope that it will be useful, |
|
12 |
- * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
14 |
- * Lesser General Public License for more details. |
|
15 |
- * |
|
16 |
- * You should have received a copy of the GNU Lesser General Public |
|
17 |
- * License along with Libav; if not, write to the Free Software |
|
18 |
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
19 |
- */ |
|
20 |
- |
|
21 |
-#include "libavutil/arm/asm.S" |
|
22 |
- |
|
23 |
-function ff_pix_abs16_armv6, export=1 |
|
24 |
- ldr r0, [sp] |
|
25 |
- push {r4-r9, lr} |
|
26 |
- mov r12, #0 |
|
27 |
- mov lr, #0 |
|
28 |
- ldm r1, {r4-r7} |
|
29 |
- ldr r8, [r2] |
|
30 |
-1: |
|
31 |
- ldr r9, [r2, #4] |
|
32 |
- pld [r1, r3] |
|
33 |
- usada8 r12, r4, r8, r12 |
|
34 |
- ldr r8, [r2, #8] |
|
35 |
- pld [r2, r3] |
|
36 |
- usada8 lr, r5, r9, lr |
|
37 |
- ldr r9, [r2, #12] |
|
38 |
- usada8 r12, r6, r8, r12 |
|
39 |
- subs r0, r0, #1 |
|
40 |
- usada8 lr, r7, r9, lr |
|
41 |
- beq 2f |
|
42 |
- add r1, r1, r3 |
|
43 |
- ldm r1, {r4-r7} |
|
44 |
- add r2, r2, r3 |
|
45 |
- ldr r8, [r2] |
|
46 |
- b 1b |
|
47 |
-2: |
|
48 |
- add r0, r12, lr |
|
49 |
- pop {r4-r9, pc} |
|
50 |
-endfunc |
|
51 |
- |
|
52 |
-function ff_pix_abs16_x2_armv6, export=1 |
|
53 |
- ldr r12, [sp] |
|
54 |
- push {r4-r11, lr} |
|
55 |
- mov r0, #0 |
|
56 |
- mov lr, #1 |
|
57 |
- orr lr, lr, lr, lsl #8 |
|
58 |
- orr lr, lr, lr, lsl #16 |
|
59 |
-1: |
|
60 |
- ldr r8, [r2] |
|
61 |
- ldr r9, [r2, #4] |
|
62 |
- lsr r10, r8, #8 |
|
63 |
- ldr r4, [r1] |
|
64 |
- lsr r6, r9, #8 |
|
65 |
- orr r10, r10, r9, lsl #24 |
|
66 |
- ldr r5, [r2, #8] |
|
67 |
- eor r11, r8, r10 |
|
68 |
- uhadd8 r7, r8, r10 |
|
69 |
- orr r6, r6, r5, lsl #24 |
|
70 |
- and r11, r11, lr |
|
71 |
- uadd8 r7, r7, r11 |
|
72 |
- ldr r8, [r1, #4] |
|
73 |
- usada8 r0, r4, r7, r0 |
|
74 |
- eor r7, r9, r6 |
|
75 |
- lsr r10, r5, #8 |
|
76 |
- and r7, r7, lr |
|
77 |
- uhadd8 r4, r9, r6 |
|
78 |
- ldr r6, [r2, #12] |
|
79 |
- uadd8 r4, r4, r7 |
|
80 |
- pld [r1, r3] |
|
81 |
- orr r10, r10, r6, lsl #24 |
|
82 |
- usada8 r0, r8, r4, r0 |
|
83 |
- ldr r4, [r1, #8] |
|
84 |
- eor r11, r5, r10 |
|
85 |
- ldrb r7, [r2, #16] |
|
86 |
- and r11, r11, lr |
|
87 |
- uhadd8 r8, r5, r10 |
|
88 |
- ldr r5, [r1, #12] |
|
89 |
- uadd8 r8, r8, r11 |
|
90 |
- pld [r2, r3] |
|
91 |
- lsr r10, r6, #8 |
|
92 |
- usada8 r0, r4, r8, r0 |
|
93 |
- orr r10, r10, r7, lsl #24 |
|
94 |
- subs r12, r12, #1 |
|
95 |
- eor r11, r6, r10 |
|
96 |
- add r1, r1, r3 |
|
97 |
- uhadd8 r9, r6, r10 |
|
98 |
- and r11, r11, lr |
|
99 |
- uadd8 r9, r9, r11 |
|
100 |
- add r2, r2, r3 |
|
101 |
- usada8 r0, r5, r9, r0 |
|
102 |
- bgt 1b |
|
103 |
- |
|
104 |
- pop {r4-r11, pc} |
|
105 |
-endfunc |
|
106 |
- |
|
107 |
-.macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3 |
|
108 |
- ldr \n0, [r2] |
|
109 |
- eor \n1, \p0, \n0 |
|
110 |
- uhadd8 \p0, \p0, \n0 |
|
111 |
- and \n1, \n1, lr |
|
112 |
- ldr \n2, [r1] |
|
113 |
- uadd8 \p0, \p0, \n1 |
|
114 |
- ldr \n1, [r2, #4] |
|
115 |
- usada8 r0, \p0, \n2, r0 |
|
116 |
- pld [r1, r3] |
|
117 |
- eor \n3, \p1, \n1 |
|
118 |
- uhadd8 \p1, \p1, \n1 |
|
119 |
- and \n3, \n3, lr |
|
120 |
- ldr \p0, [r1, #4] |
|
121 |
- uadd8 \p1, \p1, \n3 |
|
122 |
- ldr \n2, [r2, #8] |
|
123 |
- usada8 r0, \p1, \p0, r0 |
|
124 |
- pld [r2, r3] |
|
125 |
- eor \p0, \p2, \n2 |
|
126 |
- uhadd8 \p2, \p2, \n2 |
|
127 |
- and \p0, \p0, lr |
|
128 |
- ldr \p1, [r1, #8] |
|
129 |
- uadd8 \p2, \p2, \p0 |
|
130 |
- ldr \n3, [r2, #12] |
|
131 |
- usada8 r0, \p2, \p1, r0 |
|
132 |
- eor \p1, \p3, \n3 |
|
133 |
- uhadd8 \p3, \p3, \n3 |
|
134 |
- and \p1, \p1, lr |
|
135 |
- ldr \p0, [r1, #12] |
|
136 |
- uadd8 \p3, \p3, \p1 |
|
137 |
- add r1, r1, r3 |
|
138 |
- usada8 r0, \p3, \p0, r0 |
|
139 |
- add r2, r2, r3 |
|
140 |
-.endm |
|
141 |
- |
|
142 |
-function ff_pix_abs16_y2_armv6, export=1 |
|
143 |
- pld [r1] |
|
144 |
- pld [r2] |
|
145 |
- ldr r12, [sp] |
|
146 |
- push {r4-r11, lr} |
|
147 |
- mov r0, #0 |
|
148 |
- mov lr, #1 |
|
149 |
- orr lr, lr, lr, lsl #8 |
|
150 |
- orr lr, lr, lr, lsl #16 |
|
151 |
- ldr r4, [r2] |
|
152 |
- ldr r5, [r2, #4] |
|
153 |
- ldr r6, [r2, #8] |
|
154 |
- ldr r7, [r2, #12] |
|
155 |
- add r2, r2, r3 |
|
156 |
-1: |
|
157 |
- usad_y2 r4, r5, r6, r7, r8, r9, r10, r11 |
|
158 |
- subs r12, r12, #2 |
|
159 |
- usad_y2 r8, r9, r10, r11, r4, r5, r6, r7 |
|
160 |
- bgt 1b |
|
161 |
- |
|
162 |
- pop {r4-r11, pc} |
|
163 |
-endfunc |
|
164 |
- |
|
165 |
-function ff_pix_abs8_armv6, export=1 |
|
166 |
- pld [r2, r3] |
|
167 |
- ldr r12, [sp] |
|
168 |
- push {r4-r9, lr} |
|
169 |
- mov r0, #0 |
|
170 |
- mov lr, #0 |
|
171 |
- ldrd_post r4, r5, r1, r3 |
|
172 |
-1: |
|
173 |
- subs r12, r12, #2 |
|
174 |
- ldr r7, [r2, #4] |
|
175 |
- ldr_post r6, r2, r3 |
|
176 |
- ldrd_post r8, r9, r1, r3 |
|
177 |
- usada8 r0, r4, r6, r0 |
|
178 |
- pld [r2, r3] |
|
179 |
- usada8 lr, r5, r7, lr |
|
180 |
- ldr r7, [r2, #4] |
|
181 |
- ldr_post r6, r2, r3 |
|
182 |
- beq 2f |
|
183 |
- ldrd_post r4, r5, r1, r3 |
|
184 |
- usada8 r0, r8, r6, r0 |
|
185 |
- pld [r2, r3] |
|
186 |
- usada8 lr, r9, r7, lr |
|
187 |
- b 1b |
|
188 |
-2: |
|
189 |
- usada8 r0, r8, r6, r0 |
|
190 |
- usada8 lr, r9, r7, lr |
|
191 |
- add r0, r0, lr |
|
192 |
- pop {r4-r9, pc} |
|
193 |
-endfunc |
|
194 |
- |
|
195 |
-function ff_sse16_armv6, export=1 |
|
196 |
- ldr r12, [sp] |
|
197 |
- push {r4-r9, lr} |
|
198 |
- mov r0, #0 |
|
199 |
-1: |
|
200 |
- ldrd r4, r5, [r1] |
|
201 |
- ldr r8, [r2] |
|
202 |
- uxtb16 lr, r4 |
|
203 |
- uxtb16 r4, r4, ror #8 |
|
204 |
- uxtb16 r9, r8 |
|
205 |
- uxtb16 r8, r8, ror #8 |
|
206 |
- ldr r7, [r2, #4] |
|
207 |
- usub16 lr, lr, r9 |
|
208 |
- usub16 r4, r4, r8 |
|
209 |
- smlad r0, lr, lr, r0 |
|
210 |
- uxtb16 r6, r5 |
|
211 |
- uxtb16 lr, r5, ror #8 |
|
212 |
- uxtb16 r8, r7 |
|
213 |
- uxtb16 r9, r7, ror #8 |
|
214 |
- smlad r0, r4, r4, r0 |
|
215 |
- ldrd r4, r5, [r1, #8] |
|
216 |
- usub16 r6, r6, r8 |
|
217 |
- usub16 r8, lr, r9 |
|
218 |
- ldr r7, [r2, #8] |
|
219 |
- smlad r0, r6, r6, r0 |
|
220 |
- uxtb16 lr, r4 |
|
221 |
- uxtb16 r4, r4, ror #8 |
|
222 |
- uxtb16 r9, r7 |
|
223 |
- uxtb16 r7, r7, ror #8 |
|
224 |
- smlad r0, r8, r8, r0 |
|
225 |
- ldr r8, [r2, #12] |
|
226 |
- usub16 lr, lr, r9 |
|
227 |
- usub16 r4, r4, r7 |
|
228 |
- smlad r0, lr, lr, r0 |
|
229 |
- uxtb16 r6, r5 |
|
230 |
- uxtb16 r5, r5, ror #8 |
|
231 |
- uxtb16 r9, r8 |
|
232 |
- uxtb16 r8, r8, ror #8 |
|
233 |
- smlad r0, r4, r4, r0 |
|
234 |
- usub16 r6, r6, r9 |
|
235 |
- usub16 r5, r5, r8 |
|
236 |
- smlad r0, r6, r6, r0 |
|
237 |
- add r1, r1, r3 |
|
238 |
- add r2, r2, r3 |
|
239 |
- subs r12, r12, #1 |
|
240 |
- smlad r0, r5, r5, r0 |
|
241 |
- bgt 1b |
|
242 |
- |
|
243 |
- pop {r4-r9, pc} |
|
244 |
-endfunc |
245 | 1 |
deleted file mode 100644 |
... | ... |
@@ -1,58 +0,0 @@ |
1 |
-/* |
|
2 |
- * This file is part of Libav. |
|
3 |
- * |
|
4 |
- * Libav is free software; you can redistribute it and/or |
|
5 |
- * modify it under the terms of the GNU Lesser General Public |
|
6 |
- * License as published by the Free Software Foundation; either |
|
7 |
- * version 2.1 of the License, or (at your option) any later version. |
|
8 |
- * |
|
9 |
- * Libav is distributed in the hope that it will be useful, |
|
10 |
- * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
11 |
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
12 |
- * Lesser General Public License for more details. |
|
13 |
- * |
|
14 |
- * You should have received a copy of the GNU Lesser General Public |
|
15 |
- * License along with Libav; if not, write to the Free Software |
|
16 |
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
17 |
- */ |
|
18 |
- |
|
19 |
-#include <stdint.h> |
|
20 |
- |
|
21 |
-#include "libavutil/attributes.h" |
|
22 |
-#include "libavutil/cpu.h" |
|
23 |
-#include "libavutil/arm/cpu.h" |
|
24 |
-#include "libavcodec/avcodec.h" |
|
25 |
-#include "libavcodec/dsputil.h" |
|
26 |
-#include "libavcodec/mpegvideo.h" |
|
27 |
- |
|
28 |
-int ff_pix_abs16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
29 |
- int line_size, int h); |
|
30 |
-int ff_pix_abs16_x2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
31 |
- int line_size, int h); |
|
32 |
-int ff_pix_abs16_y2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
33 |
- int line_size, int h); |
|
34 |
- |
|
35 |
-int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
36 |
- int line_size, int h); |
|
37 |
- |
|
38 |
-int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
39 |
- int line_size, int h); |
|
40 |
- |
|
41 |
- |
|
42 |
-av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx) |
|
43 |
-{ |
|
44 |
- int cpu_flags = av_get_cpu_flags(); |
|
45 |
- |
|
46 |
- if (have_armv6(cpu_flags)) { |
|
47 |
- c->pix_abs[0][0] = ff_pix_abs16_armv6; |
|
48 |
- c->pix_abs[0][1] = ff_pix_abs16_x2_armv6; |
|
49 |
- c->pix_abs[0][2] = ff_pix_abs16_y2_armv6; |
|
50 |
- |
|
51 |
- c->pix_abs[1][0] = ff_pix_abs8_armv6; |
|
52 |
- |
|
53 |
- c->sad[0] = ff_pix_abs16_armv6; |
|
54 |
- c->sad[1] = ff_pix_abs8_armv6; |
|
55 |
- |
|
56 |
- c->sse[0] = ff_sse16_armv6; |
|
57 |
- } |
|
58 |
-} |
59 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,244 @@ |
0 |
+/* |
|
1 |
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> |
|
2 |
+ * |
|
3 |
+ * This file is part of Libav. |
|
4 |
+ * |
|
5 |
+ * Libav is free software; you can redistribute it and/or |
|
6 |
+ * modify it under the terms of the GNU Lesser General Public |
|
7 |
+ * License as published by the Free Software Foundation; either |
|
8 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
9 |
+ * |
|
10 |
+ * Libav is distributed in the hope that it will be useful, |
|
11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
13 |
+ * Lesser General Public License for more details. |
|
14 |
+ * |
|
15 |
+ * You should have received a copy of the GNU Lesser General Public |
|
16 |
+ * License along with Libav; if not, write to the Free Software |
|
17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
18 |
+ */ |
|
19 |
+ |
|
20 |
+#include "libavutil/arm/asm.S" |
|
21 |
+ |
|
22 |
+function ff_pix_abs16_armv6, export=1 |
|
23 |
+ ldr r0, [sp] |
|
24 |
+ push {r4-r9, lr} |
|
25 |
+ mov r12, #0 |
|
26 |
+ mov lr, #0 |
|
27 |
+ ldm r1, {r4-r7} |
|
28 |
+ ldr r8, [r2] |
|
29 |
+1: |
|
30 |
+ ldr r9, [r2, #4] |
|
31 |
+ pld [r1, r3] |
|
32 |
+ usada8 r12, r4, r8, r12 |
|
33 |
+ ldr r8, [r2, #8] |
|
34 |
+ pld [r2, r3] |
|
35 |
+ usada8 lr, r5, r9, lr |
|
36 |
+ ldr r9, [r2, #12] |
|
37 |
+ usada8 r12, r6, r8, r12 |
|
38 |
+ subs r0, r0, #1 |
|
39 |
+ usada8 lr, r7, r9, lr |
|
40 |
+ beq 2f |
|
41 |
+ add r1, r1, r3 |
|
42 |
+ ldm r1, {r4-r7} |
|
43 |
+ add r2, r2, r3 |
|
44 |
+ ldr r8, [r2] |
|
45 |
+ b 1b |
|
46 |
+2: |
|
47 |
+ add r0, r12, lr |
|
48 |
+ pop {r4-r9, pc} |
|
49 |
+endfunc |
|
50 |
+ |
|
51 |
+function ff_pix_abs16_x2_armv6, export=1 |
|
52 |
+ ldr r12, [sp] |
|
53 |
+ push {r4-r11, lr} |
|
54 |
+ mov r0, #0 |
|
55 |
+ mov lr, #1 |
|
56 |
+ orr lr, lr, lr, lsl #8 |
|
57 |
+ orr lr, lr, lr, lsl #16 |
|
58 |
+1: |
|
59 |
+ ldr r8, [r2] |
|
60 |
+ ldr r9, [r2, #4] |
|
61 |
+ lsr r10, r8, #8 |
|
62 |
+ ldr r4, [r1] |
|
63 |
+ lsr r6, r9, #8 |
|
64 |
+ orr r10, r10, r9, lsl #24 |
|
65 |
+ ldr r5, [r2, #8] |
|
66 |
+ eor r11, r8, r10 |
|
67 |
+ uhadd8 r7, r8, r10 |
|
68 |
+ orr r6, r6, r5, lsl #24 |
|
69 |
+ and r11, r11, lr |
|
70 |
+ uadd8 r7, r7, r11 |
|
71 |
+ ldr r8, [r1, #4] |
|
72 |
+ usada8 r0, r4, r7, r0 |
|
73 |
+ eor r7, r9, r6 |
|
74 |
+ lsr r10, r5, #8 |
|
75 |
+ and r7, r7, lr |
|
76 |
+ uhadd8 r4, r9, r6 |
|
77 |
+ ldr r6, [r2, #12] |
|
78 |
+ uadd8 r4, r4, r7 |
|
79 |
+ pld [r1, r3] |
|
80 |
+ orr r10, r10, r6, lsl #24 |
|
81 |
+ usada8 r0, r8, r4, r0 |
|
82 |
+ ldr r4, [r1, #8] |
|
83 |
+ eor r11, r5, r10 |
|
84 |
+ ldrb r7, [r2, #16] |
|
85 |
+ and r11, r11, lr |
|
86 |
+ uhadd8 r8, r5, r10 |
|
87 |
+ ldr r5, [r1, #12] |
|
88 |
+ uadd8 r8, r8, r11 |
|
89 |
+ pld [r2, r3] |
|
90 |
+ lsr r10, r6, #8 |
|
91 |
+ usada8 r0, r4, r8, r0 |
|
92 |
+ orr r10, r10, r7, lsl #24 |
|
93 |
+ subs r12, r12, #1 |
|
94 |
+ eor r11, r6, r10 |
|
95 |
+ add r1, r1, r3 |
|
96 |
+ uhadd8 r9, r6, r10 |
|
97 |
+ and r11, r11, lr |
|
98 |
+ uadd8 r9, r9, r11 |
|
99 |
+ add r2, r2, r3 |
|
100 |
+ usada8 r0, r5, r9, r0 |
|
101 |
+ bgt 1b |
|
102 |
+ |
|
103 |
+ pop {r4-r11, pc} |
|
104 |
+endfunc |
|
105 |
+ |
|
106 |
+.macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3 |
|
107 |
+ ldr \n0, [r2] |
|
108 |
+ eor \n1, \p0, \n0 |
|
109 |
+ uhadd8 \p0, \p0, \n0 |
|
110 |
+ and \n1, \n1, lr |
|
111 |
+ ldr \n2, [r1] |
|
112 |
+ uadd8 \p0, \p0, \n1 |
|
113 |
+ ldr \n1, [r2, #4] |
|
114 |
+ usada8 r0, \p0, \n2, r0 |
|
115 |
+ pld [r1, r3] |
|
116 |
+ eor \n3, \p1, \n1 |
|
117 |
+ uhadd8 \p1, \p1, \n1 |
|
118 |
+ and \n3, \n3, lr |
|
119 |
+ ldr \p0, [r1, #4] |
|
120 |
+ uadd8 \p1, \p1, \n3 |
|
121 |
+ ldr \n2, [r2, #8] |
|
122 |
+ usada8 r0, \p1, \p0, r0 |
|
123 |
+ pld [r2, r3] |
|
124 |
+ eor \p0, \p2, \n2 |
|
125 |
+ uhadd8 \p2, \p2, \n2 |
|
126 |
+ and \p0, \p0, lr |
|
127 |
+ ldr \p1, [r1, #8] |
|
128 |
+ uadd8 \p2, \p2, \p0 |
|
129 |
+ ldr \n3, [r2, #12] |
|
130 |
+ usada8 r0, \p2, \p1, r0 |
|
131 |
+ eor \p1, \p3, \n3 |
|
132 |
+ uhadd8 \p3, \p3, \n3 |
|
133 |
+ and \p1, \p1, lr |
|
134 |
+ ldr \p0, [r1, #12] |
|
135 |
+ uadd8 \p3, \p3, \p1 |
|
136 |
+ add r1, r1, r3 |
|
137 |
+ usada8 r0, \p3, \p0, r0 |
|
138 |
+ add r2, r2, r3 |
|
139 |
+.endm |
|
140 |
+ |
|
141 |
+function ff_pix_abs16_y2_armv6, export=1 |
|
142 |
+ pld [r1] |
|
143 |
+ pld [r2] |
|
144 |
+ ldr r12, [sp] |
|
145 |
+ push {r4-r11, lr} |
|
146 |
+ mov r0, #0 |
|
147 |
+ mov lr, #1 |
|
148 |
+ orr lr, lr, lr, lsl #8 |
|
149 |
+ orr lr, lr, lr, lsl #16 |
|
150 |
+ ldr r4, [r2] |
|
151 |
+ ldr r5, [r2, #4] |
|
152 |
+ ldr r6, [r2, #8] |
|
153 |
+ ldr r7, [r2, #12] |
|
154 |
+ add r2, r2, r3 |
|
155 |
+1: |
|
156 |
+ usad_y2 r4, r5, r6, r7, r8, r9, r10, r11 |
|
157 |
+ subs r12, r12, #2 |
|
158 |
+ usad_y2 r8, r9, r10, r11, r4, r5, r6, r7 |
|
159 |
+ bgt 1b |
|
160 |
+ |
|
161 |
+ pop {r4-r11, pc} |
|
162 |
+endfunc |
|
163 |
+ |
|
164 |
+function ff_pix_abs8_armv6, export=1 |
|
165 |
+ pld [r2, r3] |
|
166 |
+ ldr r12, [sp] |
|
167 |
+ push {r4-r9, lr} |
|
168 |
+ mov r0, #0 |
|
169 |
+ mov lr, #0 |
|
170 |
+ ldrd_post r4, r5, r1, r3 |
|
171 |
+1: |
|
172 |
+ subs r12, r12, #2 |
|
173 |
+ ldr r7, [r2, #4] |
|
174 |
+ ldr_post r6, r2, r3 |
|
175 |
+ ldrd_post r8, r9, r1, r3 |
|
176 |
+ usada8 r0, r4, r6, r0 |
|
177 |
+ pld [r2, r3] |
|
178 |
+ usada8 lr, r5, r7, lr |
|
179 |
+ ldr r7, [r2, #4] |
|
180 |
+ ldr_post r6, r2, r3 |
|
181 |
+ beq 2f |
|
182 |
+ ldrd_post r4, r5, r1, r3 |
|
183 |
+ usada8 r0, r8, r6, r0 |
|
184 |
+ pld [r2, r3] |
|
185 |
+ usada8 lr, r9, r7, lr |
|
186 |
+ b 1b |
|
187 |
+2: |
|
188 |
+ usada8 r0, r8, r6, r0 |
|
189 |
+ usada8 lr, r9, r7, lr |
|
190 |
+ add r0, r0, lr |
|
191 |
+ pop {r4-r9, pc} |
|
192 |
+endfunc |
|
193 |
+ |
|
194 |
+function ff_sse16_armv6, export=1 |
|
195 |
+ ldr r12, [sp] |
|
196 |
+ push {r4-r9, lr} |
|
197 |
+ mov r0, #0 |
|
198 |
+1: |
|
199 |
+ ldrd r4, r5, [r1] |
|
200 |
+ ldr r8, [r2] |
|
201 |
+ uxtb16 lr, r4 |
|
202 |
+ uxtb16 r4, r4, ror #8 |
|
203 |
+ uxtb16 r9, r8 |
|
204 |
+ uxtb16 r8, r8, ror #8 |
|
205 |
+ ldr r7, [r2, #4] |
|
206 |
+ usub16 lr, lr, r9 |
|
207 |
+ usub16 r4, r4, r8 |
|
208 |
+ smlad r0, lr, lr, r0 |
|
209 |
+ uxtb16 r6, r5 |
|
210 |
+ uxtb16 lr, r5, ror #8 |
|
211 |
+ uxtb16 r8, r7 |
|
212 |
+ uxtb16 r9, r7, ror #8 |
|
213 |
+ smlad r0, r4, r4, r0 |
|
214 |
+ ldrd r4, r5, [r1, #8] |
|
215 |
+ usub16 r6, r6, r8 |
|
216 |
+ usub16 r8, lr, r9 |
|
217 |
+ ldr r7, [r2, #8] |
|
218 |
+ smlad r0, r6, r6, r0 |
|
219 |
+ uxtb16 lr, r4 |
|
220 |
+ uxtb16 r4, r4, ror #8 |
|
221 |
+ uxtb16 r9, r7 |
|
222 |
+ uxtb16 r7, r7, ror #8 |
|
223 |
+ smlad r0, r8, r8, r0 |
|
224 |
+ ldr r8, [r2, #12] |
|
225 |
+ usub16 lr, lr, r9 |
|
226 |
+ usub16 r4, r4, r7 |
|
227 |
+ smlad r0, lr, lr, r0 |
|
228 |
+ uxtb16 r6, r5 |
|
229 |
+ uxtb16 r5, r5, ror #8 |
|
230 |
+ uxtb16 r9, r8 |
|
231 |
+ uxtb16 r8, r8, ror #8 |
|
232 |
+ smlad r0, r4, r4, r0 |
|
233 |
+ usub16 r6, r6, r9 |
|
234 |
+ usub16 r5, r5, r8 |
|
235 |
+ smlad r0, r6, r6, r0 |
|
236 |
+ add r1, r1, r3 |
|
237 |
+ add r2, r2, r3 |
|
238 |
+ subs r12, r12, #1 |
|
239 |
+ smlad r0, r5, r5, r0 |
|
240 |
+ bgt 1b |
|
241 |
+ |
|
242 |
+ pop {r4-r9, pc} |
|
243 |
+endfunc |
0 | 244 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,57 @@ |
0 |
+/* |
|
1 |
+ * This file is part of Libav. |
|
2 |
+ * |
|
3 |
+ * Libav is free software; you can redistribute it and/or |
|
4 |
+ * modify it under the terms of the GNU Lesser General Public |
|
5 |
+ * License as published by the Free Software Foundation; either |
|
6 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
7 |
+ * |
|
8 |
+ * Libav is distributed in the hope that it will be useful, |
|
9 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
10 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
11 |
+ * Lesser General Public License for more details. |
|
12 |
+ * |
|
13 |
+ * You should have received a copy of the GNU Lesser General Public |
|
14 |
+ * License along with Libav; if not, write to the Free Software |
|
15 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
16 |
+ */ |
|
17 |
+ |
|
18 |
+#include <stdint.h> |
|
19 |
+ |
|
20 |
+#include "libavutil/attributes.h" |
|
21 |
+#include "libavutil/cpu.h" |
|
22 |
+#include "libavutil/arm/cpu.h" |
|
23 |
+#include "libavcodec/avcodec.h" |
|
24 |
+#include "libavcodec/me_cmp.h" |
|
25 |
+#include "libavcodec/mpegvideo.h" |
|
26 |
+ |
|
27 |
+int ff_pix_abs16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
28 |
+ int line_size, int h); |
|
29 |
+int ff_pix_abs16_x2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
30 |
+ int line_size, int h); |
|
31 |
+int ff_pix_abs16_y2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
32 |
+ int line_size, int h); |
|
33 |
+ |
|
34 |
+int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
35 |
+ int line_size, int h); |
|
36 |
+ |
|
37 |
+int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
38 |
+ int line_size, int h); |
|
39 |
+ |
|
40 |
+av_cold void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx) |
|
41 |
+{ |
|
42 |
+ int cpu_flags = av_get_cpu_flags(); |
|
43 |
+ |
|
44 |
+ if (have_armv6(cpu_flags)) { |
|
45 |
+ c->pix_abs[0][0] = ff_pix_abs16_armv6; |
|
46 |
+ c->pix_abs[0][1] = ff_pix_abs16_x2_armv6; |
|
47 |
+ c->pix_abs[0][2] = ff_pix_abs16_y2_armv6; |
|
48 |
+ |
|
49 |
+ c->pix_abs[1][0] = ff_pix_abs8_armv6; |
|
50 |
+ |
|
51 |
+ c->sad[0] = ff_pix_abs16_armv6; |
|
52 |
+ c->sad[1] = ff_pix_abs8_armv6; |
|
53 |
+ |
|
54 |
+ c->sse[0] = ff_sse16_armv6; |
|
55 |
+ } |
|
56 |
+} |
0 | 57 |
deleted file mode 100644 |
... | ... |
@@ -1,952 +0,0 @@ |
1 |
-/* |
|
2 |
- * DSP utils |
|
3 |
- * Copyright (c) 2000, 2001 Fabrice Bellard |
|
4 |
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
5 |
- * |
|
6 |
- * This file is part of Libav. |
|
7 |
- * |
|
8 |
- * Libav is free software; you can redistribute it and/or |
|
9 |
- * modify it under the terms of the GNU Lesser General Public |
|
10 |
- * License as published by the Free Software Foundation; either |
|
11 |
- * version 2.1 of the License, or (at your option) any later version. |
|
12 |
- * |
|
13 |
- * Libav is distributed in the hope that it will be useful, |
|
14 |
- * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
15 |
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
16 |
- * Lesser General Public License for more details. |
|
17 |
- * |
|
18 |
- * You should have received a copy of the GNU Lesser General Public |
|
19 |
- * License along with Libav; if not, write to the Free Software |
|
20 |
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
21 |
- */ |
|
22 |
- |
|
23 |
-/** |
|
24 |
- * @file |
|
25 |
- * DSP utils |
|
26 |
- */ |
|
27 |
- |
|
28 |
-#include "libavutil/attributes.h" |
|
29 |
-#include "avcodec.h" |
|
30 |
-#include "copy_block.h" |
|
31 |
-#include "dsputil.h" |
|
32 |
-#include "simple_idct.h" |
|
33 |
-#include "mpegvideo.h" |
|
34 |
-#include "config.h" |
|
35 |
- |
|
36 |
-uint32_t ff_square_tab[512] = { 0, }; |
|
37 |
- |
|
38 |
-static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
39 |
- int line_size, int h) |
|
40 |
-{ |
|
41 |
- int s = 0, i; |
|
42 |
- uint32_t *sq = ff_square_tab + 256; |
|
43 |
- |
|
44 |
- for (i = 0; i < h; i++) { |
|
45 |
- s += sq[pix1[0] - pix2[0]]; |
|
46 |
- s += sq[pix1[1] - pix2[1]]; |
|
47 |
- s += sq[pix1[2] - pix2[2]]; |
|
48 |
- s += sq[pix1[3] - pix2[3]]; |
|
49 |
- pix1 += line_size; |
|
50 |
- pix2 += line_size; |
|
51 |
- } |
|
52 |
- return s; |
|
53 |
-} |
|
54 |
- |
|
55 |
-static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
56 |
- int line_size, int h) |
|
57 |
-{ |
|
58 |
- int s = 0, i; |
|
59 |
- uint32_t *sq = ff_square_tab + 256; |
|
60 |
- |
|
61 |
- for (i = 0; i < h; i++) { |
|
62 |
- s += sq[pix1[0] - pix2[0]]; |
|
63 |
- s += sq[pix1[1] - pix2[1]]; |
|
64 |
- s += sq[pix1[2] - pix2[2]]; |
|
65 |
- s += sq[pix1[3] - pix2[3]]; |
|
66 |
- s += sq[pix1[4] - pix2[4]]; |
|
67 |
- s += sq[pix1[5] - pix2[5]]; |
|
68 |
- s += sq[pix1[6] - pix2[6]]; |
|
69 |
- s += sq[pix1[7] - pix2[7]]; |
|
70 |
- pix1 += line_size; |
|
71 |
- pix2 += line_size; |
|
72 |
- } |
|
73 |
- return s; |
|
74 |
-} |
|
75 |
- |
|
76 |
-static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
77 |
- int line_size, int h) |
|
78 |
-{ |
|
79 |
- int s = 0, i; |
|
80 |
- uint32_t *sq = ff_square_tab + 256; |
|
81 |
- |
|
82 |
- for (i = 0; i < h; i++) { |
|
83 |
- s += sq[pix1[0] - pix2[0]]; |
|
84 |
- s += sq[pix1[1] - pix2[1]]; |
|
85 |
- s += sq[pix1[2] - pix2[2]]; |
|
86 |
- s += sq[pix1[3] - pix2[3]]; |
|
87 |
- s += sq[pix1[4] - pix2[4]]; |
|
88 |
- s += sq[pix1[5] - pix2[5]]; |
|
89 |
- s += sq[pix1[6] - pix2[6]]; |
|
90 |
- s += sq[pix1[7] - pix2[7]]; |
|
91 |
- s += sq[pix1[8] - pix2[8]]; |
|
92 |
- s += sq[pix1[9] - pix2[9]]; |
|
93 |
- s += sq[pix1[10] - pix2[10]]; |
|
94 |
- s += sq[pix1[11] - pix2[11]]; |
|
95 |
- s += sq[pix1[12] - pix2[12]]; |
|
96 |
- s += sq[pix1[13] - pix2[13]]; |
|
97 |
- s += sq[pix1[14] - pix2[14]]; |
|
98 |
- s += sq[pix1[15] - pix2[15]]; |
|
99 |
- |
|
100 |
- pix1 += line_size; |
|
101 |
- pix2 += line_size; |
|
102 |
- } |
|
103 |
- return s; |
|
104 |
-} |
|
105 |
- |
|
106 |
-static int sum_abs_dctelem_c(int16_t *block) |
|
107 |
-{ |
|
108 |
- int sum = 0, i; |
|
109 |
- |
|
110 |
- for (i = 0; i < 64; i++) |
|
111 |
- sum += FFABS(block[i]); |
|
112 |
- return sum; |
|
113 |
-} |
|
114 |
- |
|
115 |
-#define avg2(a, b) ((a + b + 1) >> 1) |
|
116 |
-#define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2) |
|
117 |
- |
|
118 |
-static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
119 |
- int line_size, int h) |
|
120 |
-{ |
|
121 |
- int s = 0, i; |
|
122 |
- |
|
123 |
- for (i = 0; i < h; i++) { |
|
124 |
- s += abs(pix1[0] - pix2[0]); |
|
125 |
- s += abs(pix1[1] - pix2[1]); |
|
126 |
- s += abs(pix1[2] - pix2[2]); |
|
127 |
- s += abs(pix1[3] - pix2[3]); |
|
128 |
- s += abs(pix1[4] - pix2[4]); |
|
129 |
- s += abs(pix1[5] - pix2[5]); |
|
130 |
- s += abs(pix1[6] - pix2[6]); |
|
131 |
- s += abs(pix1[7] - pix2[7]); |
|
132 |
- s += abs(pix1[8] - pix2[8]); |
|
133 |
- s += abs(pix1[9] - pix2[9]); |
|
134 |
- s += abs(pix1[10] - pix2[10]); |
|
135 |
- s += abs(pix1[11] - pix2[11]); |
|
136 |
- s += abs(pix1[12] - pix2[12]); |
|
137 |
- s += abs(pix1[13] - pix2[13]); |
|
138 |
- s += abs(pix1[14] - pix2[14]); |
|
139 |
- s += abs(pix1[15] - pix2[15]); |
|
140 |
- pix1 += line_size; |
|
141 |
- pix2 += line_size; |
|
142 |
- } |
|
143 |
- return s; |
|
144 |
-} |
|
145 |
- |
|
146 |
-static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
147 |
- int line_size, int h) |
|
148 |
-{ |
|
149 |
- int s = 0, i; |
|
150 |
- |
|
151 |
- for (i = 0; i < h; i++) { |
|
152 |
- s += abs(pix1[0] - avg2(pix2[0], pix2[1])); |
|
153 |
- s += abs(pix1[1] - avg2(pix2[1], pix2[2])); |
|
154 |
- s += abs(pix1[2] - avg2(pix2[2], pix2[3])); |
|
155 |
- s += abs(pix1[3] - avg2(pix2[3], pix2[4])); |
|
156 |
- s += abs(pix1[4] - avg2(pix2[4], pix2[5])); |
|
157 |
- s += abs(pix1[5] - avg2(pix2[5], pix2[6])); |
|
158 |
- s += abs(pix1[6] - avg2(pix2[6], pix2[7])); |
|
159 |
- s += abs(pix1[7] - avg2(pix2[7], pix2[8])); |
|
160 |
- s += abs(pix1[8] - avg2(pix2[8], pix2[9])); |
|
161 |
- s += abs(pix1[9] - avg2(pix2[9], pix2[10])); |
|
162 |
- s += abs(pix1[10] - avg2(pix2[10], pix2[11])); |
|
163 |
- s += abs(pix1[11] - avg2(pix2[11], pix2[12])); |
|
164 |
- s += abs(pix1[12] - avg2(pix2[12], pix2[13])); |
|
165 |
- s += abs(pix1[13] - avg2(pix2[13], pix2[14])); |
|
166 |
- s += abs(pix1[14] - avg2(pix2[14], pix2[15])); |
|
167 |
- s += abs(pix1[15] - avg2(pix2[15], pix2[16])); |
|
168 |
- pix1 += line_size; |
|
169 |
- pix2 += line_size; |
|
170 |
- } |
|
171 |
- return s; |
|
172 |
-} |
|
173 |
- |
|
174 |
-static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
175 |
- int line_size, int h) |
|
176 |
-{ |
|
177 |
- int s = 0, i; |
|
178 |
- uint8_t *pix3 = pix2 + line_size; |
|
179 |
- |
|
180 |
- for (i = 0; i < h; i++) { |
|
181 |
- s += abs(pix1[0] - avg2(pix2[0], pix3[0])); |
|
182 |
- s += abs(pix1[1] - avg2(pix2[1], pix3[1])); |
|
183 |
- s += abs(pix1[2] - avg2(pix2[2], pix3[2])); |
|
184 |
- s += abs(pix1[3] - avg2(pix2[3], pix3[3])); |
|
185 |
- s += abs(pix1[4] - avg2(pix2[4], pix3[4])); |
|
186 |
- s += abs(pix1[5] - avg2(pix2[5], pix3[5])); |
|
187 |
- s += abs(pix1[6] - avg2(pix2[6], pix3[6])); |
|
188 |
- s += abs(pix1[7] - avg2(pix2[7], pix3[7])); |
|
189 |
- s += abs(pix1[8] - avg2(pix2[8], pix3[8])); |
|
190 |
- s += abs(pix1[9] - avg2(pix2[9], pix3[9])); |
|
191 |
- s += abs(pix1[10] - avg2(pix2[10], pix3[10])); |
|
192 |
- s += abs(pix1[11] - avg2(pix2[11], pix3[11])); |
|
193 |
- s += abs(pix1[12] - avg2(pix2[12], pix3[12])); |
|
194 |
- s += abs(pix1[13] - avg2(pix2[13], pix3[13])); |
|
195 |
- s += abs(pix1[14] - avg2(pix2[14], pix3[14])); |
|
196 |
- s += abs(pix1[15] - avg2(pix2[15], pix3[15])); |
|
197 |
- pix1 += line_size; |
|
198 |
- pix2 += line_size; |
|
199 |
- pix3 += line_size; |
|
200 |
- } |
|
201 |
- return s; |
|
202 |
-} |
|
203 |
- |
|
204 |
-static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
205 |
- int line_size, int h) |
|
206 |
-{ |
|
207 |
- int s = 0, i; |
|
208 |
- uint8_t *pix3 = pix2 + line_size; |
|
209 |
- |
|
210 |
- for (i = 0; i < h; i++) { |
|
211 |
- s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); |
|
212 |
- s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); |
|
213 |
- s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); |
|
214 |
- s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); |
|
215 |
- s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); |
|
216 |
- s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); |
|
217 |
- s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); |
|
218 |
- s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); |
|
219 |
- s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9])); |
|
220 |
- s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10])); |
|
221 |
- s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11])); |
|
222 |
- s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12])); |
|
223 |
- s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13])); |
|
224 |
- s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14])); |
|
225 |
- s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15])); |
|
226 |
- s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16])); |
|
227 |
- pix1 += line_size; |
|
228 |
- pix2 += line_size; |
|
229 |
- pix3 += line_size; |
|
230 |
- } |
|
231 |
- return s; |
|
232 |
-} |
|
233 |
- |
|
234 |
-static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
235 |
- int line_size, int h) |
|
236 |
-{ |
|
237 |
- int s = 0, i; |
|
238 |
- |
|
239 |
- for (i = 0; i < h; i++) { |
|
240 |
- s += abs(pix1[0] - pix2[0]); |
|
241 |
- s += abs(pix1[1] - pix2[1]); |
|
242 |
- s += abs(pix1[2] - pix2[2]); |
|
243 |
- s += abs(pix1[3] - pix2[3]); |
|
244 |
- s += abs(pix1[4] - pix2[4]); |
|
245 |
- s += abs(pix1[5] - pix2[5]); |
|
246 |
- s += abs(pix1[6] - pix2[6]); |
|
247 |
- s += abs(pix1[7] - pix2[7]); |
|
248 |
- pix1 += line_size; |
|
249 |
- pix2 += line_size; |
|
250 |
- } |
|
251 |
- return s; |
|
252 |
-} |
|
253 |
- |
|
254 |
-static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
255 |
- int line_size, int h) |
|
256 |
-{ |
|
257 |
- int s = 0, i; |
|
258 |
- |
|
259 |
- for (i = 0; i < h; i++) { |
|
260 |
- s += abs(pix1[0] - avg2(pix2[0], pix2[1])); |
|
261 |
- s += abs(pix1[1] - avg2(pix2[1], pix2[2])); |
|
262 |
- s += abs(pix1[2] - avg2(pix2[2], pix2[3])); |
|
263 |
- s += abs(pix1[3] - avg2(pix2[3], pix2[4])); |
|
264 |
- s += abs(pix1[4] - avg2(pix2[4], pix2[5])); |
|
265 |
- s += abs(pix1[5] - avg2(pix2[5], pix2[6])); |
|
266 |
- s += abs(pix1[6] - avg2(pix2[6], pix2[7])); |
|
267 |
- s += abs(pix1[7] - avg2(pix2[7], pix2[8])); |
|
268 |
- pix1 += line_size; |
|
269 |
- pix2 += line_size; |
|
270 |
- } |
|
271 |
- return s; |
|
272 |
-} |
|
273 |
- |
|
274 |
-static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
275 |
- int line_size, int h) |
|
276 |
-{ |
|
277 |
- int s = 0, i; |
|
278 |
- uint8_t *pix3 = pix2 + line_size; |
|
279 |
- |
|
280 |
- for (i = 0; i < h; i++) { |
|
281 |
- s += abs(pix1[0] - avg2(pix2[0], pix3[0])); |
|
282 |
- s += abs(pix1[1] - avg2(pix2[1], pix3[1])); |
|
283 |
- s += abs(pix1[2] - avg2(pix2[2], pix3[2])); |
|
284 |
- s += abs(pix1[3] - avg2(pix2[3], pix3[3])); |
|
285 |
- s += abs(pix1[4] - avg2(pix2[4], pix3[4])); |
|
286 |
- s += abs(pix1[5] - avg2(pix2[5], pix3[5])); |
|
287 |
- s += abs(pix1[6] - avg2(pix2[6], pix3[6])); |
|
288 |
- s += abs(pix1[7] - avg2(pix2[7], pix3[7])); |
|
289 |
- pix1 += line_size; |
|
290 |
- pix2 += line_size; |
|
291 |
- pix3 += line_size; |
|
292 |
- } |
|
293 |
- return s; |
|
294 |
-} |
|
295 |
- |
|
296 |
-static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
297 |
- int line_size, int h) |
|
298 |
-{ |
|
299 |
- int s = 0, i; |
|
300 |
- uint8_t *pix3 = pix2 + line_size; |
|
301 |
- |
|
302 |
- for (i = 0; i < h; i++) { |
|
303 |
- s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); |
|
304 |
- s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); |
|
305 |
- s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); |
|
306 |
- s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); |
|
307 |
- s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); |
|
308 |
- s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); |
|
309 |
- s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); |
|
310 |
- s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); |
|
311 |
- pix1 += line_size; |
|
312 |
- pix2 += line_size; |
|
313 |
- pix3 += line_size; |
|
314 |
- } |
|
315 |
- return s; |
|
316 |
-} |
|
317 |
- |
|
318 |
-static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h) |
|
319 |
-{ |
|
320 |
- int score1 = 0, score2 = 0, x, y; |
|
321 |
- |
|
322 |
- for (y = 0; y < h; y++) { |
|
323 |
- for (x = 0; x < 16; x++) |
|
324 |
- score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]); |
|
325 |
- if (y + 1 < h) { |
|
326 |
- for (x = 0; x < 15; x++) |
|
327 |
- score2 += FFABS(s1[x] - s1[x + stride] - |
|
328 |
- s1[x + 1] + s1[x + stride + 1]) - |
|
329 |
- FFABS(s2[x] - s2[x + stride] - |
|
330 |
- s2[x + 1] + s2[x + stride + 1]); |
|
331 |
- } |
|
332 |
- s1 += stride; |
|
333 |
- s2 += stride; |
|
334 |
- } |
|
335 |
- |
|
336 |
- if (c) |
|
337 |
- return score1 + FFABS(score2) * c->avctx->nsse_weight; |
|
338 |
- else |
|
339 |
- return score1 + FFABS(score2) * 8; |
|
340 |
-} |
|
341 |
- |
|
342 |
-static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h) |
|
343 |
-{ |
|
344 |
- int score1 = 0, score2 = 0, x, y; |
|
345 |
- |
|
346 |
- for (y = 0; y < h; y++) { |
|
347 |
- for (x = 0; x < 8; x++) |
|
348 |
- score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]); |
|
349 |
- if (y + 1 < h) { |
|
350 |
- for (x = 0; x < 7; x++) |
|
351 |
- score2 += FFABS(s1[x] - s1[x + stride] - |
|
352 |
- s1[x + 1] + s1[x + stride + 1]) - |
|
353 |
- FFABS(s2[x] - s2[x + stride] - |
|
354 |
- s2[x + 1] + s2[x + stride + 1]); |
|
355 |
- } |
|
356 |
- s1 += stride; |
|
357 |
- s2 += stride; |
|
358 |
- } |
|
359 |
- |
|
360 |
- if (c) |
|
361 |
- return score1 + FFABS(score2) * c->avctx->nsse_weight; |
|
362 |
- else |
|
363 |
- return score1 + FFABS(score2) * 8; |
|
364 |
-} |
|
365 |
- |
|
366 |
-static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b, |
|
367 |
- int stride, int h) |
|
368 |
-{ |
|
369 |
- return 0; |
|
370 |
-} |
|
371 |
- |
|
372 |
-void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type) |
|
373 |
-{ |
|
374 |
- int i; |
|
375 |
- |
|
376 |
- memset(cmp, 0, sizeof(void *) * 6); |
|
377 |
- |
|
378 |
- for (i = 0; i < 6; i++) { |
|
379 |
- switch (type & 0xFF) { |
|
380 |
- case FF_CMP_SAD: |
|
381 |
- cmp[i] = c->sad[i]; |
|
382 |
- break; |
|
383 |
- case FF_CMP_SATD: |
|
384 |
- cmp[i] = c->hadamard8_diff[i]; |
|
385 |
- break; |
|
386 |
- case FF_CMP_SSE: |
|
387 |
- cmp[i] = c->sse[i]; |
|
388 |
- break; |
|
389 |
- case FF_CMP_DCT: |
|
390 |
- cmp[i] = c->dct_sad[i]; |
|
391 |
- break; |
|
392 |
- case FF_CMP_DCT264: |
|
393 |
- cmp[i] = c->dct264_sad[i]; |
|
394 |
- break; |
|
395 |
- case FF_CMP_DCTMAX: |
|
396 |
- cmp[i] = c->dct_max[i]; |
|
397 |
- break; |
|
398 |
- case FF_CMP_PSNR: |
|
399 |
- cmp[i] = c->quant_psnr[i]; |
|
400 |
- break; |
|
401 |
- case FF_CMP_BIT: |
|
402 |
- cmp[i] = c->bit[i]; |
|
403 |
- break; |
|
404 |
- case FF_CMP_RD: |
|
405 |
- cmp[i] = c->rd[i]; |
|
406 |
- break; |
|
407 |
- case FF_CMP_VSAD: |
|
408 |
- cmp[i] = c->vsad[i]; |
|
409 |
- break; |
|
410 |
- case FF_CMP_VSSE: |
|
411 |
- cmp[i] = c->vsse[i]; |
|
412 |
- break; |
|
413 |
- case FF_CMP_ZERO: |
|
414 |
- cmp[i] = zero_cmp; |
|
415 |
- break; |
|
416 |
- case FF_CMP_NSSE: |
|
417 |
- cmp[i] = c->nsse[i]; |
|
418 |
- break; |
|
419 |
- default: |
|
420 |
- av_log(NULL, AV_LOG_ERROR, |
|
421 |
- "internal error in cmp function selection\n"); |
|
422 |
- } |
|
423 |
- } |
|
424 |
-} |
|
425 |
- |
|
426 |
-#define BUTTERFLY2(o1, o2, i1, i2) \ |
|
427 |
- o1 = (i1) + (i2); \ |
|
428 |
- o2 = (i1) - (i2); |
|
429 |
- |
|
430 |
-#define BUTTERFLY1(x, y) \ |
|
431 |
- { \ |
|
432 |
- int a, b; \ |
|
433 |
- a = x; \ |
|
434 |
- b = y; \ |
|
435 |
- x = a + b; \ |
|
436 |
- y = a - b; \ |
|
437 |
- } |
|
438 |
- |
|
439 |
-#define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y))) |
|
440 |
- |
|
441 |
-static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst, |
|
442 |
- uint8_t *src, int stride, int h) |
|
443 |
-{ |
|
444 |
- int i, temp[64], sum = 0; |
|
445 |
- |
|
446 |
- assert(h == 8); |
|
447 |
- |
|
448 |
- for (i = 0; i < 8; i++) { |
|
449 |
- // FIXME: try pointer walks |
|
450 |
- BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1], |
|
451 |
- src[stride * i + 0] - dst[stride * i + 0], |
|
452 |
- src[stride * i + 1] - dst[stride * i + 1]); |
|
453 |
- BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3], |
|
454 |
- src[stride * i + 2] - dst[stride * i + 2], |
|
455 |
- src[stride * i + 3] - dst[stride * i + 3]); |
|
456 |
- BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5], |
|
457 |
- src[stride * i + 4] - dst[stride * i + 4], |
|
458 |
- src[stride * i + 5] - dst[stride * i + 5]); |
|
459 |
- BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7], |
|
460 |
- src[stride * i + 6] - dst[stride * i + 6], |
|
461 |
- src[stride * i + 7] - dst[stride * i + 7]); |
|
462 |
- |
|
463 |
- BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]); |
|
464 |
- BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]); |
|
465 |
- BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]); |
|
466 |
- BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]); |
|
467 |
- |
|
468 |
- BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]); |
|
469 |
- BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]); |
|
470 |
- BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]); |
|
471 |
- BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]); |
|
472 |
- } |
|
473 |
- |
|
474 |
- for (i = 0; i < 8; i++) { |
|
475 |
- BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]); |
|
476 |
- BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]); |
|
477 |
- BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]); |
|
478 |
- BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]); |
|
479 |
- |
|
480 |
- BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]); |
|
481 |
- BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]); |
|
482 |
- BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]); |
|
483 |
- BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]); |
|
484 |
- |
|
485 |
- sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) + |
|
486 |
- BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) + |
|
487 |
- BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) + |
|
488 |
- BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]); |
|
489 |
- } |
|
490 |
- return sum; |
|
491 |
-} |
|
492 |
- |
|
493 |
-static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src, |
|
494 |
- uint8_t *dummy, int stride, int h) |
|
495 |
-{ |
|
496 |
- int i, temp[64], sum = 0; |
|
497 |
- |
|
498 |
- assert(h == 8); |
|
499 |
- |
|
500 |
- for (i = 0; i < 8; i++) { |
|
501 |
- // FIXME: try pointer walks |
|
502 |
- BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1], |
|
503 |
- src[stride * i + 0], src[stride * i + 1]); |
|
504 |
- BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3], |
|
505 |
- src[stride * i + 2], src[stride * i + 3]); |
|
506 |
- BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5], |
|
507 |
- src[stride * i + 4], src[stride * i + 5]); |
|
508 |
- BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7], |
|
509 |
- src[stride * i + 6], src[stride * i + 7]); |
|
510 |
- |
|
511 |
- BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]); |
|
512 |
- BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]); |
|
513 |
- BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]); |
|
514 |
- BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]); |
|
515 |
- |
|
516 |
- BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]); |
|
517 |
- BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]); |
|
518 |
- BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]); |
|
519 |
- BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]); |
|
520 |
- } |
|
521 |
- |
|
522 |
- for (i = 0; i < 8; i++) { |
|
523 |
- BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]); |
|
524 |
- BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]); |
|
525 |
- BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]); |
|
526 |
- BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]); |
|
527 |
- |
|
528 |
- BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]); |
|
529 |
- BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]); |
|
530 |
- BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]); |
|
531 |
- BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]); |
|
532 |
- |
|
533 |
- sum += |
|
534 |
- BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) |
|
535 |
- + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) |
|
536 |
- + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) |
|
537 |
- + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]); |
|
538 |
- } |
|
539 |
- |
|
540 |
- sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean |
|
541 |
- |
|
542 |
- return sum; |
|
543 |
-} |
|
544 |
- |
|
545 |
-static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1, |
|
546 |
- uint8_t *src2, int stride, int h) |
|
547 |
-{ |
|
548 |
- LOCAL_ALIGNED_16(int16_t, temp, [64]); |
|
549 |
- |
|
550 |
- assert(h == 8); |
|
551 |
- |
|
552 |
- s->pdsp.diff_pixels(temp, src1, src2, stride); |
|
553 |
- s->fdsp.fdct(temp); |
|
554 |
- return s->dsp.sum_abs_dctelem(temp); |
|
555 |
-} |
|
556 |
- |
|
557 |
-#if CONFIG_GPL |
|
558 |
-#define DCT8_1D \ |
|
559 |
- { \ |
|
560 |
- const int s07 = SRC(0) + SRC(7); \ |
|
561 |
- const int s16 = SRC(1) + SRC(6); \ |
|
562 |
- const int s25 = SRC(2) + SRC(5); \ |
|
563 |
- const int s34 = SRC(3) + SRC(4); \ |
|
564 |
- const int a0 = s07 + s34; \ |
|
565 |
- const int a1 = s16 + s25; \ |
|
566 |
- const int a2 = s07 - s34; \ |
|
567 |
- const int a3 = s16 - s25; \ |
|
568 |
- const int d07 = SRC(0) - SRC(7); \ |
|
569 |
- const int d16 = SRC(1) - SRC(6); \ |
|
570 |
- const int d25 = SRC(2) - SRC(5); \ |
|
571 |
- const int d34 = SRC(3) - SRC(4); \ |
|
572 |
- const int a4 = d16 + d25 + (d07 + (d07 >> 1)); \ |
|
573 |
- const int a5 = d07 - d34 - (d25 + (d25 >> 1)); \ |
|
574 |
- const int a6 = d07 + d34 - (d16 + (d16 >> 1)); \ |
|
575 |
- const int a7 = d16 - d25 + (d34 + (d34 >> 1)); \ |
|
576 |
- DST(0, a0 + a1); \ |
|
577 |
- DST(1, a4 + (a7 >> 2)); \ |
|
578 |
- DST(2, a2 + (a3 >> 1)); \ |
|
579 |
- DST(3, a5 + (a6 >> 2)); \ |
|
580 |
- DST(4, a0 - a1); \ |
|
581 |
- DST(5, a6 - (a5 >> 2)); \ |
|
582 |
- DST(6, (a2 >> 1) - a3); \ |
|
583 |
- DST(7, (a4 >> 2) - a7); \ |
|
584 |
- } |
|
585 |
- |
|
586 |
-static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1, |
|
587 |
- uint8_t *src2, int stride, int h) |
|
588 |
-{ |
|
589 |
- int16_t dct[8][8]; |
|
590 |
- int i, sum = 0; |
|
591 |
- |
|
592 |
- s->pdsp.diff_pixels(dct[0], src1, src2, stride); |
|
593 |
- |
|
594 |
-#define SRC(x) dct[i][x] |
|
595 |
-#define DST(x, v) dct[i][x] = v |
|
596 |
- for (i = 0; i < 8; i++) |
|
597 |
- DCT8_1D |
|
598 |
-#undef SRC |
|
599 |
-#undef DST |
|
600 |
- |
|
601 |
-#define SRC(x) dct[x][i] |
|
602 |
-#define DST(x, v) sum += FFABS(v) |
|
603 |
- for (i = 0; i < 8; i++) |
|
604 |
- DCT8_1D |
|
605 |
-#undef SRC |
|
606 |
-#undef DST |
|
607 |
- return sum; |
|
608 |
-} |
|
609 |
-#endif |
|
610 |
- |
|
611 |
-static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1, |
|
612 |
- uint8_t *src2, int stride, int h) |
|
613 |
-{ |
|
614 |
- LOCAL_ALIGNED_16(int16_t, temp, [64]); |
|
615 |
- int sum = 0, i; |
|
616 |
- |
|
617 |
- assert(h == 8); |
|
618 |
- |
|
619 |
- s->pdsp.diff_pixels(temp, src1, src2, stride); |
|
620 |
- s->fdsp.fdct(temp); |
|
621 |
- |
|
622 |
- for (i = 0; i < 64; i++) |
|
623 |
- sum = FFMAX(sum, FFABS(temp[i])); |
|
624 |
- |
|
625 |
- return sum; |
|
626 |
-} |
|
627 |
- |
|
628 |
-static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1, |
|
629 |
- uint8_t *src2, int stride, int h) |
|
630 |
-{ |
|
631 |
- LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]); |
|
632 |
- int16_t *const bak = temp + 64; |
|
633 |
- int sum = 0, i; |
|
634 |
- |
|
635 |
- assert(h == 8); |
|
636 |
- s->mb_intra = 0; |
|
637 |
- |
|
638 |
- s->pdsp.diff_pixels(temp, src1, src2, stride); |
|
639 |
- |
|
640 |
- memcpy(bak, temp, 64 * sizeof(int16_t)); |
|
641 |
- |
|
642 |
- s->block_last_index[0 /* FIXME */] = |
|
643 |
- s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); |
|
644 |
- s->dct_unquantize_inter(s, temp, 0, s->qscale); |
|
645 |
- ff_simple_idct_8(temp); // FIXME |
|
646 |
- |
|
647 |
- for (i = 0; i < 64; i++) |
|
648 |
- sum += (temp[i] - bak[i]) * (temp[i] - bak[i]); |
|
649 |
- |
|
650 |
- return sum; |
|
651 |
-} |
|
652 |
- |
|
653 |
-static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2, |
|
654 |
- int stride, int h) |
|
655 |
-{ |
|
656 |
- const uint8_t *scantable = s->intra_scantable.permutated; |
|
657 |
- LOCAL_ALIGNED_16(int16_t, temp, [64]); |
|
658 |
- LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]); |
|
659 |
- LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]); |
|
660 |
- int i, last, run, bits, level, distortion, start_i; |
|
661 |
- const int esc_length = s->ac_esc_length; |
|
662 |
- uint8_t *length, *last_length; |
|
663 |
- |
|
664 |
- assert(h == 8); |
|
665 |
- |
|
666 |
- copy_block8(lsrc1, src1, 8, stride, 8); |
|
667 |
- copy_block8(lsrc2, src2, 8, stride, 8); |
|
668 |
- |
|
669 |
- s->pdsp.diff_pixels(temp, lsrc1, lsrc2, 8); |
|
670 |
- |
|
671 |
- s->block_last_index[0 /* FIXME */] = |
|
672 |
- last = |
|
673 |
- s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); |
|
674 |
- |
|
675 |
- bits = 0; |
|
676 |
- |
|
677 |
- if (s->mb_intra) { |
|
678 |
- start_i = 1; |
|
679 |
- length = s->intra_ac_vlc_length; |
|
680 |
- last_length = s->intra_ac_vlc_last_length; |
|
681 |
- bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma |
|
682 |
- } else { |
|
683 |
- start_i = 0; |
|
684 |
- length = s->inter_ac_vlc_length; |
|
685 |
- last_length = s->inter_ac_vlc_last_length; |
|
686 |
- } |
|
687 |
- |
|
688 |
- if (last >= start_i) { |
|
689 |
- run = 0; |
|
690 |
- for (i = start_i; i < last; i++) { |
|
691 |
- int j = scantable[i]; |
|
692 |
- level = temp[j]; |
|
693 |
- |
|
694 |
- if (level) { |
|
695 |
- level += 64; |
|
696 |
- if ((level & (~127)) == 0) |
|
697 |
- bits += length[UNI_AC_ENC_INDEX(run, level)]; |
|
698 |
- else |
|
699 |
- bits += esc_length; |
|
700 |
- run = 0; |
|
701 |
- } else |
|
702 |
- run++; |
|
703 |
- } |
|
704 |
- i = scantable[last]; |
|
705 |
- |
|
706 |
- level = temp[i] + 64; |
|
707 |
- |
|
708 |
- assert(level - 64); |
|
709 |
- |
|
710 |
- if ((level & (~127)) == 0) { |
|
711 |
- bits += last_length[UNI_AC_ENC_INDEX(run, level)]; |
|
712 |
- } else |
|
713 |
- bits += esc_length; |
|
714 |
- } |
|
715 |
- |
|
716 |
- if (last >= 0) { |
|
717 |
- if (s->mb_intra) |
|
718 |
- s->dct_unquantize_intra(s, temp, 0, s->qscale); |
|
719 |
- else |
|
720 |
- s->dct_unquantize_inter(s, temp, 0, s->qscale); |
|
721 |
- } |
|
722 |
- |
|
723 |
- s->idsp.idct_add(lsrc2, 8, temp); |
|
724 |
- |
|
725 |
- distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8); |
|
726 |
- |
|
727 |
- return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7); |
|
728 |
-} |
|
729 |
- |
|
730 |
-static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2, |
|
731 |
- int stride, int h) |
|
732 |
-{ |
|
733 |
- const uint8_t *scantable = s->intra_scantable.permutated; |
|
734 |
- LOCAL_ALIGNED_16(int16_t, temp, [64]); |
|
735 |
- int i, last, run, bits, level, start_i; |
|
736 |
- const int esc_length = s->ac_esc_length; |
|
737 |
- uint8_t *length, *last_length; |
|
738 |
- |
|
739 |
- assert(h == 8); |
|
740 |
- |
|
741 |
- s->pdsp.diff_pixels(temp, src1, src2, stride); |
|
742 |
- |
|
743 |
- s->block_last_index[0 /* FIXME */] = |
|
744 |
- last = |
|
745 |
- s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); |
|
746 |
- |
|
747 |
- bits = 0; |
|
748 |
- |
|
749 |
- if (s->mb_intra) { |
|
750 |
- start_i = 1; |
|
751 |
- length = s->intra_ac_vlc_length; |
|
752 |
- last_length = s->intra_ac_vlc_last_length; |
|
753 |
- bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma |
|
754 |
- } else { |
|
755 |
- start_i = 0; |
|
756 |
- length = s->inter_ac_vlc_length; |
|
757 |
- last_length = s->inter_ac_vlc_last_length; |
|
758 |
- } |
|
759 |
- |
|
760 |
- if (last >= start_i) { |
|
761 |
- run = 0; |
|
762 |
- for (i = start_i; i < last; i++) { |
|
763 |
- int j = scantable[i]; |
|
764 |
- level = temp[j]; |
|
765 |
- |
|
766 |
- if (level) { |
|
767 |
- level += 64; |
|
768 |
- if ((level & (~127)) == 0) |
|
769 |
- bits += length[UNI_AC_ENC_INDEX(run, level)]; |
|
770 |
- else |
|
771 |
- bits += esc_length; |
|
772 |
- run = 0; |
|
773 |
- } else |
|
774 |
- run++; |
|
775 |
- } |
|
776 |
- i = scantable[last]; |
|
777 |
- |
|
778 |
- level = temp[i] + 64; |
|
779 |
- |
|
780 |
- assert(level - 64); |
|
781 |
- |
|
782 |
- if ((level & (~127)) == 0) |
|
783 |
- bits += last_length[UNI_AC_ENC_INDEX(run, level)]; |
|
784 |
- else |
|
785 |
- bits += esc_length; |
|
786 |
- } |
|
787 |
- |
|
788 |
- return bits; |
|
789 |
-} |
|
790 |
- |
|
791 |
-#define VSAD_INTRA(size) \ |
|
792 |
-static int vsad_intra ## size ## _c(MpegEncContext *c, \ |
|
793 |
- uint8_t *s, uint8_t *dummy, \ |
|
794 |
- int stride, int h) \ |
|
795 |
-{ \ |
|
796 |
- int score = 0, x, y; \ |
|
797 |
- \ |
|
798 |
- for (y = 1; y < h; y++) { \ |
|
799 |
- for (x = 0; x < size; x += 4) { \ |
|
800 |
- score += FFABS(s[x] - s[x + stride]) + \ |
|
801 |
- FFABS(s[x + 1] - s[x + stride + 1]) + \ |
|
802 |
- FFABS(s[x + 2] - s[x + 2 + stride]) + \ |
|
803 |
- FFABS(s[x + 3] - s[x + 3 + stride]); \ |
|
804 |
- } \ |
|
805 |
- s += stride; \ |
|
806 |
- } \ |
|
807 |
- \ |
|
808 |
- return score; \ |
|
809 |
-} |
|
810 |
-VSAD_INTRA(8) |
|
811 |
-VSAD_INTRA(16) |
|
812 |
- |
|
813 |
-static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, |
|
814 |
- int stride, int h) |
|
815 |
-{ |
|
816 |
- int score = 0, x, y; |
|
817 |
- |
|
818 |
- for (y = 1; y < h; y++) { |
|
819 |
- for (x = 0; x < 16; x++) |
|
820 |
- score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); |
|
821 |
- s1 += stride; |
|
822 |
- s2 += stride; |
|
823 |
- } |
|
824 |
- |
|
825 |
- return score; |
|
826 |
-} |
|
827 |
- |
|
828 |
-#define SQ(a) ((a) * (a)) |
|
829 |
-#define VSSE_INTRA(size) \ |
|
830 |
-static int vsse_intra ## size ## _c(MpegEncContext *c, \ |
|
831 |
- uint8_t *s, uint8_t *dummy, \ |
|
832 |
- int stride, int h) \ |
|
833 |
-{ \ |
|
834 |
- int score = 0, x, y; \ |
|
835 |
- \ |
|
836 |
- for (y = 1; y < h; y++) { \ |
|
837 |
- for (x = 0; x < size; x += 4) { \ |
|
838 |
- score += SQ(s[x] - s[x + stride]) + \ |
|
839 |
- SQ(s[x + 1] - s[x + stride + 1]) + \ |
|
840 |
- SQ(s[x + 2] - s[x + stride + 2]) + \ |
|
841 |
- SQ(s[x + 3] - s[x + stride + 3]); \ |
|
842 |
- } \ |
|
843 |
- s += stride; \ |
|
844 |
- } \ |
|
845 |
- \ |
|
846 |
- return score; \ |
|
847 |
-} |
|
848 |
-VSSE_INTRA(8) |
|
849 |
-VSSE_INTRA(16) |
|
850 |
- |
|
851 |
-static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, |
|
852 |
- int stride, int h) |
|
853 |
-{ |
|
854 |
- int score = 0, x, y; |
|
855 |
- |
|
856 |
- for (y = 1; y < h; y++) { |
|
857 |
- for (x = 0; x < 16; x++) |
|
858 |
- score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); |
|
859 |
- s1 += stride; |
|
860 |
- s2 += stride; |
|
861 |
- } |
|
862 |
- |
|
863 |
- return score; |
|
864 |
-} |
|
865 |
- |
|
866 |
-#define WRAPPER8_16_SQ(name8, name16) \ |
|
867 |
-static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \ |
|
868 |
- int stride, int h) \ |
|
869 |
-{ \ |
|
870 |
- int score = 0; \ |
|
871 |
- \ |
|
872 |
- score += name8(s, dst, src, stride, 8); \ |
|
873 |
- score += name8(s, dst + 8, src + 8, stride, 8); \ |
|
874 |
- if (h == 16) { \ |
|
875 |
- dst += 8 * stride; \ |
|
876 |
- src += 8 * stride; \ |
|
877 |
- score += name8(s, dst, src, stride, 8); \ |
|
878 |
- score += name8(s, dst + 8, src + 8, stride, 8); \ |
|
879 |
- } \ |
|
880 |
- return score; \ |
|
881 |
-} |
|
882 |
- |
|
883 |
-WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c) |
|
884 |
-WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c) |
|
885 |
-WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c) |
|
886 |
-#if CONFIG_GPL |
|
887 |
-WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c) |
|
888 |
-#endif |
|
889 |
-WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c) |
|
890 |
-WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c) |
|
891 |
-WRAPPER8_16_SQ(rd8x8_c, rd16_c) |
|
892 |
-WRAPPER8_16_SQ(bit8x8_c, bit16_c) |
|
893 |
- |
|
894 |
-/* init static data */ |
|
895 |
-av_cold void ff_dsputil_static_init(void) |
|
896 |
-{ |
|
897 |
- int i; |
|
898 |
- |
|
899 |
- for (i = 0; i < 512; i++) |
|
900 |
- ff_square_tab[i] = (i - 256) * (i - 256); |
|
901 |
-} |
|
902 |
- |
|
903 |
-av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx) |
|
904 |
-{ |
|
905 |
- c->sum_abs_dctelem = sum_abs_dctelem_c; |
|
906 |
- |
|
907 |
- /* TODO [0] 16 [1] 8 */ |
|
908 |
- c->pix_abs[0][0] = pix_abs16_c; |
|
909 |
- c->pix_abs[0][1] = pix_abs16_x2_c; |
|
910 |
- c->pix_abs[0][2] = pix_abs16_y2_c; |
|
911 |
- c->pix_abs[0][3] = pix_abs16_xy2_c; |
|
912 |
- c->pix_abs[1][0] = pix_abs8_c; |
|
913 |
- c->pix_abs[1][1] = pix_abs8_x2_c; |
|
914 |
- c->pix_abs[1][2] = pix_abs8_y2_c; |
|
915 |
- c->pix_abs[1][3] = pix_abs8_xy2_c; |
|
916 |
- |
|
917 |
-#define SET_CMP_FUNC(name) \ |
|
918 |
- c->name[0] = name ## 16_c; \ |
|
919 |
- c->name[1] = name ## 8x8_c; |
|
920 |
- |
|
921 |
- SET_CMP_FUNC(hadamard8_diff) |
|
922 |
- c->hadamard8_diff[4] = hadamard8_intra16_c; |
|
923 |
- c->hadamard8_diff[5] = hadamard8_intra8x8_c; |
|
924 |
- SET_CMP_FUNC(dct_sad) |
|
925 |
- SET_CMP_FUNC(dct_max) |
|
926 |
-#if CONFIG_GPL |
|
927 |
- SET_CMP_FUNC(dct264_sad) |
|
928 |
-#endif |
|
929 |
- c->sad[0] = pix_abs16_c; |
|
930 |
- c->sad[1] = pix_abs8_c; |
|
931 |
- c->sse[0] = sse16_c; |
|
932 |
- c->sse[1] = sse8_c; |
|
933 |
- c->sse[2] = sse4_c; |
|
934 |
- SET_CMP_FUNC(quant_psnr) |
|
935 |
- SET_CMP_FUNC(rd) |
|
936 |
- SET_CMP_FUNC(bit) |
|
937 |
- c->vsad[0] = vsad16_c; |
|
938 |
- c->vsad[4] = vsad_intra16_c; |
|
939 |
- c->vsad[5] = vsad_intra8_c; |
|
940 |
- c->vsse[0] = vsse16_c; |
|
941 |
- c->vsse[4] = vsse_intra16_c; |
|
942 |
- c->vsse[5] = vsse_intra8_c; |
|
943 |
- c->nsse[0] = nsse16_c; |
|
944 |
- c->nsse[1] = nsse8_c; |
|
945 |
- |
|
946 |
- if (ARCH_ARM) |
|
947 |
- ff_dsputil_init_arm(c, avctx); |
|
948 |
- if (ARCH_PPC) |
|
949 |
- ff_dsputil_init_ppc(c, avctx); |
|
950 |
- if (ARCH_X86) |
|
951 |
- ff_dsputil_init_x86(c, avctx); |
|
952 |
-} |
953 | 1 |
deleted file mode 100644 |
... | ... |
@@ -1,85 +0,0 @@ |
1 |
-/* |
|
2 |
- * DSP utils |
|
3 |
- * Copyright (c) 2000, 2001, 2002 Fabrice Bellard |
|
4 |
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
5 |
- * |
|
6 |
- * This file is part of Libav. |
|
7 |
- * |
|
8 |
- * Libav is free software; you can redistribute it and/or |
|
9 |
- * modify it under the terms of the GNU Lesser General Public |
|
10 |
- * License as published by the Free Software Foundation; either |
|
11 |
- * version 2.1 of the License, or (at your option) any later version. |
|
12 |
- * |
|
13 |
- * Libav is distributed in the hope that it will be useful, |
|
14 |
- * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
15 |
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
16 |
- * Lesser General Public License for more details. |
|
17 |
- * |
|
18 |
- * You should have received a copy of the GNU Lesser General Public |
|
19 |
- * License along with Libav; if not, write to the Free Software |
|
20 |
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
21 |
- */ |
|
22 |
- |
|
23 |
-/** |
|
24 |
- * @file |
|
25 |
- * DSP utils. |
|
26 |
- * Note, many functions in here may use MMX which trashes the FPU state, it is |
|
27 |
- * absolutely necessary to call emms_c() between DSP & float/double code. |
|
28 |
- */ |
|
29 |
- |
|
30 |
-#ifndef AVCODEC_DSPUTIL_H |
|
31 |
-#define AVCODEC_DSPUTIL_H |
|
32 |
- |
|
33 |
-#include "avcodec.h" |
|
34 |
- |
|
35 |
-extern uint32_t ff_square_tab[512]; |
|
36 |
- |
|
37 |
-struct MpegEncContext; |
|
38 |
-/* Motion estimation: |
|
39 |
- * h is limited to { width / 2, width, 2 * width }, |
|
40 |
- * but never larger than 16 and never smaller than 2. |
|
41 |
- * Although currently h < 4 is not used as functions with |
|
42 |
- * width < 8 are neither used nor implemented. */ |
|
43 |
-typedef int (*me_cmp_func)(struct MpegEncContext *c, |
|
44 |
- uint8_t *blk1 /* align width (8 or 16) */, |
|
45 |
- uint8_t *blk2 /* align 1 */, int line_size, int h); |
|
46 |
- |
|
47 |
-/** |
|
48 |
- * DSPContext. |
|
49 |
- */ |
|
50 |
-typedef struct DSPContext { |
|
51 |
- int (*sum_abs_dctelem)(int16_t *block /* align 16 */); |
|
52 |
- |
|
53 |
- me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */ |
|
54 |
- me_cmp_func sse[6]; |
|
55 |
- me_cmp_func hadamard8_diff[6]; |
|
56 |
- me_cmp_func dct_sad[6]; |
|
57 |
- me_cmp_func quant_psnr[6]; |
|
58 |
- me_cmp_func bit[6]; |
|
59 |
- me_cmp_func rd[6]; |
|
60 |
- me_cmp_func vsad[6]; |
|
61 |
- me_cmp_func vsse[6]; |
|
62 |
- me_cmp_func nsse[6]; |
|
63 |
- me_cmp_func dct_max[6]; |
|
64 |
- me_cmp_func dct264_sad[6]; |
|
65 |
- |
|
66 |
- me_cmp_func me_pre_cmp[6]; |
|
67 |
- me_cmp_func me_cmp[6]; |
|
68 |
- me_cmp_func me_sub_cmp[6]; |
|
69 |
- me_cmp_func mb_cmp[6]; |
|
70 |
- me_cmp_func ildct_cmp[6]; // only width 16 used |
|
71 |
- me_cmp_func frame_skip_cmp[6]; // only width 8 used |
|
72 |
- |
|
73 |
- me_cmp_func pix_abs[2][4]; |
|
74 |
-} DSPContext; |
|
75 |
- |
|
76 |
-void ff_dsputil_static_init(void); |
|
77 |
-void ff_dsputil_init(DSPContext *p, AVCodecContext *avctx); |
|
78 |
- |
|
79 |
-void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type); |
|
80 |
- |
|
81 |
-void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx); |
|
82 |
-void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx); |
|
83 |
-void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx); |
|
84 |
- |
|
85 |
-#endif /* AVCODEC_DSPUTIL_H */ |
... | ... |
@@ -28,9 +28,9 @@ |
28 | 28 |
#include "libavutil/pixdesc.h" |
29 | 29 |
#include "config.h" |
30 | 30 |
#include "avcodec.h" |
31 |
-#include "dsputil.h" |
|
32 | 31 |
#include "fdctdsp.h" |
33 | 32 |
#include "internal.h" |
33 |
+#include "me_cmp.h" |
|
34 | 34 |
#include "pixblockdsp.h" |
35 | 35 |
#include "put_bits.h" |
36 | 36 |
#include "dv.h" |
... | ... |
@@ -40,8 +40,8 @@ |
40 | 40 |
static av_cold int dvvideo_encode_init(AVCodecContext *avctx) |
41 | 41 |
{ |
42 | 42 |
DVVideoContext *s = avctx->priv_data; |
43 |
- DSPContext dsp; |
|
44 | 43 |
FDCTDSPContext fdsp; |
44 |
+ MECmpContext mecc; |
|
45 | 45 |
PixblockDSPContext pdsp; |
46 | 46 |
int ret; |
47 | 47 |
|
... | ... |
@@ -65,13 +65,13 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx) |
65 | 65 |
|
66 | 66 |
dv_vlc_map_tableinit(); |
67 | 67 |
|
68 |
- ff_dsputil_init(&dsp, avctx); |
|
69 | 68 |
ff_fdctdsp_init(&fdsp, avctx); |
69 |
+ ff_me_cmp_init(&mecc, avctx); |
|
70 | 70 |
ff_pixblockdsp_init(&pdsp, avctx); |
71 |
- ff_set_cmp(&dsp, dsp.ildct_cmp, avctx->ildct_cmp); |
|
71 |
+ ff_set_cmp(&mecc, mecc.ildct_cmp, avctx->ildct_cmp); |
|
72 | 72 |
|
73 | 73 |
s->get_pixels = pdsp.get_pixels; |
74 |
- s->ildct_cmp = dsp.ildct_cmp[5]; |
|
74 |
+ s->ildct_cmp = mecc.ildct_cmp[5]; |
|
75 | 75 |
|
76 | 76 |
s->fdct[0] = fdsp.fdct; |
77 | 77 |
s->fdct[1] = fdsp.fdct248; |
... | ... |
@@ -715,11 +715,11 @@ FF_ENABLE_DEPRECATION_WARNINGS |
715 | 715 |
} else { |
716 | 716 |
ff_thread_await_progress(s->last_pic.tf, mb_y, 0); |
717 | 717 |
} |
718 |
- is_intra_likely += s->dsp->sad[0](NULL, last_mb_ptr, mb_ptr, |
|
719 |
- linesize[0], 16); |
|
720 |
- is_intra_likely -= s->dsp->sad[0](NULL, last_mb_ptr, |
|
721 |
- last_mb_ptr + linesize[0] * 16, |
|
722 |
- linesize[0], 16); |
|
718 |
+ is_intra_likely += s->mecc->sad[0](NULL, last_mb_ptr, mb_ptr, |
|
719 |
+ linesize[0], 16); |
|
720 |
+ is_intra_likely -= s->mecc->sad[0](NULL, last_mb_ptr, |
|
721 |
+ last_mb_ptr + linesize[0] * 16, |
|
722 |
+ linesize[0], 16); |
|
723 | 723 |
} else { |
724 | 724 |
if (IS_INTRA(s->cur_pic.mb_type[mb_xy])) |
725 | 725 |
is_intra_likely++; |
... | ... |
@@ -23,7 +23,7 @@ |
23 | 23 |
#include <stdint.h> |
24 | 24 |
|
25 | 25 |
#include "avcodec.h" |
26 |
-#include "dsputil.h" |
|
26 |
+#include "me_cmp.h" |
|
27 | 27 |
#include "thread.h" |
28 | 28 |
|
29 | 29 |
///< current MB is the first after a resync marker |
... | ... |
@@ -52,7 +52,7 @@ typedef struct ERPicture { |
52 | 52 |
|
53 | 53 |
typedef struct ERContext { |
54 | 54 |
AVCodecContext *avctx; |
55 |
- DSPContext *dsp; |
|
55 |
+ MECmpContext *mecc; |
|
56 | 56 |
|
57 | 57 |
int *mb_index2xy; |
58 | 58 |
int mb_num; |
... | ... |
@@ -33,7 +33,6 @@ |
33 | 33 |
#include "internal.h" |
34 | 34 |
#include "cabac.h" |
35 | 35 |
#include "cabac_functions.h" |
36 |
-#include "dsputil.h" |
|
37 | 36 |
#include "error_resilience.h" |
38 | 37 |
#include "avcodec.h" |
39 | 38 |
#include "h264.h" |
... | ... |
@@ -42,6 +41,7 @@ |
42 | 42 |
#include "h264_mvpred.h" |
43 | 43 |
#include "golomb.h" |
44 | 44 |
#include "mathops.h" |
45 |
+#include "me_cmp.h" |
|
45 | 46 |
#include "mpegutils.h" |
46 | 47 |
#include "rectangle.h" |
47 | 48 |
#include "svq3.h" |
... | ... |
@@ -490,7 +490,7 @@ int ff_h264_context_init(H264Context *h) |
490 | 490 |
if (CONFIG_ERROR_RESILIENCE) { |
491 | 491 |
/* init ER */ |
492 | 492 |
er->avctx = h->avctx; |
493 |
- er->dsp = &h->dsp; |
|
493 |
+ er->mecc = &h->mecc; |
|
494 | 494 |
er->decode_mb = h264_er_decode_mb; |
495 | 495 |
er->opaque = h; |
496 | 496 |
er->quarter_sample = 1; |
... | ... |
@@ -620,7 +620,7 @@ av_cold int ff_h264_decode_init(AVCodecContext *avctx) |
620 | 620 |
|
621 | 621 |
/* needed so that IDCT permutation is known early */ |
622 | 622 |
if (CONFIG_ERROR_RESILIENCE) |
623 |
- ff_dsputil_init(&h->dsp, h->avctx); |
|
623 |
+ ff_me_cmp_init(&h->mecc, h->avctx); |
|
624 | 624 |
ff_videodsp_init(&h->vdsp, 8); |
625 | 625 |
|
626 | 626 |
memset(h->pps.scaling_matrix4, 16, 6 * 16 * sizeof(uint8_t)); |
... | ... |
@@ -1234,7 +1234,7 @@ int ff_h264_set_parameter_from_sps(H264Context *h) |
1234 | 1234 |
ff_h264_pred_init(&h->hpc, h->avctx->codec_id, h->sps.bit_depth_luma, |
1235 | 1235 |
h->sps.chroma_format_idc); |
1236 | 1236 |
if (CONFIG_ERROR_RESILIENCE) |
1237 |
- ff_dsputil_init(&h->dsp, h->avctx); |
|
1237 |
+ ff_me_cmp_init(&h->mecc, h->avctx); |
|
1238 | 1238 |
ff_videodsp_init(&h->vdsp, h->sps.bit_depth_luma); |
1239 | 1239 |
} else { |
1240 | 1240 |
av_log(h->avctx, AV_LOG_ERROR, "Unsupported bit depth %d\n", |
... | ... |
@@ -30,13 +30,13 @@ |
30 | 30 |
|
31 | 31 |
#include "libavutil/intreadwrite.h" |
32 | 32 |
#include "cabac.h" |
33 |
-#include "dsputil.h" |
|
34 | 33 |
#include "error_resilience.h" |
35 | 34 |
#include "get_bits.h" |
36 | 35 |
#include "h264chroma.h" |
37 | 36 |
#include "h264dsp.h" |
38 | 37 |
#include "h264pred.h" |
39 | 38 |
#include "h264qpel.h" |
39 |
+#include "me_cmp.h" |
|
40 | 40 |
#include "mpegutils.h" |
41 | 41 |
#include "parser.h" |
42 | 42 |
#include "qpeldsp.h" |
... | ... |
@@ -302,7 +302,7 @@ typedef struct H264Picture { |
302 | 302 |
*/ |
303 | 303 |
typedef struct H264Context { |
304 | 304 |
AVCodecContext *avctx; |
305 |
- DSPContext dsp; |
|
305 |
+ MECmpContext mecc; |
|
306 | 306 |
VideoDSPContext vdsp; |
307 | 307 |
H264DSPContext h264dsp; |
308 | 308 |
H264ChromaContext h264chroma; |
... | ... |
@@ -31,7 +31,6 @@ |
31 | 31 |
#include "internal.h" |
32 | 32 |
#include "cabac.h" |
33 | 33 |
#include "cabac_functions.h" |
34 |
-#include "dsputil.h" |
|
35 | 34 |
#include "error_resilience.h" |
36 | 35 |
#include "avcodec.h" |
37 | 36 |
#include "h264.h" |
... | ... |
@@ -1119,7 +1118,7 @@ static int h264_slice_header_init(H264Context *h, int reinit) |
1119 | 1119 |
if (!c) |
1120 | 1120 |
return AVERROR(ENOMEM); |
1121 | 1121 |
c->avctx = h->avctx; |
1122 |
- c->dsp = h->dsp; |
|
1122 |
+ c->mecc = h->mecc; |
|
1123 | 1123 |
c->vdsp = h->vdsp; |
1124 | 1124 |
c->h264dsp = h->h264dsp; |
1125 | 1125 |
c->h264qpel = h->h264qpel; |
1126 | 1126 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,942 @@ |
0 |
+/* |
|
1 |
+ * This file is part of Libav. |
|
2 |
+ * |
|
3 |
+ * Libav is free software; you can redistribute it and/or |
|
4 |
+ * modify it under the terms of the GNU Lesser General Public |
|
5 |
+ * License as published by the Free Software Foundation; either |
|
6 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
7 |
+ * |
|
8 |
+ * Libav is distributed in the hope that it will be useful, |
|
9 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
10 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
11 |
+ * Lesser General Public License for more details. |
|
12 |
+ * |
|
13 |
+ * You should have received a copy of the GNU Lesser General Public |
|
14 |
+ * License along with Libav; if not, write to the Free Software |
|
15 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
16 |
+ */ |
|
17 |
+ |
|
18 |
+#include "libavutil/attributes.h" |
|
19 |
+#include "avcodec.h" |
|
20 |
+#include "copy_block.h" |
|
21 |
+#include "simple_idct.h" |
|
22 |
+#include "me_cmp.h" |
|
23 |
+#include "mpegvideo.h" |
|
24 |
+#include "config.h" |
|
25 |
+ |
|
26 |
+uint32_t ff_square_tab[512] = { 0, }; |
|
27 |
+ |
|
28 |
+static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
29 |
+ int line_size, int h) |
|
30 |
+{ |
|
31 |
+ int s = 0, i; |
|
32 |
+ uint32_t *sq = ff_square_tab + 256; |
|
33 |
+ |
|
34 |
+ for (i = 0; i < h; i++) { |
|
35 |
+ s += sq[pix1[0] - pix2[0]]; |
|
36 |
+ s += sq[pix1[1] - pix2[1]]; |
|
37 |
+ s += sq[pix1[2] - pix2[2]]; |
|
38 |
+ s += sq[pix1[3] - pix2[3]]; |
|
39 |
+ pix1 += line_size; |
|
40 |
+ pix2 += line_size; |
|
41 |
+ } |
|
42 |
+ return s; |
|
43 |
+} |
|
44 |
+ |
|
45 |
+static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
46 |
+ int line_size, int h) |
|
47 |
+{ |
|
48 |
+ int s = 0, i; |
|
49 |
+ uint32_t *sq = ff_square_tab + 256; |
|
50 |
+ |
|
51 |
+ for (i = 0; i < h; i++) { |
|
52 |
+ s += sq[pix1[0] - pix2[0]]; |
|
53 |
+ s += sq[pix1[1] - pix2[1]]; |
|
54 |
+ s += sq[pix1[2] - pix2[2]]; |
|
55 |
+ s += sq[pix1[3] - pix2[3]]; |
|
56 |
+ s += sq[pix1[4] - pix2[4]]; |
|
57 |
+ s += sq[pix1[5] - pix2[5]]; |
|
58 |
+ s += sq[pix1[6] - pix2[6]]; |
|
59 |
+ s += sq[pix1[7] - pix2[7]]; |
|
60 |
+ pix1 += line_size; |
|
61 |
+ pix2 += line_size; |
|
62 |
+ } |
|
63 |
+ return s; |
|
64 |
+} |
|
65 |
+ |
|
66 |
+static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
67 |
+ int line_size, int h) |
|
68 |
+{ |
|
69 |
+ int s = 0, i; |
|
70 |
+ uint32_t *sq = ff_square_tab + 256; |
|
71 |
+ |
|
72 |
+ for (i = 0; i < h; i++) { |
|
73 |
+ s += sq[pix1[0] - pix2[0]]; |
|
74 |
+ s += sq[pix1[1] - pix2[1]]; |
|
75 |
+ s += sq[pix1[2] - pix2[2]]; |
|
76 |
+ s += sq[pix1[3] - pix2[3]]; |
|
77 |
+ s += sq[pix1[4] - pix2[4]]; |
|
78 |
+ s += sq[pix1[5] - pix2[5]]; |
|
79 |
+ s += sq[pix1[6] - pix2[6]]; |
|
80 |
+ s += sq[pix1[7] - pix2[7]]; |
|
81 |
+ s += sq[pix1[8] - pix2[8]]; |
|
82 |
+ s += sq[pix1[9] - pix2[9]]; |
|
83 |
+ s += sq[pix1[10] - pix2[10]]; |
|
84 |
+ s += sq[pix1[11] - pix2[11]]; |
|
85 |
+ s += sq[pix1[12] - pix2[12]]; |
|
86 |
+ s += sq[pix1[13] - pix2[13]]; |
|
87 |
+ s += sq[pix1[14] - pix2[14]]; |
|
88 |
+ s += sq[pix1[15] - pix2[15]]; |
|
89 |
+ |
|
90 |
+ pix1 += line_size; |
|
91 |
+ pix2 += line_size; |
|
92 |
+ } |
|
93 |
+ return s; |
|
94 |
+} |
|
95 |
+ |
|
96 |
+static int sum_abs_dctelem_c(int16_t *block) |
|
97 |
+{ |
|
98 |
+ int sum = 0, i; |
|
99 |
+ |
|
100 |
+ for (i = 0; i < 64; i++) |
|
101 |
+ sum += FFABS(block[i]); |
|
102 |
+ return sum; |
|
103 |
+} |
|
104 |
+ |
|
105 |
+#define avg2(a, b) ((a + b + 1) >> 1) |
|
106 |
+#define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2) |
|
107 |
+ |
|
108 |
+static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
109 |
+ int line_size, int h) |
|
110 |
+{ |
|
111 |
+ int s = 0, i; |
|
112 |
+ |
|
113 |
+ for (i = 0; i < h; i++) { |
|
114 |
+ s += abs(pix1[0] - pix2[0]); |
|
115 |
+ s += abs(pix1[1] - pix2[1]); |
|
116 |
+ s += abs(pix1[2] - pix2[2]); |
|
117 |
+ s += abs(pix1[3] - pix2[3]); |
|
118 |
+ s += abs(pix1[4] - pix2[4]); |
|
119 |
+ s += abs(pix1[5] - pix2[5]); |
|
120 |
+ s += abs(pix1[6] - pix2[6]); |
|
121 |
+ s += abs(pix1[7] - pix2[7]); |
|
122 |
+ s += abs(pix1[8] - pix2[8]); |
|
123 |
+ s += abs(pix1[9] - pix2[9]); |
|
124 |
+ s += abs(pix1[10] - pix2[10]); |
|
125 |
+ s += abs(pix1[11] - pix2[11]); |
|
126 |
+ s += abs(pix1[12] - pix2[12]); |
|
127 |
+ s += abs(pix1[13] - pix2[13]); |
|
128 |
+ s += abs(pix1[14] - pix2[14]); |
|
129 |
+ s += abs(pix1[15] - pix2[15]); |
|
130 |
+ pix1 += line_size; |
|
131 |
+ pix2 += line_size; |
|
132 |
+ } |
|
133 |
+ return s; |
|
134 |
+} |
|
135 |
+ |
|
136 |
+static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
137 |
+ int line_size, int h) |
|
138 |
+{ |
|
139 |
+ int s = 0, i; |
|
140 |
+ |
|
141 |
+ for (i = 0; i < h; i++) { |
|
142 |
+ s += abs(pix1[0] - avg2(pix2[0], pix2[1])); |
|
143 |
+ s += abs(pix1[1] - avg2(pix2[1], pix2[2])); |
|
144 |
+ s += abs(pix1[2] - avg2(pix2[2], pix2[3])); |
|
145 |
+ s += abs(pix1[3] - avg2(pix2[3], pix2[4])); |
|
146 |
+ s += abs(pix1[4] - avg2(pix2[4], pix2[5])); |
|
147 |
+ s += abs(pix1[5] - avg2(pix2[5], pix2[6])); |
|
148 |
+ s += abs(pix1[6] - avg2(pix2[6], pix2[7])); |
|
149 |
+ s += abs(pix1[7] - avg2(pix2[7], pix2[8])); |
|
150 |
+ s += abs(pix1[8] - avg2(pix2[8], pix2[9])); |
|
151 |
+ s += abs(pix1[9] - avg2(pix2[9], pix2[10])); |
|
152 |
+ s += abs(pix1[10] - avg2(pix2[10], pix2[11])); |
|
153 |
+ s += abs(pix1[11] - avg2(pix2[11], pix2[12])); |
|
154 |
+ s += abs(pix1[12] - avg2(pix2[12], pix2[13])); |
|
155 |
+ s += abs(pix1[13] - avg2(pix2[13], pix2[14])); |
|
156 |
+ s += abs(pix1[14] - avg2(pix2[14], pix2[15])); |
|
157 |
+ s += abs(pix1[15] - avg2(pix2[15], pix2[16])); |
|
158 |
+ pix1 += line_size; |
|
159 |
+ pix2 += line_size; |
|
160 |
+ } |
|
161 |
+ return s; |
|
162 |
+} |
|
163 |
+ |
|
164 |
+static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
165 |
+ int line_size, int h) |
|
166 |
+{ |
|
167 |
+ int s = 0, i; |
|
168 |
+ uint8_t *pix3 = pix2 + line_size; |
|
169 |
+ |
|
170 |
+ for (i = 0; i < h; i++) { |
|
171 |
+ s += abs(pix1[0] - avg2(pix2[0], pix3[0])); |
|
172 |
+ s += abs(pix1[1] - avg2(pix2[1], pix3[1])); |
|
173 |
+ s += abs(pix1[2] - avg2(pix2[2], pix3[2])); |
|
174 |
+ s += abs(pix1[3] - avg2(pix2[3], pix3[3])); |
|
175 |
+ s += abs(pix1[4] - avg2(pix2[4], pix3[4])); |
|
176 |
+ s += abs(pix1[5] - avg2(pix2[5], pix3[5])); |
|
177 |
+ s += abs(pix1[6] - avg2(pix2[6], pix3[6])); |
|
178 |
+ s += abs(pix1[7] - avg2(pix2[7], pix3[7])); |
|
179 |
+ s += abs(pix1[8] - avg2(pix2[8], pix3[8])); |
|
180 |
+ s += abs(pix1[9] - avg2(pix2[9], pix3[9])); |
|
181 |
+ s += abs(pix1[10] - avg2(pix2[10], pix3[10])); |
|
182 |
+ s += abs(pix1[11] - avg2(pix2[11], pix3[11])); |
|
183 |
+ s += abs(pix1[12] - avg2(pix2[12], pix3[12])); |
|
184 |
+ s += abs(pix1[13] - avg2(pix2[13], pix3[13])); |
|
185 |
+ s += abs(pix1[14] - avg2(pix2[14], pix3[14])); |
|
186 |
+ s += abs(pix1[15] - avg2(pix2[15], pix3[15])); |
|
187 |
+ pix1 += line_size; |
|
188 |
+ pix2 += line_size; |
|
189 |
+ pix3 += line_size; |
|
190 |
+ } |
|
191 |
+ return s; |
|
192 |
+} |
|
193 |
+ |
|
194 |
+static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
195 |
+ int line_size, int h) |
|
196 |
+{ |
|
197 |
+ int s = 0, i; |
|
198 |
+ uint8_t *pix3 = pix2 + line_size; |
|
199 |
+ |
|
200 |
+ for (i = 0; i < h; i++) { |
|
201 |
+ s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); |
|
202 |
+ s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); |
|
203 |
+ s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); |
|
204 |
+ s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); |
|
205 |
+ s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); |
|
206 |
+ s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); |
|
207 |
+ s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); |
|
208 |
+ s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); |
|
209 |
+ s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9])); |
|
210 |
+ s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10])); |
|
211 |
+ s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11])); |
|
212 |
+ s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12])); |
|
213 |
+ s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13])); |
|
214 |
+ s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14])); |
|
215 |
+ s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15])); |
|
216 |
+ s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16])); |
|
217 |
+ pix1 += line_size; |
|
218 |
+ pix2 += line_size; |
|
219 |
+ pix3 += line_size; |
|
220 |
+ } |
|
221 |
+ return s; |
|
222 |
+} |
|
223 |
+ |
|
224 |
+static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
225 |
+ int line_size, int h) |
|
226 |
+{ |
|
227 |
+ int s = 0, i; |
|
228 |
+ |
|
229 |
+ for (i = 0; i < h; i++) { |
|
230 |
+ s += abs(pix1[0] - pix2[0]); |
|
231 |
+ s += abs(pix1[1] - pix2[1]); |
|
232 |
+ s += abs(pix1[2] - pix2[2]); |
|
233 |
+ s += abs(pix1[3] - pix2[3]); |
|
234 |
+ s += abs(pix1[4] - pix2[4]); |
|
235 |
+ s += abs(pix1[5] - pix2[5]); |
|
236 |
+ s += abs(pix1[6] - pix2[6]); |
|
237 |
+ s += abs(pix1[7] - pix2[7]); |
|
238 |
+ pix1 += line_size; |
|
239 |
+ pix2 += line_size; |
|
240 |
+ } |
|
241 |
+ return s; |
|
242 |
+} |
|
243 |
+ |
|
244 |
+static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
245 |
+ int line_size, int h) |
|
246 |
+{ |
|
247 |
+ int s = 0, i; |
|
248 |
+ |
|
249 |
+ for (i = 0; i < h; i++) { |
|
250 |
+ s += abs(pix1[0] - avg2(pix2[0], pix2[1])); |
|
251 |
+ s += abs(pix1[1] - avg2(pix2[1], pix2[2])); |
|
252 |
+ s += abs(pix1[2] - avg2(pix2[2], pix2[3])); |
|
253 |
+ s += abs(pix1[3] - avg2(pix2[3], pix2[4])); |
|
254 |
+ s += abs(pix1[4] - avg2(pix2[4], pix2[5])); |
|
255 |
+ s += abs(pix1[5] - avg2(pix2[5], pix2[6])); |
|
256 |
+ s += abs(pix1[6] - avg2(pix2[6], pix2[7])); |
|
257 |
+ s += abs(pix1[7] - avg2(pix2[7], pix2[8])); |
|
258 |
+ pix1 += line_size; |
|
259 |
+ pix2 += line_size; |
|
260 |
+ } |
|
261 |
+ return s; |
|
262 |
+} |
|
263 |
+ |
|
264 |
+static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
265 |
+ int line_size, int h) |
|
266 |
+{ |
|
267 |
+ int s = 0, i; |
|
268 |
+ uint8_t *pix3 = pix2 + line_size; |
|
269 |
+ |
|
270 |
+ for (i = 0; i < h; i++) { |
|
271 |
+ s += abs(pix1[0] - avg2(pix2[0], pix3[0])); |
|
272 |
+ s += abs(pix1[1] - avg2(pix2[1], pix3[1])); |
|
273 |
+ s += abs(pix1[2] - avg2(pix2[2], pix3[2])); |
|
274 |
+ s += abs(pix1[3] - avg2(pix2[3], pix3[3])); |
|
275 |
+ s += abs(pix1[4] - avg2(pix2[4], pix3[4])); |
|
276 |
+ s += abs(pix1[5] - avg2(pix2[5], pix3[5])); |
|
277 |
+ s += abs(pix1[6] - avg2(pix2[6], pix3[6])); |
|
278 |
+ s += abs(pix1[7] - avg2(pix2[7], pix3[7])); |
|
279 |
+ pix1 += line_size; |
|
280 |
+ pix2 += line_size; |
|
281 |
+ pix3 += line_size; |
|
282 |
+ } |
|
283 |
+ return s; |
|
284 |
+} |
|
285 |
+ |
|
286 |
+static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
287 |
+ int line_size, int h) |
|
288 |
+{ |
|
289 |
+ int s = 0, i; |
|
290 |
+ uint8_t *pix3 = pix2 + line_size; |
|
291 |
+ |
|
292 |
+ for (i = 0; i < h; i++) { |
|
293 |
+ s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); |
|
294 |
+ s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); |
|
295 |
+ s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); |
|
296 |
+ s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); |
|
297 |
+ s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); |
|
298 |
+ s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); |
|
299 |
+ s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); |
|
300 |
+ s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); |
|
301 |
+ pix1 += line_size; |
|
302 |
+ pix2 += line_size; |
|
303 |
+ pix3 += line_size; |
|
304 |
+ } |
|
305 |
+ return s; |
|
306 |
+} |
|
307 |
+ |
|
308 |
+static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h) |
|
309 |
+{ |
|
310 |
+ int score1 = 0, score2 = 0, x, y; |
|
311 |
+ |
|
312 |
+ for (y = 0; y < h; y++) { |
|
313 |
+ for (x = 0; x < 16; x++) |
|
314 |
+ score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]); |
|
315 |
+ if (y + 1 < h) { |
|
316 |
+ for (x = 0; x < 15; x++) |
|
317 |
+ score2 += FFABS(s1[x] - s1[x + stride] - |
|
318 |
+ s1[x + 1] + s1[x + stride + 1]) - |
|
319 |
+ FFABS(s2[x] - s2[x + stride] - |
|
320 |
+ s2[x + 1] + s2[x + stride + 1]); |
|
321 |
+ } |
|
322 |
+ s1 += stride; |
|
323 |
+ s2 += stride; |
|
324 |
+ } |
|
325 |
+ |
|
326 |
+ if (c) |
|
327 |
+ return score1 + FFABS(score2) * c->avctx->nsse_weight; |
|
328 |
+ else |
|
329 |
+ return score1 + FFABS(score2) * 8; |
|
330 |
+} |
|
331 |
+ |
|
332 |
+static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h) |
|
333 |
+{ |
|
334 |
+ int score1 = 0, score2 = 0, x, y; |
|
335 |
+ |
|
336 |
+ for (y = 0; y < h; y++) { |
|
337 |
+ for (x = 0; x < 8; x++) |
|
338 |
+ score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]); |
|
339 |
+ if (y + 1 < h) { |
|
340 |
+ for (x = 0; x < 7; x++) |
|
341 |
+ score2 += FFABS(s1[x] - s1[x + stride] - |
|
342 |
+ s1[x + 1] + s1[x + stride + 1]) - |
|
343 |
+ FFABS(s2[x] - s2[x + stride] - |
|
344 |
+ s2[x + 1] + s2[x + stride + 1]); |
|
345 |
+ } |
|
346 |
+ s1 += stride; |
|
347 |
+ s2 += stride; |
|
348 |
+ } |
|
349 |
+ |
|
350 |
+ if (c) |
|
351 |
+ return score1 + FFABS(score2) * c->avctx->nsse_weight; |
|
352 |
+ else |
|
353 |
+ return score1 + FFABS(score2) * 8; |
|
354 |
+} |
|
355 |
+ |
|
356 |
+static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b, |
|
357 |
+ int stride, int h) |
|
358 |
+{ |
|
359 |
+ return 0; |
|
360 |
+} |
|
361 |
+ |
|
362 |
+void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type) |
|
363 |
+{ |
|
364 |
+ int i; |
|
365 |
+ |
|
366 |
+ memset(cmp, 0, sizeof(void *) * 6); |
|
367 |
+ |
|
368 |
+ for (i = 0; i < 6; i++) { |
|
369 |
+ switch (type & 0xFF) { |
|
370 |
+ case FF_CMP_SAD: |
|
371 |
+ cmp[i] = c->sad[i]; |
|
372 |
+ break; |
|
373 |
+ case FF_CMP_SATD: |
|
374 |
+ cmp[i] = c->hadamard8_diff[i]; |
|
375 |
+ break; |
|
376 |
+ case FF_CMP_SSE: |
|
377 |
+ cmp[i] = c->sse[i]; |
|
378 |
+ break; |
|
379 |
+ case FF_CMP_DCT: |
|
380 |
+ cmp[i] = c->dct_sad[i]; |
|
381 |
+ break; |
|
382 |
+ case FF_CMP_DCT264: |
|
383 |
+ cmp[i] = c->dct264_sad[i]; |
|
384 |
+ break; |
|
385 |
+ case FF_CMP_DCTMAX: |
|
386 |
+ cmp[i] = c->dct_max[i]; |
|
387 |
+ break; |
|
388 |
+ case FF_CMP_PSNR: |
|
389 |
+ cmp[i] = c->quant_psnr[i]; |
|
390 |
+ break; |
|
391 |
+ case FF_CMP_BIT: |
|
392 |
+ cmp[i] = c->bit[i]; |
|
393 |
+ break; |
|
394 |
+ case FF_CMP_RD: |
|
395 |
+ cmp[i] = c->rd[i]; |
|
396 |
+ break; |
|
397 |
+ case FF_CMP_VSAD: |
|
398 |
+ cmp[i] = c->vsad[i]; |
|
399 |
+ break; |
|
400 |
+ case FF_CMP_VSSE: |
|
401 |
+ cmp[i] = c->vsse[i]; |
|
402 |
+ break; |
|
403 |
+ case FF_CMP_ZERO: |
|
404 |
+ cmp[i] = zero_cmp; |
|
405 |
+ break; |
|
406 |
+ case FF_CMP_NSSE: |
|
407 |
+ cmp[i] = c->nsse[i]; |
|
408 |
+ break; |
|
409 |
+ default: |
|
410 |
+ av_log(NULL, AV_LOG_ERROR, |
|
411 |
+ "internal error in cmp function selection\n"); |
|
412 |
+ } |
|
413 |
+ } |
|
414 |
+} |
|
415 |
+ |
|
416 |
+#define BUTTERFLY2(o1, o2, i1, i2) \ |
|
417 |
+ o1 = (i1) + (i2); \ |
|
418 |
+ o2 = (i1) - (i2); |
|
419 |
+ |
|
420 |
+#define BUTTERFLY1(x, y) \ |
|
421 |
+ { \ |
|
422 |
+ int a, b; \ |
|
423 |
+ a = x; \ |
|
424 |
+ b = y; \ |
|
425 |
+ x = a + b; \ |
|
426 |
+ y = a - b; \ |
|
427 |
+ } |
|
428 |
+ |
|
429 |
+#define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y))) |
|
430 |
+ |
|
431 |
+static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst, |
|
432 |
+ uint8_t *src, int stride, int h) |
|
433 |
+{ |
|
434 |
+ int i, temp[64], sum = 0; |
|
435 |
+ |
|
436 |
+ assert(h == 8); |
|
437 |
+ |
|
438 |
+ for (i = 0; i < 8; i++) { |
|
439 |
+ // FIXME: try pointer walks |
|
440 |
+ BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1], |
|
441 |
+ src[stride * i + 0] - dst[stride * i + 0], |
|
442 |
+ src[stride * i + 1] - dst[stride * i + 1]); |
|
443 |
+ BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3], |
|
444 |
+ src[stride * i + 2] - dst[stride * i + 2], |
|
445 |
+ src[stride * i + 3] - dst[stride * i + 3]); |
|
446 |
+ BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5], |
|
447 |
+ src[stride * i + 4] - dst[stride * i + 4], |
|
448 |
+ src[stride * i + 5] - dst[stride * i + 5]); |
|
449 |
+ BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7], |
|
450 |
+ src[stride * i + 6] - dst[stride * i + 6], |
|
451 |
+ src[stride * i + 7] - dst[stride * i + 7]); |
|
452 |
+ |
|
453 |
+ BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]); |
|
454 |
+ BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]); |
|
455 |
+ BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]); |
|
456 |
+ BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]); |
|
457 |
+ |
|
458 |
+ BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]); |
|
459 |
+ BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]); |
|
460 |
+ BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]); |
|
461 |
+ BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]); |
|
462 |
+ } |
|
463 |
+ |
|
464 |
+ for (i = 0; i < 8; i++) { |
|
465 |
+ BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]); |
|
466 |
+ BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]); |
|
467 |
+ BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]); |
|
468 |
+ BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]); |
|
469 |
+ |
|
470 |
+ BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]); |
|
471 |
+ BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]); |
|
472 |
+ BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]); |
|
473 |
+ BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]); |
|
474 |
+ |
|
475 |
+ sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) + |
|
476 |
+ BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) + |
|
477 |
+ BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) + |
|
478 |
+ BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]); |
|
479 |
+ } |
|
480 |
+ return sum; |
|
481 |
+} |
|
482 |
+ |
|
483 |
+static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src, |
|
484 |
+ uint8_t *dummy, int stride, int h) |
|
485 |
+{ |
|
486 |
+ int i, temp[64], sum = 0; |
|
487 |
+ |
|
488 |
+ assert(h == 8); |
|
489 |
+ |
|
490 |
+ for (i = 0; i < 8; i++) { |
|
491 |
+ // FIXME: try pointer walks |
|
492 |
+ BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1], |
|
493 |
+ src[stride * i + 0], src[stride * i + 1]); |
|
494 |
+ BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3], |
|
495 |
+ src[stride * i + 2], src[stride * i + 3]); |
|
496 |
+ BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5], |
|
497 |
+ src[stride * i + 4], src[stride * i + 5]); |
|
498 |
+ BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7], |
|
499 |
+ src[stride * i + 6], src[stride * i + 7]); |
|
500 |
+ |
|
501 |
+ BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]); |
|
502 |
+ BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]); |
|
503 |
+ BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]); |
|
504 |
+ BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]); |
|
505 |
+ |
|
506 |
+ BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]); |
|
507 |
+ BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]); |
|
508 |
+ BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]); |
|
509 |
+ BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]); |
|
510 |
+ } |
|
511 |
+ |
|
512 |
+ for (i = 0; i < 8; i++) { |
|
513 |
+ BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]); |
|
514 |
+ BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]); |
|
515 |
+ BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]); |
|
516 |
+ BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]); |
|
517 |
+ |
|
518 |
+ BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]); |
|
519 |
+ BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]); |
|
520 |
+ BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]); |
|
521 |
+ BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]); |
|
522 |
+ |
|
523 |
+ sum += |
|
524 |
+ BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) |
|
525 |
+ + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) |
|
526 |
+ + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) |
|
527 |
+ + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]); |
|
528 |
+ } |
|
529 |
+ |
|
530 |
+ sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean |
|
531 |
+ |
|
532 |
+ return sum; |
|
533 |
+} |
|
534 |
+ |
|
535 |
+static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1, |
|
536 |
+ uint8_t *src2, int stride, int h) |
|
537 |
+{ |
|
538 |
+ LOCAL_ALIGNED_16(int16_t, temp, [64]); |
|
539 |
+ |
|
540 |
+ assert(h == 8); |
|
541 |
+ |
|
542 |
+ s->pdsp.diff_pixels(temp, src1, src2, stride); |
|
543 |
+ s->fdsp.fdct(temp); |
|
544 |
+ return s->mecc.sum_abs_dctelem(temp); |
|
545 |
+} |
|
546 |
+ |
|
547 |
+#if CONFIG_GPL |
|
548 |
+#define DCT8_1D \ |
|
549 |
+ { \ |
|
550 |
+ const int s07 = SRC(0) + SRC(7); \ |
|
551 |
+ const int s16 = SRC(1) + SRC(6); \ |
|
552 |
+ const int s25 = SRC(2) + SRC(5); \ |
|
553 |
+ const int s34 = SRC(3) + SRC(4); \ |
|
554 |
+ const int a0 = s07 + s34; \ |
|
555 |
+ const int a1 = s16 + s25; \ |
|
556 |
+ const int a2 = s07 - s34; \ |
|
557 |
+ const int a3 = s16 - s25; \ |
|
558 |
+ const int d07 = SRC(0) - SRC(7); \ |
|
559 |
+ const int d16 = SRC(1) - SRC(6); \ |
|
560 |
+ const int d25 = SRC(2) - SRC(5); \ |
|
561 |
+ const int d34 = SRC(3) - SRC(4); \ |
|
562 |
+ const int a4 = d16 + d25 + (d07 + (d07 >> 1)); \ |
|
563 |
+ const int a5 = d07 - d34 - (d25 + (d25 >> 1)); \ |
|
564 |
+ const int a6 = d07 + d34 - (d16 + (d16 >> 1)); \ |
|
565 |
+ const int a7 = d16 - d25 + (d34 + (d34 >> 1)); \ |
|
566 |
+ DST(0, a0 + a1); \ |
|
567 |
+ DST(1, a4 + (a7 >> 2)); \ |
|
568 |
+ DST(2, a2 + (a3 >> 1)); \ |
|
569 |
+ DST(3, a5 + (a6 >> 2)); \ |
|
570 |
+ DST(4, a0 - a1); \ |
|
571 |
+ DST(5, a6 - (a5 >> 2)); \ |
|
572 |
+ DST(6, (a2 >> 1) - a3); \ |
|
573 |
+ DST(7, (a4 >> 2) - a7); \ |
|
574 |
+ } |
|
575 |
+ |
|
576 |
+static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1, |
|
577 |
+ uint8_t *src2, int stride, int h) |
|
578 |
+{ |
|
579 |
+ int16_t dct[8][8]; |
|
580 |
+ int i, sum = 0; |
|
581 |
+ |
|
582 |
+ s->pdsp.diff_pixels(dct[0], src1, src2, stride); |
|
583 |
+ |
|
584 |
+#define SRC(x) dct[i][x] |
|
585 |
+#define DST(x, v) dct[i][x] = v |
|
586 |
+ for (i = 0; i < 8; i++) |
|
587 |
+ DCT8_1D |
|
588 |
+#undef SRC |
|
589 |
+#undef DST |
|
590 |
+ |
|
591 |
+#define SRC(x) dct[x][i] |
|
592 |
+#define DST(x, v) sum += FFABS(v) |
|
593 |
+ for (i = 0; i < 8; i++) |
|
594 |
+ DCT8_1D |
|
595 |
+#undef SRC |
|
596 |
+#undef DST |
|
597 |
+ return sum; |
|
598 |
+} |
|
599 |
+#endif |
|
600 |
+ |
|
601 |
+static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1, |
|
602 |
+ uint8_t *src2, int stride, int h) |
|
603 |
+{ |
|
604 |
+ LOCAL_ALIGNED_16(int16_t, temp, [64]); |
|
605 |
+ int sum = 0, i; |
|
606 |
+ |
|
607 |
+ assert(h == 8); |
|
608 |
+ |
|
609 |
+ s->pdsp.diff_pixels(temp, src1, src2, stride); |
|
610 |
+ s->fdsp.fdct(temp); |
|
611 |
+ |
|
612 |
+ for (i = 0; i < 64; i++) |
|
613 |
+ sum = FFMAX(sum, FFABS(temp[i])); |
|
614 |
+ |
|
615 |
+ return sum; |
|
616 |
+} |
|
617 |
+ |
|
618 |
+static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1, |
|
619 |
+ uint8_t *src2, int stride, int h) |
|
620 |
+{ |
|
621 |
+ LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]); |
|
622 |
+ int16_t *const bak = temp + 64; |
|
623 |
+ int sum = 0, i; |
|
624 |
+ |
|
625 |
+ assert(h == 8); |
|
626 |
+ s->mb_intra = 0; |
|
627 |
+ |
|
628 |
+ s->pdsp.diff_pixels(temp, src1, src2, stride); |
|
629 |
+ |
|
630 |
+ memcpy(bak, temp, 64 * sizeof(int16_t)); |
|
631 |
+ |
|
632 |
+ s->block_last_index[0 /* FIXME */] = |
|
633 |
+ s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); |
|
634 |
+ s->dct_unquantize_inter(s, temp, 0, s->qscale); |
|
635 |
+ ff_simple_idct_8(temp); // FIXME |
|
636 |
+ |
|
637 |
+ for (i = 0; i < 64; i++) |
|
638 |
+ sum += (temp[i] - bak[i]) * (temp[i] - bak[i]); |
|
639 |
+ |
|
640 |
+ return sum; |
|
641 |
+} |
|
642 |
+ |
|
643 |
+static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2, |
|
644 |
+ int stride, int h) |
|
645 |
+{ |
|
646 |
+ const uint8_t *scantable = s->intra_scantable.permutated; |
|
647 |
+ LOCAL_ALIGNED_16(int16_t, temp, [64]); |
|
648 |
+ LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]); |
|
649 |
+ LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]); |
|
650 |
+ int i, last, run, bits, level, distortion, start_i; |
|
651 |
+ const int esc_length = s->ac_esc_length; |
|
652 |
+ uint8_t *length, *last_length; |
|
653 |
+ |
|
654 |
+ assert(h == 8); |
|
655 |
+ |
|
656 |
+ copy_block8(lsrc1, src1, 8, stride, 8); |
|
657 |
+ copy_block8(lsrc2, src2, 8, stride, 8); |
|
658 |
+ |
|
659 |
+ s->pdsp.diff_pixels(temp, lsrc1, lsrc2, 8); |
|
660 |
+ |
|
661 |
+ s->block_last_index[0 /* FIXME */] = |
|
662 |
+ last = |
|
663 |
+ s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); |
|
664 |
+ |
|
665 |
+ bits = 0; |
|
666 |
+ |
|
667 |
+ if (s->mb_intra) { |
|
668 |
+ start_i = 1; |
|
669 |
+ length = s->intra_ac_vlc_length; |
|
670 |
+ last_length = s->intra_ac_vlc_last_length; |
|
671 |
+ bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma |
|
672 |
+ } else { |
|
673 |
+ start_i = 0; |
|
674 |
+ length = s->inter_ac_vlc_length; |
|
675 |
+ last_length = s->inter_ac_vlc_last_length; |
|
676 |
+ } |
|
677 |
+ |
|
678 |
+ if (last >= start_i) { |
|
679 |
+ run = 0; |
|
680 |
+ for (i = start_i; i < last; i++) { |
|
681 |
+ int j = scantable[i]; |
|
682 |
+ level = temp[j]; |
|
683 |
+ |
|
684 |
+ if (level) { |
|
685 |
+ level += 64; |
|
686 |
+ if ((level & (~127)) == 0) |
|
687 |
+ bits += length[UNI_AC_ENC_INDEX(run, level)]; |
|
688 |
+ else |
|
689 |
+ bits += esc_length; |
|
690 |
+ run = 0; |
|
691 |
+ } else |
|
692 |
+ run++; |
|
693 |
+ } |
|
694 |
+ i = scantable[last]; |
|
695 |
+ |
|
696 |
+ level = temp[i] + 64; |
|
697 |
+ |
|
698 |
+ assert(level - 64); |
|
699 |
+ |
|
700 |
+ if ((level & (~127)) == 0) { |
|
701 |
+ bits += last_length[UNI_AC_ENC_INDEX(run, level)]; |
|
702 |
+ } else |
|
703 |
+ bits += esc_length; |
|
704 |
+ } |
|
705 |
+ |
|
706 |
+ if (last >= 0) { |
|
707 |
+ if (s->mb_intra) |
|
708 |
+ s->dct_unquantize_intra(s, temp, 0, s->qscale); |
|
709 |
+ else |
|
710 |
+ s->dct_unquantize_inter(s, temp, 0, s->qscale); |
|
711 |
+ } |
|
712 |
+ |
|
713 |
+ s->idsp.idct_add(lsrc2, 8, temp); |
|
714 |
+ |
|
715 |
+ distortion = s->mecc.sse[1](NULL, lsrc2, lsrc1, 8, 8); |
|
716 |
+ |
|
717 |
+ return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7); |
|
718 |
+} |
|
719 |
+ |
|
720 |
+static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2, |
|
721 |
+ int stride, int h) |
|
722 |
+{ |
|
723 |
+ const uint8_t *scantable = s->intra_scantable.permutated; |
|
724 |
+ LOCAL_ALIGNED_16(int16_t, temp, [64]); |
|
725 |
+ int i, last, run, bits, level, start_i; |
|
726 |
+ const int esc_length = s->ac_esc_length; |
|
727 |
+ uint8_t *length, *last_length; |
|
728 |
+ |
|
729 |
+ assert(h == 8); |
|
730 |
+ |
|
731 |
+ s->pdsp.diff_pixels(temp, src1, src2, stride); |
|
732 |
+ |
|
733 |
+ s->block_last_index[0 /* FIXME */] = |
|
734 |
+ last = |
|
735 |
+ s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); |
|
736 |
+ |
|
737 |
+ bits = 0; |
|
738 |
+ |
|
739 |
+ if (s->mb_intra) { |
|
740 |
+ start_i = 1; |
|
741 |
+ length = s->intra_ac_vlc_length; |
|
742 |
+ last_length = s->intra_ac_vlc_last_length; |
|
743 |
+ bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma |
|
744 |
+ } else { |
|
745 |
+ start_i = 0; |
|
746 |
+ length = s->inter_ac_vlc_length; |
|
747 |
+ last_length = s->inter_ac_vlc_last_length; |
|
748 |
+ } |
|
749 |
+ |
|
750 |
+ if (last >= start_i) { |
|
751 |
+ run = 0; |
|
752 |
+ for (i = start_i; i < last; i++) { |
|
753 |
+ int j = scantable[i]; |
|
754 |
+ level = temp[j]; |
|
755 |
+ |
|
756 |
+ if (level) { |
|
757 |
+ level += 64; |
|
758 |
+ if ((level & (~127)) == 0) |
|
759 |
+ bits += length[UNI_AC_ENC_INDEX(run, level)]; |
|
760 |
+ else |
|
761 |
+ bits += esc_length; |
|
762 |
+ run = 0; |
|
763 |
+ } else |
|
764 |
+ run++; |
|
765 |
+ } |
|
766 |
+ i = scantable[last]; |
|
767 |
+ |
|
768 |
+ level = temp[i] + 64; |
|
769 |
+ |
|
770 |
+ assert(level - 64); |
|
771 |
+ |
|
772 |
+ if ((level & (~127)) == 0) |
|
773 |
+ bits += last_length[UNI_AC_ENC_INDEX(run, level)]; |
|
774 |
+ else |
|
775 |
+ bits += esc_length; |
|
776 |
+ } |
|
777 |
+ |
|
778 |
+ return bits; |
|
779 |
+} |
|
780 |
+ |
|
781 |
+#define VSAD_INTRA(size) \ |
|
782 |
+static int vsad_intra ## size ## _c(MpegEncContext *c, \ |
|
783 |
+ uint8_t *s, uint8_t *dummy, \ |
|
784 |
+ int stride, int h) \ |
|
785 |
+{ \ |
|
786 |
+ int score = 0, x, y; \ |
|
787 |
+ \ |
|
788 |
+ for (y = 1; y < h; y++) { \ |
|
789 |
+ for (x = 0; x < size; x += 4) { \ |
|
790 |
+ score += FFABS(s[x] - s[x + stride]) + \ |
|
791 |
+ FFABS(s[x + 1] - s[x + stride + 1]) + \ |
|
792 |
+ FFABS(s[x + 2] - s[x + 2 + stride]) + \ |
|
793 |
+ FFABS(s[x + 3] - s[x + 3 + stride]); \ |
|
794 |
+ } \ |
|
795 |
+ s += stride; \ |
|
796 |
+ } \ |
|
797 |
+ \ |
|
798 |
+ return score; \ |
|
799 |
+} |
|
800 |
+VSAD_INTRA(8) |
|
801 |
+VSAD_INTRA(16) |
|
802 |
+ |
|
803 |
+static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, |
|
804 |
+ int stride, int h) |
|
805 |
+{ |
|
806 |
+ int score = 0, x, y; |
|
807 |
+ |
|
808 |
+ for (y = 1; y < h; y++) { |
|
809 |
+ for (x = 0; x < 16; x++) |
|
810 |
+ score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); |
|
811 |
+ s1 += stride; |
|
812 |
+ s2 += stride; |
|
813 |
+ } |
|
814 |
+ |
|
815 |
+ return score; |
|
816 |
+} |
|
817 |
+ |
|
818 |
+#define SQ(a) ((a) * (a)) |
|
819 |
+#define VSSE_INTRA(size) \ |
|
820 |
+static int vsse_intra ## size ## _c(MpegEncContext *c, \ |
|
821 |
+ uint8_t *s, uint8_t *dummy, \ |
|
822 |
+ int stride, int h) \ |
|
823 |
+{ \ |
|
824 |
+ int score = 0, x, y; \ |
|
825 |
+ \ |
|
826 |
+ for (y = 1; y < h; y++) { \ |
|
827 |
+ for (x = 0; x < size; x += 4) { \ |
|
828 |
+ score += SQ(s[x] - s[x + stride]) + \ |
|
829 |
+ SQ(s[x + 1] - s[x + stride + 1]) + \ |
|
830 |
+ SQ(s[x + 2] - s[x + stride + 2]) + \ |
|
831 |
+ SQ(s[x + 3] - s[x + stride + 3]); \ |
|
832 |
+ } \ |
|
833 |
+ s += stride; \ |
|
834 |
+ } \ |
|
835 |
+ \ |
|
836 |
+ return score; \ |
|
837 |
+} |
|
838 |
+VSSE_INTRA(8) |
|
839 |
+VSSE_INTRA(16) |
|
840 |
+ |
|
841 |
+static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, |
|
842 |
+ int stride, int h) |
|
843 |
+{ |
|
844 |
+ int score = 0, x, y; |
|
845 |
+ |
|
846 |
+ for (y = 1; y < h; y++) { |
|
847 |
+ for (x = 0; x < 16; x++) |
|
848 |
+ score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); |
|
849 |
+ s1 += stride; |
|
850 |
+ s2 += stride; |
|
851 |
+ } |
|
852 |
+ |
|
853 |
+ return score; |
|
854 |
+} |
|
855 |
+ |
|
856 |
+#define WRAPPER8_16_SQ(name8, name16) \ |
|
857 |
+static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \ |
|
858 |
+ int stride, int h) \ |
|
859 |
+{ \ |
|
860 |
+ int score = 0; \ |
|
861 |
+ \ |
|
862 |
+ score += name8(s, dst, src, stride, 8); \ |
|
863 |
+ score += name8(s, dst + 8, src + 8, stride, 8); \ |
|
864 |
+ if (h == 16) { \ |
|
865 |
+ dst += 8 * stride; \ |
|
866 |
+ src += 8 * stride; \ |
|
867 |
+ score += name8(s, dst, src, stride, 8); \ |
|
868 |
+ score += name8(s, dst + 8, src + 8, stride, 8); \ |
|
869 |
+ } \ |
|
870 |
+ return score; \ |
|
871 |
+} |
|
872 |
+ |
|
873 |
+WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c) |
|
874 |
+WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c) |
|
875 |
+WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c) |
|
876 |
+#if CONFIG_GPL |
|
877 |
+WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c) |
|
878 |
+#endif |
|
879 |
+WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c) |
|
880 |
+WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c) |
|
881 |
+WRAPPER8_16_SQ(rd8x8_c, rd16_c) |
|
882 |
+WRAPPER8_16_SQ(bit8x8_c, bit16_c) |
|
883 |
+ |
|
884 |
+av_cold void ff_me_cmp_init_static(void) |
|
885 |
+{ |
|
886 |
+ int i; |
|
887 |
+ |
|
888 |
+ for (i = 0; i < 512; i++) |
|
889 |
+ ff_square_tab[i] = (i - 256) * (i - 256); |
|
890 |
+} |
|
891 |
+ |
|
892 |
+av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx) |
|
893 |
+{ |
|
894 |
+ c->sum_abs_dctelem = sum_abs_dctelem_c; |
|
895 |
+ |
|
896 |
+ /* TODO [0] 16 [1] 8 */ |
|
897 |
+ c->pix_abs[0][0] = pix_abs16_c; |
|
898 |
+ c->pix_abs[0][1] = pix_abs16_x2_c; |
|
899 |
+ c->pix_abs[0][2] = pix_abs16_y2_c; |
|
900 |
+ c->pix_abs[0][3] = pix_abs16_xy2_c; |
|
901 |
+ c->pix_abs[1][0] = pix_abs8_c; |
|
902 |
+ c->pix_abs[1][1] = pix_abs8_x2_c; |
|
903 |
+ c->pix_abs[1][2] = pix_abs8_y2_c; |
|
904 |
+ c->pix_abs[1][3] = pix_abs8_xy2_c; |
|
905 |
+ |
|
906 |
+#define SET_CMP_FUNC(name) \ |
|
907 |
+ c->name[0] = name ## 16_c; \ |
|
908 |
+ c->name[1] = name ## 8x8_c; |
|
909 |
+ |
|
910 |
+ SET_CMP_FUNC(hadamard8_diff) |
|
911 |
+ c->hadamard8_diff[4] = hadamard8_intra16_c; |
|
912 |
+ c->hadamard8_diff[5] = hadamard8_intra8x8_c; |
|
913 |
+ SET_CMP_FUNC(dct_sad) |
|
914 |
+ SET_CMP_FUNC(dct_max) |
|
915 |
+#if CONFIG_GPL |
|
916 |
+ SET_CMP_FUNC(dct264_sad) |
|
917 |
+#endif |
|
918 |
+ c->sad[0] = pix_abs16_c; |
|
919 |
+ c->sad[1] = pix_abs8_c; |
|
920 |
+ c->sse[0] = sse16_c; |
|
921 |
+ c->sse[1] = sse8_c; |
|
922 |
+ c->sse[2] = sse4_c; |
|
923 |
+ SET_CMP_FUNC(quant_psnr) |
|
924 |
+ SET_CMP_FUNC(rd) |
|
925 |
+ SET_CMP_FUNC(bit) |
|
926 |
+ c->vsad[0] = vsad16_c; |
|
927 |
+ c->vsad[4] = vsad_intra16_c; |
|
928 |
+ c->vsad[5] = vsad_intra8_c; |
|
929 |
+ c->vsse[0] = vsse16_c; |
|
930 |
+ c->vsse[4] = vsse_intra16_c; |
|
931 |
+ c->vsse[5] = vsse_intra8_c; |
|
932 |
+ c->nsse[0] = nsse16_c; |
|
933 |
+ c->nsse[1] = nsse8_c; |
|
934 |
+ |
|
935 |
+ if (ARCH_ARM) |
|
936 |
+ ff_me_cmp_init_arm(c, avctx); |
|
937 |
+ if (ARCH_PPC) |
|
938 |
+ ff_me_cmp_init_ppc(c, avctx); |
|
939 |
+ if (ARCH_X86) |
|
940 |
+ ff_me_cmp_init_x86(c, avctx); |
|
941 |
+} |
0 | 942 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,73 @@ |
0 |
+/* |
|
1 |
+ * This file is part of Libav. |
|
2 |
+ * |
|
3 |
+ * Libav is free software; you can redistribute it and/or |
|
4 |
+ * modify it under the terms of the GNU Lesser General Public |
|
5 |
+ * License as published by the Free Software Foundation; either |
|
6 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
7 |
+ * |
|
8 |
+ * Libav is distributed in the hope that it will be useful, |
|
9 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
10 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
11 |
+ * Lesser General Public License for more details. |
|
12 |
+ * |
|
13 |
+ * You should have received a copy of the GNU Lesser General Public |
|
14 |
+ * License along with Libav; if not, write to the Free Software |
|
15 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
16 |
+ */ |
|
17 |
+ |
|
18 |
+#ifndef AVCODEC_ME_CMP_H |
|
19 |
+#define AVCODEC_ME_CMP_H |
|
20 |
+ |
|
21 |
+#include <stdint.h> |
|
22 |
+ |
|
23 |
+#include "avcodec.h" |
|
24 |
+ |
|
25 |
+extern uint32_t ff_square_tab[512]; |
|
26 |
+ |
|
27 |
+struct MpegEncContext; |
|
28 |
+/* Motion estimation: |
|
29 |
+ * h is limited to { width / 2, width, 2 * width }, |
|
30 |
+ * but never larger than 16 and never smaller than 2. |
|
31 |
+ * Although currently h < 4 is not used as functions with |
|
32 |
+ * width < 8 are neither used nor implemented. */ |
|
33 |
+typedef int (*me_cmp_func)(struct MpegEncContext *c, |
|
34 |
+ uint8_t *blk1 /* align width (8 or 16) */, |
|
35 |
+ uint8_t *blk2 /* align 1 */, int line_size, int h); |
|
36 |
+ |
|
37 |
+typedef struct MECmpContext { |
|
38 |
+ int (*sum_abs_dctelem)(int16_t *block /* align 16 */); |
|
39 |
+ |
|
40 |
+ me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */ |
|
41 |
+ me_cmp_func sse[6]; |
|
42 |
+ me_cmp_func hadamard8_diff[6]; |
|
43 |
+ me_cmp_func dct_sad[6]; |
|
44 |
+ me_cmp_func quant_psnr[6]; |
|
45 |
+ me_cmp_func bit[6]; |
|
46 |
+ me_cmp_func rd[6]; |
|
47 |
+ me_cmp_func vsad[6]; |
|
48 |
+ me_cmp_func vsse[6]; |
|
49 |
+ me_cmp_func nsse[6]; |
|
50 |
+ me_cmp_func dct_max[6]; |
|
51 |
+ me_cmp_func dct264_sad[6]; |
|
52 |
+ |
|
53 |
+ me_cmp_func me_pre_cmp[6]; |
|
54 |
+ me_cmp_func me_cmp[6]; |
|
55 |
+ me_cmp_func me_sub_cmp[6]; |
|
56 |
+ me_cmp_func mb_cmp[6]; |
|
57 |
+ me_cmp_func ildct_cmp[6]; // only width 16 used |
|
58 |
+ me_cmp_func frame_skip_cmp[6]; // only width 8 used |
|
59 |
+ |
|
60 |
+ me_cmp_func pix_abs[2][4]; |
|
61 |
+} MECmpContext; |
|
62 |
+ |
|
63 |
+void ff_me_cmp_init_static(void); |
|
64 |
+ |
|
65 |
+void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx); |
|
66 |
+void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx); |
|
67 |
+void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx); |
|
68 |
+void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx); |
|
69 |
+ |
|
70 |
+void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type); |
|
71 |
+ |
|
72 |
+#endif /* AVCODEC_ME_CMP_H */ |
... | ... |
@@ -317,10 +317,10 @@ int ff_init_me(MpegEncContext *s){ |
317 | 317 |
av_log(s->avctx, AV_LOG_INFO, "ME_MAP size may be a little small for the selected diamond size\n"); |
318 | 318 |
} |
319 | 319 |
|
320 |
- ff_set_cmp(&s->dsp, s->dsp.me_pre_cmp, c->avctx->me_pre_cmp); |
|
321 |
- ff_set_cmp(&s->dsp, s->dsp.me_cmp, c->avctx->me_cmp); |
|
322 |
- ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, c->avctx->me_sub_cmp); |
|
323 |
- ff_set_cmp(&s->dsp, s->dsp.mb_cmp, c->avctx->mb_cmp); |
|
320 |
+ ff_set_cmp(&s->mecc, s->mecc.me_pre_cmp, c->avctx->me_pre_cmp); |
|
321 |
+ ff_set_cmp(&s->mecc, s->mecc.me_cmp, c->avctx->me_cmp); |
|
322 |
+ ff_set_cmp(&s->mecc, s->mecc.me_sub_cmp, c->avctx->me_sub_cmp); |
|
323 |
+ ff_set_cmp(&s->mecc, s->mecc.mb_cmp, c->avctx->mb_cmp); |
|
324 | 324 |
|
325 | 325 |
c->flags = get_flags(c, 0, c->avctx->me_cmp &FF_CMP_CHROMA); |
326 | 326 |
c->sub_flags= get_flags(c, 0, c->avctx->me_sub_cmp&FF_CMP_CHROMA); |
... | ... |
@@ -361,12 +361,10 @@ int ff_init_me(MpegEncContext *s){ |
361 | 361 |
/* 8x8 fullpel search would need a 4x4 chroma compare, which we do |
362 | 362 |
* not have yet, and even if we had, the motion estimation code |
363 | 363 |
* does not expect it. */ |
364 |
- if((c->avctx->me_cmp&FF_CMP_CHROMA)/* && !s->dsp.me_cmp[2]*/){ |
|
365 |
- s->dsp.me_cmp[2]= zero_cmp; |
|
366 |
- } |
|
367 |
- if((c->avctx->me_sub_cmp&FF_CMP_CHROMA) && !s->dsp.me_sub_cmp[2]){ |
|
368 |
- s->dsp.me_sub_cmp[2]= zero_cmp; |
|
369 |
- } |
|
364 |
+ if ((c->avctx->me_cmp & FF_CMP_CHROMA) /* && !s->mecc.me_cmp[2] */) |
|
365 |
+ s->mecc.me_cmp[2] = zero_cmp; |
|
366 |
+ if ((c->avctx->me_sub_cmp & FF_CMP_CHROMA) && !s->mecc.me_sub_cmp[2]) |
|
367 |
+ s->mecc.me_sub_cmp[2] = zero_cmp; |
|
370 | 368 |
c->hpel_put[2][0]= c->hpel_put[2][1]= |
371 | 369 |
c->hpel_put[2][2]= c->hpel_put[2][3]= zero_hpel; |
372 | 370 |
|
... | ... |
@@ -379,7 +377,7 @@ int ff_init_me(MpegEncContext *s){ |
379 | 379 |
|
380 | 380 |
#define CHECK_SAD_HALF_MV(suffix, x, y) \ |
381 | 381 |
{\ |
382 |
- d= s->dsp.pix_abs[size][(x?1:0)+(y?2:0)](NULL, pix, ptr+((x)>>1), stride, h);\ |
|
382 |
+ d = s->mecc.pix_abs[size][(x ? 1 : 0) + (y ? 2 : 0)](NULL, pix, ptr + ((x) >> 1), stride, h); \ |
|
383 | 383 |
d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*penalty_factor;\ |
384 | 384 |
COPY3_IF_LT(dminh, d, dx, x, dy, y)\ |
385 | 385 |
} |
... | ... |
@@ -615,7 +613,7 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift) |
615 | 615 |
|
616 | 616 |
dmin4= c->sub_motion_search(s, &mx4, &my4, dmin4, block, block, size, h); |
617 | 617 |
|
618 |
- if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){ |
|
618 |
+ if (s->mecc.me_sub_cmp[0] != s->mecc.mb_cmp[0]) { |
|
619 | 619 |
int dxy; |
620 | 620 |
const int offset= ((block&1) + (block>>1)*stride)*8; |
621 | 621 |
uint8_t *dest_y = c->scratchpad + offset; |
... | ... |
@@ -657,8 +655,11 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift) |
657 | 657 |
if(same) |
658 | 658 |
return INT_MAX; |
659 | 659 |
|
660 |
- if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){ |
|
661 |
- dmin_sum += s->dsp.mb_cmp[0](s, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*16*stride, c->scratchpad, stride, 16); |
|
660 |
+ if (s->mecc.me_sub_cmp[0] != s->mecc.mb_cmp[0]) { |
|
661 |
+ dmin_sum += s->mecc.mb_cmp[0](s, |
|
662 |
+ s->new_picture.f->data[0] + |
|
663 |
+ s->mb_x * 16 + s->mb_y * 16 * stride, |
|
664 |
+ c->scratchpad, stride, 16); |
|
662 | 665 |
} |
663 | 666 |
|
664 | 667 |
if(c->avctx->mb_cmp&FF_CMP_CHROMA){ |
... | ... |
@@ -680,8 +681,8 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift) |
680 | 680 |
s->hdsp.put_pixels_tab [1][dxy](c->scratchpad + 8, s->last_picture.f->data[2] + offset, s->uvlinesize, 8); |
681 | 681 |
} |
682 | 682 |
|
683 |
- dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.f->data[1] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, c->scratchpad , s->uvlinesize, 8); |
|
684 |
- dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.f->data[2] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, c->scratchpad+8, s->uvlinesize, 8); |
|
683 |
+ dmin_sum += s->mecc.mb_cmp[1](s, s->new_picture.f->data[1] + s->mb_x * 8 + s->mb_y * 8 * s->uvlinesize, c->scratchpad, s->uvlinesize, 8); |
|
684 |
+ dmin_sum += s->mecc.mb_cmp[1](s, s->new_picture.f->data[2] + s->mb_x * 8 + s->mb_y * 8 * s->uvlinesize, c->scratchpad + 8, s->uvlinesize, 8); |
|
685 | 685 |
} |
686 | 686 |
|
687 | 687 |
c->pred_x= mx; |
... | ... |
@@ -777,7 +778,7 @@ static int interlaced_search(MpegEncContext *s, int ref_index, |
777 | 777 |
mv_table[xy][0]= mx_i; |
778 | 778 |
mv_table[xy][1]= my_i; |
779 | 779 |
|
780 |
- if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){ |
|
780 |
+ if (s->mecc.me_sub_cmp[0] != s->mecc.mb_cmp[0]) { |
|
781 | 781 |
int dxy; |
782 | 782 |
|
783 | 783 |
//FIXME chroma ME |
... | ... |
@@ -789,7 +790,7 @@ static int interlaced_search(MpegEncContext *s, int ref_index, |
789 | 789 |
}else{ |
790 | 790 |
s->hdsp.put_pixels_tab [size][dxy](c->scratchpad, ref , stride, h); |
791 | 791 |
} |
792 |
- dmin= s->dsp.mb_cmp[size](s, c->src[block][0], c->scratchpad, stride, h); |
|
792 |
+ dmin = s->mecc.mb_cmp[size](s, c->src[block][0], c->scratchpad, stride, h); |
|
793 | 793 |
dmin+= (mv_penalty[mx_i-c->pred_x] + mv_penalty[my_i-c->pred_y] + 1)*c->mb_penalty_factor; |
794 | 794 |
}else |
795 | 795 |
dmin+= c->mb_penalty_factor; //field_select bits |
... | ... |
@@ -940,7 +941,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s, |
940 | 940 |
/* At this point (mx,my) are full-pell and the relative displacement */ |
941 | 941 |
ppix = c->ref[0][0] + (my * s->linesize) + mx; |
942 | 942 |
|
943 |
- vard = s->dsp.sse[0](NULL, pix, ppix, s->linesize, 16); |
|
943 |
+ vard = s->mecc.sse[0](NULL, pix, ppix, s->linesize, 16); |
|
944 | 944 |
|
945 | 945 |
pic->mc_mb_var[s->mb_stride * mb_y + mb_x] = (vard+128)>>8; |
946 | 946 |
c->mc_mb_var_sum_temp += (vard+128)>>8; |
... | ... |
@@ -1037,7 +1038,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s, |
1037 | 1037 |
*(uint32_t*)(&c->scratchpad[i*s->linesize+12]) = mean; |
1038 | 1038 |
} |
1039 | 1039 |
|
1040 |
- intra_score= s->dsp.mb_cmp[0](s, c->scratchpad, pix, s->linesize, 16); |
|
1040 |
+ intra_score= s->mecc.mb_cmp[0](s, c->scratchpad, pix, s->linesize, 16); |
|
1041 | 1041 |
} |
1042 | 1042 |
intra_score += c->mb_penalty_factor*16; |
1043 | 1043 |
|
... | ... |
@@ -1237,7 +1238,7 @@ static inline int check_bidir_mv(MpegEncContext * s, |
1237 | 1237 |
|
1238 | 1238 |
fbmin = (mv_penalty_f[motion_fx-pred_fx] + mv_penalty_f[motion_fy-pred_fy])*c->mb_penalty_factor |
1239 | 1239 |
+(mv_penalty_b[motion_bx-pred_bx] + mv_penalty_b[motion_by-pred_by])*c->mb_penalty_factor |
1240 |
- + s->dsp.mb_cmp[size](s, src_data[0], dest_y, stride, h); //FIXME new_pic |
|
1240 |
+ + s->mecc.mb_cmp[size](s, src_data[0], dest_y, stride, h); // FIXME new_pic |
|
1241 | 1241 |
|
1242 | 1242 |
if(c->avctx->mb_cmp&FF_CMP_CHROMA){ |
1243 | 1243 |
} |
... | ... |
@@ -63,8 +63,8 @@ static int hpel_motion_search(MpegEncContext * s, |
63 | 63 |
|
64 | 64 |
//FIXME factorize |
65 | 65 |
|
66 |
- cmp_sub= s->dsp.me_sub_cmp[size]; |
|
67 |
- chroma_cmp_sub= s->dsp.me_sub_cmp[size+1]; |
|
66 |
+ cmp_sub = s->mecc.me_sub_cmp[size]; |
|
67 |
+ chroma_cmp_sub = s->mecc.me_sub_cmp[size + 1]; |
|
68 | 68 |
|
69 | 69 |
if(c->skip){ //FIXME move out of hpel? |
70 | 70 |
*mx_ptr = 0; |
... | ... |
@@ -166,7 +166,6 @@ static inline int get_mb_score(MpegEncContext *s, int mx, int my, |
166 | 166 |
int src_index, int ref_index, int size, |
167 | 167 |
int h, int add_rate) |
168 | 168 |
{ |
169 |
-// const int check_luma= s->dsp.me_sub_cmp != s->dsp.mb_cmp; |
|
170 | 169 |
MotionEstContext * const c= &s->me; |
171 | 170 |
const int penalty_factor= c->mb_penalty_factor; |
172 | 171 |
const int flags= c->mb_flags; |
... | ... |
@@ -179,8 +178,8 @@ static inline int get_mb_score(MpegEncContext *s, int mx, int my, |
179 | 179 |
|
180 | 180 |
//FIXME factorize |
181 | 181 |
|
182 |
- cmp_sub= s->dsp.mb_cmp[size]; |
|
183 |
- chroma_cmp_sub= s->dsp.mb_cmp[size+1]; |
|
182 |
+ cmp_sub = s->mecc.mb_cmp[size]; |
|
183 |
+ chroma_cmp_sub = s->mecc.mb_cmp[size + 1]; |
|
184 | 184 |
|
185 | 185 |
// assert(!c->skip); |
186 | 186 |
// assert(c->avctx->me_sub_cmp != c->avctx->mb_cmp); |
... | ... |
@@ -226,12 +225,12 @@ static int qpel_motion_search(MpegEncContext * s, |
226 | 226 |
LOAD_COMMON |
227 | 227 |
int flags= c->sub_flags; |
228 | 228 |
|
229 |
- cmpf= s->dsp.me_cmp[size]; |
|
230 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; //factorize FIXME |
|
229 |
+ cmpf = s->mecc.me_cmp[size]; |
|
230 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; // FIXME: factorize |
|
231 | 231 |
//FIXME factorize |
232 | 232 |
|
233 |
- cmp_sub= s->dsp.me_sub_cmp[size]; |
|
234 |
- chroma_cmp_sub= s->dsp.me_sub_cmp[size+1]; |
|
233 |
+ cmp_sub = s->mecc.me_sub_cmp[size]; |
|
234 |
+ chroma_cmp_sub = s->mecc.me_sub_cmp[size + 1]; |
|
235 | 235 |
|
236 | 236 |
if(c->skip){ //FIXME somehow move up (benchmark) |
237 | 237 |
*mx_ptr = 0; |
... | ... |
@@ -427,8 +426,8 @@ static av_always_inline int small_diamond_search(MpegEncContext * s, int *best, |
427 | 427 |
LOAD_COMMON2 |
428 | 428 |
unsigned map_generation = c->map_generation; |
429 | 429 |
|
430 |
- cmpf= s->dsp.me_cmp[size]; |
|
431 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
430 |
+ cmpf = s->mecc.me_cmp[size]; |
|
431 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
432 | 432 |
|
433 | 433 |
{ /* ensure that the best point is in the MAP as h/qpel refinement needs it */ |
434 | 434 |
const unsigned key = (best[1]<<ME_MAP_MV_BITS) + best[0] + map_generation; |
... | ... |
@@ -468,8 +467,8 @@ static int funny_diamond_search(MpegEncContext * s, int *best, int dmin, |
468 | 468 |
LOAD_COMMON2 |
469 | 469 |
unsigned map_generation = c->map_generation; |
470 | 470 |
|
471 |
- cmpf= s->dsp.me_cmp[size]; |
|
472 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
471 |
+ cmpf = s->mecc.me_cmp[size]; |
|
472 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
473 | 473 |
|
474 | 474 |
for(dia_size=1; dia_size<=4; dia_size++){ |
475 | 475 |
int dir; |
... | ... |
@@ -511,8 +510,8 @@ static int hex_search(MpegEncContext * s, int *best, int dmin, |
511 | 511 |
int x,y,d; |
512 | 512 |
const int dec= dia_size & (dia_size-1); |
513 | 513 |
|
514 |
- cmpf= s->dsp.me_cmp[size]; |
|
515 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
514 |
+ cmpf = s->mecc.me_cmp[size]; |
|
515 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
516 | 516 |
|
517 | 517 |
for(;dia_size; dia_size= dec ? dia_size-1 : dia_size>>1){ |
518 | 518 |
do{ |
... | ... |
@@ -548,8 +547,8 @@ static int l2s_dia_search(MpegEncContext * s, int *best, int dmin, |
548 | 548 |
static const int hex[8][2]={{-2, 0}, {-1,-1}, { 0,-2}, { 1,-1}, |
549 | 549 |
{ 2, 0}, { 1, 1}, { 0, 2}, {-1, 1}}; |
550 | 550 |
|
551 |
- cmpf= s->dsp.me_cmp[size]; |
|
552 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
551 |
+ cmpf = s->mecc.me_cmp[size]; |
|
552 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
553 | 553 |
|
554 | 554 |
for(; dia_size; dia_size= dec ? dia_size-1 : dia_size>>1){ |
555 | 555 |
do{ |
... | ... |
@@ -587,8 +586,8 @@ static int umh_search(MpegEncContext * s, int *best, int dmin, |
587 | 587 |
{-2, 3}, { 0, 4}, { 2, 3}, |
588 | 588 |
{-2,-3}, { 0,-4}, { 2,-3},}; |
589 | 589 |
|
590 |
- cmpf= s->dsp.me_cmp[size]; |
|
591 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
590 |
+ cmpf = s->mecc.me_cmp[size]; |
|
591 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
592 | 592 |
|
593 | 593 |
x= best[0]; |
594 | 594 |
y= best[1]; |
... | ... |
@@ -630,8 +629,8 @@ static int full_search(MpegEncContext * s, int *best, int dmin, |
630 | 630 |
int x,y, d; |
631 | 631 |
const int dia_size= c->dia_size&0xFF; |
632 | 632 |
|
633 |
- cmpf= s->dsp.me_cmp[size]; |
|
634 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
633 |
+ cmpf = s->mecc.me_cmp[size]; |
|
634 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
635 | 635 |
|
636 | 636 |
for(y=FFMAX(-dia_size, ymin); y<=FFMIN(dia_size,ymax); y++){ |
637 | 637 |
for(x=FFMAX(-dia_size, xmin); x<=FFMIN(dia_size,xmax); x++){ |
... | ... |
@@ -694,8 +693,8 @@ static int sab_diamond_search(MpegEncContext * s, int *best, int dmin, |
694 | 694 |
LOAD_COMMON2 |
695 | 695 |
unsigned map_generation = c->map_generation; |
696 | 696 |
|
697 |
- cmpf= s->dsp.me_cmp[size]; |
|
698 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
697 |
+ cmpf = s->mecc.me_cmp[size]; |
|
698 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
699 | 699 |
|
700 | 700 |
/*Note j<MAX_SAB_SIZE is needed if MAX_SAB_SIZE < ME_MAP_SIZE as j can |
701 | 701 |
become larger due to MVs overflowing their ME_MAP_MV_BITS bits space in map |
... | ... |
@@ -779,8 +778,8 @@ static int var_diamond_search(MpegEncContext * s, int *best, int dmin, |
779 | 779 |
LOAD_COMMON2 |
780 | 780 |
unsigned map_generation = c->map_generation; |
781 | 781 |
|
782 |
- cmpf= s->dsp.me_cmp[size]; |
|
783 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
782 |
+ cmpf = s->mecc.me_cmp[size]; |
|
783 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
784 | 784 |
|
785 | 785 |
for(dia_size=1; dia_size<=c->dia_size; dia_size++){ |
786 | 786 |
int dir, start, end; |
... | ... |
@@ -880,12 +879,12 @@ static av_always_inline int epzs_motion_search_internal(MpegEncContext * s, int |
880 | 880 |
|
881 | 881 |
if(c->pre_pass){ |
882 | 882 |
penalty_factor= c->pre_penalty_factor; |
883 |
- cmpf= s->dsp.me_pre_cmp[size]; |
|
884 |
- chroma_cmpf= s->dsp.me_pre_cmp[size+1]; |
|
883 |
+ cmpf = s->mecc.me_pre_cmp[size]; |
|
884 |
+ chroma_cmpf = s->mecc.me_pre_cmp[size + 1]; |
|
885 | 885 |
}else{ |
886 | 886 |
penalty_factor= c->penalty_factor; |
887 |
- cmpf= s->dsp.me_cmp[size]; |
|
888 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
887 |
+ cmpf = s->mecc.me_cmp[size]; |
|
888 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
889 | 889 |
} |
890 | 890 |
|
891 | 891 |
map_generation= update_map_generation(c); |
... | ... |
@@ -1009,8 +1008,8 @@ static int epzs_motion_search4(MpegEncContext * s, |
1009 | 1009 |
int flags= c->flags; |
1010 | 1010 |
LOAD_COMMON2 |
1011 | 1011 |
|
1012 |
- cmpf= s->dsp.me_cmp[size]; |
|
1013 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
1012 |
+ cmpf = s->mecc.me_cmp[size]; |
|
1013 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
1014 | 1014 |
|
1015 | 1015 |
map_generation= update_map_generation(c); |
1016 | 1016 |
|
... | ... |
@@ -1068,8 +1067,8 @@ static int epzs_motion_search2(MpegEncContext * s, |
1068 | 1068 |
int flags= c->flags; |
1069 | 1069 |
LOAD_COMMON2 |
1070 | 1070 |
|
1071 |
- cmpf= s->dsp.me_cmp[size]; |
|
1072 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
1071 |
+ cmpf = s->mecc.me_cmp[size]; |
|
1072 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
1073 | 1073 |
|
1074 | 1074 |
map_generation= update_map_generation(c); |
1075 | 1075 |
|
... | ... |
@@ -689,7 +689,7 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64], |
689 | 689 |
b_pic = pic->f->data[0] + offset; |
690 | 690 |
if (!pic->shared) |
691 | 691 |
b_pic += INPLACE_OFFSET; |
692 |
- diff = s->dsp.sad[0](NULL, p_pic, b_pic, s->linesize, 16); |
|
692 |
+ diff = s->mecc.sad[0](NULL, p_pic, b_pic, s->linesize, 16); |
|
693 | 693 |
if (diff > s->qscale * 70) { // FIXME check that 70 is optimal |
694 | 694 |
s->mb_skipped = 0; |
695 | 695 |
break; |
... | ... |
@@ -378,9 +378,9 @@ static void mpeg_er_decode_mb(void *opaque, int ref, int mv_dir, int mv_type, |
378 | 378 |
av_cold int ff_dct_common_init(MpegEncContext *s) |
379 | 379 |
{ |
380 | 380 |
ff_blockdsp_init(&s->bdsp, s->avctx); |
381 |
- ff_dsputil_init(&s->dsp, s->avctx); |
|
382 | 381 |
ff_hpeldsp_init(&s->hdsp, s->avctx->flags); |
383 | 382 |
ff_idctdsp_init(&s->idsp, s->avctx); |
383 |
+ ff_me_cmp_init(&s->mecc, s->avctx); |
|
384 | 384 |
ff_mpegvideodsp_init(&s->mdsp); |
385 | 385 |
ff_videodsp_init(&s->vdsp, s->avctx->bits_per_raw_sample); |
386 | 386 |
|
... | ... |
@@ -1051,7 +1051,7 @@ static int init_er(MpegEncContext *s) |
1051 | 1051 |
int i; |
1052 | 1052 |
|
1053 | 1053 |
er->avctx = s->avctx; |
1054 |
- er->dsp = &s->dsp; |
|
1054 |
+ er->mecc = &s->mecc; |
|
1055 | 1055 |
|
1056 | 1056 |
er->mb_index2xy = s->mb_index2xy; |
1057 | 1057 |
er->mb_num = s->mb_num; |
... | ... |
@@ -30,13 +30,13 @@ |
30 | 30 |
|
31 | 31 |
#include "avcodec.h" |
32 | 32 |
#include "blockdsp.h" |
33 |
-#include "dsputil.h" |
|
34 | 33 |
#include "error_resilience.h" |
35 | 34 |
#include "fdctdsp.h" |
36 | 35 |
#include "get_bits.h" |
37 | 36 |
#include "h263dsp.h" |
38 | 37 |
#include "hpeldsp.h" |
39 | 38 |
#include "idctdsp.h" |
39 |
+#include "me_cmp.h" |
|
40 | 40 |
#include "mpegvideodsp.h" |
41 | 41 |
#include "mpegvideoencdsp.h" |
42 | 42 |
#include "pixblockdsp.h" |
... | ... |
@@ -356,10 +356,10 @@ typedef struct MpegEncContext { |
356 | 356 |
int h263_long_vectors; ///< use horrible h263v1 long vector mode |
357 | 357 |
|
358 | 358 |
BlockDSPContext bdsp; |
359 |
- DSPContext dsp; ///< pointers for accelerated dsp functions |
|
360 | 359 |
FDCTDSPContext fdsp; |
361 | 360 |
HpelDSPContext hdsp; |
362 | 361 |
IDCTDSPContext idsp; |
362 |
+ MECmpContext mecc; |
|
363 | 363 |
MpegVideoDSPContext mdsp; |
364 | 364 |
MpegvideoEncDSPContext mpvencdsp; |
365 | 365 |
PixblockDSPContext pdsp; |
... | ... |
@@ -702,6 +702,7 @@ av_cold int ff_MPV_encode_init(AVCodecContext *avctx) |
702 | 702 |
ff_MPV_encode_init_x86(s); |
703 | 703 |
|
704 | 704 |
ff_fdctdsp_init(&s->fdsp, avctx); |
705 |
+ ff_me_cmp_init(&s->mecc, avctx); |
|
705 | 706 |
ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx); |
706 | 707 |
ff_pixblockdsp_init(&s->pdsp, avctx); |
707 | 708 |
ff_qpeldsp_init(&s->qdsp); |
... | ... |
@@ -744,8 +745,8 @@ av_cold int ff_MPV_encode_init(AVCodecContext *avctx) |
744 | 744 |
|
745 | 745 |
s->quant_precision = 5; |
746 | 746 |
|
747 |
- ff_set_cmp(&s->dsp, s->dsp.ildct_cmp, s->avctx->ildct_cmp); |
|
748 |
- ff_set_cmp(&s->dsp, s->dsp.frame_skip_cmp, s->avctx->frame_skip_cmp); |
|
747 |
+ ff_set_cmp(&s->mecc, s->mecc.ildct_cmp, s->avctx->ildct_cmp); |
|
748 |
+ ff_set_cmp(&s->mecc, s->mecc.frame_skip_cmp, s->avctx->frame_skip_cmp); |
|
749 | 749 |
|
750 | 750 |
if (CONFIG_H261_ENCODER && s->out_format == FMT_H261) |
751 | 751 |
ff_h261_encode_init(s); |
... | ... |
@@ -895,8 +896,8 @@ static int get_intra_count(MpegEncContext *s, uint8_t *src, |
895 | 895 |
for (y = 0; y < h; y += 16) { |
896 | 896 |
for (x = 0; x < w; x += 16) { |
897 | 897 |
int offset = x + y * stride; |
898 |
- int sad = s->dsp.sad[0](NULL, src + offset, ref + offset, stride, |
|
899 |
- 16); |
|
898 |
+ int sad = s->mecc.sad[0](NULL, src + offset, ref + offset, |
|
899 |
+ stride, 16); |
|
900 | 900 |
int mean = (s->mpvencdsp.pix_sum(src + offset, stride) + 128) >> 8; |
901 | 901 |
int sae = get_sae(src + offset, mean, stride); |
902 | 902 |
|
... | ... |
@@ -1053,7 +1054,7 @@ static int skip_check(MpegEncContext *s, Picture *p, Picture *ref) |
1053 | 1053 |
int off = p->shared ? 0 : 16; |
1054 | 1054 |
uint8_t *dptr = p->f->data[plane] + 8 * (x + y * stride) + off; |
1055 | 1055 |
uint8_t *rptr = ref->f->data[plane] + 8 * (x + y * stride); |
1056 |
- int v = s->dsp.frame_skip_cmp[1](s, dptr, rptr, stride, 8); |
|
1056 |
+ int v = s->mecc.frame_skip_cmp[1](s, dptr, rptr, stride, 8); |
|
1057 | 1057 |
|
1058 | 1058 |
switch (s->avctx->frame_skip_exp) { |
1059 | 1059 |
case 0: score = FFMAX(score, v); break; |
... | ... |
@@ -1923,16 +1924,15 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s, |
1923 | 1923 |
int progressive_score, interlaced_score; |
1924 | 1924 |
|
1925 | 1925 |
s->interlaced_dct = 0; |
1926 |
- progressive_score = s->dsp.ildct_cmp[4](s, ptr_y, |
|
1927 |
- NULL, wrap_y, 8) + |
|
1928 |
- s->dsp.ildct_cmp[4](s, ptr_y + wrap_y * 8, |
|
1929 |
- NULL, wrap_y, 8) - 400; |
|
1926 |
+ progressive_score = s->mecc.ildct_cmp[4](s, ptr_y, NULL, wrap_y, 8) + |
|
1927 |
+ s->mecc.ildct_cmp[4](s, ptr_y + wrap_y * 8, |
|
1928 |
+ NULL, wrap_y, 8) - 400; |
|
1930 | 1929 |
|
1931 | 1930 |
if (progressive_score > 0) { |
1932 |
- interlaced_score = s->dsp.ildct_cmp[4](s, ptr_y, |
|
1933 |
- NULL, wrap_y * 2, 8) + |
|
1934 |
- s->dsp.ildct_cmp[4](s, ptr_y + wrap_y, |
|
1935 |
- NULL, wrap_y * 2, 8); |
|
1931 |
+ interlaced_score = s->mecc.ildct_cmp[4](s, ptr_y, |
|
1932 |
+ NULL, wrap_y * 2, 8) + |
|
1933 |
+ s->mecc.ildct_cmp[4](s, ptr_y + wrap_y, |
|
1934 |
+ NULL, wrap_y * 2, 8); |
|
1936 | 1935 |
if (progressive_score > interlaced_score) { |
1937 | 1936 |
s->interlaced_dct = 1; |
1938 | 1937 |
|
... | ... |
@@ -1996,23 +1996,20 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s, |
1996 | 1996 |
int progressive_score, interlaced_score; |
1997 | 1997 |
|
1998 | 1998 |
s->interlaced_dct = 0; |
1999 |
- progressive_score = s->dsp.ildct_cmp[0](s, dest_y, |
|
2000 |
- ptr_y, wrap_y, |
|
2001 |
- 8) + |
|
2002 |
- s->dsp.ildct_cmp[0](s, dest_y + wrap_y * 8, |
|
2003 |
- ptr_y + wrap_y * 8, wrap_y, |
|
2004 |
- 8) - 400; |
|
1999 |
+ progressive_score = s->mecc.ildct_cmp[0](s, dest_y, ptr_y, wrap_y, 8) + |
|
2000 |
+ s->mecc.ildct_cmp[0](s, dest_y + wrap_y * 8, |
|
2001 |
+ ptr_y + wrap_y * 8, |
|
2002 |
+ wrap_y, 8) - 400; |
|
2005 | 2003 |
|
2006 | 2004 |
if (s->avctx->ildct_cmp == FF_CMP_VSSE) |
2007 | 2005 |
progressive_score -= 400; |
2008 | 2006 |
|
2009 | 2007 |
if (progressive_score > 0) { |
2010 |
- interlaced_score = s->dsp.ildct_cmp[0](s, dest_y, |
|
2011 |
- ptr_y, |
|
2012 |
- wrap_y * 2, 8) + |
|
2013 |
- s->dsp.ildct_cmp[0](s, dest_y + wrap_y, |
|
2014 |
- ptr_y + wrap_y, |
|
2015 |
- wrap_y * 2, 8); |
|
2008 |
+ interlaced_score = s->mecc.ildct_cmp[0](s, dest_y, ptr_y, |
|
2009 |
+ wrap_y * 2, 8) + |
|
2010 |
+ s->mecc.ildct_cmp[0](s, dest_y + wrap_y, |
|
2011 |
+ ptr_y + wrap_y, |
|
2012 |
+ wrap_y * 2, 8); |
|
2016 | 2013 |
|
2017 | 2014 |
if (progressive_score > interlaced_score) { |
2018 | 2015 |
s->interlaced_dct = 1; |
... | ... |
@@ -2049,33 +2046,28 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s, |
2049 | 2049 |
if (s->current_picture.mc_mb_var[s->mb_stride * mb_y + mb_x] < |
2050 | 2050 |
2 * s->qscale * s->qscale) { |
2051 | 2051 |
// FIXME optimize |
2052 |
- if (s->dsp.sad[1](NULL, ptr_y , dest_y, |
|
2053 |
- wrap_y, 8) < 20 * s->qscale) |
|
2052 |
+ if (s->mecc.sad[1](NULL, ptr_y, dest_y, wrap_y, 8) < 20 * s->qscale) |
|
2054 | 2053 |
skip_dct[0] = 1; |
2055 |
- if (s->dsp.sad[1](NULL, ptr_y + 8, |
|
2056 |
- dest_y + 8, wrap_y, 8) < 20 * s->qscale) |
|
2054 |
+ if (s->mecc.sad[1](NULL, ptr_y + 8, dest_y + 8, wrap_y, 8) < 20 * s->qscale) |
|
2057 | 2055 |
skip_dct[1] = 1; |
2058 |
- if (s->dsp.sad[1](NULL, ptr_y + dct_offset, |
|
2059 |
- dest_y + dct_offset, wrap_y, 8) < 20 * s->qscale) |
|
2056 |
+ if (s->mecc.sad[1](NULL, ptr_y + dct_offset, dest_y + dct_offset, |
|
2057 |
+ wrap_y, 8) < 20 * s->qscale) |
|
2060 | 2058 |
skip_dct[2] = 1; |
2061 |
- if (s->dsp.sad[1](NULL, ptr_y + dct_offset + 8, |
|
2062 |
- dest_y + dct_offset + 8, |
|
2063 |
- wrap_y, 8) < 20 * s->qscale) |
|
2059 |
+ if (s->mecc.sad[1](NULL, ptr_y + dct_offset + 8, dest_y + dct_offset + 8, |
|
2060 |
+ wrap_y, 8) < 20 * s->qscale) |
|
2064 | 2061 |
skip_dct[3] = 1; |
2065 |
- if (s->dsp.sad[1](NULL, ptr_cb, dest_cb, |
|
2066 |
- wrap_c, 8) < 20 * s->qscale) |
|
2062 |
+ if (s->mecc.sad[1](NULL, ptr_cb, dest_cb, wrap_c, 8) < 20 * s->qscale) |
|
2067 | 2063 |
skip_dct[4] = 1; |
2068 |
- if (s->dsp.sad[1](NULL, ptr_cr, dest_cr, |
|
2069 |
- wrap_c, 8) < 20 * s->qscale) |
|
2064 |
+ if (s->mecc.sad[1](NULL, ptr_cr, dest_cr, wrap_c, 8) < 20 * s->qscale) |
|
2070 | 2065 |
skip_dct[5] = 1; |
2071 | 2066 |
if (!s->chroma_y_shift) { /* 422 */ |
2072 |
- if (s->dsp.sad[1](NULL, ptr_cb + (dct_offset >> 1), |
|
2073 |
- dest_cb + (dct_offset >> 1), |
|
2074 |
- wrap_c, 8) < 20 * s->qscale) |
|
2067 |
+ if (s->mecc.sad[1](NULL, ptr_cb + (dct_offset >> 1), |
|
2068 |
+ dest_cb + (dct_offset >> 1), |
|
2069 |
+ wrap_c, 8) < 20 * s->qscale) |
|
2075 | 2070 |
skip_dct[6] = 1; |
2076 |
- if (s->dsp.sad[1](NULL, ptr_cr + (dct_offset >> 1), |
|
2077 |
- dest_cr + (dct_offset >> 1), |
|
2078 |
- wrap_c, 8) < 20 * s->qscale) |
|
2071 |
+ if (s->mecc.sad[1](NULL, ptr_cr + (dct_offset >> 1), |
|
2072 |
+ dest_cr + (dct_offset >> 1), |
|
2073 |
+ wrap_c, 8) < 20 * s->qscale) |
|
2079 | 2074 |
skip_dct[7] = 1; |
2080 | 2075 |
} |
2081 | 2076 |
} |
... | ... |
@@ -2340,9 +2332,9 @@ static int sse(MpegEncContext *s, uint8_t *src1, uint8_t *src2, int w, int h, in |
2340 | 2340 |
int x,y; |
2341 | 2341 |
|
2342 | 2342 |
if(w==16 && h==16) |
2343 |
- return s->dsp.sse[0](NULL, src1, src2, stride, 16); |
|
2343 |
+ return s->mecc.sse[0](NULL, src1, src2, stride, 16); |
|
2344 | 2344 |
else if(w==8 && h==8) |
2345 |
- return s->dsp.sse[1](NULL, src1, src2, stride, 8); |
|
2345 |
+ return s->mecc.sse[1](NULL, src1, src2, stride, 8); |
|
2346 | 2346 |
|
2347 | 2347 |
for(y=0; y<h; y++){ |
2348 | 2348 |
for(x=0; x<w; x++){ |
... | ... |
@@ -2364,13 +2356,13 @@ static int sse_mb(MpegEncContext *s){ |
2364 | 2364 |
|
2365 | 2365 |
if(w==16 && h==16) |
2366 | 2366 |
if(s->avctx->mb_cmp == FF_CMP_NSSE){ |
2367 |
- return s->dsp.nsse[0](s, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], s->linesize, 16) |
|
2368 |
- +s->dsp.nsse[1](s, s->new_picture.f->data[1] + s->mb_x*8 + s->mb_y*s->uvlinesize*8,s->dest[1], s->uvlinesize, 8) |
|
2369 |
- +s->dsp.nsse[1](s, s->new_picture.f->data[2] + s->mb_x*8 + s->mb_y*s->uvlinesize*8,s->dest[2], s->uvlinesize, 8); |
|
2367 |
+ return s->mecc.nsse[0](s, s->new_picture.f->data[0] + s->mb_x * 16 + s->mb_y * s->linesize * 16, s->dest[0], s->linesize, 16) + |
|
2368 |
+ s->mecc.nsse[1](s, s->new_picture.f->data[1] + s->mb_x * 8 + s->mb_y * s->uvlinesize * 8, s->dest[1], s->uvlinesize, 8) + |
|
2369 |
+ s->mecc.nsse[1](s, s->new_picture.f->data[2] + s->mb_x * 8 + s->mb_y * s->uvlinesize * 8, s->dest[2], s->uvlinesize, 8); |
|
2370 | 2370 |
}else{ |
2371 |
- return s->dsp.sse[0](NULL, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], s->linesize, 16) |
|
2372 |
- +s->dsp.sse[1](NULL, s->new_picture.f->data[1] + s->mb_x*8 + s->mb_y*s->uvlinesize*8,s->dest[1], s->uvlinesize, 8) |
|
2373 |
- +s->dsp.sse[1](NULL, s->new_picture.f->data[2] + s->mb_x*8 + s->mb_y*s->uvlinesize*8,s->dest[2], s->uvlinesize, 8); |
|
2371 |
+ return s->mecc.sse[0](NULL, s->new_picture.f->data[0] + s->mb_x * 16 + s->mb_y * s->linesize * 16, s->dest[0], s->linesize, 16) + |
|
2372 |
+ s->mecc.sse[1](NULL, s->new_picture.f->data[1] + s->mb_x * 8 + s->mb_y * s->uvlinesize * 8, s->dest[1], s->uvlinesize, 8) + |
|
2373 |
+ s->mecc.sse[1](NULL, s->new_picture.f->data[2] + s->mb_x * 8 + s->mb_y * s->uvlinesize * 8, s->dest[2], s->uvlinesize, 8); |
|
2374 | 2374 |
} |
2375 | 2375 |
else |
2376 | 2376 |
return sse(s, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], w, h, s->linesize) |
... | ... |
@@ -24,8 +24,8 @@ |
24 | 24 |
#include "libavutil/attributes.h" |
25 | 25 |
#include "libavutil/imgutils.h" |
26 | 26 |
#include "avcodec.h" |
27 |
-#include "dsputil.h" |
|
28 | 27 |
#include "imgconvert.h" |
28 |
+#include "me_cmp.h" |
|
29 | 29 |
#include "mpegvideoencdsp.h" |
30 | 30 |
|
31 | 31 |
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], |
... | ... |
@@ -2,7 +2,6 @@ OBJS += ppc/fmtconvert_altivec.o \ |
2 | 2 |
|
3 | 3 |
OBJS-$(CONFIG_AUDIODSP) += ppc/audiodsp.o |
4 | 4 |
OBJS-$(CONFIG_BLOCKDSP) += ppc/blockdsp.o |
5 |
-OBJS-$(CONFIG_DSPUTIL) += ppc/dsputil_altivec.o |
|
6 | 5 |
OBJS-$(CONFIG_FFT) += ppc/fft_altivec.o |
7 | 6 |
OBJS-$(CONFIG_H264CHROMA) += ppc/h264chroma_init.o |
8 | 7 |
OBJS-$(CONFIG_H264DSP) += ppc/h264dsp.o |
... | ... |
@@ -11,6 +10,7 @@ OBJS-$(CONFIG_HPELDSP) += ppc/hpeldsp_altivec.o |
11 | 11 |
OBJS-$(CONFIG_HUFFYUVDSP) += ppc/huffyuvdsp_altivec.o |
12 | 12 |
OBJS-$(CONFIG_FDCTDSP) += ppc/fdctdsp.o |
13 | 13 |
OBJS-$(CONFIG_IDCTDSP) += ppc/idctdsp.o |
14 |
+OBJS-$(CONFIG_ME_CMP) += ppc/me_cmp.o |
|
14 | 15 |
OBJS-$(CONFIG_MPEGAUDIODSP) += ppc/mpegaudiodsp_altivec.o |
15 | 16 |
OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o \ |
16 | 17 |
ppc/mpegvideodsp.o |
17 | 18 |
deleted file mode 100644 |
... | ... |
@@ -1,767 +0,0 @@ |
1 |
-/* |
|
2 |
- * Copyright (c) 2002 Brian Foley |
|
3 |
- * Copyright (c) 2002 Dieter Shirley |
|
4 |
- * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> |
|
5 |
- * |
|
6 |
- * This file is part of Libav. |
|
7 |
- * |
|
8 |
- * Libav is free software; you can redistribute it and/or |
|
9 |
- * modify it under the terms of the GNU Lesser General Public |
|
10 |
- * License as published by the Free Software Foundation; either |
|
11 |
- * version 2.1 of the License, or (at your option) any later version. |
|
12 |
- * |
|
13 |
- * Libav is distributed in the hope that it will be useful, |
|
14 |
- * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
15 |
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
16 |
- * Lesser General Public License for more details. |
|
17 |
- * |
|
18 |
- * You should have received a copy of the GNU Lesser General Public |
|
19 |
- * License along with Libav; if not, write to the Free Software |
|
20 |
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
21 |
- */ |
|
22 |
- |
|
23 |
-#include "config.h" |
|
24 |
-#if HAVE_ALTIVEC_H |
|
25 |
-#include <altivec.h> |
|
26 |
-#endif |
|
27 |
- |
|
28 |
-#include "libavutil/attributes.h" |
|
29 |
-#include "libavutil/cpu.h" |
|
30 |
-#include "libavutil/ppc/cpu.h" |
|
31 |
-#include "libavutil/ppc/types_altivec.h" |
|
32 |
-#include "libavutil/ppc/util_altivec.h" |
|
33 |
-#include "libavcodec/avcodec.h" |
|
34 |
-#include "libavcodec/dsputil.h" |
|
35 |
-#include "libavcodec/mpegvideo.h" |
|
36 |
- |
|
37 |
-#if HAVE_ALTIVEC |
|
38 |
-static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
39 |
- int line_size, int h) |
|
40 |
-{ |
|
41 |
- int i, s = 0; |
|
42 |
- const vector unsigned char zero = |
|
43 |
- (const vector unsigned char) vec_splat_u8(0); |
|
44 |
- vector unsigned char perm1 = vec_lvsl(0, pix2); |
|
45 |
- vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); |
|
46 |
- vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
47 |
- vector signed int sumdiffs; |
|
48 |
- |
|
49 |
- for (i = 0; i < h; i++) { |
|
50 |
- /* Read unaligned pixels into our vectors. The vectors are as follows: |
|
51 |
- * pix1v: pix1[0] - pix1[15] |
|
52 |
- * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] */ |
|
53 |
- vector unsigned char pix1v = vec_ld(0, pix1); |
|
54 |
- vector unsigned char pix2l = vec_ld(0, pix2); |
|
55 |
- vector unsigned char pix2r = vec_ld(16, pix2); |
|
56 |
- vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm1); |
|
57 |
- vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2); |
|
58 |
- |
|
59 |
- /* Calculate the average vector. */ |
|
60 |
- vector unsigned char avgv = vec_avg(pix2v, pix2iv); |
|
61 |
- |
|
62 |
- /* Calculate a sum of abs differences vector. */ |
|
63 |
- vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv), |
|
64 |
- vec_min(pix1v, avgv)); |
|
65 |
- |
|
66 |
- /* Add each 4 pixel group together and put 4 results into sad. */ |
|
67 |
- sad = vec_sum4s(t5, sad); |
|
68 |
- |
|
69 |
- pix1 += line_size; |
|
70 |
- pix2 += line_size; |
|
71 |
- } |
|
72 |
- /* Sum up the four partial sums, and put the result into s. */ |
|
73 |
- sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
74 |
- sumdiffs = vec_splat(sumdiffs, 3); |
|
75 |
- vec_ste(sumdiffs, 0, &s); |
|
76 |
- |
|
77 |
- return s; |
|
78 |
-} |
|
79 |
- |
|
80 |
-static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
81 |
- int line_size, int h) |
|
82 |
-{ |
|
83 |
- int i, s = 0; |
|
84 |
- const vector unsigned char zero = |
|
85 |
- (const vector unsigned char) vec_splat_u8(0); |
|
86 |
- vector unsigned char perm = vec_lvsl(0, pix2); |
|
87 |
- vector unsigned char pix1v, pix3v, avgv, t5; |
|
88 |
- vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
89 |
- vector signed int sumdiffs; |
|
90 |
- uint8_t *pix3 = pix2 + line_size; |
|
91 |
- |
|
92 |
- /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one |
|
93 |
- * iteration becomes pix2 in the next iteration. We can use this |
|
94 |
- * fact to avoid a potentially expensive unaligned read, each |
|
95 |
- * time around the loop. |
|
96 |
- * Read unaligned pixels into our vectors. The vectors are as follows: |
|
97 |
- * pix2v: pix2[0] - pix2[15] |
|
98 |
- * Split the pixel vectors into shorts. */ |
|
99 |
- vector unsigned char pix2l = vec_ld(0, pix2); |
|
100 |
- vector unsigned char pix2r = vec_ld(15, pix2); |
|
101 |
- vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm); |
|
102 |
- |
|
103 |
- for (i = 0; i < h; i++) { |
|
104 |
- /* Read unaligned pixels into our vectors. The vectors are as follows: |
|
105 |
- * pix1v: pix1[0] - pix1[15] |
|
106 |
- * pix3v: pix3[0] - pix3[15] */ |
|
107 |
- pix1v = vec_ld(0, pix1); |
|
108 |
- |
|
109 |
- pix2l = vec_ld(0, pix3); |
|
110 |
- pix2r = vec_ld(15, pix3); |
|
111 |
- pix3v = vec_perm(pix2l, pix2r, perm); |
|
112 |
- |
|
113 |
- /* Calculate the average vector. */ |
|
114 |
- avgv = vec_avg(pix2v, pix3v); |
|
115 |
- |
|
116 |
- /* Calculate a sum of abs differences vector. */ |
|
117 |
- t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
|
118 |
- |
|
119 |
- /* Add each 4 pixel group together and put 4 results into sad. */ |
|
120 |
- sad = vec_sum4s(t5, sad); |
|
121 |
- |
|
122 |
- pix1 += line_size; |
|
123 |
- pix2v = pix3v; |
|
124 |
- pix3 += line_size; |
|
125 |
- } |
|
126 |
- |
|
127 |
- /* Sum up the four partial sums, and put the result into s. */ |
|
128 |
- sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
129 |
- sumdiffs = vec_splat(sumdiffs, 3); |
|
130 |
- vec_ste(sumdiffs, 0, &s); |
|
131 |
- return s; |
|
132 |
-} |
|
133 |
- |
|
134 |
-static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
135 |
- int line_size, int h) |
|
136 |
-{ |
|
137 |
- int i, s = 0; |
|
138 |
- uint8_t *pix3 = pix2 + line_size; |
|
139 |
- const vector unsigned char zero = |
|
140 |
- (const vector unsigned char) vec_splat_u8(0); |
|
141 |
- const vector unsigned short two = |
|
142 |
- (const vector unsigned short) vec_splat_u16(2); |
|
143 |
- vector unsigned char avgv, t5; |
|
144 |
- vector unsigned char perm1 = vec_lvsl(0, pix2); |
|
145 |
- vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); |
|
146 |
- vector unsigned char pix1v, pix3v, pix3iv; |
|
147 |
- vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; |
|
148 |
- vector unsigned short avghv, avglv; |
|
149 |
- vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
150 |
- vector signed int sumdiffs; |
|
151 |
- |
|
152 |
- /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one |
|
153 |
- * iteration becomes pix2 in the next iteration. We can use this |
|
154 |
- * fact to avoid a potentially expensive unaligned read, as well |
|
155 |
- * as some splitting, and vector addition each time around the loop. |
|
156 |
- * Read unaligned pixels into our vectors. The vectors are as follows: |
|
157 |
- * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] |
|
158 |
- * Split the pixel vectors into shorts. */ |
|
159 |
- vector unsigned char pix2l = vec_ld(0, pix2); |
|
160 |
- vector unsigned char pix2r = vec_ld(16, pix2); |
|
161 |
- vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm1); |
|
162 |
- vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2); |
|
163 |
- |
|
164 |
- vector unsigned short pix2hv = |
|
165 |
- (vector unsigned short) vec_mergeh(zero, pix2v); |
|
166 |
- vector unsigned short pix2lv = |
|
167 |
- (vector unsigned short) vec_mergel(zero, pix2v); |
|
168 |
- vector unsigned short pix2ihv = |
|
169 |
- (vector unsigned short) vec_mergeh(zero, pix2iv); |
|
170 |
- vector unsigned short pix2ilv = |
|
171 |
- (vector unsigned short) vec_mergel(zero, pix2iv); |
|
172 |
- vector unsigned short t1 = vec_add(pix2hv, pix2ihv); |
|
173 |
- vector unsigned short t2 = vec_add(pix2lv, pix2ilv); |
|
174 |
- vector unsigned short t3, t4; |
|
175 |
- |
|
176 |
- for (i = 0; i < h; i++) { |
|
177 |
- /* Read unaligned pixels into our vectors. The vectors are as follows: |
|
178 |
- * pix1v: pix1[0] - pix1[15] |
|
179 |
- * pix3v: pix3[0] - pix3[15] pix3iv: pix3[1] - pix3[16] */ |
|
180 |
- pix1v = vec_ld(0, pix1); |
|
181 |
- |
|
182 |
- pix2l = vec_ld(0, pix3); |
|
183 |
- pix2r = vec_ld(16, pix3); |
|
184 |
- pix3v = vec_perm(pix2l, pix2r, perm1); |
|
185 |
- pix3iv = vec_perm(pix2l, pix2r, perm2); |
|
186 |
- |
|
187 |
- /* Note that AltiVec does have vec_avg, but this works on vector pairs |
|
188 |
- * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the |
|
189 |
- * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when |
|
190 |
- * it should be 1. Instead, we have to split the pixel vectors into |
|
191 |
- * vectors of shorts and do the averaging by hand. */ |
|
192 |
- |
|
193 |
- /* Split the pixel vectors into shorts. */ |
|
194 |
- pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); |
|
195 |
- pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); |
|
196 |
- pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); |
|
197 |
- pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); |
|
198 |
- |
|
199 |
- /* Do the averaging on them. */ |
|
200 |
- t3 = vec_add(pix3hv, pix3ihv); |
|
201 |
- t4 = vec_add(pix3lv, pix3ilv); |
|
202 |
- |
|
203 |
- avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); |
|
204 |
- avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); |
|
205 |
- |
|
206 |
- /* Pack the shorts back into a result. */ |
|
207 |
- avgv = vec_pack(avghv, avglv); |
|
208 |
- |
|
209 |
- /* Calculate a sum of abs differences vector. */ |
|
210 |
- t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
|
211 |
- |
|
212 |
- /* Add each 4 pixel group together and put 4 results into sad. */ |
|
213 |
- sad = vec_sum4s(t5, sad); |
|
214 |
- |
|
215 |
- pix1 += line_size; |
|
216 |
- pix3 += line_size; |
|
217 |
- /* Transfer the calculated values for pix3 into pix2. */ |
|
218 |
- t1 = t3; |
|
219 |
- t2 = t4; |
|
220 |
- } |
|
221 |
- /* Sum up the four partial sums, and put the result into s. */ |
|
222 |
- sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
223 |
- sumdiffs = vec_splat(sumdiffs, 3); |
|
224 |
- vec_ste(sumdiffs, 0, &s); |
|
225 |
- |
|
226 |
- return s; |
|
227 |
-} |
|
228 |
- |
|
229 |
-static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
230 |
- int line_size, int h) |
|
231 |
-{ |
|
232 |
- int i, s; |
|
233 |
- const vector unsigned int zero = |
|
234 |
- (const vector unsigned int) vec_splat_u32(0); |
|
235 |
- vector unsigned char perm = vec_lvsl(0, pix2); |
|
236 |
- vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
237 |
- vector signed int sumdiffs; |
|
238 |
- |
|
239 |
- for (i = 0; i < h; i++) { |
|
240 |
- /* Read potentially unaligned pixels into t1 and t2. */ |
|
241 |
- vector unsigned char pix2l = vec_ld(0, pix2); |
|
242 |
- vector unsigned char pix2r = vec_ld(15, pix2); |
|
243 |
- vector unsigned char t1 = vec_ld(0, pix1); |
|
244 |
- vector unsigned char t2 = vec_perm(pix2l, pix2r, perm); |
|
245 |
- |
|
246 |
- /* Calculate a sum of abs differences vector. */ |
|
247 |
- vector unsigned char t3 = vec_max(t1, t2); |
|
248 |
- vector unsigned char t4 = vec_min(t1, t2); |
|
249 |
- vector unsigned char t5 = vec_sub(t3, t4); |
|
250 |
- |
|
251 |
- /* Add each 4 pixel group together and put 4 results into sad. */ |
|
252 |
- sad = vec_sum4s(t5, sad); |
|
253 |
- |
|
254 |
- pix1 += line_size; |
|
255 |
- pix2 += line_size; |
|
256 |
- } |
|
257 |
- |
|
258 |
- /* Sum up the four partial sums, and put the result into s. */ |
|
259 |
- sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
260 |
- sumdiffs = vec_splat(sumdiffs, 3); |
|
261 |
- vec_ste(sumdiffs, 0, &s); |
|
262 |
- |
|
263 |
- return s; |
|
264 |
-} |
|
265 |
- |
|
266 |
-static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
267 |
- int line_size, int h) |
|
268 |
-{ |
|
269 |
- int i, s; |
|
270 |
- const vector unsigned int zero = |
|
271 |
- (const vector unsigned int) vec_splat_u32(0); |
|
272 |
- const vector unsigned char permclear = |
|
273 |
- (vector unsigned char) |
|
274 |
- { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; |
|
275 |
- vector unsigned char perm1 = vec_lvsl(0, pix1); |
|
276 |
- vector unsigned char perm2 = vec_lvsl(0, pix2); |
|
277 |
- vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
278 |
- vector signed int sumdiffs; |
|
279 |
- |
|
280 |
- for (i = 0; i < h; i++) { |
|
281 |
- /* Read potentially unaligned pixels into t1 and t2. |
|
282 |
- * Since we're reading 16 pixels, and actually only want 8, |
|
283 |
- * mask out the last 8 pixels. The 0s don't change the sum. */ |
|
284 |
- vector unsigned char pix1l = vec_ld(0, pix1); |
|
285 |
- vector unsigned char pix1r = vec_ld(7, pix1); |
|
286 |
- vector unsigned char pix2l = vec_ld(0, pix2); |
|
287 |
- vector unsigned char pix2r = vec_ld(7, pix2); |
|
288 |
- vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1), |
|
289 |
- permclear); |
|
290 |
- vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2), |
|
291 |
- permclear); |
|
292 |
- |
|
293 |
- /* Calculate a sum of abs differences vector. */ |
|
294 |
- vector unsigned char t3 = vec_max(t1, t2); |
|
295 |
- vector unsigned char t4 = vec_min(t1, t2); |
|
296 |
- vector unsigned char t5 = vec_sub(t3, t4); |
|
297 |
- |
|
298 |
- /* Add each 4 pixel group together and put 4 results into sad. */ |
|
299 |
- sad = vec_sum4s(t5, sad); |
|
300 |
- |
|
301 |
- pix1 += line_size; |
|
302 |
- pix2 += line_size; |
|
303 |
- } |
|
304 |
- |
|
305 |
- /* Sum up the four partial sums, and put the result into s. */ |
|
306 |
- sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
307 |
- sumdiffs = vec_splat(sumdiffs, 3); |
|
308 |
- vec_ste(sumdiffs, 0, &s); |
|
309 |
- |
|
310 |
- return s; |
|
311 |
-} |
|
312 |
- |
|
313 |
-/* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced. |
|
314 |
- * It's the sad8_altivec code above w/ squaring added. */ |
|
315 |
-static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
316 |
- int line_size, int h) |
|
317 |
-{ |
|
318 |
- int i, s; |
|
319 |
- const vector unsigned int zero = |
|
320 |
- (const vector unsigned int) vec_splat_u32(0); |
|
321 |
- const vector unsigned char permclear = |
|
322 |
- (vector unsigned char) |
|
323 |
- { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; |
|
324 |
- vector unsigned char perm1 = vec_lvsl(0, pix1); |
|
325 |
- vector unsigned char perm2 = vec_lvsl(0, pix2); |
|
326 |
- vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); |
|
327 |
- vector signed int sumsqr; |
|
328 |
- |
|
329 |
- for (i = 0; i < h; i++) { |
|
330 |
- /* Read potentially unaligned pixels into t1 and t2. |
|
331 |
- * Since we're reading 16 pixels, and actually only want 8, |
|
332 |
- * mask out the last 8 pixels. The 0s don't change the sum. */ |
|
333 |
- vector unsigned char pix1l = vec_ld(0, pix1); |
|
334 |
- vector unsigned char pix1r = vec_ld(7, pix1); |
|
335 |
- vector unsigned char pix2l = vec_ld(0, pix2); |
|
336 |
- vector unsigned char pix2r = vec_ld(7, pix2); |
|
337 |
- vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1), |
|
338 |
- permclear); |
|
339 |
- vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2), |
|
340 |
- permclear); |
|
341 |
- |
|
342 |
- /* Since we want to use unsigned chars, we can take advantage |
|
343 |
- * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ |
|
344 |
- |
|
345 |
- /* Calculate abs differences vector. */ |
|
346 |
- vector unsigned char t3 = vec_max(t1, t2); |
|
347 |
- vector unsigned char t4 = vec_min(t1, t2); |
|
348 |
- vector unsigned char t5 = vec_sub(t3, t4); |
|
349 |
- |
|
350 |
- /* Square the values and add them to our sum. */ |
|
351 |
- sum = vec_msum(t5, t5, sum); |
|
352 |
- |
|
353 |
- pix1 += line_size; |
|
354 |
- pix2 += line_size; |
|
355 |
- } |
|
356 |
- |
|
357 |
- /* Sum up the four partial sums, and put the result into s. */ |
|
358 |
- sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); |
|
359 |
- sumsqr = vec_splat(sumsqr, 3); |
|
360 |
- vec_ste(sumsqr, 0, &s); |
|
361 |
- |
|
362 |
- return s; |
|
363 |
-} |
|
364 |
- |
|
365 |
-/* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced. |
|
366 |
- * It's the sad16_altivec code above w/ squaring added. */ |
|
367 |
-static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
368 |
- int line_size, int h) |
|
369 |
-{ |
|
370 |
- int i, s; |
|
371 |
- const vector unsigned int zero = |
|
372 |
- (const vector unsigned int) vec_splat_u32(0); |
|
373 |
- vector unsigned char perm = vec_lvsl(0, pix2); |
|
374 |
- vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); |
|
375 |
- vector signed int sumsqr; |
|
376 |
- |
|
377 |
- for (i = 0; i < h; i++) { |
|
378 |
- /* Read potentially unaligned pixels into t1 and t2. */ |
|
379 |
- vector unsigned char pix2l = vec_ld(0, pix2); |
|
380 |
- vector unsigned char pix2r = vec_ld(15, pix2); |
|
381 |
- vector unsigned char t1 = vec_ld(0, pix1); |
|
382 |
- vector unsigned char t2 = vec_perm(pix2l, pix2r, perm); |
|
383 |
- |
|
384 |
- /* Since we want to use unsigned chars, we can take advantage |
|
385 |
- * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ |
|
386 |
- |
|
387 |
- /* Calculate abs differences vector. */ |
|
388 |
- vector unsigned char t3 = vec_max(t1, t2); |
|
389 |
- vector unsigned char t4 = vec_min(t1, t2); |
|
390 |
- vector unsigned char t5 = vec_sub(t3, t4); |
|
391 |
- |
|
392 |
- /* Square the values and add them to our sum. */ |
|
393 |
- sum = vec_msum(t5, t5, sum); |
|
394 |
- |
|
395 |
- pix1 += line_size; |
|
396 |
- pix2 += line_size; |
|
397 |
- } |
|
398 |
- |
|
399 |
- /* Sum up the four partial sums, and put the result into s. */ |
|
400 |
- sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); |
|
401 |
- sumsqr = vec_splat(sumsqr, 3); |
|
402 |
- vec_ste(sumsqr, 0, &s); |
|
403 |
- |
|
404 |
- return s; |
|
405 |
-} |
|
406 |
- |
|
407 |
-static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst, |
|
408 |
- uint8_t *src, int stride, int h) |
|
409 |
-{ |
|
410 |
- int sum; |
|
411 |
- register const vector unsigned char vzero = |
|
412 |
- (const vector unsigned char) vec_splat_u8(0); |
|
413 |
- register vector signed short temp0, temp1, temp2, temp3, temp4, |
|
414 |
- temp5, temp6, temp7; |
|
415 |
- { |
|
416 |
- register const vector signed short vprod1 = |
|
417 |
- (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; |
|
418 |
- register const vector signed short vprod2 = |
|
419 |
- (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; |
|
420 |
- register const vector signed short vprod3 = |
|
421 |
- (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; |
|
422 |
- register const vector unsigned char perm1 = |
|
423 |
- (const vector unsigned char) |
|
424 |
- { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, |
|
425 |
- 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; |
|
426 |
- register const vector unsigned char perm2 = |
|
427 |
- (const vector unsigned char) |
|
428 |
- { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, |
|
429 |
- 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; |
|
430 |
- register const vector unsigned char perm3 = |
|
431 |
- (const vector unsigned char) |
|
432 |
- { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
|
433 |
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; |
|
434 |
- |
|
435 |
-#define ONEITERBUTTERFLY(i, res) \ |
|
436 |
- { \ |
|
437 |
- register vector unsigned char src1 = vec_ld(stride * i, src); \ |
|
438 |
- register vector unsigned char src2 = vec_ld(stride * i + 15, src); \ |
|
439 |
- register vector unsigned char srcO = \ |
|
440 |
- vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ |
|
441 |
- register vector unsigned char dst1 = vec_ld(stride * i, dst); \ |
|
442 |
- register vector unsigned char dst2 = vec_ld(stride * i + 15, dst); \ |
|
443 |
- register vector unsigned char dstO = \ |
|
444 |
- vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ |
|
445 |
- \ |
|
446 |
- /* Promote the unsigned chars to signed shorts. */ \ |
|
447 |
- /* We're in the 8x8 function, we only care for the first 8. */ \ |
|
448 |
- register vector signed short srcV = \ |
|
449 |
- (vector signed short) vec_mergeh((vector signed char) vzero, \ |
|
450 |
- (vector signed char) srcO); \ |
|
451 |
- register vector signed short dstV = \ |
|
452 |
- (vector signed short) vec_mergeh((vector signed char) vzero, \ |
|
453 |
- (vector signed char) dstO); \ |
|
454 |
- \ |
|
455 |
- /* subtractions inside the first butterfly */ \ |
|
456 |
- register vector signed short but0 = vec_sub(srcV, dstV); \ |
|
457 |
- register vector signed short op1 = vec_perm(but0, but0, perm1); \ |
|
458 |
- register vector signed short but1 = vec_mladd(but0, vprod1, op1); \ |
|
459 |
- register vector signed short op2 = vec_perm(but1, but1, perm2); \ |
|
460 |
- register vector signed short but2 = vec_mladd(but1, vprod2, op2); \ |
|
461 |
- register vector signed short op3 = vec_perm(but2, but2, perm3); \ |
|
462 |
- res = vec_mladd(but2, vprod3, op3); \ |
|
463 |
- } |
|
464 |
- ONEITERBUTTERFLY(0, temp0); |
|
465 |
- ONEITERBUTTERFLY(1, temp1); |
|
466 |
- ONEITERBUTTERFLY(2, temp2); |
|
467 |
- ONEITERBUTTERFLY(3, temp3); |
|
468 |
- ONEITERBUTTERFLY(4, temp4); |
|
469 |
- ONEITERBUTTERFLY(5, temp5); |
|
470 |
- ONEITERBUTTERFLY(6, temp6); |
|
471 |
- ONEITERBUTTERFLY(7, temp7); |
|
472 |
- } |
|
473 |
-#undef ONEITERBUTTERFLY |
|
474 |
- { |
|
475 |
- register vector signed int vsum; |
|
476 |
- register vector signed short line0 = vec_add(temp0, temp1); |
|
477 |
- register vector signed short line1 = vec_sub(temp0, temp1); |
|
478 |
- register vector signed short line2 = vec_add(temp2, temp3); |
|
479 |
- register vector signed short line3 = vec_sub(temp2, temp3); |
|
480 |
- register vector signed short line4 = vec_add(temp4, temp5); |
|
481 |
- register vector signed short line5 = vec_sub(temp4, temp5); |
|
482 |
- register vector signed short line6 = vec_add(temp6, temp7); |
|
483 |
- register vector signed short line7 = vec_sub(temp6, temp7); |
|
484 |
- |
|
485 |
- register vector signed short line0B = vec_add(line0, line2); |
|
486 |
- register vector signed short line2B = vec_sub(line0, line2); |
|
487 |
- register vector signed short line1B = vec_add(line1, line3); |
|
488 |
- register vector signed short line3B = vec_sub(line1, line3); |
|
489 |
- register vector signed short line4B = vec_add(line4, line6); |
|
490 |
- register vector signed short line6B = vec_sub(line4, line6); |
|
491 |
- register vector signed short line5B = vec_add(line5, line7); |
|
492 |
- register vector signed short line7B = vec_sub(line5, line7); |
|
493 |
- |
|
494 |
- register vector signed short line0C = vec_add(line0B, line4B); |
|
495 |
- register vector signed short line4C = vec_sub(line0B, line4B); |
|
496 |
- register vector signed short line1C = vec_add(line1B, line5B); |
|
497 |
- register vector signed short line5C = vec_sub(line1B, line5B); |
|
498 |
- register vector signed short line2C = vec_add(line2B, line6B); |
|
499 |
- register vector signed short line6C = vec_sub(line2B, line6B); |
|
500 |
- register vector signed short line3C = vec_add(line3B, line7B); |
|
501 |
- register vector signed short line7C = vec_sub(line3B, line7B); |
|
502 |
- |
|
503 |
- vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); |
|
504 |
- vsum = vec_sum4s(vec_abs(line1C), vsum); |
|
505 |
- vsum = vec_sum4s(vec_abs(line2C), vsum); |
|
506 |
- vsum = vec_sum4s(vec_abs(line3C), vsum); |
|
507 |
- vsum = vec_sum4s(vec_abs(line4C), vsum); |
|
508 |
- vsum = vec_sum4s(vec_abs(line5C), vsum); |
|
509 |
- vsum = vec_sum4s(vec_abs(line6C), vsum); |
|
510 |
- vsum = vec_sum4s(vec_abs(line7C), vsum); |
|
511 |
- vsum = vec_sums(vsum, (vector signed int) vzero); |
|
512 |
- vsum = vec_splat(vsum, 3); |
|
513 |
- vec_ste(vsum, 0, &sum); |
|
514 |
- } |
|
515 |
- return sum; |
|
516 |
-} |
|
517 |
- |
|
518 |
-/* |
|
519 |
- * 16x8 works with 16 elements; it allows to avoid replicating loads, and |
|
520 |
- * gives the compiler more room for scheduling. It's only used from |
|
521 |
- * inside hadamard8_diff16_altivec. |
|
522 |
- * |
|
523 |
- * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has |
|
524 |
- * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in |
|
525 |
- * registers by itself. The following code includes hand-made register |
|
526 |
- * allocation. It's not clean, but on a 7450 the resulting code is much faster |
|
527 |
- * (best case falls from 700+ cycles to 550). |
|
528 |
- * |
|
529 |
- * xlc doesn't add spill code, but it doesn't know how to schedule for the |
|
530 |
- * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses |
|
531 |
- * 25% fewer instructions...) |
|
532 |
- * |
|
533 |
- * On the 970, the hand-made RA is still a win (around 690 vs. around 780), |
|
534 |
- * but xlc goes to around 660 on the regular C code... |
|
535 |
- */ |
|
536 |
-static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst, |
|
537 |
- uint8_t *src, int stride, int h) |
|
538 |
-{ |
|
539 |
- int sum; |
|
540 |
- register vector signed short |
|
541 |
- temp0 __asm__ ("v0"), |
|
542 |
- temp1 __asm__ ("v1"), |
|
543 |
- temp2 __asm__ ("v2"), |
|
544 |
- temp3 __asm__ ("v3"), |
|
545 |
- temp4 __asm__ ("v4"), |
|
546 |
- temp5 __asm__ ("v5"), |
|
547 |
- temp6 __asm__ ("v6"), |
|
548 |
- temp7 __asm__ ("v7"); |
|
549 |
- register vector signed short |
|
550 |
- temp0S __asm__ ("v8"), |
|
551 |
- temp1S __asm__ ("v9"), |
|
552 |
- temp2S __asm__ ("v10"), |
|
553 |
- temp3S __asm__ ("v11"), |
|
554 |
- temp4S __asm__ ("v12"), |
|
555 |
- temp5S __asm__ ("v13"), |
|
556 |
- temp6S __asm__ ("v14"), |
|
557 |
- temp7S __asm__ ("v15"); |
|
558 |
- register const vector unsigned char vzero __asm__ ("v31") = |
|
559 |
- (const vector unsigned char) vec_splat_u8(0); |
|
560 |
- { |
|
561 |
- register const vector signed short vprod1 __asm__ ("v16") = |
|
562 |
- (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; |
|
563 |
- |
|
564 |
- register const vector signed short vprod2 __asm__ ("v17") = |
|
565 |
- (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; |
|
566 |
- |
|
567 |
- register const vector signed short vprod3 __asm__ ("v18") = |
|
568 |
- (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; |
|
569 |
- |
|
570 |
- register const vector unsigned char perm1 __asm__ ("v19") = |
|
571 |
- (const vector unsigned char) |
|
572 |
- { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, |
|
573 |
- 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; |
|
574 |
- |
|
575 |
- register const vector unsigned char perm2 __asm__ ("v20") = |
|
576 |
- (const vector unsigned char) |
|
577 |
- { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, |
|
578 |
- 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; |
|
579 |
- |
|
580 |
- register const vector unsigned char perm3 __asm__ ("v21") = |
|
581 |
- (const vector unsigned char) |
|
582 |
- { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
|
583 |
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; |
|
584 |
- |
|
585 |
-#define ONEITERBUTTERFLY(i, res1, res2) \ |
|
586 |
- { \ |
|
587 |
- register vector unsigned char src1 __asm__ ("v22") = \ |
|
588 |
- vec_ld(stride * i, src); \ |
|
589 |
- register vector unsigned char src2 __asm__ ("v23") = \ |
|
590 |
- vec_ld(stride * i + 16, src); \ |
|
591 |
- register vector unsigned char srcO __asm__ ("v22") = \ |
|
592 |
- vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ |
|
593 |
- register vector unsigned char dst1 __asm__ ("v24") = \ |
|
594 |
- vec_ld(stride * i, dst); \ |
|
595 |
- register vector unsigned char dst2 __asm__ ("v25") = \ |
|
596 |
- vec_ld(stride * i + 16, dst); \ |
|
597 |
- register vector unsigned char dstO __asm__ ("v23") = \ |
|
598 |
- vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ |
|
599 |
- \ |
|
600 |
- /* Promote the unsigned chars to signed shorts. */ \ |
|
601 |
- register vector signed short srcV __asm__ ("v24") = \ |
|
602 |
- (vector signed short) vec_mergeh((vector signed char) vzero, \ |
|
603 |
- (vector signed char) srcO); \ |
|
604 |
- register vector signed short dstV __asm__ ("v25") = \ |
|
605 |
- (vector signed short) vec_mergeh((vector signed char) vzero, \ |
|
606 |
- (vector signed char) dstO); \ |
|
607 |
- register vector signed short srcW __asm__ ("v26") = \ |
|
608 |
- (vector signed short) vec_mergel((vector signed char) vzero, \ |
|
609 |
- (vector signed char) srcO); \ |
|
610 |
- register vector signed short dstW __asm__ ("v27") = \ |
|
611 |
- (vector signed short) vec_mergel((vector signed char) vzero, \ |
|
612 |
- (vector signed char) dstO); \ |
|
613 |
- \ |
|
614 |
- /* subtractions inside the first butterfly */ \ |
|
615 |
- register vector signed short but0 __asm__ ("v28") = \ |
|
616 |
- vec_sub(srcV, dstV); \ |
|
617 |
- register vector signed short but0S __asm__ ("v29") = \ |
|
618 |
- vec_sub(srcW, dstW); \ |
|
619 |
- register vector signed short op1 __asm__ ("v30") = \ |
|
620 |
- vec_perm(but0, but0, perm1); \ |
|
621 |
- register vector signed short but1 __asm__ ("v22") = \ |
|
622 |
- vec_mladd(but0, vprod1, op1); \ |
|
623 |
- register vector signed short op1S __asm__ ("v23") = \ |
|
624 |
- vec_perm(but0S, but0S, perm1); \ |
|
625 |
- register vector signed short but1S __asm__ ("v24") = \ |
|
626 |
- vec_mladd(but0S, vprod1, op1S); \ |
|
627 |
- register vector signed short op2 __asm__ ("v25") = \ |
|
628 |
- vec_perm(but1, but1, perm2); \ |
|
629 |
- register vector signed short but2 __asm__ ("v26") = \ |
|
630 |
- vec_mladd(but1, vprod2, op2); \ |
|
631 |
- register vector signed short op2S __asm__ ("v27") = \ |
|
632 |
- vec_perm(but1S, but1S, perm2); \ |
|
633 |
- register vector signed short but2S __asm__ ("v28") = \ |
|
634 |
- vec_mladd(but1S, vprod2, op2S); \ |
|
635 |
- register vector signed short op3 __asm__ ("v29") = \ |
|
636 |
- vec_perm(but2, but2, perm3); \ |
|
637 |
- register vector signed short op3S __asm__ ("v30") = \ |
|
638 |
- vec_perm(but2S, but2S, perm3); \ |
|
639 |
- res1 = vec_mladd(but2, vprod3, op3); \ |
|
640 |
- res2 = vec_mladd(but2S, vprod3, op3S); \ |
|
641 |
- } |
|
642 |
- ONEITERBUTTERFLY(0, temp0, temp0S); |
|
643 |
- ONEITERBUTTERFLY(1, temp1, temp1S); |
|
644 |
- ONEITERBUTTERFLY(2, temp2, temp2S); |
|
645 |
- ONEITERBUTTERFLY(3, temp3, temp3S); |
|
646 |
- ONEITERBUTTERFLY(4, temp4, temp4S); |
|
647 |
- ONEITERBUTTERFLY(5, temp5, temp5S); |
|
648 |
- ONEITERBUTTERFLY(6, temp6, temp6S); |
|
649 |
- ONEITERBUTTERFLY(7, temp7, temp7S); |
|
650 |
- } |
|
651 |
-#undef ONEITERBUTTERFLY |
|
652 |
- { |
|
653 |
- register vector signed int vsum; |
|
654 |
- |
|
655 |
- register vector signed short line0 = vec_add(temp0, temp1); |
|
656 |
- register vector signed short line1 = vec_sub(temp0, temp1); |
|
657 |
- register vector signed short line2 = vec_add(temp2, temp3); |
|
658 |
- register vector signed short line3 = vec_sub(temp2, temp3); |
|
659 |
- register vector signed short line4 = vec_add(temp4, temp5); |
|
660 |
- register vector signed short line5 = vec_sub(temp4, temp5); |
|
661 |
- register vector signed short line6 = vec_add(temp6, temp7); |
|
662 |
- register vector signed short line7 = vec_sub(temp6, temp7); |
|
663 |
- |
|
664 |
- register vector signed short line0B = vec_add(line0, line2); |
|
665 |
- register vector signed short line2B = vec_sub(line0, line2); |
|
666 |
- register vector signed short line1B = vec_add(line1, line3); |
|
667 |
- register vector signed short line3B = vec_sub(line1, line3); |
|
668 |
- register vector signed short line4B = vec_add(line4, line6); |
|
669 |
- register vector signed short line6B = vec_sub(line4, line6); |
|
670 |
- register vector signed short line5B = vec_add(line5, line7); |
|
671 |
- register vector signed short line7B = vec_sub(line5, line7); |
|
672 |
- |
|
673 |
- register vector signed short line0C = vec_add(line0B, line4B); |
|
674 |
- register vector signed short line4C = vec_sub(line0B, line4B); |
|
675 |
- register vector signed short line1C = vec_add(line1B, line5B); |
|
676 |
- register vector signed short line5C = vec_sub(line1B, line5B); |
|
677 |
- register vector signed short line2C = vec_add(line2B, line6B); |
|
678 |
- register vector signed short line6C = vec_sub(line2B, line6B); |
|
679 |
- register vector signed short line3C = vec_add(line3B, line7B); |
|
680 |
- register vector signed short line7C = vec_sub(line3B, line7B); |
|
681 |
- |
|
682 |
- register vector signed short line0S = vec_add(temp0S, temp1S); |
|
683 |
- register vector signed short line1S = vec_sub(temp0S, temp1S); |
|
684 |
- register vector signed short line2S = vec_add(temp2S, temp3S); |
|
685 |
- register vector signed short line3S = vec_sub(temp2S, temp3S); |
|
686 |
- register vector signed short line4S = vec_add(temp4S, temp5S); |
|
687 |
- register vector signed short line5S = vec_sub(temp4S, temp5S); |
|
688 |
- register vector signed short line6S = vec_add(temp6S, temp7S); |
|
689 |
- register vector signed short line7S = vec_sub(temp6S, temp7S); |
|
690 |
- |
|
691 |
- register vector signed short line0BS = vec_add(line0S, line2S); |
|
692 |
- register vector signed short line2BS = vec_sub(line0S, line2S); |
|
693 |
- register vector signed short line1BS = vec_add(line1S, line3S); |
|
694 |
- register vector signed short line3BS = vec_sub(line1S, line3S); |
|
695 |
- register vector signed short line4BS = vec_add(line4S, line6S); |
|
696 |
- register vector signed short line6BS = vec_sub(line4S, line6S); |
|
697 |
- register vector signed short line5BS = vec_add(line5S, line7S); |
|
698 |
- register vector signed short line7BS = vec_sub(line5S, line7S); |
|
699 |
- |
|
700 |
- register vector signed short line0CS = vec_add(line0BS, line4BS); |
|
701 |
- register vector signed short line4CS = vec_sub(line0BS, line4BS); |
|
702 |
- register vector signed short line1CS = vec_add(line1BS, line5BS); |
|
703 |
- register vector signed short line5CS = vec_sub(line1BS, line5BS); |
|
704 |
- register vector signed short line2CS = vec_add(line2BS, line6BS); |
|
705 |
- register vector signed short line6CS = vec_sub(line2BS, line6BS); |
|
706 |
- register vector signed short line3CS = vec_add(line3BS, line7BS); |
|
707 |
- register vector signed short line7CS = vec_sub(line3BS, line7BS); |
|
708 |
- |
|
709 |
- vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); |
|
710 |
- vsum = vec_sum4s(vec_abs(line1C), vsum); |
|
711 |
- vsum = vec_sum4s(vec_abs(line2C), vsum); |
|
712 |
- vsum = vec_sum4s(vec_abs(line3C), vsum); |
|
713 |
- vsum = vec_sum4s(vec_abs(line4C), vsum); |
|
714 |
- vsum = vec_sum4s(vec_abs(line5C), vsum); |
|
715 |
- vsum = vec_sum4s(vec_abs(line6C), vsum); |
|
716 |
- vsum = vec_sum4s(vec_abs(line7C), vsum); |
|
717 |
- |
|
718 |
- vsum = vec_sum4s(vec_abs(line0CS), vsum); |
|
719 |
- vsum = vec_sum4s(vec_abs(line1CS), vsum); |
|
720 |
- vsum = vec_sum4s(vec_abs(line2CS), vsum); |
|
721 |
- vsum = vec_sum4s(vec_abs(line3CS), vsum); |
|
722 |
- vsum = vec_sum4s(vec_abs(line4CS), vsum); |
|
723 |
- vsum = vec_sum4s(vec_abs(line5CS), vsum); |
|
724 |
- vsum = vec_sum4s(vec_abs(line6CS), vsum); |
|
725 |
- vsum = vec_sum4s(vec_abs(line7CS), vsum); |
|
726 |
- vsum = vec_sums(vsum, (vector signed int) vzero); |
|
727 |
- vsum = vec_splat(vsum, 3); |
|
728 |
- vec_ste(vsum, 0, &sum); |
|
729 |
- } |
|
730 |
- return sum; |
|
731 |
-} |
|
732 |
- |
|
733 |
-static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst, |
|
734 |
- uint8_t *src, int stride, int h) |
|
735 |
-{ |
|
736 |
- int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); |
|
737 |
- |
|
738 |
- if (h == 16) { |
|
739 |
- dst += 8 * stride; |
|
740 |
- src += 8 * stride; |
|
741 |
- score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); |
|
742 |
- } |
|
743 |
- return score; |
|
744 |
-} |
|
745 |
-#endif /* HAVE_ALTIVEC */ |
|
746 |
- |
|
747 |
-av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx) |
|
748 |
-{ |
|
749 |
-#if HAVE_ALTIVEC |
|
750 |
- if (!PPC_ALTIVEC(av_get_cpu_flags())) |
|
751 |
- return; |
|
752 |
- |
|
753 |
- c->pix_abs[0][1] = sad16_x2_altivec; |
|
754 |
- c->pix_abs[0][2] = sad16_y2_altivec; |
|
755 |
- c->pix_abs[0][3] = sad16_xy2_altivec; |
|
756 |
- c->pix_abs[0][0] = sad16_altivec; |
|
757 |
- c->pix_abs[1][0] = sad8_altivec; |
|
758 |
- |
|
759 |
- c->sad[0] = sad16_altivec; |
|
760 |
- c->sad[1] = sad8_altivec; |
|
761 |
- c->sse[0] = sse16_altivec; |
|
762 |
- c->sse[1] = sse8_altivec; |
|
763 |
- |
|
764 |
- c->hadamard8_diff[0] = hadamard8_diff16_altivec; |
|
765 |
- c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; |
|
766 |
-#endif /* HAVE_ALTIVEC */ |
|
767 |
-} |
768 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,767 @@ |
0 |
+/* |
|
1 |
+ * Copyright (c) 2002 Brian Foley |
|
2 |
+ * Copyright (c) 2002 Dieter Shirley |
|
3 |
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> |
|
4 |
+ * |
|
5 |
+ * This file is part of Libav. |
|
6 |
+ * |
|
7 |
+ * Libav is free software; you can redistribute it and/or |
|
8 |
+ * modify it under the terms of the GNU Lesser General Public |
|
9 |
+ * License as published by the Free Software Foundation; either |
|
10 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
11 |
+ * |
|
12 |
+ * Libav is distributed in the hope that it will be useful, |
|
13 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
15 |
+ * Lesser General Public License for more details. |
|
16 |
+ * |
|
17 |
+ * You should have received a copy of the GNU Lesser General Public |
|
18 |
+ * License along with Libav; if not, write to the Free Software |
|
19 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
20 |
+ */ |
|
21 |
+ |
|
22 |
+#include "config.h" |
|
23 |
+#if HAVE_ALTIVEC_H |
|
24 |
+#include <altivec.h> |
|
25 |
+#endif |
|
26 |
+ |
|
27 |
+#include "libavutil/attributes.h" |
|
28 |
+#include "libavutil/cpu.h" |
|
29 |
+#include "libavutil/ppc/cpu.h" |
|
30 |
+#include "libavutil/ppc/types_altivec.h" |
|
31 |
+#include "libavutil/ppc/util_altivec.h" |
|
32 |
+#include "libavcodec/avcodec.h" |
|
33 |
+#include "libavcodec/mpegvideo.h" |
|
34 |
+#include "libavcodec/me_cmp.h" |
|
35 |
+ |
|
36 |
+#if HAVE_ALTIVEC |
|
37 |
+static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
38 |
+ int line_size, int h) |
|
39 |
+{ |
|
40 |
+ int i, s = 0; |
|
41 |
+ const vector unsigned char zero = |
|
42 |
+ (const vector unsigned char) vec_splat_u8(0); |
|
43 |
+ vector unsigned char perm1 = vec_lvsl(0, pix2); |
|
44 |
+ vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); |
|
45 |
+ vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
46 |
+ vector signed int sumdiffs; |
|
47 |
+ |
|
48 |
+ for (i = 0; i < h; i++) { |
|
49 |
+ /* Read unaligned pixels into our vectors. The vectors are as follows: |
|
50 |
+ * pix1v: pix1[0] - pix1[15] |
|
51 |
+ * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] */ |
|
52 |
+ vector unsigned char pix1v = vec_ld(0, pix1); |
|
53 |
+ vector unsigned char pix2l = vec_ld(0, pix2); |
|
54 |
+ vector unsigned char pix2r = vec_ld(16, pix2); |
|
55 |
+ vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm1); |
|
56 |
+ vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2); |
|
57 |
+ |
|
58 |
+ /* Calculate the average vector. */ |
|
59 |
+ vector unsigned char avgv = vec_avg(pix2v, pix2iv); |
|
60 |
+ |
|
61 |
+ /* Calculate a sum of abs differences vector. */ |
|
62 |
+ vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv), |
|
63 |
+ vec_min(pix1v, avgv)); |
|
64 |
+ |
|
65 |
+ /* Add each 4 pixel group together and put 4 results into sad. */ |
|
66 |
+ sad = vec_sum4s(t5, sad); |
|
67 |
+ |
|
68 |
+ pix1 += line_size; |
|
69 |
+ pix2 += line_size; |
|
70 |
+ } |
|
71 |
+ /* Sum up the four partial sums, and put the result into s. */ |
|
72 |
+ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
73 |
+ sumdiffs = vec_splat(sumdiffs, 3); |
|
74 |
+ vec_ste(sumdiffs, 0, &s); |
|
75 |
+ |
|
76 |
+ return s; |
|
77 |
+} |
|
78 |
+ |
|
79 |
+static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
80 |
+ int line_size, int h) |
|
81 |
+{ |
|
82 |
+ int i, s = 0; |
|
83 |
+ const vector unsigned char zero = |
|
84 |
+ (const vector unsigned char) vec_splat_u8(0); |
|
85 |
+ vector unsigned char perm = vec_lvsl(0, pix2); |
|
86 |
+ vector unsigned char pix1v, pix3v, avgv, t5; |
|
87 |
+ vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
88 |
+ vector signed int sumdiffs; |
|
89 |
+ uint8_t *pix3 = pix2 + line_size; |
|
90 |
+ |
|
91 |
+ /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one |
|
92 |
+ * iteration becomes pix2 in the next iteration. We can use this |
|
93 |
+ * fact to avoid a potentially expensive unaligned read, each |
|
94 |
+ * time around the loop. |
|
95 |
+ * Read unaligned pixels into our vectors. The vectors are as follows: |
|
96 |
+ * pix2v: pix2[0] - pix2[15] |
|
97 |
+ * Split the pixel vectors into shorts. */ |
|
98 |
+ vector unsigned char pix2l = vec_ld(0, pix2); |
|
99 |
+ vector unsigned char pix2r = vec_ld(15, pix2); |
|
100 |
+ vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm); |
|
101 |
+ |
|
102 |
+ for (i = 0; i < h; i++) { |
|
103 |
+ /* Read unaligned pixels into our vectors. The vectors are as follows: |
|
104 |
+ * pix1v: pix1[0] - pix1[15] |
|
105 |
+ * pix3v: pix3[0] - pix3[15] */ |
|
106 |
+ pix1v = vec_ld(0, pix1); |
|
107 |
+ |
|
108 |
+ pix2l = vec_ld(0, pix3); |
|
109 |
+ pix2r = vec_ld(15, pix3); |
|
110 |
+ pix3v = vec_perm(pix2l, pix2r, perm); |
|
111 |
+ |
|
112 |
+ /* Calculate the average vector. */ |
|
113 |
+ avgv = vec_avg(pix2v, pix3v); |
|
114 |
+ |
|
115 |
+ /* Calculate a sum of abs differences vector. */ |
|
116 |
+ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
|
117 |
+ |
|
118 |
+ /* Add each 4 pixel group together and put 4 results into sad. */ |
|
119 |
+ sad = vec_sum4s(t5, sad); |
|
120 |
+ |
|
121 |
+ pix1 += line_size; |
|
122 |
+ pix2v = pix3v; |
|
123 |
+ pix3 += line_size; |
|
124 |
+ } |
|
125 |
+ |
|
126 |
+ /* Sum up the four partial sums, and put the result into s. */ |
|
127 |
+ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
128 |
+ sumdiffs = vec_splat(sumdiffs, 3); |
|
129 |
+ vec_ste(sumdiffs, 0, &s); |
|
130 |
+ return s; |
|
131 |
+} |
|
132 |
+ |
|
133 |
+static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
134 |
+ int line_size, int h) |
|
135 |
+{ |
|
136 |
+ int i, s = 0; |
|
137 |
+ uint8_t *pix3 = pix2 + line_size; |
|
138 |
+ const vector unsigned char zero = |
|
139 |
+ (const vector unsigned char) vec_splat_u8(0); |
|
140 |
+ const vector unsigned short two = |
|
141 |
+ (const vector unsigned short) vec_splat_u16(2); |
|
142 |
+ vector unsigned char avgv, t5; |
|
143 |
+ vector unsigned char perm1 = vec_lvsl(0, pix2); |
|
144 |
+ vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); |
|
145 |
+ vector unsigned char pix1v, pix3v, pix3iv; |
|
146 |
+ vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; |
|
147 |
+ vector unsigned short avghv, avglv; |
|
148 |
+ vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
149 |
+ vector signed int sumdiffs; |
|
150 |
+ |
|
151 |
+ /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one |
|
152 |
+ * iteration becomes pix2 in the next iteration. We can use this |
|
153 |
+ * fact to avoid a potentially expensive unaligned read, as well |
|
154 |
+ * as some splitting, and vector addition each time around the loop. |
|
155 |
+ * Read unaligned pixels into our vectors. The vectors are as follows: |
|
156 |
+ * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] |
|
157 |
+ * Split the pixel vectors into shorts. */ |
|
158 |
+ vector unsigned char pix2l = vec_ld(0, pix2); |
|
159 |
+ vector unsigned char pix2r = vec_ld(16, pix2); |
|
160 |
+ vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm1); |
|
161 |
+ vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2); |
|
162 |
+ |
|
163 |
+ vector unsigned short pix2hv = |
|
164 |
+ (vector unsigned short) vec_mergeh(zero, pix2v); |
|
165 |
+ vector unsigned short pix2lv = |
|
166 |
+ (vector unsigned short) vec_mergel(zero, pix2v); |
|
167 |
+ vector unsigned short pix2ihv = |
|
168 |
+ (vector unsigned short) vec_mergeh(zero, pix2iv); |
|
169 |
+ vector unsigned short pix2ilv = |
|
170 |
+ (vector unsigned short) vec_mergel(zero, pix2iv); |
|
171 |
+ vector unsigned short t1 = vec_add(pix2hv, pix2ihv); |
|
172 |
+ vector unsigned short t2 = vec_add(pix2lv, pix2ilv); |
|
173 |
+ vector unsigned short t3, t4; |
|
174 |
+ |
|
175 |
+ for (i = 0; i < h; i++) { |
|
176 |
+ /* Read unaligned pixels into our vectors. The vectors are as follows: |
|
177 |
+ * pix1v: pix1[0] - pix1[15] |
|
178 |
+ * pix3v: pix3[0] - pix3[15] pix3iv: pix3[1] - pix3[16] */ |
|
179 |
+ pix1v = vec_ld(0, pix1); |
|
180 |
+ |
|
181 |
+ pix2l = vec_ld(0, pix3); |
|
182 |
+ pix2r = vec_ld(16, pix3); |
|
183 |
+ pix3v = vec_perm(pix2l, pix2r, perm1); |
|
184 |
+ pix3iv = vec_perm(pix2l, pix2r, perm2); |
|
185 |
+ |
|
186 |
+ /* Note that AltiVec does have vec_avg, but this works on vector pairs |
|
187 |
+ * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the |
|
188 |
+ * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when |
|
189 |
+ * it should be 1. Instead, we have to split the pixel vectors into |
|
190 |
+ * vectors of shorts and do the averaging by hand. */ |
|
191 |
+ |
|
192 |
+ /* Split the pixel vectors into shorts. */ |
|
193 |
+ pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); |
|
194 |
+ pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); |
|
195 |
+ pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); |
|
196 |
+ pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); |
|
197 |
+ |
|
198 |
+ /* Do the averaging on them. */ |
|
199 |
+ t3 = vec_add(pix3hv, pix3ihv); |
|
200 |
+ t4 = vec_add(pix3lv, pix3ilv); |
|
201 |
+ |
|
202 |
+ avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); |
|
203 |
+ avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); |
|
204 |
+ |
|
205 |
+ /* Pack the shorts back into a result. */ |
|
206 |
+ avgv = vec_pack(avghv, avglv); |
|
207 |
+ |
|
208 |
+ /* Calculate a sum of abs differences vector. */ |
|
209 |
+ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
|
210 |
+ |
|
211 |
+ /* Add each 4 pixel group together and put 4 results into sad. */ |
|
212 |
+ sad = vec_sum4s(t5, sad); |
|
213 |
+ |
|
214 |
+ pix1 += line_size; |
|
215 |
+ pix3 += line_size; |
|
216 |
+ /* Transfer the calculated values for pix3 into pix2. */ |
|
217 |
+ t1 = t3; |
|
218 |
+ t2 = t4; |
|
219 |
+ } |
|
220 |
+ /* Sum up the four partial sums, and put the result into s. */ |
|
221 |
+ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
222 |
+ sumdiffs = vec_splat(sumdiffs, 3); |
|
223 |
+ vec_ste(sumdiffs, 0, &s); |
|
224 |
+ |
|
225 |
+ return s; |
|
226 |
+} |
|
227 |
+ |
|
228 |
+static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
229 |
+ int line_size, int h) |
|
230 |
+{ |
|
231 |
+ int i, s; |
|
232 |
+ const vector unsigned int zero = |
|
233 |
+ (const vector unsigned int) vec_splat_u32(0); |
|
234 |
+ vector unsigned char perm = vec_lvsl(0, pix2); |
|
235 |
+ vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
236 |
+ vector signed int sumdiffs; |
|
237 |
+ |
|
238 |
+ for (i = 0; i < h; i++) { |
|
239 |
+ /* Read potentially unaligned pixels into t1 and t2. */ |
|
240 |
+ vector unsigned char pix2l = vec_ld(0, pix2); |
|
241 |
+ vector unsigned char pix2r = vec_ld(15, pix2); |
|
242 |
+ vector unsigned char t1 = vec_ld(0, pix1); |
|
243 |
+ vector unsigned char t2 = vec_perm(pix2l, pix2r, perm); |
|
244 |
+ |
|
245 |
+ /* Calculate a sum of abs differences vector. */ |
|
246 |
+ vector unsigned char t3 = vec_max(t1, t2); |
|
247 |
+ vector unsigned char t4 = vec_min(t1, t2); |
|
248 |
+ vector unsigned char t5 = vec_sub(t3, t4); |
|
249 |
+ |
|
250 |
+ /* Add each 4 pixel group together and put 4 results into sad. */ |
|
251 |
+ sad = vec_sum4s(t5, sad); |
|
252 |
+ |
|
253 |
+ pix1 += line_size; |
|
254 |
+ pix2 += line_size; |
|
255 |
+ } |
|
256 |
+ |
|
257 |
+ /* Sum up the four partial sums, and put the result into s. */ |
|
258 |
+ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
259 |
+ sumdiffs = vec_splat(sumdiffs, 3); |
|
260 |
+ vec_ste(sumdiffs, 0, &s); |
|
261 |
+ |
|
262 |
+ return s; |
|
263 |
+} |
|
264 |
+ |
|
265 |
+static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
266 |
+ int line_size, int h) |
|
267 |
+{ |
|
268 |
+ int i, s; |
|
269 |
+ const vector unsigned int zero = |
|
270 |
+ (const vector unsigned int) vec_splat_u32(0); |
|
271 |
+ const vector unsigned char permclear = |
|
272 |
+ (vector unsigned char) |
|
273 |
+ { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; |
|
274 |
+ vector unsigned char perm1 = vec_lvsl(0, pix1); |
|
275 |
+ vector unsigned char perm2 = vec_lvsl(0, pix2); |
|
276 |
+ vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
277 |
+ vector signed int sumdiffs; |
|
278 |
+ |
|
279 |
+ for (i = 0; i < h; i++) { |
|
280 |
+ /* Read potentially unaligned pixels into t1 and t2. |
|
281 |
+ * Since we're reading 16 pixels, and actually only want 8, |
|
282 |
+ * mask out the last 8 pixels. The 0s don't change the sum. */ |
|
283 |
+ vector unsigned char pix1l = vec_ld(0, pix1); |
|
284 |
+ vector unsigned char pix1r = vec_ld(7, pix1); |
|
285 |
+ vector unsigned char pix2l = vec_ld(0, pix2); |
|
286 |
+ vector unsigned char pix2r = vec_ld(7, pix2); |
|
287 |
+ vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1), |
|
288 |
+ permclear); |
|
289 |
+ vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2), |
|
290 |
+ permclear); |
|
291 |
+ |
|
292 |
+ /* Calculate a sum of abs differences vector. */ |
|
293 |
+ vector unsigned char t3 = vec_max(t1, t2); |
|
294 |
+ vector unsigned char t4 = vec_min(t1, t2); |
|
295 |
+ vector unsigned char t5 = vec_sub(t3, t4); |
|
296 |
+ |
|
297 |
+ /* Add each 4 pixel group together and put 4 results into sad. */ |
|
298 |
+ sad = vec_sum4s(t5, sad); |
|
299 |
+ |
|
300 |
+ pix1 += line_size; |
|
301 |
+ pix2 += line_size; |
|
302 |
+ } |
|
303 |
+ |
|
304 |
+ /* Sum up the four partial sums, and put the result into s. */ |
|
305 |
+ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
306 |
+ sumdiffs = vec_splat(sumdiffs, 3); |
|
307 |
+ vec_ste(sumdiffs, 0, &s); |
|
308 |
+ |
|
309 |
+ return s; |
|
310 |
+} |
|
311 |
+ |
|
312 |
+/* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced. |
|
313 |
+ * It's the sad8_altivec code above w/ squaring added. */ |
|
314 |
+static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
315 |
+ int line_size, int h) |
|
316 |
+{ |
|
317 |
+ int i, s; |
|
318 |
+ const vector unsigned int zero = |
|
319 |
+ (const vector unsigned int) vec_splat_u32(0); |
|
320 |
+ const vector unsigned char permclear = |
|
321 |
+ (vector unsigned char) |
|
322 |
+ { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; |
|
323 |
+ vector unsigned char perm1 = vec_lvsl(0, pix1); |
|
324 |
+ vector unsigned char perm2 = vec_lvsl(0, pix2); |
|
325 |
+ vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); |
|
326 |
+ vector signed int sumsqr; |
|
327 |
+ |
|
328 |
+ for (i = 0; i < h; i++) { |
|
329 |
+ /* Read potentially unaligned pixels into t1 and t2. |
|
330 |
+ * Since we're reading 16 pixels, and actually only want 8, |
|
331 |
+ * mask out the last 8 pixels. The 0s don't change the sum. */ |
|
332 |
+ vector unsigned char pix1l = vec_ld(0, pix1); |
|
333 |
+ vector unsigned char pix1r = vec_ld(7, pix1); |
|
334 |
+ vector unsigned char pix2l = vec_ld(0, pix2); |
|
335 |
+ vector unsigned char pix2r = vec_ld(7, pix2); |
|
336 |
+ vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1), |
|
337 |
+ permclear); |
|
338 |
+ vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2), |
|
339 |
+ permclear); |
|
340 |
+ |
|
341 |
+ /* Since we want to use unsigned chars, we can take advantage |
|
342 |
+ * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ |
|
343 |
+ |
|
344 |
+ /* Calculate abs differences vector. */ |
|
345 |
+ vector unsigned char t3 = vec_max(t1, t2); |
|
346 |
+ vector unsigned char t4 = vec_min(t1, t2); |
|
347 |
+ vector unsigned char t5 = vec_sub(t3, t4); |
|
348 |
+ |
|
349 |
+ /* Square the values and add them to our sum. */ |
|
350 |
+ sum = vec_msum(t5, t5, sum); |
|
351 |
+ |
|
352 |
+ pix1 += line_size; |
|
353 |
+ pix2 += line_size; |
|
354 |
+ } |
|
355 |
+ |
|
356 |
+ /* Sum up the four partial sums, and put the result into s. */ |
|
357 |
+ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); |
|
358 |
+ sumsqr = vec_splat(sumsqr, 3); |
|
359 |
+ vec_ste(sumsqr, 0, &s); |
|
360 |
+ |
|
361 |
+ return s; |
|
362 |
+} |
|
363 |
+ |
|
364 |
+/* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced. |
|
365 |
+ * It's the sad16_altivec code above w/ squaring added. */ |
|
366 |
+static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
367 |
+ int line_size, int h) |
|
368 |
+{ |
|
369 |
+ int i, s; |
|
370 |
+ const vector unsigned int zero = |
|
371 |
+ (const vector unsigned int) vec_splat_u32(0); |
|
372 |
+ vector unsigned char perm = vec_lvsl(0, pix2); |
|
373 |
+ vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); |
|
374 |
+ vector signed int sumsqr; |
|
375 |
+ |
|
376 |
+ for (i = 0; i < h; i++) { |
|
377 |
+ /* Read potentially unaligned pixels into t1 and t2. */ |
|
378 |
+ vector unsigned char pix2l = vec_ld(0, pix2); |
|
379 |
+ vector unsigned char pix2r = vec_ld(15, pix2); |
|
380 |
+ vector unsigned char t1 = vec_ld(0, pix1); |
|
381 |
+ vector unsigned char t2 = vec_perm(pix2l, pix2r, perm); |
|
382 |
+ |
|
383 |
+ /* Since we want to use unsigned chars, we can take advantage |
|
384 |
+ * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ |
|
385 |
+ |
|
386 |
+ /* Calculate abs differences vector. */ |
|
387 |
+ vector unsigned char t3 = vec_max(t1, t2); |
|
388 |
+ vector unsigned char t4 = vec_min(t1, t2); |
|
389 |
+ vector unsigned char t5 = vec_sub(t3, t4); |
|
390 |
+ |
|
391 |
+ /* Square the values and add them to our sum. */ |
|
392 |
+ sum = vec_msum(t5, t5, sum); |
|
393 |
+ |
|
394 |
+ pix1 += line_size; |
|
395 |
+ pix2 += line_size; |
|
396 |
+ } |
|
397 |
+ |
|
398 |
+ /* Sum up the four partial sums, and put the result into s. */ |
|
399 |
+ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); |
|
400 |
+ sumsqr = vec_splat(sumsqr, 3); |
|
401 |
+ vec_ste(sumsqr, 0, &s); |
|
402 |
+ |
|
403 |
+ return s; |
|
404 |
+} |
|
405 |
+ |
|
406 |
+static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst, |
|
407 |
+ uint8_t *src, int stride, int h) |
|
408 |
+{ |
|
409 |
+ int sum; |
|
410 |
+ register const vector unsigned char vzero = |
|
411 |
+ (const vector unsigned char) vec_splat_u8(0); |
|
412 |
+ register vector signed short temp0, temp1, temp2, temp3, temp4, |
|
413 |
+ temp5, temp6, temp7; |
|
414 |
+ { |
|
415 |
+ register const vector signed short vprod1 = |
|
416 |
+ (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; |
|
417 |
+ register const vector signed short vprod2 = |
|
418 |
+ (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; |
|
419 |
+ register const vector signed short vprod3 = |
|
420 |
+ (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; |
|
421 |
+ register const vector unsigned char perm1 = |
|
422 |
+ (const vector unsigned char) |
|
423 |
+ { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, |
|
424 |
+ 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; |
|
425 |
+ register const vector unsigned char perm2 = |
|
426 |
+ (const vector unsigned char) |
|
427 |
+ { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, |
|
428 |
+ 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; |
|
429 |
+ register const vector unsigned char perm3 = |
|
430 |
+ (const vector unsigned char) |
|
431 |
+ { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
|
432 |
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; |
|
433 |
+ |
|
434 |
+#define ONEITERBUTTERFLY(i, res) \ |
|
435 |
+ { \ |
|
436 |
+ register vector unsigned char src1 = vec_ld(stride * i, src); \ |
|
437 |
+ register vector unsigned char src2 = vec_ld(stride * i + 15, src); \ |
|
438 |
+ register vector unsigned char srcO = \ |
|
439 |
+ vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ |
|
440 |
+ register vector unsigned char dst1 = vec_ld(stride * i, dst); \ |
|
441 |
+ register vector unsigned char dst2 = vec_ld(stride * i + 15, dst); \ |
|
442 |
+ register vector unsigned char dstO = \ |
|
443 |
+ vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ |
|
444 |
+ \ |
|
445 |
+ /* Promote the unsigned chars to signed shorts. */ \ |
|
446 |
+ /* We're in the 8x8 function, we only care for the first 8. */ \ |
|
447 |
+ register vector signed short srcV = \ |
|
448 |
+ (vector signed short) vec_mergeh((vector signed char) vzero, \ |
|
449 |
+ (vector signed char) srcO); \ |
|
450 |
+ register vector signed short dstV = \ |
|
451 |
+ (vector signed short) vec_mergeh((vector signed char) vzero, \ |
|
452 |
+ (vector signed char) dstO); \ |
|
453 |
+ \ |
|
454 |
+ /* subtractions inside the first butterfly */ \ |
|
455 |
+ register vector signed short but0 = vec_sub(srcV, dstV); \ |
|
456 |
+ register vector signed short op1 = vec_perm(but0, but0, perm1); \ |
|
457 |
+ register vector signed short but1 = vec_mladd(but0, vprod1, op1); \ |
|
458 |
+ register vector signed short op2 = vec_perm(but1, but1, perm2); \ |
|
459 |
+ register vector signed short but2 = vec_mladd(but1, vprod2, op2); \ |
|
460 |
+ register vector signed short op3 = vec_perm(but2, but2, perm3); \ |
|
461 |
+ res = vec_mladd(but2, vprod3, op3); \ |
|
462 |
+ } |
|
463 |
+ ONEITERBUTTERFLY(0, temp0); |
|
464 |
+ ONEITERBUTTERFLY(1, temp1); |
|
465 |
+ ONEITERBUTTERFLY(2, temp2); |
|
466 |
+ ONEITERBUTTERFLY(3, temp3); |
|
467 |
+ ONEITERBUTTERFLY(4, temp4); |
|
468 |
+ ONEITERBUTTERFLY(5, temp5); |
|
469 |
+ ONEITERBUTTERFLY(6, temp6); |
|
470 |
+ ONEITERBUTTERFLY(7, temp7); |
|
471 |
+ } |
|
472 |
+#undef ONEITERBUTTERFLY |
|
473 |
+ { |
|
474 |
+ register vector signed int vsum; |
|
475 |
+ register vector signed short line0 = vec_add(temp0, temp1); |
|
476 |
+ register vector signed short line1 = vec_sub(temp0, temp1); |
|
477 |
+ register vector signed short line2 = vec_add(temp2, temp3); |
|
478 |
+ register vector signed short line3 = vec_sub(temp2, temp3); |
|
479 |
+ register vector signed short line4 = vec_add(temp4, temp5); |
|
480 |
+ register vector signed short line5 = vec_sub(temp4, temp5); |
|
481 |
+ register vector signed short line6 = vec_add(temp6, temp7); |
|
482 |
+ register vector signed short line7 = vec_sub(temp6, temp7); |
|
483 |
+ |
|
484 |
+ register vector signed short line0B = vec_add(line0, line2); |
|
485 |
+ register vector signed short line2B = vec_sub(line0, line2); |
|
486 |
+ register vector signed short line1B = vec_add(line1, line3); |
|
487 |
+ register vector signed short line3B = vec_sub(line1, line3); |
|
488 |
+ register vector signed short line4B = vec_add(line4, line6); |
|
489 |
+ register vector signed short line6B = vec_sub(line4, line6); |
|
490 |
+ register vector signed short line5B = vec_add(line5, line7); |
|
491 |
+ register vector signed short line7B = vec_sub(line5, line7); |
|
492 |
+ |
|
493 |
+ register vector signed short line0C = vec_add(line0B, line4B); |
|
494 |
+ register vector signed short line4C = vec_sub(line0B, line4B); |
|
495 |
+ register vector signed short line1C = vec_add(line1B, line5B); |
|
496 |
+ register vector signed short line5C = vec_sub(line1B, line5B); |
|
497 |
+ register vector signed short line2C = vec_add(line2B, line6B); |
|
498 |
+ register vector signed short line6C = vec_sub(line2B, line6B); |
|
499 |
+ register vector signed short line3C = vec_add(line3B, line7B); |
|
500 |
+ register vector signed short line7C = vec_sub(line3B, line7B); |
|
501 |
+ |
|
502 |
+ vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); |
|
503 |
+ vsum = vec_sum4s(vec_abs(line1C), vsum); |
|
504 |
+ vsum = vec_sum4s(vec_abs(line2C), vsum); |
|
505 |
+ vsum = vec_sum4s(vec_abs(line3C), vsum); |
|
506 |
+ vsum = vec_sum4s(vec_abs(line4C), vsum); |
|
507 |
+ vsum = vec_sum4s(vec_abs(line5C), vsum); |
|
508 |
+ vsum = vec_sum4s(vec_abs(line6C), vsum); |
|
509 |
+ vsum = vec_sum4s(vec_abs(line7C), vsum); |
|
510 |
+ vsum = vec_sums(vsum, (vector signed int) vzero); |
|
511 |
+ vsum = vec_splat(vsum, 3); |
|
512 |
+ vec_ste(vsum, 0, &sum); |
|
513 |
+ } |
|
514 |
+ return sum; |
|
515 |
+} |
|
516 |
+ |
|
517 |
+/* |
|
518 |
+ * 16x8 works with 16 elements; it allows to avoid replicating loads, and |
|
519 |
+ * gives the compiler more room for scheduling. It's only used from |
|
520 |
+ * inside hadamard8_diff16_altivec. |
|
521 |
+ * |
|
522 |
+ * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has |
|
523 |
+ * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in |
|
524 |
+ * registers by itself. The following code includes hand-made register |
|
525 |
+ * allocation. It's not clean, but on a 7450 the resulting code is much faster |
|
526 |
+ * (best case falls from 700+ cycles to 550). |
|
527 |
+ * |
|
528 |
+ * xlc doesn't add spill code, but it doesn't know how to schedule for the |
|
529 |
+ * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses |
|
530 |
+ * 25% fewer instructions...) |
|
531 |
+ * |
|
532 |
+ * On the 970, the hand-made RA is still a win (around 690 vs. around 780), |
|
533 |
+ * but xlc goes to around 660 on the regular C code... |
|
534 |
+ */ |
|
535 |
+static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst, |
|
536 |
+ uint8_t *src, int stride, int h) |
|
537 |
+{ |
|
538 |
+ int sum; |
|
539 |
+ register vector signed short |
|
540 |
+ temp0 __asm__ ("v0"), |
|
541 |
+ temp1 __asm__ ("v1"), |
|
542 |
+ temp2 __asm__ ("v2"), |
|
543 |
+ temp3 __asm__ ("v3"), |
|
544 |
+ temp4 __asm__ ("v4"), |
|
545 |
+ temp5 __asm__ ("v5"), |
|
546 |
+ temp6 __asm__ ("v6"), |
|
547 |
+ temp7 __asm__ ("v7"); |
|
548 |
+ register vector signed short |
|
549 |
+ temp0S __asm__ ("v8"), |
|
550 |
+ temp1S __asm__ ("v9"), |
|
551 |
+ temp2S __asm__ ("v10"), |
|
552 |
+ temp3S __asm__ ("v11"), |
|
553 |
+ temp4S __asm__ ("v12"), |
|
554 |
+ temp5S __asm__ ("v13"), |
|
555 |
+ temp6S __asm__ ("v14"), |
|
556 |
+ temp7S __asm__ ("v15"); |
|
557 |
+ register const vector unsigned char vzero __asm__ ("v31") = |
|
558 |
+ (const vector unsigned char) vec_splat_u8(0); |
|
559 |
+ { |
|
560 |
+ register const vector signed short vprod1 __asm__ ("v16") = |
|
561 |
+ (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; |
|
562 |
+ |
|
563 |
+ register const vector signed short vprod2 __asm__ ("v17") = |
|
564 |
+ (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; |
|
565 |
+ |
|
566 |
+ register const vector signed short vprod3 __asm__ ("v18") = |
|
567 |
+ (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; |
|
568 |
+ |
|
569 |
+ register const vector unsigned char perm1 __asm__ ("v19") = |
|
570 |
+ (const vector unsigned char) |
|
571 |
+ { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, |
|
572 |
+ 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; |
|
573 |
+ |
|
574 |
+ register const vector unsigned char perm2 __asm__ ("v20") = |
|
575 |
+ (const vector unsigned char) |
|
576 |
+ { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, |
|
577 |
+ 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; |
|
578 |
+ |
|
579 |
+ register const vector unsigned char perm3 __asm__ ("v21") = |
|
580 |
+ (const vector unsigned char) |
|
581 |
+ { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
|
582 |
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; |
|
583 |
+ |
|
584 |
+#define ONEITERBUTTERFLY(i, res1, res2) \ |
|
585 |
+ { \ |
|
586 |
+ register vector unsigned char src1 __asm__ ("v22") = \ |
|
587 |
+ vec_ld(stride * i, src); \ |
|
588 |
+ register vector unsigned char src2 __asm__ ("v23") = \ |
|
589 |
+ vec_ld(stride * i + 16, src); \ |
|
590 |
+ register vector unsigned char srcO __asm__ ("v22") = \ |
|
591 |
+ vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ |
|
592 |
+ register vector unsigned char dst1 __asm__ ("v24") = \ |
|
593 |
+ vec_ld(stride * i, dst); \ |
|
594 |
+ register vector unsigned char dst2 __asm__ ("v25") = \ |
|
595 |
+ vec_ld(stride * i + 16, dst); \ |
|
596 |
+ register vector unsigned char dstO __asm__ ("v23") = \ |
|
597 |
+ vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ |
|
598 |
+ \ |
|
599 |
+ /* Promote the unsigned chars to signed shorts. */ \ |
|
600 |
+ register vector signed short srcV __asm__ ("v24") = \ |
|
601 |
+ (vector signed short) vec_mergeh((vector signed char) vzero, \ |
|
602 |
+ (vector signed char) srcO); \ |
|
603 |
+ register vector signed short dstV __asm__ ("v25") = \ |
|
604 |
+ (vector signed short) vec_mergeh((vector signed char) vzero, \ |
|
605 |
+ (vector signed char) dstO); \ |
|
606 |
+ register vector signed short srcW __asm__ ("v26") = \ |
|
607 |
+ (vector signed short) vec_mergel((vector signed char) vzero, \ |
|
608 |
+ (vector signed char) srcO); \ |
|
609 |
+ register vector signed short dstW __asm__ ("v27") = \ |
|
610 |
+ (vector signed short) vec_mergel((vector signed char) vzero, \ |
|
611 |
+ (vector signed char) dstO); \ |
|
612 |
+ \ |
|
613 |
+ /* subtractions inside the first butterfly */ \ |
|
614 |
+ register vector signed short but0 __asm__ ("v28") = \ |
|
615 |
+ vec_sub(srcV, dstV); \ |
|
616 |
+ register vector signed short but0S __asm__ ("v29") = \ |
|
617 |
+ vec_sub(srcW, dstW); \ |
|
618 |
+ register vector signed short op1 __asm__ ("v30") = \ |
|
619 |
+ vec_perm(but0, but0, perm1); \ |
|
620 |
+ register vector signed short but1 __asm__ ("v22") = \ |
|
621 |
+ vec_mladd(but0, vprod1, op1); \ |
|
622 |
+ register vector signed short op1S __asm__ ("v23") = \ |
|
623 |
+ vec_perm(but0S, but0S, perm1); \ |
|
624 |
+ register vector signed short but1S __asm__ ("v24") = \ |
|
625 |
+ vec_mladd(but0S, vprod1, op1S); \ |
|
626 |
+ register vector signed short op2 __asm__ ("v25") = \ |
|
627 |
+ vec_perm(but1, but1, perm2); \ |
|
628 |
+ register vector signed short but2 __asm__ ("v26") = \ |
|
629 |
+ vec_mladd(but1, vprod2, op2); \ |
|
630 |
+ register vector signed short op2S __asm__ ("v27") = \ |
|
631 |
+ vec_perm(but1S, but1S, perm2); \ |
|
632 |
+ register vector signed short but2S __asm__ ("v28") = \ |
|
633 |
+ vec_mladd(but1S, vprod2, op2S); \ |
|
634 |
+ register vector signed short op3 __asm__ ("v29") = \ |
|
635 |
+ vec_perm(but2, but2, perm3); \ |
|
636 |
+ register vector signed short op3S __asm__ ("v30") = \ |
|
637 |
+ vec_perm(but2S, but2S, perm3); \ |
|
638 |
+ res1 = vec_mladd(but2, vprod3, op3); \ |
|
639 |
+ res2 = vec_mladd(but2S, vprod3, op3S); \ |
|
640 |
+ } |
|
641 |
+ ONEITERBUTTERFLY(0, temp0, temp0S); |
|
642 |
+ ONEITERBUTTERFLY(1, temp1, temp1S); |
|
643 |
+ ONEITERBUTTERFLY(2, temp2, temp2S); |
|
644 |
+ ONEITERBUTTERFLY(3, temp3, temp3S); |
|
645 |
+ ONEITERBUTTERFLY(4, temp4, temp4S); |
|
646 |
+ ONEITERBUTTERFLY(5, temp5, temp5S); |
|
647 |
+ ONEITERBUTTERFLY(6, temp6, temp6S); |
|
648 |
+ ONEITERBUTTERFLY(7, temp7, temp7S); |
|
649 |
+ } |
|
650 |
+#undef ONEITERBUTTERFLY |
|
651 |
+ { |
|
652 |
+ register vector signed int vsum; |
|
653 |
+ |
|
654 |
+ register vector signed short line0 = vec_add(temp0, temp1); |
|
655 |
+ register vector signed short line1 = vec_sub(temp0, temp1); |
|
656 |
+ register vector signed short line2 = vec_add(temp2, temp3); |
|
657 |
+ register vector signed short line3 = vec_sub(temp2, temp3); |
|
658 |
+ register vector signed short line4 = vec_add(temp4, temp5); |
|
659 |
+ register vector signed short line5 = vec_sub(temp4, temp5); |
|
660 |
+ register vector signed short line6 = vec_add(temp6, temp7); |
|
661 |
+ register vector signed short line7 = vec_sub(temp6, temp7); |
|
662 |
+ |
|
663 |
+ register vector signed short line0B = vec_add(line0, line2); |
|
664 |
+ register vector signed short line2B = vec_sub(line0, line2); |
|
665 |
+ register vector signed short line1B = vec_add(line1, line3); |
|
666 |
+ register vector signed short line3B = vec_sub(line1, line3); |
|
667 |
+ register vector signed short line4B = vec_add(line4, line6); |
|
668 |
+ register vector signed short line6B = vec_sub(line4, line6); |
|
669 |
+ register vector signed short line5B = vec_add(line5, line7); |
|
670 |
+ register vector signed short line7B = vec_sub(line5, line7); |
|
671 |
+ |
|
672 |
+ register vector signed short line0C = vec_add(line0B, line4B); |
|
673 |
+ register vector signed short line4C = vec_sub(line0B, line4B); |
|
674 |
+ register vector signed short line1C = vec_add(line1B, line5B); |
|
675 |
+ register vector signed short line5C = vec_sub(line1B, line5B); |
|
676 |
+ register vector signed short line2C = vec_add(line2B, line6B); |
|
677 |
+ register vector signed short line6C = vec_sub(line2B, line6B); |
|
678 |
+ register vector signed short line3C = vec_add(line3B, line7B); |
|
679 |
+ register vector signed short line7C = vec_sub(line3B, line7B); |
|
680 |
+ |
|
681 |
+ register vector signed short line0S = vec_add(temp0S, temp1S); |
|
682 |
+ register vector signed short line1S = vec_sub(temp0S, temp1S); |
|
683 |
+ register vector signed short line2S = vec_add(temp2S, temp3S); |
|
684 |
+ register vector signed short line3S = vec_sub(temp2S, temp3S); |
|
685 |
+ register vector signed short line4S = vec_add(temp4S, temp5S); |
|
686 |
+ register vector signed short line5S = vec_sub(temp4S, temp5S); |
|
687 |
+ register vector signed short line6S = vec_add(temp6S, temp7S); |
|
688 |
+ register vector signed short line7S = vec_sub(temp6S, temp7S); |
|
689 |
+ |
|
690 |
+ register vector signed short line0BS = vec_add(line0S, line2S); |
|
691 |
+ register vector signed short line2BS = vec_sub(line0S, line2S); |
|
692 |
+ register vector signed short line1BS = vec_add(line1S, line3S); |
|
693 |
+ register vector signed short line3BS = vec_sub(line1S, line3S); |
|
694 |
+ register vector signed short line4BS = vec_add(line4S, line6S); |
|
695 |
+ register vector signed short line6BS = vec_sub(line4S, line6S); |
|
696 |
+ register vector signed short line5BS = vec_add(line5S, line7S); |
|
697 |
+ register vector signed short line7BS = vec_sub(line5S, line7S); |
|
698 |
+ |
|
699 |
+ register vector signed short line0CS = vec_add(line0BS, line4BS); |
|
700 |
+ register vector signed short line4CS = vec_sub(line0BS, line4BS); |
|
701 |
+ register vector signed short line1CS = vec_add(line1BS, line5BS); |
|
702 |
+ register vector signed short line5CS = vec_sub(line1BS, line5BS); |
|
703 |
+ register vector signed short line2CS = vec_add(line2BS, line6BS); |
|
704 |
+ register vector signed short line6CS = vec_sub(line2BS, line6BS); |
|
705 |
+ register vector signed short line3CS = vec_add(line3BS, line7BS); |
|
706 |
+ register vector signed short line7CS = vec_sub(line3BS, line7BS); |
|
707 |
+ |
|
708 |
+ vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); |
|
709 |
+ vsum = vec_sum4s(vec_abs(line1C), vsum); |
|
710 |
+ vsum = vec_sum4s(vec_abs(line2C), vsum); |
|
711 |
+ vsum = vec_sum4s(vec_abs(line3C), vsum); |
|
712 |
+ vsum = vec_sum4s(vec_abs(line4C), vsum); |
|
713 |
+ vsum = vec_sum4s(vec_abs(line5C), vsum); |
|
714 |
+ vsum = vec_sum4s(vec_abs(line6C), vsum); |
|
715 |
+ vsum = vec_sum4s(vec_abs(line7C), vsum); |
|
716 |
+ |
|
717 |
+ vsum = vec_sum4s(vec_abs(line0CS), vsum); |
|
718 |
+ vsum = vec_sum4s(vec_abs(line1CS), vsum); |
|
719 |
+ vsum = vec_sum4s(vec_abs(line2CS), vsum); |
|
720 |
+ vsum = vec_sum4s(vec_abs(line3CS), vsum); |
|
721 |
+ vsum = vec_sum4s(vec_abs(line4CS), vsum); |
|
722 |
+ vsum = vec_sum4s(vec_abs(line5CS), vsum); |
|
723 |
+ vsum = vec_sum4s(vec_abs(line6CS), vsum); |
|
724 |
+ vsum = vec_sum4s(vec_abs(line7CS), vsum); |
|
725 |
+ vsum = vec_sums(vsum, (vector signed int) vzero); |
|
726 |
+ vsum = vec_splat(vsum, 3); |
|
727 |
+ vec_ste(vsum, 0, &sum); |
|
728 |
+ } |
|
729 |
+ return sum; |
|
730 |
+} |
|
731 |
+ |
|
732 |
+static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst, |
|
733 |
+ uint8_t *src, int stride, int h) |
|
734 |
+{ |
|
735 |
+ int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); |
|
736 |
+ |
|
737 |
+ if (h == 16) { |
|
738 |
+ dst += 8 * stride; |
|
739 |
+ src += 8 * stride; |
|
740 |
+ score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); |
|
741 |
+ } |
|
742 |
+ return score; |
|
743 |
+} |
|
744 |
+#endif /* HAVE_ALTIVEC */ |
|
745 |
+ |
|
746 |
+av_cold void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx) |
|
747 |
+{ |
|
748 |
+#if HAVE_ALTIVEC |
|
749 |
+ if (!PPC_ALTIVEC(av_get_cpu_flags())) |
|
750 |
+ return; |
|
751 |
+ |
|
752 |
+ c->pix_abs[0][1] = sad16_x2_altivec; |
|
753 |
+ c->pix_abs[0][2] = sad16_y2_altivec; |
|
754 |
+ c->pix_abs[0][3] = sad16_xy2_altivec; |
|
755 |
+ c->pix_abs[0][0] = sad16_altivec; |
|
756 |
+ c->pix_abs[1][0] = sad8_altivec; |
|
757 |
+ |
|
758 |
+ c->sad[0] = sad16_altivec; |
|
759 |
+ c->sad[1] = sad8_altivec; |
|
760 |
+ c->sse[0] = sse16_altivec; |
|
761 |
+ c->sse[1] = sse8_altivec; |
|
762 |
+ |
|
763 |
+ c->hadamard8_diff[0] = hadamard8_diff16_altivec; |
|
764 |
+ c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; |
|
765 |
+#endif /* HAVE_ALTIVEC */ |
|
766 |
+} |
... | ... |
@@ -27,8 +27,8 @@ |
27 | 27 |
*/ |
28 | 28 |
|
29 | 29 |
#include "avcodec.h" |
30 |
-#include "dsputil.h" |
|
31 | 30 |
#include "hpeldsp.h" |
31 |
+#include "me_cmp.h" |
|
32 | 32 |
#include "mpegvideo.h" |
33 | 33 |
#include "h263.h" |
34 | 34 |
#include "internal.h" |
... | ... |
@@ -306,7 +306,7 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane, |
306 | 306 |
s->m.current_picture.motion_val[0] = s->motion_val8[plane] + 2; |
307 | 307 |
s->m.p_mv_table = s->motion_val16[plane] + |
308 | 308 |
s->m.mb_stride + 1; |
309 |
- s->m.dsp = s->dsp; // move |
|
309 |
+ s->m.mecc = s->mecc; // move |
|
310 | 310 |
ff_init_me(&s->m); |
311 | 311 |
|
312 | 312 |
s->m.me.dia_size = s->avctx->dia_size; |
... | ... |
@@ -431,8 +431,8 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane, |
431 | 431 |
best = score[1] <= score[0]; |
432 | 432 |
|
433 | 433 |
vlc = ff_svq1_block_type_vlc[SVQ1_BLOCK_SKIP]; |
434 |
- score[2] = s->dsp.sse[0](NULL, src + 16 * x, ref, |
|
435 |
- stride, 16); |
|
434 |
+ score[2] = s->mecc.sse[0](NULL, src + 16 * x, ref, |
|
435 |
+ stride, 16); |
|
436 | 436 |
score[2] += vlc[1] * lambda; |
437 | 437 |
if (score[2] < score[best] && mx == 0 && my == 0) { |
438 | 438 |
best = 2; |
... | ... |
@@ -509,8 +509,8 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx) |
509 | 509 |
SVQ1EncContext *const s = avctx->priv_data; |
510 | 510 |
int ret; |
511 | 511 |
|
512 |
- ff_dsputil_init(&s->dsp, avctx); |
|
513 | 512 |
ff_hpeldsp_init(&s->hdsp, avctx->flags); |
513 |
+ ff_me_cmp_init(&s->mecc, avctx); |
|
514 | 514 |
ff_mpegvideoencdsp_init(&s->m.mpvencdsp, avctx); |
515 | 515 |
|
516 | 516 |
avctx->coded_frame = av_frame_alloc(); |
... | ... |
@@ -25,9 +25,9 @@ |
25 | 25 |
|
26 | 26 |
#include "libavutil/frame.h" |
27 | 27 |
#include "avcodec.h" |
28 |
-#include "dsputil.h" |
|
29 | 28 |
#include "get_bits.h" |
30 | 29 |
#include "hpeldsp.h" |
30 |
+#include "me_cmp.h" |
|
31 | 31 |
#include "mpegvideo.h" |
32 | 32 |
#include "put_bits.h" |
33 | 33 |
|
... | ... |
@@ -37,7 +37,7 @@ typedef struct SVQ1EncContext { |
37 | 37 |
* of MpegEncContext, so this will be removed then. */ |
38 | 38 |
MpegEncContext m; |
39 | 39 |
AVCodecContext *avctx; |
40 |
- DSPContext dsp; |
|
40 |
+ MECmpContext mecc; |
|
41 | 41 |
HpelDSPContext hdsp; |
42 | 42 |
AVFrame *current_picture; |
43 | 43 |
AVFrame *last_picture; |
... | ... |
@@ -39,8 +39,8 @@ |
39 | 39 |
#include "libavutil/samplefmt.h" |
40 | 40 |
#include "libavutil/dict.h" |
41 | 41 |
#include "avcodec.h" |
42 |
-#include "dsputil.h" |
|
43 | 42 |
#include "libavutil/opt.h" |
43 |
+#include "me_cmp.h" |
|
44 | 44 |
#include "mpegvideo.h" |
45 | 45 |
#include "thread.h" |
46 | 46 |
#include "internal.h" |
... | ... |
@@ -100,8 +100,8 @@ static av_cold void avcodec_init(void) |
100 | 100 |
return; |
101 | 101 |
initialized = 1; |
102 | 102 |
|
103 |
- if (CONFIG_DSPUTIL) |
|
104 |
- ff_dsputil_static_init(); |
|
103 |
+ if (CONFIG_ME_CMP) |
|
104 |
+ ff_me_cmp_init_static(); |
|
105 | 105 |
} |
106 | 106 |
|
107 | 107 |
int av_codec_is_encoder(const AVCodec *codec) |
... | ... |
@@ -6,7 +6,6 @@ OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o |
6 | 6 |
OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o |
7 | 7 |
OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o |
8 | 8 |
OBJS-$(CONFIG_DCT) += x86/dct_init.o |
9 |
-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_init.o |
|
10 | 9 |
OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o |
11 | 10 |
OBJS-$(CONFIG_FFT) += x86/fft_init.o |
12 | 11 |
OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o |
... | ... |
@@ -19,6 +18,7 @@ OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o |
19 | 19 |
OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_mmx.o |
20 | 20 |
OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o |
21 | 21 |
OBJS-$(CONFIG_LPC) += x86/lpc.o |
22 |
+OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o |
|
22 | 23 |
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o |
23 | 24 |
OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o \ |
24 | 25 |
x86/mpegvideodsp.o |
... | ... |
@@ -70,7 +70,6 @@ YASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o |
70 | 70 |
YASM-OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp.o |
71 | 71 |
YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o |
72 | 72 |
YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o |
73 |
-YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputilenc.o |
|
74 | 73 |
YASM-OBJS-$(CONFIG_FFT) += x86/fft.o |
75 | 74 |
YASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o |
76 | 75 |
YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ |
... | ... |
@@ -90,6 +89,7 @@ YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \ |
90 | 90 |
YASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \ |
91 | 91 |
x86/hpeldsp.o |
92 | 92 |
YASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o |
93 |
+YASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o |
|
93 | 94 |
YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o |
94 | 95 |
YASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o |
95 | 96 |
YASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o |
96 | 97 |
deleted file mode 100644 |
... | ... |
@@ -1,1321 +0,0 @@ |
1 |
-/* |
|
2 |
- * MMX optimized DSP utils |
|
3 |
- * Copyright (c) 2000, 2001 Fabrice Bellard |
|
4 |
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
5 |
- * |
|
6 |
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
|
7 |
- * |
|
8 |
- * This file is part of Libav. |
|
9 |
- * |
|
10 |
- * Libav is free software; you can redistribute it and/or |
|
11 |
- * modify it under the terms of the GNU Lesser General Public |
|
12 |
- * License as published by the Free Software Foundation; either |
|
13 |
- * version 2.1 of the License, or (at your option) any later version. |
|
14 |
- * |
|
15 |
- * Libav is distributed in the hope that it will be useful, |
|
16 |
- * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
17 |
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
18 |
- * Lesser General Public License for more details. |
|
19 |
- * |
|
20 |
- * You should have received a copy of the GNU Lesser General Public |
|
21 |
- * License along with Libav; if not, write to the Free Software |
|
22 |
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
23 |
- */ |
|
24 |
- |
|
25 |
-#include "libavutil/attributes.h" |
|
26 |
-#include "libavutil/cpu.h" |
|
27 |
-#include "libavutil/x86/asm.h" |
|
28 |
-#include "libavutil/x86/cpu.h" |
|
29 |
-#include "libavcodec/dsputil.h" |
|
30 |
-#include "libavcodec/mpegvideo.h" |
|
31 |
- |
|
32 |
-#if HAVE_INLINE_ASM |
|
33 |
- |
|
34 |
-static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
35 |
- int line_size, int h) |
|
36 |
-{ |
|
37 |
- int tmp; |
|
38 |
- |
|
39 |
- __asm__ volatile ( |
|
40 |
- "movl %4, %%ecx \n" |
|
41 |
- "shr $1, %%ecx \n" |
|
42 |
- "pxor %%mm0, %%mm0 \n" /* mm0 = 0 */ |
|
43 |
- "pxor %%mm7, %%mm7 \n" /* mm7 holds the sum */ |
|
44 |
- "1: \n" |
|
45 |
- "movq (%0), %%mm1 \n" /* mm1 = pix1[0][0 - 7] */ |
|
46 |
- "movq (%1), %%mm2 \n" /* mm2 = pix2[0][0 - 7] */ |
|
47 |
- "movq (%0, %3), %%mm3 \n" /* mm3 = pix1[1][0 - 7] */ |
|
48 |
- "movq (%1, %3), %%mm4 \n" /* mm4 = pix2[1][0 - 7] */ |
|
49 |
- |
|
50 |
- /* todo: mm1-mm2, mm3-mm4 */ |
|
51 |
- /* algo: subtract mm1 from mm2 with saturation and vice versa */ |
|
52 |
- /* OR the results to get absolute difference */ |
|
53 |
- "movq %%mm1, %%mm5 \n" |
|
54 |
- "movq %%mm3, %%mm6 \n" |
|
55 |
- "psubusb %%mm2, %%mm1 \n" |
|
56 |
- "psubusb %%mm4, %%mm3 \n" |
|
57 |
- "psubusb %%mm5, %%mm2 \n" |
|
58 |
- "psubusb %%mm6, %%mm4 \n" |
|
59 |
- |
|
60 |
- "por %%mm1, %%mm2 \n" |
|
61 |
- "por %%mm3, %%mm4 \n" |
|
62 |
- |
|
63 |
- /* now convert to 16-bit vectors so we can square them */ |
|
64 |
- "movq %%mm2, %%mm1 \n" |
|
65 |
- "movq %%mm4, %%mm3 \n" |
|
66 |
- |
|
67 |
- "punpckhbw %%mm0, %%mm2 \n" |
|
68 |
- "punpckhbw %%mm0, %%mm4 \n" |
|
69 |
- "punpcklbw %%mm0, %%mm1 \n" /* mm1 now spread over (mm1, mm2) */ |
|
70 |
- "punpcklbw %%mm0, %%mm3 \n" /* mm4 now spread over (mm3, mm4) */ |
|
71 |
- |
|
72 |
- "pmaddwd %%mm2, %%mm2 \n" |
|
73 |
- "pmaddwd %%mm4, %%mm4 \n" |
|
74 |
- "pmaddwd %%mm1, %%mm1 \n" |
|
75 |
- "pmaddwd %%mm3, %%mm3 \n" |
|
76 |
- |
|
77 |
- "lea (%0, %3, 2), %0 \n" /* pix1 += 2 * line_size */ |
|
78 |
- "lea (%1, %3, 2), %1 \n" /* pix2 += 2 * line_size */ |
|
79 |
- |
|
80 |
- "paddd %%mm2, %%mm1 \n" |
|
81 |
- "paddd %%mm4, %%mm3 \n" |
|
82 |
- "paddd %%mm1, %%mm7 \n" |
|
83 |
- "paddd %%mm3, %%mm7 \n" |
|
84 |
- |
|
85 |
- "decl %%ecx \n" |
|
86 |
- "jnz 1b \n" |
|
87 |
- |
|
88 |
- "movq %%mm7, %%mm1 \n" |
|
89 |
- "psrlq $32, %%mm7 \n" /* shift hi dword to lo */ |
|
90 |
- "paddd %%mm7, %%mm1 \n" |
|
91 |
- "movd %%mm1, %2 \n" |
|
92 |
- : "+r" (pix1), "+r" (pix2), "=r" (tmp) |
|
93 |
- : "r" ((x86_reg) line_size), "m" (h) |
|
94 |
- : "%ecx"); |
|
95 |
- |
|
96 |
- return tmp; |
|
97 |
-} |
|
98 |
- |
|
99 |
-static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
100 |
- int line_size, int h) |
|
101 |
-{ |
|
102 |
- int tmp; |
|
103 |
- |
|
104 |
- __asm__ volatile ( |
|
105 |
- "movl %4, %%ecx\n" |
|
106 |
- "pxor %%mm0, %%mm0\n" /* mm0 = 0 */ |
|
107 |
- "pxor %%mm7, %%mm7\n" /* mm7 holds the sum */ |
|
108 |
- "1:\n" |
|
109 |
- "movq (%0), %%mm1\n" /* mm1 = pix1[0 - 7] */ |
|
110 |
- "movq (%1), %%mm2\n" /* mm2 = pix2[0 - 7] */ |
|
111 |
- "movq 8(%0), %%mm3\n" /* mm3 = pix1[8 - 15] */ |
|
112 |
- "movq 8(%1), %%mm4\n" /* mm4 = pix2[8 - 15] */ |
|
113 |
- |
|
114 |
- /* todo: mm1-mm2, mm3-mm4 */ |
|
115 |
- /* algo: subtract mm1 from mm2 with saturation and vice versa */ |
|
116 |
- /* OR the results to get absolute difference */ |
|
117 |
- "movq %%mm1, %%mm5\n" |
|
118 |
- "movq %%mm3, %%mm6\n" |
|
119 |
- "psubusb %%mm2, %%mm1\n" |
|
120 |
- "psubusb %%mm4, %%mm3\n" |
|
121 |
- "psubusb %%mm5, %%mm2\n" |
|
122 |
- "psubusb %%mm6, %%mm4\n" |
|
123 |
- |
|
124 |
- "por %%mm1, %%mm2\n" |
|
125 |
- "por %%mm3, %%mm4\n" |
|
126 |
- |
|
127 |
- /* now convert to 16-bit vectors so we can square them */ |
|
128 |
- "movq %%mm2, %%mm1\n" |
|
129 |
- "movq %%mm4, %%mm3\n" |
|
130 |
- |
|
131 |
- "punpckhbw %%mm0, %%mm2\n" |
|
132 |
- "punpckhbw %%mm0, %%mm4\n" |
|
133 |
- "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */ |
|
134 |
- "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */ |
|
135 |
- |
|
136 |
- "pmaddwd %%mm2, %%mm2\n" |
|
137 |
- "pmaddwd %%mm4, %%mm4\n" |
|
138 |
- "pmaddwd %%mm1, %%mm1\n" |
|
139 |
- "pmaddwd %%mm3, %%mm3\n" |
|
140 |
- |
|
141 |
- "add %3, %0\n" |
|
142 |
- "add %3, %1\n" |
|
143 |
- |
|
144 |
- "paddd %%mm2, %%mm1\n" |
|
145 |
- "paddd %%mm4, %%mm3\n" |
|
146 |
- "paddd %%mm1, %%mm7\n" |
|
147 |
- "paddd %%mm3, %%mm7\n" |
|
148 |
- |
|
149 |
- "decl %%ecx\n" |
|
150 |
- "jnz 1b\n" |
|
151 |
- |
|
152 |
- "movq %%mm7, %%mm1\n" |
|
153 |
- "psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
|
154 |
- "paddd %%mm7, %%mm1\n" |
|
155 |
- "movd %%mm1, %2\n" |
|
156 |
- : "+r" (pix1), "+r" (pix2), "=r" (tmp) |
|
157 |
- : "r" ((x86_reg) line_size), "m" (h) |
|
158 |
- : "%ecx"); |
|
159 |
- |
|
160 |
- return tmp; |
|
161 |
-} |
|
162 |
- |
|
163 |
-static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h) |
|
164 |
-{ |
|
165 |
- int tmp; |
|
166 |
- |
|
167 |
- __asm__ volatile ( |
|
168 |
- "movl %3, %%ecx\n" |
|
169 |
- "pxor %%mm7, %%mm7\n" |
|
170 |
- "pxor %%mm6, %%mm6\n" |
|
171 |
- |
|
172 |
- "movq (%0), %%mm0\n" |
|
173 |
- "movq %%mm0, %%mm1\n" |
|
174 |
- "psllq $8, %%mm0\n" |
|
175 |
- "psrlq $8, %%mm1\n" |
|
176 |
- "psrlq $8, %%mm0\n" |
|
177 |
- "movq %%mm0, %%mm2\n" |
|
178 |
- "movq %%mm1, %%mm3\n" |
|
179 |
- "punpcklbw %%mm7, %%mm0\n" |
|
180 |
- "punpcklbw %%mm7, %%mm1\n" |
|
181 |
- "punpckhbw %%mm7, %%mm2\n" |
|
182 |
- "punpckhbw %%mm7, %%mm3\n" |
|
183 |
- "psubw %%mm1, %%mm0\n" |
|
184 |
- "psubw %%mm3, %%mm2\n" |
|
185 |
- |
|
186 |
- "add %2, %0\n" |
|
187 |
- |
|
188 |
- "movq (%0), %%mm4\n" |
|
189 |
- "movq %%mm4, %%mm1\n" |
|
190 |
- "psllq $8, %%mm4\n" |
|
191 |
- "psrlq $8, %%mm1\n" |
|
192 |
- "psrlq $8, %%mm4\n" |
|
193 |
- "movq %%mm4, %%mm5\n" |
|
194 |
- "movq %%mm1, %%mm3\n" |
|
195 |
- "punpcklbw %%mm7, %%mm4\n" |
|
196 |
- "punpcklbw %%mm7, %%mm1\n" |
|
197 |
- "punpckhbw %%mm7, %%mm5\n" |
|
198 |
- "punpckhbw %%mm7, %%mm3\n" |
|
199 |
- "psubw %%mm1, %%mm4\n" |
|
200 |
- "psubw %%mm3, %%mm5\n" |
|
201 |
- "psubw %%mm4, %%mm0\n" |
|
202 |
- "psubw %%mm5, %%mm2\n" |
|
203 |
- "pxor %%mm3, %%mm3\n" |
|
204 |
- "pxor %%mm1, %%mm1\n" |
|
205 |
- "pcmpgtw %%mm0, %%mm3\n\t" |
|
206 |
- "pcmpgtw %%mm2, %%mm1\n\t" |
|
207 |
- "pxor %%mm3, %%mm0\n" |
|
208 |
- "pxor %%mm1, %%mm2\n" |
|
209 |
- "psubw %%mm3, %%mm0\n" |
|
210 |
- "psubw %%mm1, %%mm2\n" |
|
211 |
- "paddw %%mm0, %%mm2\n" |
|
212 |
- "paddw %%mm2, %%mm6\n" |
|
213 |
- |
|
214 |
- "add %2, %0\n" |
|
215 |
- "1:\n" |
|
216 |
- |
|
217 |
- "movq (%0), %%mm0\n" |
|
218 |
- "movq %%mm0, %%mm1\n" |
|
219 |
- "psllq $8, %%mm0\n" |
|
220 |
- "psrlq $8, %%mm1\n" |
|
221 |
- "psrlq $8, %%mm0\n" |
|
222 |
- "movq %%mm0, %%mm2\n" |
|
223 |
- "movq %%mm1, %%mm3\n" |
|
224 |
- "punpcklbw %%mm7, %%mm0\n" |
|
225 |
- "punpcklbw %%mm7, %%mm1\n" |
|
226 |
- "punpckhbw %%mm7, %%mm2\n" |
|
227 |
- "punpckhbw %%mm7, %%mm3\n" |
|
228 |
- "psubw %%mm1, %%mm0\n" |
|
229 |
- "psubw %%mm3, %%mm2\n" |
|
230 |
- "psubw %%mm0, %%mm4\n" |
|
231 |
- "psubw %%mm2, %%mm5\n" |
|
232 |
- "pxor %%mm3, %%mm3\n" |
|
233 |
- "pxor %%mm1, %%mm1\n" |
|
234 |
- "pcmpgtw %%mm4, %%mm3\n\t" |
|
235 |
- "pcmpgtw %%mm5, %%mm1\n\t" |
|
236 |
- "pxor %%mm3, %%mm4\n" |
|
237 |
- "pxor %%mm1, %%mm5\n" |
|
238 |
- "psubw %%mm3, %%mm4\n" |
|
239 |
- "psubw %%mm1, %%mm5\n" |
|
240 |
- "paddw %%mm4, %%mm5\n" |
|
241 |
- "paddw %%mm5, %%mm6\n" |
|
242 |
- |
|
243 |
- "add %2, %0\n" |
|
244 |
- |
|
245 |
- "movq (%0), %%mm4\n" |
|
246 |
- "movq %%mm4, %%mm1\n" |
|
247 |
- "psllq $8, %%mm4\n" |
|
248 |
- "psrlq $8, %%mm1\n" |
|
249 |
- "psrlq $8, %%mm4\n" |
|
250 |
- "movq %%mm4, %%mm5\n" |
|
251 |
- "movq %%mm1, %%mm3\n" |
|
252 |
- "punpcklbw %%mm7, %%mm4\n" |
|
253 |
- "punpcklbw %%mm7, %%mm1\n" |
|
254 |
- "punpckhbw %%mm7, %%mm5\n" |
|
255 |
- "punpckhbw %%mm7, %%mm3\n" |
|
256 |
- "psubw %%mm1, %%mm4\n" |
|
257 |
- "psubw %%mm3, %%mm5\n" |
|
258 |
- "psubw %%mm4, %%mm0\n" |
|
259 |
- "psubw %%mm5, %%mm2\n" |
|
260 |
- "pxor %%mm3, %%mm3\n" |
|
261 |
- "pxor %%mm1, %%mm1\n" |
|
262 |
- "pcmpgtw %%mm0, %%mm3\n\t" |
|
263 |
- "pcmpgtw %%mm2, %%mm1\n\t" |
|
264 |
- "pxor %%mm3, %%mm0\n" |
|
265 |
- "pxor %%mm1, %%mm2\n" |
|
266 |
- "psubw %%mm3, %%mm0\n" |
|
267 |
- "psubw %%mm1, %%mm2\n" |
|
268 |
- "paddw %%mm0, %%mm2\n" |
|
269 |
- "paddw %%mm2, %%mm6\n" |
|
270 |
- |
|
271 |
- "add %2, %0\n" |
|
272 |
- "subl $2, %%ecx\n" |
|
273 |
- " jnz 1b\n" |
|
274 |
- |
|
275 |
- "movq %%mm6, %%mm0\n" |
|
276 |
- "punpcklwd %%mm7, %%mm0\n" |
|
277 |
- "punpckhwd %%mm7, %%mm6\n" |
|
278 |
- "paddd %%mm0, %%mm6\n" |
|
279 |
- |
|
280 |
- "movq %%mm6, %%mm0\n" |
|
281 |
- "psrlq $32, %%mm6\n" |
|
282 |
- "paddd %%mm6, %%mm0\n" |
|
283 |
- "movd %%mm0, %1\n" |
|
284 |
- : "+r" (pix1), "=r" (tmp) |
|
285 |
- : "r" ((x86_reg) line_size), "g" (h - 2) |
|
286 |
- : "%ecx"); |
|
287 |
- |
|
288 |
- return tmp; |
|
289 |
-} |
|
290 |
- |
|
291 |
-static int hf_noise16_mmx(uint8_t *pix1, int line_size, int h) |
|
292 |
-{ |
|
293 |
- int tmp; |
|
294 |
- uint8_t *pix = pix1; |
|
295 |
- |
|
296 |
- __asm__ volatile ( |
|
297 |
- "movl %3, %%ecx\n" |
|
298 |
- "pxor %%mm7, %%mm7\n" |
|
299 |
- "pxor %%mm6, %%mm6\n" |
|
300 |
- |
|
301 |
- "movq (%0), %%mm0\n" |
|
302 |
- "movq 1(%0), %%mm1\n" |
|
303 |
- "movq %%mm0, %%mm2\n" |
|
304 |
- "movq %%mm1, %%mm3\n" |
|
305 |
- "punpcklbw %%mm7, %%mm0\n" |
|
306 |
- "punpcklbw %%mm7, %%mm1\n" |
|
307 |
- "punpckhbw %%mm7, %%mm2\n" |
|
308 |
- "punpckhbw %%mm7, %%mm3\n" |
|
309 |
- "psubw %%mm1, %%mm0\n" |
|
310 |
- "psubw %%mm3, %%mm2\n" |
|
311 |
- |
|
312 |
- "add %2, %0\n" |
|
313 |
- |
|
314 |
- "movq (%0), %%mm4\n" |
|
315 |
- "movq 1(%0), %%mm1\n" |
|
316 |
- "movq %%mm4, %%mm5\n" |
|
317 |
- "movq %%mm1, %%mm3\n" |
|
318 |
- "punpcklbw %%mm7, %%mm4\n" |
|
319 |
- "punpcklbw %%mm7, %%mm1\n" |
|
320 |
- "punpckhbw %%mm7, %%mm5\n" |
|
321 |
- "punpckhbw %%mm7, %%mm3\n" |
|
322 |
- "psubw %%mm1, %%mm4\n" |
|
323 |
- "psubw %%mm3, %%mm5\n" |
|
324 |
- "psubw %%mm4, %%mm0\n" |
|
325 |
- "psubw %%mm5, %%mm2\n" |
|
326 |
- "pxor %%mm3, %%mm3\n" |
|
327 |
- "pxor %%mm1, %%mm1\n" |
|
328 |
- "pcmpgtw %%mm0, %%mm3\n\t" |
|
329 |
- "pcmpgtw %%mm2, %%mm1\n\t" |
|
330 |
- "pxor %%mm3, %%mm0\n" |
|
331 |
- "pxor %%mm1, %%mm2\n" |
|
332 |
- "psubw %%mm3, %%mm0\n" |
|
333 |
- "psubw %%mm1, %%mm2\n" |
|
334 |
- "paddw %%mm0, %%mm2\n" |
|
335 |
- "paddw %%mm2, %%mm6\n" |
|
336 |
- |
|
337 |
- "add %2, %0\n" |
|
338 |
- "1:\n" |
|
339 |
- |
|
340 |
- "movq (%0), %%mm0\n" |
|
341 |
- "movq 1(%0), %%mm1\n" |
|
342 |
- "movq %%mm0, %%mm2\n" |
|
343 |
- "movq %%mm1, %%mm3\n" |
|
344 |
- "punpcklbw %%mm7, %%mm0\n" |
|
345 |
- "punpcklbw %%mm7, %%mm1\n" |
|
346 |
- "punpckhbw %%mm7, %%mm2\n" |
|
347 |
- "punpckhbw %%mm7, %%mm3\n" |
|
348 |
- "psubw %%mm1, %%mm0\n" |
|
349 |
- "psubw %%mm3, %%mm2\n" |
|
350 |
- "psubw %%mm0, %%mm4\n" |
|
351 |
- "psubw %%mm2, %%mm5\n" |
|
352 |
- "pxor %%mm3, %%mm3\n" |
|
353 |
- "pxor %%mm1, %%mm1\n" |
|
354 |
- "pcmpgtw %%mm4, %%mm3\n\t" |
|
355 |
- "pcmpgtw %%mm5, %%mm1\n\t" |
|
356 |
- "pxor %%mm3, %%mm4\n" |
|
357 |
- "pxor %%mm1, %%mm5\n" |
|
358 |
- "psubw %%mm3, %%mm4\n" |
|
359 |
- "psubw %%mm1, %%mm5\n" |
|
360 |
- "paddw %%mm4, %%mm5\n" |
|
361 |
- "paddw %%mm5, %%mm6\n" |
|
362 |
- |
|
363 |
- "add %2, %0\n" |
|
364 |
- |
|
365 |
- "movq (%0), %%mm4\n" |
|
366 |
- "movq 1(%0), %%mm1\n" |
|
367 |
- "movq %%mm4, %%mm5\n" |
|
368 |
- "movq %%mm1, %%mm3\n" |
|
369 |
- "punpcklbw %%mm7, %%mm4\n" |
|
370 |
- "punpcklbw %%mm7, %%mm1\n" |
|
371 |
- "punpckhbw %%mm7, %%mm5\n" |
|
372 |
- "punpckhbw %%mm7, %%mm3\n" |
|
373 |
- "psubw %%mm1, %%mm4\n" |
|
374 |
- "psubw %%mm3, %%mm5\n" |
|
375 |
- "psubw %%mm4, %%mm0\n" |
|
376 |
- "psubw %%mm5, %%mm2\n" |
|
377 |
- "pxor %%mm3, %%mm3\n" |
|
378 |
- "pxor %%mm1, %%mm1\n" |
|
379 |
- "pcmpgtw %%mm0, %%mm3\n\t" |
|
380 |
- "pcmpgtw %%mm2, %%mm1\n\t" |
|
381 |
- "pxor %%mm3, %%mm0\n" |
|
382 |
- "pxor %%mm1, %%mm2\n" |
|
383 |
- "psubw %%mm3, %%mm0\n" |
|
384 |
- "psubw %%mm1, %%mm2\n" |
|
385 |
- "paddw %%mm0, %%mm2\n" |
|
386 |
- "paddw %%mm2, %%mm6\n" |
|
387 |
- |
|
388 |
- "add %2, %0\n" |
|
389 |
- "subl $2, %%ecx\n" |
|
390 |
- " jnz 1b\n" |
|
391 |
- |
|
392 |
- "movq %%mm6, %%mm0\n" |
|
393 |
- "punpcklwd %%mm7, %%mm0\n" |
|
394 |
- "punpckhwd %%mm7, %%mm6\n" |
|
395 |
- "paddd %%mm0, %%mm6\n" |
|
396 |
- |
|
397 |
- "movq %%mm6, %%mm0\n" |
|
398 |
- "psrlq $32, %%mm6\n" |
|
399 |
- "paddd %%mm6, %%mm0\n" |
|
400 |
- "movd %%mm0, %1\n" |
|
401 |
- : "+r" (pix1), "=r" (tmp) |
|
402 |
- : "r" ((x86_reg) line_size), "g" (h - 2) |
|
403 |
- : "%ecx"); |
|
404 |
- |
|
405 |
- return tmp + hf_noise8_mmx(pix + 8, line_size, h); |
|
406 |
-} |
|
407 |
- |
|
408 |
-static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, |
|
409 |
- int line_size, int h) |
|
410 |
-{ |
|
411 |
- int score1, score2; |
|
412 |
- |
|
413 |
- if (c) |
|
414 |
- score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); |
|
415 |
- else |
|
416 |
- score1 = sse16_mmx(c, pix1, pix2, line_size, h); |
|
417 |
- score2 = hf_noise16_mmx(pix1, line_size, h) - |
|
418 |
- hf_noise16_mmx(pix2, line_size, h); |
|
419 |
- |
|
420 |
- if (c) |
|
421 |
- return score1 + FFABS(score2) * c->avctx->nsse_weight; |
|
422 |
- else |
|
423 |
- return score1 + FFABS(score2) * 8; |
|
424 |
-} |
|
425 |
- |
|
426 |
-static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, |
|
427 |
- int line_size, int h) |
|
428 |
-{ |
|
429 |
- int score1 = sse8_mmx(c, pix1, pix2, line_size, h); |
|
430 |
- int score2 = hf_noise8_mmx(pix1, line_size, h) - |
|
431 |
- hf_noise8_mmx(pix2, line_size, h); |
|
432 |
- |
|
433 |
- if (c) |
|
434 |
- return score1 + FFABS(score2) * c->avctx->nsse_weight; |
|
435 |
- else |
|
436 |
- return score1 + FFABS(score2) * 8; |
|
437 |
-} |
|
438 |
- |
|
439 |
-static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, |
|
440 |
- int line_size, int h) |
|
441 |
-{ |
|
442 |
- int tmp; |
|
443 |
- |
|
444 |
- assert((((int) pix) & 7) == 0); |
|
445 |
- assert((line_size & 7) == 0); |
|
446 |
- |
|
447 |
-#define SUM(in0, in1, out0, out1) \ |
|
448 |
- "movq (%0), %%mm2\n" \ |
|
449 |
- "movq 8(%0), %%mm3\n" \ |
|
450 |
- "add %2,%0\n" \ |
|
451 |
- "movq %%mm2, " #out0 "\n" \ |
|
452 |
- "movq %%mm3, " #out1 "\n" \ |
|
453 |
- "psubusb " #in0 ", %%mm2\n" \ |
|
454 |
- "psubusb " #in1 ", %%mm3\n" \ |
|
455 |
- "psubusb " #out0 ", " #in0 "\n" \ |
|
456 |
- "psubusb " #out1 ", " #in1 "\n" \ |
|
457 |
- "por %%mm2, " #in0 "\n" \ |
|
458 |
- "por %%mm3, " #in1 "\n" \ |
|
459 |
- "movq " #in0 ", %%mm2\n" \ |
|
460 |
- "movq " #in1 ", %%mm3\n" \ |
|
461 |
- "punpcklbw %%mm7, " #in0 "\n" \ |
|
462 |
- "punpcklbw %%mm7, " #in1 "\n" \ |
|
463 |
- "punpckhbw %%mm7, %%mm2\n" \ |
|
464 |
- "punpckhbw %%mm7, %%mm3\n" \ |
|
465 |
- "paddw " #in1 ", " #in0 "\n" \ |
|
466 |
- "paddw %%mm3, %%mm2\n" \ |
|
467 |
- "paddw %%mm2, " #in0 "\n" \ |
|
468 |
- "paddw " #in0 ", %%mm6\n" |
|
469 |
- |
|
470 |
- |
|
471 |
- __asm__ volatile ( |
|
472 |
- "movl %3, %%ecx\n" |
|
473 |
- "pxor %%mm6, %%mm6\n" |
|
474 |
- "pxor %%mm7, %%mm7\n" |
|
475 |
- "movq (%0), %%mm0\n" |
|
476 |
- "movq 8(%0), %%mm1\n" |
|
477 |
- "add %2, %0\n" |
|
478 |
- "jmp 2f\n" |
|
479 |
- "1:\n" |
|
480 |
- |
|
481 |
- SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
|
482 |
- "2:\n" |
|
483 |
- SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
|
484 |
- |
|
485 |
- "subl $2, %%ecx\n" |
|
486 |
- "jnz 1b\n" |
|
487 |
- |
|
488 |
- "movq %%mm6, %%mm0\n" |
|
489 |
- "psrlq $32, %%mm6\n" |
|
490 |
- "paddw %%mm6, %%mm0\n" |
|
491 |
- "movq %%mm0, %%mm6\n" |
|
492 |
- "psrlq $16, %%mm0\n" |
|
493 |
- "paddw %%mm6, %%mm0\n" |
|
494 |
- "movd %%mm0, %1\n" |
|
495 |
- : "+r" (pix), "=r" (tmp) |
|
496 |
- : "r" ((x86_reg) line_size), "m" (h) |
|
497 |
- : "%ecx"); |
|
498 |
- |
|
499 |
- return tmp & 0xFFFF; |
|
500 |
-} |
|
501 |
-#undef SUM |
|
502 |
- |
|
503 |
-static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, |
|
504 |
- int line_size, int h) |
|
505 |
-{ |
|
506 |
- int tmp; |
|
507 |
- |
|
508 |
- assert((((int) pix) & 7) == 0); |
|
509 |
- assert((line_size & 7) == 0); |
|
510 |
- |
|
511 |
-#define SUM(in0, in1, out0, out1) \ |
|
512 |
- "movq (%0), " #out0 "\n" \ |
|
513 |
- "movq 8(%0), " #out1 "\n" \ |
|
514 |
- "add %2, %0\n" \ |
|
515 |
- "psadbw " #out0 ", " #in0 "\n" \ |
|
516 |
- "psadbw " #out1 ", " #in1 "\n" \ |
|
517 |
- "paddw " #in1 ", " #in0 "\n" \ |
|
518 |
- "paddw " #in0 ", %%mm6\n" |
|
519 |
- |
|
520 |
- __asm__ volatile ( |
|
521 |
- "movl %3, %%ecx\n" |
|
522 |
- "pxor %%mm6, %%mm6\n" |
|
523 |
- "pxor %%mm7, %%mm7\n" |
|
524 |
- "movq (%0), %%mm0\n" |
|
525 |
- "movq 8(%0), %%mm1\n" |
|
526 |
- "add %2, %0\n" |
|
527 |
- "jmp 2f\n" |
|
528 |
- "1:\n" |
|
529 |
- |
|
530 |
- SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
|
531 |
- "2:\n" |
|
532 |
- SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
|
533 |
- |
|
534 |
- "subl $2, %%ecx\n" |
|
535 |
- "jnz 1b\n" |
|
536 |
- |
|
537 |
- "movd %%mm6, %1\n" |
|
538 |
- : "+r" (pix), "=r" (tmp) |
|
539 |
- : "r" ((x86_reg) line_size), "m" (h) |
|
540 |
- : "%ecx"); |
|
541 |
- |
|
542 |
- return tmp; |
|
543 |
-} |
|
544 |
-#undef SUM |
|
545 |
- |
|
546 |
-static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
547 |
- int line_size, int h) |
|
548 |
-{ |
|
549 |
- int tmp; |
|
550 |
- |
|
551 |
- assert((((int) pix1) & 7) == 0); |
|
552 |
- assert((((int) pix2) & 7) == 0); |
|
553 |
- assert((line_size & 7) == 0); |
|
554 |
- |
|
555 |
-#define SUM(in0, in1, out0, out1) \ |
|
556 |
- "movq (%0), %%mm2\n" \ |
|
557 |
- "movq (%1), " #out0 "\n" \ |
|
558 |
- "movq 8(%0), %%mm3\n" \ |
|
559 |
- "movq 8(%1), " #out1 "\n" \ |
|
560 |
- "add %3, %0\n" \ |
|
561 |
- "add %3, %1\n" \ |
|
562 |
- "psubb " #out0 ", %%mm2\n" \ |
|
563 |
- "psubb " #out1 ", %%mm3\n" \ |
|
564 |
- "pxor %%mm7, %%mm2\n" \ |
|
565 |
- "pxor %%mm7, %%mm3\n" \ |
|
566 |
- "movq %%mm2, " #out0 "\n" \ |
|
567 |
- "movq %%mm3, " #out1 "\n" \ |
|
568 |
- "psubusb " #in0 ", %%mm2\n" \ |
|
569 |
- "psubusb " #in1 ", %%mm3\n" \ |
|
570 |
- "psubusb " #out0 ", " #in0 "\n" \ |
|
571 |
- "psubusb " #out1 ", " #in1 "\n" \ |
|
572 |
- "por %%mm2, " #in0 "\n" \ |
|
573 |
- "por %%mm3, " #in1 "\n" \ |
|
574 |
- "movq " #in0 ", %%mm2\n" \ |
|
575 |
- "movq " #in1 ", %%mm3\n" \ |
|
576 |
- "punpcklbw %%mm7, " #in0 "\n" \ |
|
577 |
- "punpcklbw %%mm7, " #in1 "\n" \ |
|
578 |
- "punpckhbw %%mm7, %%mm2\n" \ |
|
579 |
- "punpckhbw %%mm7, %%mm3\n" \ |
|
580 |
- "paddw " #in1 ", " #in0 "\n" \ |
|
581 |
- "paddw %%mm3, %%mm2\n" \ |
|
582 |
- "paddw %%mm2, " #in0 "\n" \ |
|
583 |
- "paddw " #in0 ", %%mm6\n" |
|
584 |
- |
|
585 |
- |
|
586 |
- __asm__ volatile ( |
|
587 |
- "movl %4, %%ecx\n" |
|
588 |
- "pxor %%mm6, %%mm6\n" |
|
589 |
- "pcmpeqw %%mm7, %%mm7\n" |
|
590 |
- "psllw $15, %%mm7\n" |
|
591 |
- "packsswb %%mm7, %%mm7\n" |
|
592 |
- "movq (%0), %%mm0\n" |
|
593 |
- "movq (%1), %%mm2\n" |
|
594 |
- "movq 8(%0), %%mm1\n" |
|
595 |
- "movq 8(%1), %%mm3\n" |
|
596 |
- "add %3, %0\n" |
|
597 |
- "add %3, %1\n" |
|
598 |
- "psubb %%mm2, %%mm0\n" |
|
599 |
- "psubb %%mm3, %%mm1\n" |
|
600 |
- "pxor %%mm7, %%mm0\n" |
|
601 |
- "pxor %%mm7, %%mm1\n" |
|
602 |
- "jmp 2f\n" |
|
603 |
- "1:\n" |
|
604 |
- |
|
605 |
- SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
|
606 |
- "2:\n" |
|
607 |
- SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
|
608 |
- |
|
609 |
- "subl $2, %%ecx\n" |
|
610 |
- "jnz 1b\n" |
|
611 |
- |
|
612 |
- "movq %%mm6, %%mm0\n" |
|
613 |
- "psrlq $32, %%mm6\n" |
|
614 |
- "paddw %%mm6, %%mm0\n" |
|
615 |
- "movq %%mm0, %%mm6\n" |
|
616 |
- "psrlq $16, %%mm0\n" |
|
617 |
- "paddw %%mm6, %%mm0\n" |
|
618 |
- "movd %%mm0, %2\n" |
|
619 |
- : "+r" (pix1), "+r" (pix2), "=r" (tmp) |
|
620 |
- : "r" ((x86_reg) line_size), "m" (h) |
|
621 |
- : "%ecx"); |
|
622 |
- |
|
623 |
- return tmp & 0x7FFF; |
|
624 |
-} |
|
625 |
-#undef SUM |
|
626 |
- |
|
627 |
-static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
628 |
- int line_size, int h) |
|
629 |
-{ |
|
630 |
- int tmp; |
|
631 |
- |
|
632 |
- assert((((int) pix1) & 7) == 0); |
|
633 |
- assert((((int) pix2) & 7) == 0); |
|
634 |
- assert((line_size & 7) == 0); |
|
635 |
- |
|
636 |
-#define SUM(in0, in1, out0, out1) \ |
|
637 |
- "movq (%0), " #out0 "\n" \ |
|
638 |
- "movq (%1), %%mm2\n" \ |
|
639 |
- "movq 8(%0), " #out1 "\n" \ |
|
640 |
- "movq 8(%1), %%mm3\n" \ |
|
641 |
- "add %3, %0\n" \ |
|
642 |
- "add %3, %1\n" \ |
|
643 |
- "psubb %%mm2, " #out0 "\n" \ |
|
644 |
- "psubb %%mm3, " #out1 "\n" \ |
|
645 |
- "pxor %%mm7, " #out0 "\n" \ |
|
646 |
- "pxor %%mm7, " #out1 "\n" \ |
|
647 |
- "psadbw " #out0 ", " #in0 "\n" \ |
|
648 |
- "psadbw " #out1 ", " #in1 "\n" \ |
|
649 |
- "paddw " #in1 ", " #in0 "\n" \ |
|
650 |
- "paddw " #in0 ", %%mm6\n " |
|
651 |
- |
|
652 |
- __asm__ volatile ( |
|
653 |
- "movl %4, %%ecx\n" |
|
654 |
- "pxor %%mm6, %%mm6\n" |
|
655 |
- "pcmpeqw %%mm7, %%mm7\n" |
|
656 |
- "psllw $15, %%mm7\n" |
|
657 |
- "packsswb %%mm7, %%mm7\n" |
|
658 |
- "movq (%0), %%mm0\n" |
|
659 |
- "movq (%1), %%mm2\n" |
|
660 |
- "movq 8(%0), %%mm1\n" |
|
661 |
- "movq 8(%1), %%mm3\n" |
|
662 |
- "add %3, %0\n" |
|
663 |
- "add %3, %1\n" |
|
664 |
- "psubb %%mm2, %%mm0\n" |
|
665 |
- "psubb %%mm3, %%mm1\n" |
|
666 |
- "pxor %%mm7, %%mm0\n" |
|
667 |
- "pxor %%mm7, %%mm1\n" |
|
668 |
- "jmp 2f\n" |
|
669 |
- "1:\n" |
|
670 |
- |
|
671 |
- SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
|
672 |
- "2:\n" |
|
673 |
- SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
|
674 |
- |
|
675 |
- "subl $2, %%ecx\n" |
|
676 |
- "jnz 1b\n" |
|
677 |
- |
|
678 |
- "movd %%mm6, %2\n" |
|
679 |
- : "+r" (pix1), "+r" (pix2), "=r" (tmp) |
|
680 |
- : "r" ((x86_reg) line_size), "m" (h) |
|
681 |
- : "%ecx"); |
|
682 |
- |
|
683 |
- return tmp; |
|
684 |
-} |
|
685 |
-#undef SUM |
|
686 |
- |
|
687 |
-#define MMABS_MMX(a,z) \ |
|
688 |
- "pxor " #z ", " #z " \n\t" \ |
|
689 |
- "pcmpgtw " #a ", " #z " \n\t" \ |
|
690 |
- "pxor " #z ", " #a " \n\t" \ |
|
691 |
- "psubw " #z ", " #a " \n\t" |
|
692 |
- |
|
693 |
-#define MMABS_MMXEXT(a, z) \ |
|
694 |
- "pxor " #z ", " #z " \n\t" \ |
|
695 |
- "psubw " #a ", " #z " \n\t" \ |
|
696 |
- "pmaxsw " #z ", " #a " \n\t" |
|
697 |
- |
|
698 |
-#define MMABS_SSSE3(a,z) \ |
|
699 |
- "pabsw " #a ", " #a " \n\t" |
|
700 |
- |
|
701 |
-#define MMABS_SUM(a,z, sum) \ |
|
702 |
- MMABS(a,z) \ |
|
703 |
- "paddusw " #a ", " #sum " \n\t" |
|
704 |
- |
|
705 |
-/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get |
|
706 |
- * up to about 100k on extreme inputs. But that's very unlikely to occur in |
|
707 |
- * natural video, and it's even more unlikely to not have any alternative |
|
708 |
- * mvs/modes with lower cost. */ |
|
709 |
-#define HSUM_MMX(a, t, dst) \ |
|
710 |
- "movq " #a ", " #t " \n\t" \ |
|
711 |
- "psrlq $32, " #a " \n\t" \ |
|
712 |
- "paddusw " #t ", " #a " \n\t" \ |
|
713 |
- "movq " #a ", " #t " \n\t" \ |
|
714 |
- "psrlq $16, " #a " \n\t" \ |
|
715 |
- "paddusw " #t ", " #a " \n\t" \ |
|
716 |
- "movd " #a ", " #dst " \n\t" \ |
|
717 |
- |
|
718 |
-#define HSUM_MMXEXT(a, t, dst) \ |
|
719 |
- "pshufw $0x0E, " #a ", " #t " \n\t" \ |
|
720 |
- "paddusw " #t ", " #a " \n\t" \ |
|
721 |
- "pshufw $0x01, " #a ", " #t " \n\t" \ |
|
722 |
- "paddusw " #t ", " #a " \n\t" \ |
|
723 |
- "movd " #a ", " #dst " \n\t" \ |
|
724 |
- |
|
725 |
-#define HSUM_SSE2(a, t, dst) \ |
|
726 |
- "movhlps " #a ", " #t " \n\t" \ |
|
727 |
- "paddusw " #t ", " #a " \n\t" \ |
|
728 |
- "pshuflw $0x0E, " #a ", " #t " \n\t" \ |
|
729 |
- "paddusw " #t ", " #a " \n\t" \ |
|
730 |
- "pshuflw $0x01, " #a ", " #t " \n\t" \ |
|
731 |
- "paddusw " #t ", " #a " \n\t" \ |
|
732 |
- "movd " #a ", " #dst " \n\t" \ |
|
733 |
- |
|
734 |
-#define DCT_SAD4(m, mm, o) \ |
|
735 |
- "mov"#m" "#o" + 0(%1), " #mm "2 \n\t" \ |
|
736 |
- "mov"#m" "#o" + 16(%1), " #mm "3 \n\t" \ |
|
737 |
- "mov"#m" "#o" + 32(%1), " #mm "4 \n\t" \ |
|
738 |
- "mov"#m" "#o" + 48(%1), " #mm "5 \n\t" \ |
|
739 |
- MMABS_SUM(mm ## 2, mm ## 6, mm ## 0) \ |
|
740 |
- MMABS_SUM(mm ## 3, mm ## 7, mm ## 1) \ |
|
741 |
- MMABS_SUM(mm ## 4, mm ## 6, mm ## 0) \ |
|
742 |
- MMABS_SUM(mm ## 5, mm ## 7, mm ## 1) \ |
|
743 |
- |
|
744 |
-#define DCT_SAD_MMX \ |
|
745 |
- "pxor %%mm0, %%mm0 \n\t" \ |
|
746 |
- "pxor %%mm1, %%mm1 \n\t" \ |
|
747 |
- DCT_SAD4(q, %%mm, 0) \ |
|
748 |
- DCT_SAD4(q, %%mm, 8) \ |
|
749 |
- DCT_SAD4(q, %%mm, 64) \ |
|
750 |
- DCT_SAD4(q, %%mm, 72) \ |
|
751 |
- "paddusw %%mm1, %%mm0 \n\t" \ |
|
752 |
- HSUM(%%mm0, %%mm1, %0) |
|
753 |
- |
|
754 |
-#define DCT_SAD_SSE2 \ |
|
755 |
- "pxor %%xmm0, %%xmm0 \n\t" \ |
|
756 |
- "pxor %%xmm1, %%xmm1 \n\t" \ |
|
757 |
- DCT_SAD4(dqa, %%xmm, 0) \ |
|
758 |
- DCT_SAD4(dqa, %%xmm, 64) \ |
|
759 |
- "paddusw %%xmm1, %%xmm0 \n\t" \ |
|
760 |
- HSUM(%%xmm0, %%xmm1, %0) |
|
761 |
- |
|
762 |
-#define DCT_SAD_FUNC(cpu) \ |
|
763 |
-static int sum_abs_dctelem_ ## cpu(int16_t *block) \ |
|
764 |
-{ \ |
|
765 |
- int sum; \ |
|
766 |
- __asm__ volatile ( \ |
|
767 |
- DCT_SAD \ |
|
768 |
- :"=r"(sum) \ |
|
769 |
- :"r"(block)); \ |
|
770 |
- return sum & 0xFFFF; \ |
|
771 |
-} |
|
772 |
- |
|
773 |
-#define DCT_SAD DCT_SAD_MMX |
|
774 |
-#define HSUM(a, t, dst) HSUM_MMX(a, t, dst) |
|
775 |
-#define MMABS(a, z) MMABS_MMX(a, z) |
|
776 |
-DCT_SAD_FUNC(mmx) |
|
777 |
-#undef MMABS |
|
778 |
-#undef HSUM |
|
779 |
- |
|
780 |
-#define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst) |
|
781 |
-#define MMABS(a, z) MMABS_MMXEXT(a, z) |
|
782 |
-DCT_SAD_FUNC(mmxext) |
|
783 |
-#undef HSUM |
|
784 |
-#undef DCT_SAD |
|
785 |
- |
|
786 |
-#define DCT_SAD DCT_SAD_SSE2 |
|
787 |
-#define HSUM(a, t, dst) HSUM_SSE2(a, t, dst) |
|
788 |
-DCT_SAD_FUNC(sse2) |
|
789 |
-#undef MMABS |
|
790 |
- |
|
791 |
-#if HAVE_SSSE3_INLINE |
|
792 |
-#define MMABS(a, z) MMABS_SSSE3(a, z) |
|
793 |
-DCT_SAD_FUNC(ssse3) |
|
794 |
-#undef MMABS |
|
795 |
-#endif |
|
796 |
-#undef HSUM |
|
797 |
-#undef DCT_SAD |
|
798 |
- |
|
799 |
- |
|
800 |
-DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = { |
|
801 |
- 0x0000000000000000ULL, |
|
802 |
- 0x0001000100010001ULL, |
|
803 |
- 0x0002000200020002ULL, |
|
804 |
-}; |
|
805 |
- |
|
806 |
-DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL; |
|
807 |
- |
|
808 |
-static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
|
809 |
-{ |
|
810 |
- x86_reg len = -(stride * h); |
|
811 |
- __asm__ volatile ( |
|
812 |
- ".p2align 4 \n\t" |
|
813 |
- "1: \n\t" |
|
814 |
- "movq (%1, %%"REG_a"), %%mm0 \n\t" |
|
815 |
- "movq (%2, %%"REG_a"), %%mm2 \n\t" |
|
816 |
- "movq (%2, %%"REG_a"), %%mm4 \n\t" |
|
817 |
- "add %3, %%"REG_a" \n\t" |
|
818 |
- "psubusb %%mm0, %%mm2 \n\t" |
|
819 |
- "psubusb %%mm4, %%mm0 \n\t" |
|
820 |
- "movq (%1, %%"REG_a"), %%mm1 \n\t" |
|
821 |
- "movq (%2, %%"REG_a"), %%mm3 \n\t" |
|
822 |
- "movq (%2, %%"REG_a"), %%mm5 \n\t" |
|
823 |
- "psubusb %%mm1, %%mm3 \n\t" |
|
824 |
- "psubusb %%mm5, %%mm1 \n\t" |
|
825 |
- "por %%mm2, %%mm0 \n\t" |
|
826 |
- "por %%mm1, %%mm3 \n\t" |
|
827 |
- "movq %%mm0, %%mm1 \n\t" |
|
828 |
- "movq %%mm3, %%mm2 \n\t" |
|
829 |
- "punpcklbw %%mm7, %%mm0 \n\t" |
|
830 |
- "punpckhbw %%mm7, %%mm1 \n\t" |
|
831 |
- "punpcklbw %%mm7, %%mm3 \n\t" |
|
832 |
- "punpckhbw %%mm7, %%mm2 \n\t" |
|
833 |
- "paddw %%mm1, %%mm0 \n\t" |
|
834 |
- "paddw %%mm3, %%mm2 \n\t" |
|
835 |
- "paddw %%mm2, %%mm0 \n\t" |
|
836 |
- "paddw %%mm0, %%mm6 \n\t" |
|
837 |
- "add %3, %%"REG_a" \n\t" |
|
838 |
- " js 1b \n\t" |
|
839 |
- : "+a" (len) |
|
840 |
- : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride)); |
|
841 |
-} |
|
842 |
- |
|
843 |
-static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2, |
|
844 |
- int stride, int h) |
|
845 |
-{ |
|
846 |
- __asm__ volatile ( |
|
847 |
- ".p2align 4 \n\t" |
|
848 |
- "1: \n\t" |
|
849 |
- "movq (%1), %%mm0 \n\t" |
|
850 |
- "movq (%1, %3), %%mm1 \n\t" |
|
851 |
- "psadbw (%2), %%mm0 \n\t" |
|
852 |
- "psadbw (%2, %3), %%mm1 \n\t" |
|
853 |
- "paddw %%mm0, %%mm6 \n\t" |
|
854 |
- "paddw %%mm1, %%mm6 \n\t" |
|
855 |
- "lea (%1,%3,2), %1 \n\t" |
|
856 |
- "lea (%2,%3,2), %2 \n\t" |
|
857 |
- "sub $2, %0 \n\t" |
|
858 |
- " jg 1b \n\t" |
|
859 |
- : "+r" (h), "+r" (blk1), "+r" (blk2) |
|
860 |
- : "r" ((x86_reg) stride)); |
|
861 |
-} |
|
862 |
- |
|
863 |
-static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1, |
|
864 |
- int stride, int h) |
|
865 |
-{ |
|
866 |
- int ret; |
|
867 |
- __asm__ volatile ( |
|
868 |
- "pxor %%xmm2, %%xmm2 \n\t" |
|
869 |
- ".p2align 4 \n\t" |
|
870 |
- "1: \n\t" |
|
871 |
- "movdqu (%1), %%xmm0 \n\t" |
|
872 |
- "movdqu (%1, %4), %%xmm1 \n\t" |
|
873 |
- "psadbw (%2), %%xmm0 \n\t" |
|
874 |
- "psadbw (%2, %4), %%xmm1 \n\t" |
|
875 |
- "paddw %%xmm0, %%xmm2 \n\t" |
|
876 |
- "paddw %%xmm1, %%xmm2 \n\t" |
|
877 |
- "lea (%1,%4,2), %1 \n\t" |
|
878 |
- "lea (%2,%4,2), %2 \n\t" |
|
879 |
- "sub $2, %0 \n\t" |
|
880 |
- " jg 1b \n\t" |
|
881 |
- "movhlps %%xmm2, %%xmm0 \n\t" |
|
882 |
- "paddw %%xmm0, %%xmm2 \n\t" |
|
883 |
- "movd %%xmm2, %3 \n\t" |
|
884 |
- : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret) |
|
885 |
- : "r" ((x86_reg) stride)); |
|
886 |
- return ret; |
|
887 |
-} |
|
888 |
- |
|
889 |
-static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2, |
|
890 |
- int stride, int h) |
|
891 |
-{ |
|
892 |
- __asm__ volatile ( |
|
893 |
- ".p2align 4 \n\t" |
|
894 |
- "1: \n\t" |
|
895 |
- "movq (%1), %%mm0 \n\t" |
|
896 |
- "movq (%1, %3), %%mm1 \n\t" |
|
897 |
- "pavgb 1(%1), %%mm0 \n\t" |
|
898 |
- "pavgb 1(%1, %3), %%mm1 \n\t" |
|
899 |
- "psadbw (%2), %%mm0 \n\t" |
|
900 |
- "psadbw (%2, %3), %%mm1 \n\t" |
|
901 |
- "paddw %%mm0, %%mm6 \n\t" |
|
902 |
- "paddw %%mm1, %%mm6 \n\t" |
|
903 |
- "lea (%1,%3,2), %1 \n\t" |
|
904 |
- "lea (%2,%3,2), %2 \n\t" |
|
905 |
- "sub $2, %0 \n\t" |
|
906 |
- " jg 1b \n\t" |
|
907 |
- : "+r" (h), "+r" (blk1), "+r" (blk2) |
|
908 |
- : "r" ((x86_reg) stride)); |
|
909 |
-} |
|
910 |
- |
|
911 |
-static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2, |
|
912 |
- int stride, int h) |
|
913 |
-{ |
|
914 |
- __asm__ volatile ( |
|
915 |
- "movq (%1), %%mm0 \n\t" |
|
916 |
- "add %3, %1 \n\t" |
|
917 |
- ".p2align 4 \n\t" |
|
918 |
- "1: \n\t" |
|
919 |
- "movq (%1), %%mm1 \n\t" |
|
920 |
- "movq (%1, %3), %%mm2 \n\t" |
|
921 |
- "pavgb %%mm1, %%mm0 \n\t" |
|
922 |
- "pavgb %%mm2, %%mm1 \n\t" |
|
923 |
- "psadbw (%2), %%mm0 \n\t" |
|
924 |
- "psadbw (%2, %3), %%mm1 \n\t" |
|
925 |
- "paddw %%mm0, %%mm6 \n\t" |
|
926 |
- "paddw %%mm1, %%mm6 \n\t" |
|
927 |
- "movq %%mm2, %%mm0 \n\t" |
|
928 |
- "lea (%1,%3,2), %1 \n\t" |
|
929 |
- "lea (%2,%3,2), %2 \n\t" |
|
930 |
- "sub $2, %0 \n\t" |
|
931 |
- " jg 1b \n\t" |
|
932 |
- : "+r" (h), "+r" (blk1), "+r" (blk2) |
|
933 |
- : "r" ((x86_reg) stride)); |
|
934 |
-} |
|
935 |
- |
|
936 |
-static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2, |
|
937 |
- int stride, int h) |
|
938 |
-{ |
|
939 |
- __asm__ volatile ( |
|
940 |
- "movq "MANGLE(bone)", %%mm5 \n\t" |
|
941 |
- "movq (%1), %%mm0 \n\t" |
|
942 |
- "pavgb 1(%1), %%mm0 \n\t" |
|
943 |
- "add %3, %1 \n\t" |
|
944 |
- ".p2align 4 \n\t" |
|
945 |
- "1: \n\t" |
|
946 |
- "movq (%1), %%mm1 \n\t" |
|
947 |
- "movq (%1,%3), %%mm2 \n\t" |
|
948 |
- "pavgb 1(%1), %%mm1 \n\t" |
|
949 |
- "pavgb 1(%1,%3), %%mm2 \n\t" |
|
950 |
- "psubusb %%mm5, %%mm1 \n\t" |
|
951 |
- "pavgb %%mm1, %%mm0 \n\t" |
|
952 |
- "pavgb %%mm2, %%mm1 \n\t" |
|
953 |
- "psadbw (%2), %%mm0 \n\t" |
|
954 |
- "psadbw (%2,%3), %%mm1 \n\t" |
|
955 |
- "paddw %%mm0, %%mm6 \n\t" |
|
956 |
- "paddw %%mm1, %%mm6 \n\t" |
|
957 |
- "movq %%mm2, %%mm0 \n\t" |
|
958 |
- "lea (%1,%3,2), %1 \n\t" |
|
959 |
- "lea (%2,%3,2), %2 \n\t" |
|
960 |
- "sub $2, %0 \n\t" |
|
961 |
- " jg 1b \n\t" |
|
962 |
- : "+r" (h), "+r" (blk1), "+r" (blk2) |
|
963 |
- : "r" ((x86_reg) stride)); |
|
964 |
-} |
|
965 |
- |
|
966 |
-static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, |
|
967 |
- int stride, int h) |
|
968 |
-{ |
|
969 |
- x86_reg len = -(stride * h); |
|
970 |
- __asm__ volatile ( |
|
971 |
- ".p2align 4 \n\t" |
|
972 |
- "1: \n\t" |
|
973 |
- "movq (%1, %%"REG_a"), %%mm0 \n\t" |
|
974 |
- "movq (%2, %%"REG_a"), %%mm1 \n\t" |
|
975 |
- "movq (%1, %%"REG_a"), %%mm2 \n\t" |
|
976 |
- "movq (%2, %%"REG_a"), %%mm3 \n\t" |
|
977 |
- "punpcklbw %%mm7, %%mm0 \n\t" |
|
978 |
- "punpcklbw %%mm7, %%mm1 \n\t" |
|
979 |
- "punpckhbw %%mm7, %%mm2 \n\t" |
|
980 |
- "punpckhbw %%mm7, %%mm3 \n\t" |
|
981 |
- "paddw %%mm0, %%mm1 \n\t" |
|
982 |
- "paddw %%mm2, %%mm3 \n\t" |
|
983 |
- "movq (%3, %%"REG_a"), %%mm4 \n\t" |
|
984 |
- "movq (%3, %%"REG_a"), %%mm2 \n\t" |
|
985 |
- "paddw %%mm5, %%mm1 \n\t" |
|
986 |
- "paddw %%mm5, %%mm3 \n\t" |
|
987 |
- "psrlw $1, %%mm1 \n\t" |
|
988 |
- "psrlw $1, %%mm3 \n\t" |
|
989 |
- "packuswb %%mm3, %%mm1 \n\t" |
|
990 |
- "psubusb %%mm1, %%mm4 \n\t" |
|
991 |
- "psubusb %%mm2, %%mm1 \n\t" |
|
992 |
- "por %%mm4, %%mm1 \n\t" |
|
993 |
- "movq %%mm1, %%mm0 \n\t" |
|
994 |
- "punpcklbw %%mm7, %%mm0 \n\t" |
|
995 |
- "punpckhbw %%mm7, %%mm1 \n\t" |
|
996 |
- "paddw %%mm1, %%mm0 \n\t" |
|
997 |
- "paddw %%mm0, %%mm6 \n\t" |
|
998 |
- "add %4, %%"REG_a" \n\t" |
|
999 |
- " js 1b \n\t" |
|
1000 |
- : "+a" (len) |
|
1001 |
- : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len), |
|
1002 |
- "r" ((x86_reg) stride)); |
|
1003 |
-} |
|
1004 |
- |
|
1005 |
-static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
|
1006 |
-{ |
|
1007 |
- x86_reg len = -(stride * h); |
|
1008 |
- __asm__ volatile ( |
|
1009 |
- "movq (%1, %%"REG_a"), %%mm0 \n\t" |
|
1010 |
- "movq 1(%1, %%"REG_a"), %%mm2 \n\t" |
|
1011 |
- "movq %%mm0, %%mm1 \n\t" |
|
1012 |
- "movq %%mm2, %%mm3 \n\t" |
|
1013 |
- "punpcklbw %%mm7, %%mm0 \n\t" |
|
1014 |
- "punpckhbw %%mm7, %%mm1 \n\t" |
|
1015 |
- "punpcklbw %%mm7, %%mm2 \n\t" |
|
1016 |
- "punpckhbw %%mm7, %%mm3 \n\t" |
|
1017 |
- "paddw %%mm2, %%mm0 \n\t" |
|
1018 |
- "paddw %%mm3, %%mm1 \n\t" |
|
1019 |
- ".p2align 4 \n\t" |
|
1020 |
- "1: \n\t" |
|
1021 |
- "movq (%2, %%"REG_a"), %%mm2 \n\t" |
|
1022 |
- "movq 1(%2, %%"REG_a"), %%mm4 \n\t" |
|
1023 |
- "movq %%mm2, %%mm3 \n\t" |
|
1024 |
- "movq %%mm4, %%mm5 \n\t" |
|
1025 |
- "punpcklbw %%mm7, %%mm2 \n\t" |
|
1026 |
- "punpckhbw %%mm7, %%mm3 \n\t" |
|
1027 |
- "punpcklbw %%mm7, %%mm4 \n\t" |
|
1028 |
- "punpckhbw %%mm7, %%mm5 \n\t" |
|
1029 |
- "paddw %%mm4, %%mm2 \n\t" |
|
1030 |
- "paddw %%mm5, %%mm3 \n\t" |
|
1031 |
- "movq 16+"MANGLE(round_tab)", %%mm5 \n\t" |
|
1032 |
- "paddw %%mm2, %%mm0 \n\t" |
|
1033 |
- "paddw %%mm3, %%mm1 \n\t" |
|
1034 |
- "paddw %%mm5, %%mm0 \n\t" |
|
1035 |
- "paddw %%mm5, %%mm1 \n\t" |
|
1036 |
- "movq (%3, %%"REG_a"), %%mm4 \n\t" |
|
1037 |
- "movq (%3, %%"REG_a"), %%mm5 \n\t" |
|
1038 |
- "psrlw $2, %%mm0 \n\t" |
|
1039 |
- "psrlw $2, %%mm1 \n\t" |
|
1040 |
- "packuswb %%mm1, %%mm0 \n\t" |
|
1041 |
- "psubusb %%mm0, %%mm4 \n\t" |
|
1042 |
- "psubusb %%mm5, %%mm0 \n\t" |
|
1043 |
- "por %%mm4, %%mm0 \n\t" |
|
1044 |
- "movq %%mm0, %%mm4 \n\t" |
|
1045 |
- "punpcklbw %%mm7, %%mm0 \n\t" |
|
1046 |
- "punpckhbw %%mm7, %%mm4 \n\t" |
|
1047 |
- "paddw %%mm0, %%mm6 \n\t" |
|
1048 |
- "paddw %%mm4, %%mm6 \n\t" |
|
1049 |
- "movq %%mm2, %%mm0 \n\t" |
|
1050 |
- "movq %%mm3, %%mm1 \n\t" |
|
1051 |
- "add %4, %%"REG_a" \n\t" |
|
1052 |
- " js 1b \n\t" |
|
1053 |
- : "+a" (len) |
|
1054 |
- : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), |
|
1055 |
- "r" ((x86_reg) stride)); |
|
1056 |
-} |
|
1057 |
- |
|
1058 |
-static inline int sum_mmx(void) |
|
1059 |
-{ |
|
1060 |
- int ret; |
|
1061 |
- __asm__ volatile ( |
|
1062 |
- "movq %%mm6, %%mm0 \n\t" |
|
1063 |
- "psrlq $32, %%mm6 \n\t" |
|
1064 |
- "paddw %%mm0, %%mm6 \n\t" |
|
1065 |
- "movq %%mm6, %%mm0 \n\t" |
|
1066 |
- "psrlq $16, %%mm6 \n\t" |
|
1067 |
- "paddw %%mm0, %%mm6 \n\t" |
|
1068 |
- "movd %%mm6, %0 \n\t" |
|
1069 |
- : "=r" (ret)); |
|
1070 |
- return ret & 0xFFFF; |
|
1071 |
-} |
|
1072 |
- |
|
1073 |
-static inline int sum_mmxext(void) |
|
1074 |
-{ |
|
1075 |
- int ret; |
|
1076 |
- __asm__ volatile ( |
|
1077 |
- "movd %%mm6, %0 \n\t" |
|
1078 |
- : "=r" (ret)); |
|
1079 |
- return ret; |
|
1080 |
-} |
|
1081 |
- |
|
1082 |
-static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
|
1083 |
-{ |
|
1084 |
- sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h); |
|
1085 |
-} |
|
1086 |
- |
|
1087 |
-static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
|
1088 |
-{ |
|
1089 |
- sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h); |
|
1090 |
-} |
|
1091 |
- |
|
1092 |
-#define PIX_SAD(suf) \ |
|
1093 |
-static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
1094 |
- uint8_t *blk1, int stride, int h) \ |
|
1095 |
-{ \ |
|
1096 |
- assert(h == 8); \ |
|
1097 |
- __asm__ volatile ( \ |
|
1098 |
- "pxor %%mm7, %%mm7 \n\t" \ |
|
1099 |
- "pxor %%mm6, %%mm6 \n\t" \ |
|
1100 |
- :); \ |
|
1101 |
- \ |
|
1102 |
- sad8_1_ ## suf(blk1, blk2, stride, 8); \ |
|
1103 |
- \ |
|
1104 |
- return sum_ ## suf(); \ |
|
1105 |
-} \ |
|
1106 |
- \ |
|
1107 |
-static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
1108 |
- uint8_t *blk1, int stride, int h) \ |
|
1109 |
-{ \ |
|
1110 |
- assert(h == 8); \ |
|
1111 |
- __asm__ volatile ( \ |
|
1112 |
- "pxor %%mm7, %%mm7 \n\t" \ |
|
1113 |
- "pxor %%mm6, %%mm6 \n\t" \ |
|
1114 |
- "movq %0, %%mm5 \n\t" \ |
|
1115 |
- :: "m" (round_tab[1])); \ |
|
1116 |
- \ |
|
1117 |
- sad8_x2a_ ## suf(blk1, blk2, stride, 8); \ |
|
1118 |
- \ |
|
1119 |
- return sum_ ## suf(); \ |
|
1120 |
-} \ |
|
1121 |
- \ |
|
1122 |
-static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
1123 |
- uint8_t *blk1, int stride, int h) \ |
|
1124 |
-{ \ |
|
1125 |
- assert(h == 8); \ |
|
1126 |
- __asm__ volatile ( \ |
|
1127 |
- "pxor %%mm7, %%mm7 \n\t" \ |
|
1128 |
- "pxor %%mm6, %%mm6 \n\t" \ |
|
1129 |
- "movq %0, %%mm5 \n\t" \ |
|
1130 |
- :: "m" (round_tab[1])); \ |
|
1131 |
- \ |
|
1132 |
- sad8_y2a_ ## suf(blk1, blk2, stride, 8); \ |
|
1133 |
- \ |
|
1134 |
- return sum_ ## suf(); \ |
|
1135 |
-} \ |
|
1136 |
- \ |
|
1137 |
-static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
1138 |
- uint8_t *blk1, int stride, int h) \ |
|
1139 |
-{ \ |
|
1140 |
- assert(h == 8); \ |
|
1141 |
- __asm__ volatile ( \ |
|
1142 |
- "pxor %%mm7, %%mm7 \n\t" \ |
|
1143 |
- "pxor %%mm6, %%mm6 \n\t" \ |
|
1144 |
- ::); \ |
|
1145 |
- \ |
|
1146 |
- sad8_4_ ## suf(blk1, blk2, stride, 8); \ |
|
1147 |
- \ |
|
1148 |
- return sum_ ## suf(); \ |
|
1149 |
-} \ |
|
1150 |
- \ |
|
1151 |
-static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
1152 |
- uint8_t *blk1, int stride, int h) \ |
|
1153 |
-{ \ |
|
1154 |
- __asm__ volatile ( \ |
|
1155 |
- "pxor %%mm7, %%mm7 \n\t" \ |
|
1156 |
- "pxor %%mm6, %%mm6 \n\t" \ |
|
1157 |
- :); \ |
|
1158 |
- \ |
|
1159 |
- sad8_1_ ## suf(blk1, blk2, stride, h); \ |
|
1160 |
- sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ |
|
1161 |
- \ |
|
1162 |
- return sum_ ## suf(); \ |
|
1163 |
-} \ |
|
1164 |
- \ |
|
1165 |
-static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
1166 |
- uint8_t *blk1, int stride, int h) \ |
|
1167 |
-{ \ |
|
1168 |
- __asm__ volatile ( \ |
|
1169 |
- "pxor %%mm7, %%mm7 \n\t" \ |
|
1170 |
- "pxor %%mm6, %%mm6 \n\t" \ |
|
1171 |
- "movq %0, %%mm5 \n\t" \ |
|
1172 |
- :: "m" (round_tab[1])); \ |
|
1173 |
- \ |
|
1174 |
- sad8_x2a_ ## suf(blk1, blk2, stride, h); \ |
|
1175 |
- sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ |
|
1176 |
- \ |
|
1177 |
- return sum_ ## suf(); \ |
|
1178 |
-} \ |
|
1179 |
- \ |
|
1180 |
-static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
1181 |
- uint8_t *blk1, int stride, int h) \ |
|
1182 |
-{ \ |
|
1183 |
- __asm__ volatile ( \ |
|
1184 |
- "pxor %%mm7, %%mm7 \n\t" \ |
|
1185 |
- "pxor %%mm6, %%mm6 \n\t" \ |
|
1186 |
- "movq %0, %%mm5 \n\t" \ |
|
1187 |
- :: "m" (round_tab[1])); \ |
|
1188 |
- \ |
|
1189 |
- sad8_y2a_ ## suf(blk1, blk2, stride, h); \ |
|
1190 |
- sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ |
|
1191 |
- \ |
|
1192 |
- return sum_ ## suf(); \ |
|
1193 |
-} \ |
|
1194 |
- \ |
|
1195 |
-static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
1196 |
- uint8_t *blk1, int stride, int h) \ |
|
1197 |
-{ \ |
|
1198 |
- __asm__ volatile ( \ |
|
1199 |
- "pxor %%mm7, %%mm7 \n\t" \ |
|
1200 |
- "pxor %%mm6, %%mm6 \n\t" \ |
|
1201 |
- ::); \ |
|
1202 |
- \ |
|
1203 |
- sad8_4_ ## suf(blk1, blk2, stride, h); \ |
|
1204 |
- sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ |
|
1205 |
- \ |
|
1206 |
- return sum_ ## suf(); \ |
|
1207 |
-} \ |
|
1208 |
- |
|
1209 |
-PIX_SAD(mmx) |
|
1210 |
-PIX_SAD(mmxext) |
|
1211 |
- |
|
1212 |
-#endif /* HAVE_INLINE_ASM */ |
|
1213 |
- |
|
1214 |
-int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
1215 |
- int line_size, int h); |
|
1216 |
- |
|
1217 |
-#define hadamard_func(cpu) \ |
|
1218 |
- int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \ |
|
1219 |
- uint8_t *src2, int stride, int h); \ |
|
1220 |
- int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \ |
|
1221 |
- uint8_t *src2, int stride, int h); |
|
1222 |
- |
|
1223 |
-hadamard_func(mmx) |
|
1224 |
-hadamard_func(mmxext) |
|
1225 |
-hadamard_func(sse2) |
|
1226 |
-hadamard_func(ssse3) |
|
1227 |
- |
|
1228 |
-av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx) |
|
1229 |
-{ |
|
1230 |
- int cpu_flags = av_get_cpu_flags(); |
|
1231 |
- |
|
1232 |
-#if HAVE_INLINE_ASM |
|
1233 |
- if (INLINE_MMX(cpu_flags)) { |
|
1234 |
- c->sum_abs_dctelem = sum_abs_dctelem_mmx; |
|
1235 |
- |
|
1236 |
- c->pix_abs[0][0] = sad16_mmx; |
|
1237 |
- c->pix_abs[0][1] = sad16_x2_mmx; |
|
1238 |
- c->pix_abs[0][2] = sad16_y2_mmx; |
|
1239 |
- c->pix_abs[0][3] = sad16_xy2_mmx; |
|
1240 |
- c->pix_abs[1][0] = sad8_mmx; |
|
1241 |
- c->pix_abs[1][1] = sad8_x2_mmx; |
|
1242 |
- c->pix_abs[1][2] = sad8_y2_mmx; |
|
1243 |
- c->pix_abs[1][3] = sad8_xy2_mmx; |
|
1244 |
- |
|
1245 |
- c->sad[0] = sad16_mmx; |
|
1246 |
- c->sad[1] = sad8_mmx; |
|
1247 |
- |
|
1248 |
- c->sse[0] = sse16_mmx; |
|
1249 |
- c->sse[1] = sse8_mmx; |
|
1250 |
- c->vsad[4] = vsad_intra16_mmx; |
|
1251 |
- |
|
1252 |
- c->nsse[0] = nsse16_mmx; |
|
1253 |
- c->nsse[1] = nsse8_mmx; |
|
1254 |
- |
|
1255 |
- if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
|
1256 |
- c->vsad[0] = vsad16_mmx; |
|
1257 |
- } |
|
1258 |
- } |
|
1259 |
- |
|
1260 |
- if (INLINE_MMXEXT(cpu_flags)) { |
|
1261 |
- c->sum_abs_dctelem = sum_abs_dctelem_mmxext; |
|
1262 |
- |
|
1263 |
- c->vsad[4] = vsad_intra16_mmxext; |
|
1264 |
- |
|
1265 |
- c->pix_abs[0][0] = sad16_mmxext; |
|
1266 |
- c->pix_abs[1][0] = sad8_mmxext; |
|
1267 |
- |
|
1268 |
- c->sad[0] = sad16_mmxext; |
|
1269 |
- c->sad[1] = sad8_mmxext; |
|
1270 |
- |
|
1271 |
- if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
|
1272 |
- c->pix_abs[0][1] = sad16_x2_mmxext; |
|
1273 |
- c->pix_abs[0][2] = sad16_y2_mmxext; |
|
1274 |
- c->pix_abs[0][3] = sad16_xy2_mmxext; |
|
1275 |
- c->pix_abs[1][1] = sad8_x2_mmxext; |
|
1276 |
- c->pix_abs[1][2] = sad8_y2_mmxext; |
|
1277 |
- c->pix_abs[1][3] = sad8_xy2_mmxext; |
|
1278 |
- |
|
1279 |
- c->vsad[0] = vsad16_mmxext; |
|
1280 |
- } |
|
1281 |
- } |
|
1282 |
- |
|
1283 |
- if (INLINE_SSE2(cpu_flags)) { |
|
1284 |
- c->sum_abs_dctelem = sum_abs_dctelem_sse2; |
|
1285 |
- } |
|
1286 |
- |
|
1287 |
- if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) { |
|
1288 |
- c->sad[0] = sad16_sse2; |
|
1289 |
- } |
|
1290 |
- |
|
1291 |
-#if HAVE_SSSE3_INLINE |
|
1292 |
- if (INLINE_SSSE3(cpu_flags)) { |
|
1293 |
- c->sum_abs_dctelem = sum_abs_dctelem_ssse3; |
|
1294 |
- } |
|
1295 |
-#endif |
|
1296 |
-#endif /* HAVE_INLINE_ASM */ |
|
1297 |
- |
|
1298 |
- if (EXTERNAL_MMX(cpu_flags)) { |
|
1299 |
- c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx; |
|
1300 |
- c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; |
|
1301 |
- } |
|
1302 |
- |
|
1303 |
- if (EXTERNAL_MMXEXT(cpu_flags)) { |
|
1304 |
- c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; |
|
1305 |
- c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; |
|
1306 |
- } |
|
1307 |
- |
|
1308 |
- if (EXTERNAL_SSE2(cpu_flags)) { |
|
1309 |
- c->sse[0] = ff_sse16_sse2; |
|
1310 |
- |
|
1311 |
-#if HAVE_ALIGNED_STACK |
|
1312 |
- c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; |
|
1313 |
- c->hadamard8_diff[1] = ff_hadamard8_diff_sse2; |
|
1314 |
-#endif |
|
1315 |
- } |
|
1316 |
- |
|
1317 |
- if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) { |
|
1318 |
- c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; |
|
1319 |
- c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; |
|
1320 |
- } |
|
1321 |
-} |
1322 | 1 |
deleted file mode 100644 |
... | ... |
@@ -1,336 +0,0 @@ |
1 |
-;***************************************************************************** |
|
2 |
-;* MMX optimized DSP utils |
|
3 |
-;***************************************************************************** |
|
4 |
-;* Copyright (c) 2000, 2001 Fabrice Bellard |
|
5 |
-;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
6 |
-;* |
|
7 |
-;* This file is part of Libav. |
|
8 |
-;* |
|
9 |
-;* Libav is free software; you can redistribute it and/or |
|
10 |
-;* modify it under the terms of the GNU Lesser General Public |
|
11 |
-;* License as published by the Free Software Foundation; either |
|
12 |
-;* version 2.1 of the License, or (at your option) any later version. |
|
13 |
-;* |
|
14 |
-;* Libav is distributed in the hope that it will be useful, |
|
15 |
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
16 |
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
17 |
-;* Lesser General Public License for more details. |
|
18 |
-;* |
|
19 |
-;* You should have received a copy of the GNU Lesser General Public |
|
20 |
-;* License along with Libav; if not, write to the Free Software |
|
21 |
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
22 |
-;***************************************************************************** |
|
23 |
- |
|
24 |
-%include "libavutil/x86/x86util.asm" |
|
25 |
- |
|
26 |
-SECTION .text |
|
27 |
- |
|
28 |
-%macro DIFF_PIXELS_1 4 |
|
29 |
- movh %1, %3 |
|
30 |
- movh %2, %4 |
|
31 |
- punpcklbw %2, %1 |
|
32 |
- punpcklbw %1, %1 |
|
33 |
- psubw %1, %2 |
|
34 |
-%endmacro |
|
35 |
- |
|
36 |
-; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 |
|
37 |
-; %6=temporary storage location |
|
38 |
-; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) |
|
39 |
-%macro DIFF_PIXELS_8 6 |
|
40 |
- DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] |
|
41 |
- DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] |
|
42 |
- DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] |
|
43 |
- add %1, %5 |
|
44 |
- add %2, %5 |
|
45 |
- DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] |
|
46 |
- DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] |
|
47 |
- DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] |
|
48 |
- DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] |
|
49 |
-%ifdef m8 |
|
50 |
- DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] |
|
51 |
-%else |
|
52 |
- mova [%6], m0 |
|
53 |
- DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] |
|
54 |
- mova m0, [%6] |
|
55 |
-%endif |
|
56 |
- sub %1, %5 |
|
57 |
- sub %2, %5 |
|
58 |
-%endmacro |
|
59 |
- |
|
60 |
-%macro HADAMARD8 0 |
|
61 |
- SUMSUB_BADC w, 0, 1, 2, 3 |
|
62 |
- SUMSUB_BADC w, 4, 5, 6, 7 |
|
63 |
- SUMSUB_BADC w, 0, 2, 1, 3 |
|
64 |
- SUMSUB_BADC w, 4, 6, 5, 7 |
|
65 |
- SUMSUB_BADC w, 0, 4, 1, 5 |
|
66 |
- SUMSUB_BADC w, 2, 6, 3, 7 |
|
67 |
-%endmacro |
|
68 |
- |
|
69 |
-%macro ABS1_SUM 3 |
|
70 |
- ABS1 %1, %2 |
|
71 |
- paddusw %3, %1 |
|
72 |
-%endmacro |
|
73 |
- |
|
74 |
-%macro ABS2_SUM 6 |
|
75 |
- ABS2 %1, %2, %3, %4 |
|
76 |
- paddusw %5, %1 |
|
77 |
- paddusw %6, %2 |
|
78 |
-%endmacro |
|
79 |
- |
|
80 |
-%macro ABS_SUM_8x8_64 1 |
|
81 |
- ABS2 m0, m1, m8, m9 |
|
82 |
- ABS2_SUM m2, m3, m8, m9, m0, m1 |
|
83 |
- ABS2_SUM m4, m5, m8, m9, m0, m1 |
|
84 |
- ABS2_SUM m6, m7, m8, m9, m0, m1 |
|
85 |
- paddusw m0, m1 |
|
86 |
-%endmacro |
|
87 |
- |
|
88 |
-%macro ABS_SUM_8x8_32 1 |
|
89 |
- mova [%1], m7 |
|
90 |
- ABS1 m0, m7 |
|
91 |
- ABS1 m1, m7 |
|
92 |
- ABS1_SUM m2, m7, m0 |
|
93 |
- ABS1_SUM m3, m7, m1 |
|
94 |
- ABS1_SUM m4, m7, m0 |
|
95 |
- ABS1_SUM m5, m7, m1 |
|
96 |
- ABS1_SUM m6, m7, m0 |
|
97 |
- mova m2, [%1] |
|
98 |
- ABS1_SUM m2, m7, m1 |
|
99 |
- paddusw m0, m1 |
|
100 |
-%endmacro |
|
101 |
- |
|
102 |
-; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to |
|
103 |
-; about 100k on extreme inputs. But that's very unlikely to occur in natural video, |
|
104 |
-; and it's even more unlikely to not have any alternative mvs/modes with lower cost. |
|
105 |
-%macro HSUM 3 |
|
106 |
-%if cpuflag(sse2) |
|
107 |
- movhlps %2, %1 |
|
108 |
- paddusw %1, %2 |
|
109 |
- pshuflw %2, %1, 0xE |
|
110 |
- paddusw %1, %2 |
|
111 |
- pshuflw %2, %1, 0x1 |
|
112 |
- paddusw %1, %2 |
|
113 |
- movd %3, %1 |
|
114 |
-%elif cpuflag(mmxext) |
|
115 |
- pshufw %2, %1, 0xE |
|
116 |
- paddusw %1, %2 |
|
117 |
- pshufw %2, %1, 0x1 |
|
118 |
- paddusw %1, %2 |
|
119 |
- movd %3, %1 |
|
120 |
-%elif cpuflag(mmx) |
|
121 |
- mova %2, %1 |
|
122 |
- psrlq %1, 32 |
|
123 |
- paddusw %1, %2 |
|
124 |
- mova %2, %1 |
|
125 |
- psrlq %1, 16 |
|
126 |
- paddusw %1, %2 |
|
127 |
- movd %3, %1 |
|
128 |
-%endif |
|
129 |
-%endmacro |
|
130 |
- |
|
131 |
-%macro STORE4 5 |
|
132 |
- mova [%1+mmsize*0], %2 |
|
133 |
- mova [%1+mmsize*1], %3 |
|
134 |
- mova [%1+mmsize*2], %4 |
|
135 |
- mova [%1+mmsize*3], %5 |
|
136 |
-%endmacro |
|
137 |
- |
|
138 |
-%macro LOAD4 5 |
|
139 |
- mova %2, [%1+mmsize*0] |
|
140 |
- mova %3, [%1+mmsize*1] |
|
141 |
- mova %4, [%1+mmsize*2] |
|
142 |
- mova %5, [%1+mmsize*3] |
|
143 |
-%endmacro |
|
144 |
- |
|
145 |
-%macro hadamard8_16_wrapper 2 |
|
146 |
-cglobal hadamard8_diff, 4, 4, %1 |
|
147 |
-%ifndef m8 |
|
148 |
- %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) |
|
149 |
- SUB rsp, pad |
|
150 |
-%endif |
|
151 |
- call hadamard8x8_diff %+ SUFFIX |
|
152 |
-%ifndef m8 |
|
153 |
- ADD rsp, pad |
|
154 |
-%endif |
|
155 |
- RET |
|
156 |
- |
|
157 |
-cglobal hadamard8_diff16, 5, 6, %1 |
|
158 |
-%ifndef m8 |
|
159 |
- %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) |
|
160 |
- SUB rsp, pad |
|
161 |
-%endif |
|
162 |
- |
|
163 |
- call hadamard8x8_diff %+ SUFFIX |
|
164 |
- mov r5d, eax |
|
165 |
- |
|
166 |
- add r1, 8 |
|
167 |
- add r2, 8 |
|
168 |
- call hadamard8x8_diff %+ SUFFIX |
|
169 |
- add r5d, eax |
|
170 |
- |
|
171 |
- cmp r4d, 16 |
|
172 |
- jne .done |
|
173 |
- |
|
174 |
- lea r1, [r1+r3*8-8] |
|
175 |
- lea r2, [r2+r3*8-8] |
|
176 |
- call hadamard8x8_diff %+ SUFFIX |
|
177 |
- add r5d, eax |
|
178 |
- |
|
179 |
- add r1, 8 |
|
180 |
- add r2, 8 |
|
181 |
- call hadamard8x8_diff %+ SUFFIX |
|
182 |
- add r5d, eax |
|
183 |
- |
|
184 |
-.done: |
|
185 |
- mov eax, r5d |
|
186 |
-%ifndef m8 |
|
187 |
- ADD rsp, pad |
|
188 |
-%endif |
|
189 |
- RET |
|
190 |
-%endmacro |
|
191 |
- |
|
192 |
-%macro HADAMARD8_DIFF 0-1 |
|
193 |
-%if cpuflag(sse2) |
|
194 |
-hadamard8x8_diff %+ SUFFIX: |
|
195 |
- lea r0, [r3*3] |
|
196 |
- DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize |
|
197 |
- HADAMARD8 |
|
198 |
-%if ARCH_X86_64 |
|
199 |
- TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 |
|
200 |
-%else |
|
201 |
- TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] |
|
202 |
-%endif |
|
203 |
- HADAMARD8 |
|
204 |
- ABS_SUM_8x8 rsp+gprsize |
|
205 |
- HSUM m0, m1, eax |
|
206 |
- and eax, 0xFFFF |
|
207 |
- ret |
|
208 |
- |
|
209 |
-hadamard8_16_wrapper %1, 3 |
|
210 |
-%elif cpuflag(mmx) |
|
211 |
-ALIGN 16 |
|
212 |
-; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, |
|
213 |
-; uint8_t *src2, int stride, int h) |
|
214 |
-; r0 = void *s = unused, int h = unused (always 8) |
|
215 |
-; note how r1, r2 and r3 are not clobbered in this function, so 16x16 |
|
216 |
-; can simply call this 2x2x (and that's why we access rsp+gprsize |
|
217 |
-; everywhere, which is rsp of calling func |
|
218 |
-hadamard8x8_diff %+ SUFFIX: |
|
219 |
- lea r0, [r3*3] |
|
220 |
- |
|
221 |
- ; first 4x8 pixels |
|
222 |
- DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 |
|
223 |
- HADAMARD8 |
|
224 |
- mova [rsp+gprsize+0x60], m7 |
|
225 |
- TRANSPOSE4x4W 0, 1, 2, 3, 7 |
|
226 |
- STORE4 rsp+gprsize, m0, m1, m2, m3 |
|
227 |
- mova m7, [rsp+gprsize+0x60] |
|
228 |
- TRANSPOSE4x4W 4, 5, 6, 7, 0 |
|
229 |
- STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 |
|
230 |
- |
|
231 |
- ; second 4x8 pixels |
|
232 |
- DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 |
|
233 |
- HADAMARD8 |
|
234 |
- mova [rsp+gprsize+0x60], m7 |
|
235 |
- TRANSPOSE4x4W 0, 1, 2, 3, 7 |
|
236 |
- STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 |
|
237 |
- mova m7, [rsp+gprsize+0x60] |
|
238 |
- TRANSPOSE4x4W 4, 5, 6, 7, 0 |
|
239 |
- |
|
240 |
- LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 |
|
241 |
- HADAMARD8 |
|
242 |
- ABS_SUM_8x8_32 rsp+gprsize+0x60 |
|
243 |
- mova [rsp+gprsize+0x60], m0 |
|
244 |
- |
|
245 |
- LOAD4 rsp+gprsize , m0, m1, m2, m3 |
|
246 |
- LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 |
|
247 |
- HADAMARD8 |
|
248 |
- ABS_SUM_8x8_32 rsp+gprsize |
|
249 |
- paddusw m0, [rsp+gprsize+0x60] |
|
250 |
- |
|
251 |
- HSUM m0, m1, eax |
|
252 |
- and rax, 0xFFFF |
|
253 |
- ret |
|
254 |
- |
|
255 |
-hadamard8_16_wrapper 0, 14 |
|
256 |
-%endif |
|
257 |
-%endmacro |
|
258 |
- |
|
259 |
-INIT_MMX mmx |
|
260 |
-HADAMARD8_DIFF |
|
261 |
- |
|
262 |
-INIT_MMX mmxext |
|
263 |
-HADAMARD8_DIFF |
|
264 |
- |
|
265 |
-INIT_XMM sse2 |
|
266 |
-%if ARCH_X86_64 |
|
267 |
-%define ABS_SUM_8x8 ABS_SUM_8x8_64 |
|
268 |
-%else |
|
269 |
-%define ABS_SUM_8x8 ABS_SUM_8x8_32 |
|
270 |
-%endif |
|
271 |
-HADAMARD8_DIFF 10 |
|
272 |
- |
|
273 |
-INIT_XMM ssse3 |
|
274 |
-%define ABS_SUM_8x8 ABS_SUM_8x8_64 |
|
275 |
-HADAMARD8_DIFF 9 |
|
276 |
- |
|
277 |
-INIT_XMM sse2 |
|
278 |
-; int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
279 |
-; int line_size, int h); |
|
280 |
-cglobal sse16, 5, 5, 8 |
|
281 |
- shr r4d, 1 |
|
282 |
- pxor m0, m0 ; mm0 = 0 |
|
283 |
- pxor m7, m7 ; mm7 holds the sum |
|
284 |
- |
|
285 |
-.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned |
|
286 |
- movu m1, [r1 ] ; mm1 = pix1[0][0-15] |
|
287 |
- movu m2, [r2 ] ; mm2 = pix2[0][0-15] |
|
288 |
- movu m3, [r1+r3] ; mm3 = pix1[1][0-15] |
|
289 |
- movu m4, [r2+r3] ; mm4 = pix2[1][0-15] |
|
290 |
- |
|
291 |
- ; todo: mm1-mm2, mm3-mm4 |
|
292 |
- ; algo: subtract mm1 from mm2 with saturation and vice versa |
|
293 |
- ; OR the result to get the absolute difference |
|
294 |
- mova m5, m1 |
|
295 |
- mova m6, m3 |
|
296 |
- psubusb m1, m2 |
|
297 |
- psubusb m3, m4 |
|
298 |
- psubusb m2, m5 |
|
299 |
- psubusb m4, m6 |
|
300 |
- |
|
301 |
- por m2, m1 |
|
302 |
- por m4, m3 |
|
303 |
- |
|
304 |
- ; now convert to 16-bit vectors so we can square them |
|
305 |
- mova m1, m2 |
|
306 |
- mova m3, m4 |
|
307 |
- |
|
308 |
- punpckhbw m2, m0 |
|
309 |
- punpckhbw m4, m0 |
|
310 |
- punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) |
|
311 |
- punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) |
|
312 |
- |
|
313 |
- pmaddwd m2, m2 |
|
314 |
- pmaddwd m4, m4 |
|
315 |
- pmaddwd m1, m1 |
|
316 |
- pmaddwd m3, m3 |
|
317 |
- |
|
318 |
- lea r1, [r1+r3*2] ; pix1 += 2*line_size |
|
319 |
- lea r2, [r2+r3*2] ; pix2 += 2*line_size |
|
320 |
- |
|
321 |
- paddd m1, m2 |
|
322 |
- paddd m3, m4 |
|
323 |
- paddd m7, m1 |
|
324 |
- paddd m7, m3 |
|
325 |
- |
|
326 |
- dec r4 |
|
327 |
- jnz .next2lines |
|
328 |
- |
|
329 |
- mova m1, m7 |
|
330 |
- psrldq m7, 8 ; shift hi qword to lo |
|
331 |
- paddd m7, m1 |
|
332 |
- mova m1, m7 |
|
333 |
- psrldq m7, 4 ; shift hi dword to lo |
|
334 |
- paddd m7, m1 |
|
335 |
- movd eax, m7 ; return value |
|
336 |
- RET |
337 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,336 @@ |
0 |
+;***************************************************************************** |
|
1 |
+;* SIMD-optimized motion compensation estimation |
|
2 |
+;***************************************************************************** |
|
3 |
+;* Copyright (c) 2000, 2001 Fabrice Bellard |
|
4 |
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
5 |
+;* |
|
6 |
+;* This file is part of Libav. |
|
7 |
+;* |
|
8 |
+;* Libav is free software; you can redistribute it and/or |
|
9 |
+;* modify it under the terms of the GNU Lesser General Public |
|
10 |
+;* License as published by the Free Software Foundation; either |
|
11 |
+;* version 2.1 of the License, or (at your option) any later version. |
|
12 |
+;* |
|
13 |
+;* Libav is distributed in the hope that it will be useful, |
|
14 |
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
15 |
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
16 |
+;* Lesser General Public License for more details. |
|
17 |
+;* |
|
18 |
+;* You should have received a copy of the GNU Lesser General Public |
|
19 |
+;* License along with Libav; if not, write to the Free Software |
|
20 |
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
21 |
+;***************************************************************************** |
|
22 |
+ |
|
23 |
+%include "libavutil/x86/x86util.asm" |
|
24 |
+ |
|
25 |
+SECTION .text |
|
26 |
+ |
|
27 |
+%macro DIFF_PIXELS_1 4 |
|
28 |
+ movh %1, %3 |
|
29 |
+ movh %2, %4 |
|
30 |
+ punpcklbw %2, %1 |
|
31 |
+ punpcklbw %1, %1 |
|
32 |
+ psubw %1, %2 |
|
33 |
+%endmacro |
|
34 |
+ |
|
35 |
+; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 |
|
36 |
+; %6=temporary storage location |
|
37 |
+; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) |
|
38 |
+%macro DIFF_PIXELS_8 6 |
|
39 |
+ DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] |
|
40 |
+ DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] |
|
41 |
+ DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] |
|
42 |
+ add %1, %5 |
|
43 |
+ add %2, %5 |
|
44 |
+ DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] |
|
45 |
+ DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] |
|
46 |
+ DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] |
|
47 |
+ DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] |
|
48 |
+%ifdef m8 |
|
49 |
+ DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] |
|
50 |
+%else |
|
51 |
+ mova [%6], m0 |
|
52 |
+ DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] |
|
53 |
+ mova m0, [%6] |
|
54 |
+%endif |
|
55 |
+ sub %1, %5 |
|
56 |
+ sub %2, %5 |
|
57 |
+%endmacro |
|
58 |
+ |
|
59 |
+%macro HADAMARD8 0 |
|
60 |
+ SUMSUB_BADC w, 0, 1, 2, 3 |
|
61 |
+ SUMSUB_BADC w, 4, 5, 6, 7 |
|
62 |
+ SUMSUB_BADC w, 0, 2, 1, 3 |
|
63 |
+ SUMSUB_BADC w, 4, 6, 5, 7 |
|
64 |
+ SUMSUB_BADC w, 0, 4, 1, 5 |
|
65 |
+ SUMSUB_BADC w, 2, 6, 3, 7 |
|
66 |
+%endmacro |
|
67 |
+ |
|
68 |
+%macro ABS1_SUM 3 |
|
69 |
+ ABS1 %1, %2 |
|
70 |
+ paddusw %3, %1 |
|
71 |
+%endmacro |
|
72 |
+ |
|
73 |
+%macro ABS2_SUM 6 |
|
74 |
+ ABS2 %1, %2, %3, %4 |
|
75 |
+ paddusw %5, %1 |
|
76 |
+ paddusw %6, %2 |
|
77 |
+%endmacro |
|
78 |
+ |
|
79 |
+%macro ABS_SUM_8x8_64 1 |
|
80 |
+ ABS2 m0, m1, m8, m9 |
|
81 |
+ ABS2_SUM m2, m3, m8, m9, m0, m1 |
|
82 |
+ ABS2_SUM m4, m5, m8, m9, m0, m1 |
|
83 |
+ ABS2_SUM m6, m7, m8, m9, m0, m1 |
|
84 |
+ paddusw m0, m1 |
|
85 |
+%endmacro |
|
86 |
+ |
|
87 |
+%macro ABS_SUM_8x8_32 1 |
|
88 |
+ mova [%1], m7 |
|
89 |
+ ABS1 m0, m7 |
|
90 |
+ ABS1 m1, m7 |
|
91 |
+ ABS1_SUM m2, m7, m0 |
|
92 |
+ ABS1_SUM m3, m7, m1 |
|
93 |
+ ABS1_SUM m4, m7, m0 |
|
94 |
+ ABS1_SUM m5, m7, m1 |
|
95 |
+ ABS1_SUM m6, m7, m0 |
|
96 |
+ mova m2, [%1] |
|
97 |
+ ABS1_SUM m2, m7, m1 |
|
98 |
+ paddusw m0, m1 |
|
99 |
+%endmacro |
|
100 |
+ |
|
101 |
+; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to |
|
102 |
+; about 100k on extreme inputs. But that's very unlikely to occur in natural video, |
|
103 |
+; and it's even more unlikely to not have any alternative mvs/modes with lower cost. |
|
104 |
+%macro HSUM 3 |
|
105 |
+%if cpuflag(sse2) |
|
106 |
+ movhlps %2, %1 |
|
107 |
+ paddusw %1, %2 |
|
108 |
+ pshuflw %2, %1, 0xE |
|
109 |
+ paddusw %1, %2 |
|
110 |
+ pshuflw %2, %1, 0x1 |
|
111 |
+ paddusw %1, %2 |
|
112 |
+ movd %3, %1 |
|
113 |
+%elif cpuflag(mmxext) |
|
114 |
+ pshufw %2, %1, 0xE |
|
115 |
+ paddusw %1, %2 |
|
116 |
+ pshufw %2, %1, 0x1 |
|
117 |
+ paddusw %1, %2 |
|
118 |
+ movd %3, %1 |
|
119 |
+%elif cpuflag(mmx) |
|
120 |
+ mova %2, %1 |
|
121 |
+ psrlq %1, 32 |
|
122 |
+ paddusw %1, %2 |
|
123 |
+ mova %2, %1 |
|
124 |
+ psrlq %1, 16 |
|
125 |
+ paddusw %1, %2 |
|
126 |
+ movd %3, %1 |
|
127 |
+%endif |
|
128 |
+%endmacro |
|
129 |
+ |
|
130 |
+%macro STORE4 5 |
|
131 |
+ mova [%1+mmsize*0], %2 |
|
132 |
+ mova [%1+mmsize*1], %3 |
|
133 |
+ mova [%1+mmsize*2], %4 |
|
134 |
+ mova [%1+mmsize*3], %5 |
|
135 |
+%endmacro |
|
136 |
+ |
|
137 |
+%macro LOAD4 5 |
|
138 |
+ mova %2, [%1+mmsize*0] |
|
139 |
+ mova %3, [%1+mmsize*1] |
|
140 |
+ mova %4, [%1+mmsize*2] |
|
141 |
+ mova %5, [%1+mmsize*3] |
|
142 |
+%endmacro |
|
143 |
+ |
|
144 |
+%macro hadamard8_16_wrapper 2 |
|
145 |
+cglobal hadamard8_diff, 4, 4, %1 |
|
146 |
+%ifndef m8 |
|
147 |
+ %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) |
|
148 |
+ SUB rsp, pad |
|
149 |
+%endif |
|
150 |
+ call hadamard8x8_diff %+ SUFFIX |
|
151 |
+%ifndef m8 |
|
152 |
+ ADD rsp, pad |
|
153 |
+%endif |
|
154 |
+ RET |
|
155 |
+ |
|
156 |
+cglobal hadamard8_diff16, 5, 6, %1 |
|
157 |
+%ifndef m8 |
|
158 |
+ %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) |
|
159 |
+ SUB rsp, pad |
|
160 |
+%endif |
|
161 |
+ |
|
162 |
+ call hadamard8x8_diff %+ SUFFIX |
|
163 |
+ mov r5d, eax |
|
164 |
+ |
|
165 |
+ add r1, 8 |
|
166 |
+ add r2, 8 |
|
167 |
+ call hadamard8x8_diff %+ SUFFIX |
|
168 |
+ add r5d, eax |
|
169 |
+ |
|
170 |
+ cmp r4d, 16 |
|
171 |
+ jne .done |
|
172 |
+ |
|
173 |
+ lea r1, [r1+r3*8-8] |
|
174 |
+ lea r2, [r2+r3*8-8] |
|
175 |
+ call hadamard8x8_diff %+ SUFFIX |
|
176 |
+ add r5d, eax |
|
177 |
+ |
|
178 |
+ add r1, 8 |
|
179 |
+ add r2, 8 |
|
180 |
+ call hadamard8x8_diff %+ SUFFIX |
|
181 |
+ add r5d, eax |
|
182 |
+ |
|
183 |
+.done: |
|
184 |
+ mov eax, r5d |
|
185 |
+%ifndef m8 |
|
186 |
+ ADD rsp, pad |
|
187 |
+%endif |
|
188 |
+ RET |
|
189 |
+%endmacro |
|
190 |
+ |
|
191 |
+%macro HADAMARD8_DIFF 0-1 |
|
192 |
+%if cpuflag(sse2) |
|
193 |
+hadamard8x8_diff %+ SUFFIX: |
|
194 |
+ lea r0, [r3*3] |
|
195 |
+ DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize |
|
196 |
+ HADAMARD8 |
|
197 |
+%if ARCH_X86_64 |
|
198 |
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 |
|
199 |
+%else |
|
200 |
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] |
|
201 |
+%endif |
|
202 |
+ HADAMARD8 |
|
203 |
+ ABS_SUM_8x8 rsp+gprsize |
|
204 |
+ HSUM m0, m1, eax |
|
205 |
+ and eax, 0xFFFF |
|
206 |
+ ret |
|
207 |
+ |
|
208 |
+hadamard8_16_wrapper %1, 3 |
|
209 |
+%elif cpuflag(mmx) |
|
210 |
+ALIGN 16 |
|
211 |
+; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, |
|
212 |
+; uint8_t *src2, int stride, int h) |
|
213 |
+; r0 = void *s = unused, int h = unused (always 8) |
|
214 |
+; note how r1, r2 and r3 are not clobbered in this function, so 16x16 |
|
215 |
+; can simply call this 2x2x (and that's why we access rsp+gprsize |
|
216 |
+; everywhere, which is rsp of calling func |
|
217 |
+hadamard8x8_diff %+ SUFFIX: |
|
218 |
+ lea r0, [r3*3] |
|
219 |
+ |
|
220 |
+ ; first 4x8 pixels |
|
221 |
+ DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 |
|
222 |
+ HADAMARD8 |
|
223 |
+ mova [rsp+gprsize+0x60], m7 |
|
224 |
+ TRANSPOSE4x4W 0, 1, 2, 3, 7 |
|
225 |
+ STORE4 rsp+gprsize, m0, m1, m2, m3 |
|
226 |
+ mova m7, [rsp+gprsize+0x60] |
|
227 |
+ TRANSPOSE4x4W 4, 5, 6, 7, 0 |
|
228 |
+ STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 |
|
229 |
+ |
|
230 |
+ ; second 4x8 pixels |
|
231 |
+ DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 |
|
232 |
+ HADAMARD8 |
|
233 |
+ mova [rsp+gprsize+0x60], m7 |
|
234 |
+ TRANSPOSE4x4W 0, 1, 2, 3, 7 |
|
235 |
+ STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 |
|
236 |
+ mova m7, [rsp+gprsize+0x60] |
|
237 |
+ TRANSPOSE4x4W 4, 5, 6, 7, 0 |
|
238 |
+ |
|
239 |
+ LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 |
|
240 |
+ HADAMARD8 |
|
241 |
+ ABS_SUM_8x8_32 rsp+gprsize+0x60 |
|
242 |
+ mova [rsp+gprsize+0x60], m0 |
|
243 |
+ |
|
244 |
+ LOAD4 rsp+gprsize , m0, m1, m2, m3 |
|
245 |
+ LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 |
|
246 |
+ HADAMARD8 |
|
247 |
+ ABS_SUM_8x8_32 rsp+gprsize |
|
248 |
+ paddusw m0, [rsp+gprsize+0x60] |
|
249 |
+ |
|
250 |
+ HSUM m0, m1, eax |
|
251 |
+ and rax, 0xFFFF |
|
252 |
+ ret |
|
253 |
+ |
|
254 |
+hadamard8_16_wrapper 0, 14 |
|
255 |
+%endif |
|
256 |
+%endmacro |
|
257 |
+ |
|
258 |
+INIT_MMX mmx |
|
259 |
+HADAMARD8_DIFF |
|
260 |
+ |
|
261 |
+INIT_MMX mmxext |
|
262 |
+HADAMARD8_DIFF |
|
263 |
+ |
|
264 |
+INIT_XMM sse2 |
|
265 |
+%if ARCH_X86_64 |
|
266 |
+%define ABS_SUM_8x8 ABS_SUM_8x8_64 |
|
267 |
+%else |
|
268 |
+%define ABS_SUM_8x8 ABS_SUM_8x8_32 |
|
269 |
+%endif |
|
270 |
+HADAMARD8_DIFF 10 |
|
271 |
+ |
|
272 |
+INIT_XMM ssse3 |
|
273 |
+%define ABS_SUM_8x8 ABS_SUM_8x8_64 |
|
274 |
+HADAMARD8_DIFF 9 |
|
275 |
+ |
|
276 |
+INIT_XMM sse2 |
|
277 |
+; int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
278 |
+; int line_size, int h); |
|
279 |
+cglobal sse16, 5, 5, 8 |
|
280 |
+ shr r4d, 1 |
|
281 |
+ pxor m0, m0 ; mm0 = 0 |
|
282 |
+ pxor m7, m7 ; mm7 holds the sum |
|
283 |
+ |
|
284 |
+.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned |
|
285 |
+ movu m1, [r1 ] ; mm1 = pix1[0][0-15] |
|
286 |
+ movu m2, [r2 ] ; mm2 = pix2[0][0-15] |
|
287 |
+ movu m3, [r1+r3] ; mm3 = pix1[1][0-15] |
|
288 |
+ movu m4, [r2+r3] ; mm4 = pix2[1][0-15] |
|
289 |
+ |
|
290 |
+ ; todo: mm1-mm2, mm3-mm4 |
|
291 |
+ ; algo: subtract mm1 from mm2 with saturation and vice versa |
|
292 |
+ ; OR the result to get the absolute difference |
|
293 |
+ mova m5, m1 |
|
294 |
+ mova m6, m3 |
|
295 |
+ psubusb m1, m2 |
|
296 |
+ psubusb m3, m4 |
|
297 |
+ psubusb m2, m5 |
|
298 |
+ psubusb m4, m6 |
|
299 |
+ |
|
300 |
+ por m2, m1 |
|
301 |
+ por m4, m3 |
|
302 |
+ |
|
303 |
+ ; now convert to 16-bit vectors so we can square them |
|
304 |
+ mova m1, m2 |
|
305 |
+ mova m3, m4 |
|
306 |
+ |
|
307 |
+ punpckhbw m2, m0 |
|
308 |
+ punpckhbw m4, m0 |
|
309 |
+ punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) |
|
310 |
+ punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) |
|
311 |
+ |
|
312 |
+ pmaddwd m2, m2 |
|
313 |
+ pmaddwd m4, m4 |
|
314 |
+ pmaddwd m1, m1 |
|
315 |
+ pmaddwd m3, m3 |
|
316 |
+ |
|
317 |
+ lea r1, [r1+r3*2] ; pix1 += 2*line_size |
|
318 |
+ lea r2, [r2+r3*2] ; pix2 += 2*line_size |
|
319 |
+ |
|
320 |
+ paddd m1, m2 |
|
321 |
+ paddd m3, m4 |
|
322 |
+ paddd m7, m1 |
|
323 |
+ paddd m7, m3 |
|
324 |
+ |
|
325 |
+ dec r4 |
|
326 |
+ jnz .next2lines |
|
327 |
+ |
|
328 |
+ mova m1, m7 |
|
329 |
+ psrldq m7, 8 ; shift hi qword to lo |
|
330 |
+ paddd m7, m1 |
|
331 |
+ mova m1, m7 |
|
332 |
+ psrldq m7, 4 ; shift hi dword to lo |
|
333 |
+ paddd m7, m1 |
|
334 |
+ movd eax, m7 ; return value |
|
335 |
+ RET |
0 | 336 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,1321 @@ |
0 |
+/* |
|
1 |
+ * SIMD-optimized motion estimation |
|
2 |
+ * Copyright (c) 2000, 2001 Fabrice Bellard |
|
3 |
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
4 |
+ * |
|
5 |
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
|
6 |
+ * |
|
7 |
+ * This file is part of Libav. |
|
8 |
+ * |
|
9 |
+ * Libav is free software; you can redistribute it and/or |
|
10 |
+ * modify it under the terms of the GNU Lesser General Public |
|
11 |
+ * License as published by the Free Software Foundation; either |
|
12 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
13 |
+ * |
|
14 |
+ * Libav is distributed in the hope that it will be useful, |
|
15 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
16 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
17 |
+ * Lesser General Public License for more details. |
|
18 |
+ * |
|
19 |
+ * You should have received a copy of the GNU Lesser General Public |
|
20 |
+ * License along with Libav; if not, write to the Free Software |
|
21 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
22 |
+ */ |
|
23 |
+ |
|
24 |
+#include "libavutil/attributes.h" |
|
25 |
+#include "libavutil/cpu.h" |
|
26 |
+#include "libavutil/x86/asm.h" |
|
27 |
+#include "libavutil/x86/cpu.h" |
|
28 |
+#include "libavcodec/me_cmp.h" |
|
29 |
+#include "libavcodec/mpegvideo.h" |
|
30 |
+ |
|
31 |
+#if HAVE_INLINE_ASM |
|
32 |
+ |
|
33 |
+static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
34 |
+ int line_size, int h) |
|
35 |
+{ |
|
36 |
+ int tmp; |
|
37 |
+ |
|
38 |
+ __asm__ volatile ( |
|
39 |
+ "movl %4, %%ecx \n" |
|
40 |
+ "shr $1, %%ecx \n" |
|
41 |
+ "pxor %%mm0, %%mm0 \n" /* mm0 = 0 */ |
|
42 |
+ "pxor %%mm7, %%mm7 \n" /* mm7 holds the sum */ |
|
43 |
+ "1: \n" |
|
44 |
+ "movq (%0), %%mm1 \n" /* mm1 = pix1[0][0 - 7] */ |
|
45 |
+ "movq (%1), %%mm2 \n" /* mm2 = pix2[0][0 - 7] */ |
|
46 |
+ "movq (%0, %3), %%mm3 \n" /* mm3 = pix1[1][0 - 7] */ |
|
47 |
+ "movq (%1, %3), %%mm4 \n" /* mm4 = pix2[1][0 - 7] */ |
|
48 |
+ |
|
49 |
+ /* todo: mm1-mm2, mm3-mm4 */ |
|
50 |
+ /* algo: subtract mm1 from mm2 with saturation and vice versa */ |
|
51 |
+ /* OR the results to get absolute difference */ |
|
52 |
+ "movq %%mm1, %%mm5 \n" |
|
53 |
+ "movq %%mm3, %%mm6 \n" |
|
54 |
+ "psubusb %%mm2, %%mm1 \n" |
|
55 |
+ "psubusb %%mm4, %%mm3 \n" |
|
56 |
+ "psubusb %%mm5, %%mm2 \n" |
|
57 |
+ "psubusb %%mm6, %%mm4 \n" |
|
58 |
+ |
|
59 |
+ "por %%mm1, %%mm2 \n" |
|
60 |
+ "por %%mm3, %%mm4 \n" |
|
61 |
+ |
|
62 |
+ /* now convert to 16-bit vectors so we can square them */ |
|
63 |
+ "movq %%mm2, %%mm1 \n" |
|
64 |
+ "movq %%mm4, %%mm3 \n" |
|
65 |
+ |
|
66 |
+ "punpckhbw %%mm0, %%mm2 \n" |
|
67 |
+ "punpckhbw %%mm0, %%mm4 \n" |
|
68 |
+ "punpcklbw %%mm0, %%mm1 \n" /* mm1 now spread over (mm1, mm2) */ |
|
69 |
+ "punpcklbw %%mm0, %%mm3 \n" /* mm4 now spread over (mm3, mm4) */ |
|
70 |
+ |
|
71 |
+ "pmaddwd %%mm2, %%mm2 \n" |
|
72 |
+ "pmaddwd %%mm4, %%mm4 \n" |
|
73 |
+ "pmaddwd %%mm1, %%mm1 \n" |
|
74 |
+ "pmaddwd %%mm3, %%mm3 \n" |
|
75 |
+ |
|
76 |
+ "lea (%0, %3, 2), %0 \n" /* pix1 += 2 * line_size */ |
|
77 |
+ "lea (%1, %3, 2), %1 \n" /* pix2 += 2 * line_size */ |
|
78 |
+ |
|
79 |
+ "paddd %%mm2, %%mm1 \n" |
|
80 |
+ "paddd %%mm4, %%mm3 \n" |
|
81 |
+ "paddd %%mm1, %%mm7 \n" |
|
82 |
+ "paddd %%mm3, %%mm7 \n" |
|
83 |
+ |
|
84 |
+ "decl %%ecx \n" |
|
85 |
+ "jnz 1b \n" |
|
86 |
+ |
|
87 |
+ "movq %%mm7, %%mm1 \n" |
|
88 |
+ "psrlq $32, %%mm7 \n" /* shift hi dword to lo */ |
|
89 |
+ "paddd %%mm7, %%mm1 \n" |
|
90 |
+ "movd %%mm1, %2 \n" |
|
91 |
+ : "+r" (pix1), "+r" (pix2), "=r" (tmp) |
|
92 |
+ : "r" ((x86_reg) line_size), "m" (h) |
|
93 |
+ : "%ecx"); |
|
94 |
+ |
|
95 |
+ return tmp; |
|
96 |
+} |
|
97 |
+ |
|
98 |
+static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
99 |
+ int line_size, int h) |
|
100 |
+{ |
|
101 |
+ int tmp; |
|
102 |
+ |
|
103 |
+ __asm__ volatile ( |
|
104 |
+ "movl %4, %%ecx\n" |
|
105 |
+ "pxor %%mm0, %%mm0\n" /* mm0 = 0 */ |
|
106 |
+ "pxor %%mm7, %%mm7\n" /* mm7 holds the sum */ |
|
107 |
+ "1:\n" |
|
108 |
+ "movq (%0), %%mm1\n" /* mm1 = pix1[0 - 7] */ |
|
109 |
+ "movq (%1), %%mm2\n" /* mm2 = pix2[0 - 7] */ |
|
110 |
+ "movq 8(%0), %%mm3\n" /* mm3 = pix1[8 - 15] */ |
|
111 |
+ "movq 8(%1), %%mm4\n" /* mm4 = pix2[8 - 15] */ |
|
112 |
+ |
|
113 |
+ /* todo: mm1-mm2, mm3-mm4 */ |
|
114 |
+ /* algo: subtract mm1 from mm2 with saturation and vice versa */ |
|
115 |
+ /* OR the results to get absolute difference */ |
|
116 |
+ "movq %%mm1, %%mm5\n" |
|
117 |
+ "movq %%mm3, %%mm6\n" |
|
118 |
+ "psubusb %%mm2, %%mm1\n" |
|
119 |
+ "psubusb %%mm4, %%mm3\n" |
|
120 |
+ "psubusb %%mm5, %%mm2\n" |
|
121 |
+ "psubusb %%mm6, %%mm4\n" |
|
122 |
+ |
|
123 |
+ "por %%mm1, %%mm2\n" |
|
124 |
+ "por %%mm3, %%mm4\n" |
|
125 |
+ |
|
126 |
+ /* now convert to 16-bit vectors so we can square them */ |
|
127 |
+ "movq %%mm2, %%mm1\n" |
|
128 |
+ "movq %%mm4, %%mm3\n" |
|
129 |
+ |
|
130 |
+ "punpckhbw %%mm0, %%mm2\n" |
|
131 |
+ "punpckhbw %%mm0, %%mm4\n" |
|
132 |
+ "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */ |
|
133 |
+ "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */ |
|
134 |
+ |
|
135 |
+ "pmaddwd %%mm2, %%mm2\n" |
|
136 |
+ "pmaddwd %%mm4, %%mm4\n" |
|
137 |
+ "pmaddwd %%mm1, %%mm1\n" |
|
138 |
+ "pmaddwd %%mm3, %%mm3\n" |
|
139 |
+ |
|
140 |
+ "add %3, %0\n" |
|
141 |
+ "add %3, %1\n" |
|
142 |
+ |
|
143 |
+ "paddd %%mm2, %%mm1\n" |
|
144 |
+ "paddd %%mm4, %%mm3\n" |
|
145 |
+ "paddd %%mm1, %%mm7\n" |
|
146 |
+ "paddd %%mm3, %%mm7\n" |
|
147 |
+ |
|
148 |
+ "decl %%ecx\n" |
|
149 |
+ "jnz 1b\n" |
|
150 |
+ |
|
151 |
+ "movq %%mm7, %%mm1\n" |
|
152 |
+ "psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
|
153 |
+ "paddd %%mm7, %%mm1\n" |
|
154 |
+ "movd %%mm1, %2\n" |
|
155 |
+ : "+r" (pix1), "+r" (pix2), "=r" (tmp) |
|
156 |
+ : "r" ((x86_reg) line_size), "m" (h) |
|
157 |
+ : "%ecx"); |
|
158 |
+ |
|
159 |
+ return tmp; |
|
160 |
+} |
|
161 |
+ |
|
162 |
+static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h) |
|
163 |
+{ |
|
164 |
+ int tmp; |
|
165 |
+ |
|
166 |
+ __asm__ volatile ( |
|
167 |
+ "movl %3, %%ecx\n" |
|
168 |
+ "pxor %%mm7, %%mm7\n" |
|
169 |
+ "pxor %%mm6, %%mm6\n" |
|
170 |
+ |
|
171 |
+ "movq (%0), %%mm0\n" |
|
172 |
+ "movq %%mm0, %%mm1\n" |
|
173 |
+ "psllq $8, %%mm0\n" |
|
174 |
+ "psrlq $8, %%mm1\n" |
|
175 |
+ "psrlq $8, %%mm0\n" |
|
176 |
+ "movq %%mm0, %%mm2\n" |
|
177 |
+ "movq %%mm1, %%mm3\n" |
|
178 |
+ "punpcklbw %%mm7, %%mm0\n" |
|
179 |
+ "punpcklbw %%mm7, %%mm1\n" |
|
180 |
+ "punpckhbw %%mm7, %%mm2\n" |
|
181 |
+ "punpckhbw %%mm7, %%mm3\n" |
|
182 |
+ "psubw %%mm1, %%mm0\n" |
|
183 |
+ "psubw %%mm3, %%mm2\n" |
|
184 |
+ |
|
185 |
+ "add %2, %0\n" |
|
186 |
+ |
|
187 |
+ "movq (%0), %%mm4\n" |
|
188 |
+ "movq %%mm4, %%mm1\n" |
|
189 |
+ "psllq $8, %%mm4\n" |
|
190 |
+ "psrlq $8, %%mm1\n" |
|
191 |
+ "psrlq $8, %%mm4\n" |
|
192 |
+ "movq %%mm4, %%mm5\n" |
|
193 |
+ "movq %%mm1, %%mm3\n" |
|
194 |
+ "punpcklbw %%mm7, %%mm4\n" |
|
195 |
+ "punpcklbw %%mm7, %%mm1\n" |
|
196 |
+ "punpckhbw %%mm7, %%mm5\n" |
|
197 |
+ "punpckhbw %%mm7, %%mm3\n" |
|
198 |
+ "psubw %%mm1, %%mm4\n" |
|
199 |
+ "psubw %%mm3, %%mm5\n" |
|
200 |
+ "psubw %%mm4, %%mm0\n" |
|
201 |
+ "psubw %%mm5, %%mm2\n" |
|
202 |
+ "pxor %%mm3, %%mm3\n" |
|
203 |
+ "pxor %%mm1, %%mm1\n" |
|
204 |
+ "pcmpgtw %%mm0, %%mm3\n\t" |
|
205 |
+ "pcmpgtw %%mm2, %%mm1\n\t" |
|
206 |
+ "pxor %%mm3, %%mm0\n" |
|
207 |
+ "pxor %%mm1, %%mm2\n" |
|
208 |
+ "psubw %%mm3, %%mm0\n" |
|
209 |
+ "psubw %%mm1, %%mm2\n" |
|
210 |
+ "paddw %%mm0, %%mm2\n" |
|
211 |
+ "paddw %%mm2, %%mm6\n" |
|
212 |
+ |
|
213 |
+ "add %2, %0\n" |
|
214 |
+ "1:\n" |
|
215 |
+ |
|
216 |
+ "movq (%0), %%mm0\n" |
|
217 |
+ "movq %%mm0, %%mm1\n" |
|
218 |
+ "psllq $8, %%mm0\n" |
|
219 |
+ "psrlq $8, %%mm1\n" |
|
220 |
+ "psrlq $8, %%mm0\n" |
|
221 |
+ "movq %%mm0, %%mm2\n" |
|
222 |
+ "movq %%mm1, %%mm3\n" |
|
223 |
+ "punpcklbw %%mm7, %%mm0\n" |
|
224 |
+ "punpcklbw %%mm7, %%mm1\n" |
|
225 |
+ "punpckhbw %%mm7, %%mm2\n" |
|
226 |
+ "punpckhbw %%mm7, %%mm3\n" |
|
227 |
+ "psubw %%mm1, %%mm0\n" |
|
228 |
+ "psubw %%mm3, %%mm2\n" |
|
229 |
+ "psubw %%mm0, %%mm4\n" |
|
230 |
+ "psubw %%mm2, %%mm5\n" |
|
231 |
+ "pxor %%mm3, %%mm3\n" |
|
232 |
+ "pxor %%mm1, %%mm1\n" |
|
233 |
+ "pcmpgtw %%mm4, %%mm3\n\t" |
|
234 |
+ "pcmpgtw %%mm5, %%mm1\n\t" |
|
235 |
+ "pxor %%mm3, %%mm4\n" |
|
236 |
+ "pxor %%mm1, %%mm5\n" |
|
237 |
+ "psubw %%mm3, %%mm4\n" |
|
238 |
+ "psubw %%mm1, %%mm5\n" |
|
239 |
+ "paddw %%mm4, %%mm5\n" |
|
240 |
+ "paddw %%mm5, %%mm6\n" |
|
241 |
+ |
|
242 |
+ "add %2, %0\n" |
|
243 |
+ |
|
244 |
+ "movq (%0), %%mm4\n" |
|
245 |
+ "movq %%mm4, %%mm1\n" |
|
246 |
+ "psllq $8, %%mm4\n" |
|
247 |
+ "psrlq $8, %%mm1\n" |
|
248 |
+ "psrlq $8, %%mm4\n" |
|
249 |
+ "movq %%mm4, %%mm5\n" |
|
250 |
+ "movq %%mm1, %%mm3\n" |
|
251 |
+ "punpcklbw %%mm7, %%mm4\n" |
|
252 |
+ "punpcklbw %%mm7, %%mm1\n" |
|
253 |
+ "punpckhbw %%mm7, %%mm5\n" |
|
254 |
+ "punpckhbw %%mm7, %%mm3\n" |
|
255 |
+ "psubw %%mm1, %%mm4\n" |
|
256 |
+ "psubw %%mm3, %%mm5\n" |
|
257 |
+ "psubw %%mm4, %%mm0\n" |
|
258 |
+ "psubw %%mm5, %%mm2\n" |
|
259 |
+ "pxor %%mm3, %%mm3\n" |
|
260 |
+ "pxor %%mm1, %%mm1\n" |
|
261 |
+ "pcmpgtw %%mm0, %%mm3\n\t" |
|
262 |
+ "pcmpgtw %%mm2, %%mm1\n\t" |
|
263 |
+ "pxor %%mm3, %%mm0\n" |
|
264 |
+ "pxor %%mm1, %%mm2\n" |
|
265 |
+ "psubw %%mm3, %%mm0\n" |
|
266 |
+ "psubw %%mm1, %%mm2\n" |
|
267 |
+ "paddw %%mm0, %%mm2\n" |
|
268 |
+ "paddw %%mm2, %%mm6\n" |
|
269 |
+ |
|
270 |
+ "add %2, %0\n" |
|
271 |
+ "subl $2, %%ecx\n" |
|
272 |
+ " jnz 1b\n" |
|
273 |
+ |
|
274 |
+ "movq %%mm6, %%mm0\n" |
|
275 |
+ "punpcklwd %%mm7, %%mm0\n" |
|
276 |
+ "punpckhwd %%mm7, %%mm6\n" |
|
277 |
+ "paddd %%mm0, %%mm6\n" |
|
278 |
+ |
|
279 |
+ "movq %%mm6, %%mm0\n" |
|
280 |
+ "psrlq $32, %%mm6\n" |
|
281 |
+ "paddd %%mm6, %%mm0\n" |
|
282 |
+ "movd %%mm0, %1\n" |
|
283 |
+ : "+r" (pix1), "=r" (tmp) |
|
284 |
+ : "r" ((x86_reg) line_size), "g" (h - 2) |
|
285 |
+ : "%ecx"); |
|
286 |
+ |
|
287 |
+ return tmp; |
|
288 |
+} |
|
289 |
+ |
|
290 |
+static int hf_noise16_mmx(uint8_t *pix1, int line_size, int h) |
|
291 |
+{ |
|
292 |
+ int tmp; |
|
293 |
+ uint8_t *pix = pix1; |
|
294 |
+ |
|
295 |
+ __asm__ volatile ( |
|
296 |
+ "movl %3, %%ecx\n" |
|
297 |
+ "pxor %%mm7, %%mm7\n" |
|
298 |
+ "pxor %%mm6, %%mm6\n" |
|
299 |
+ |
|
300 |
+ "movq (%0), %%mm0\n" |
|
301 |
+ "movq 1(%0), %%mm1\n" |
|
302 |
+ "movq %%mm0, %%mm2\n" |
|
303 |
+ "movq %%mm1, %%mm3\n" |
|
304 |
+ "punpcklbw %%mm7, %%mm0\n" |
|
305 |
+ "punpcklbw %%mm7, %%mm1\n" |
|
306 |
+ "punpckhbw %%mm7, %%mm2\n" |
|
307 |
+ "punpckhbw %%mm7, %%mm3\n" |
|
308 |
+ "psubw %%mm1, %%mm0\n" |
|
309 |
+ "psubw %%mm3, %%mm2\n" |
|
310 |
+ |
|
311 |
+ "add %2, %0\n" |
|
312 |
+ |
|
313 |
+ "movq (%0), %%mm4\n" |
|
314 |
+ "movq 1(%0), %%mm1\n" |
|
315 |
+ "movq %%mm4, %%mm5\n" |
|
316 |
+ "movq %%mm1, %%mm3\n" |
|
317 |
+ "punpcklbw %%mm7, %%mm4\n" |
|
318 |
+ "punpcklbw %%mm7, %%mm1\n" |
|
319 |
+ "punpckhbw %%mm7, %%mm5\n" |
|
320 |
+ "punpckhbw %%mm7, %%mm3\n" |
|
321 |
+ "psubw %%mm1, %%mm4\n" |
|
322 |
+ "psubw %%mm3, %%mm5\n" |
|
323 |
+ "psubw %%mm4, %%mm0\n" |
|
324 |
+ "psubw %%mm5, %%mm2\n" |
|
325 |
+ "pxor %%mm3, %%mm3\n" |
|
326 |
+ "pxor %%mm1, %%mm1\n" |
|
327 |
+ "pcmpgtw %%mm0, %%mm3\n\t" |
|
328 |
+ "pcmpgtw %%mm2, %%mm1\n\t" |
|
329 |
+ "pxor %%mm3, %%mm0\n" |
|
330 |
+ "pxor %%mm1, %%mm2\n" |
|
331 |
+ "psubw %%mm3, %%mm0\n" |
|
332 |
+ "psubw %%mm1, %%mm2\n" |
|
333 |
+ "paddw %%mm0, %%mm2\n" |
|
334 |
+ "paddw %%mm2, %%mm6\n" |
|
335 |
+ |
|
336 |
+ "add %2, %0\n" |
|
337 |
+ "1:\n" |
|
338 |
+ |
|
339 |
+ "movq (%0), %%mm0\n" |
|
340 |
+ "movq 1(%0), %%mm1\n" |
|
341 |
+ "movq %%mm0, %%mm2\n" |
|
342 |
+ "movq %%mm1, %%mm3\n" |
|
343 |
+ "punpcklbw %%mm7, %%mm0\n" |
|
344 |
+ "punpcklbw %%mm7, %%mm1\n" |
|
345 |
+ "punpckhbw %%mm7, %%mm2\n" |
|
346 |
+ "punpckhbw %%mm7, %%mm3\n" |
|
347 |
+ "psubw %%mm1, %%mm0\n" |
|
348 |
+ "psubw %%mm3, %%mm2\n" |
|
349 |
+ "psubw %%mm0, %%mm4\n" |
|
350 |
+ "psubw %%mm2, %%mm5\n" |
|
351 |
+ "pxor %%mm3, %%mm3\n" |
|
352 |
+ "pxor %%mm1, %%mm1\n" |
|
353 |
+ "pcmpgtw %%mm4, %%mm3\n\t" |
|
354 |
+ "pcmpgtw %%mm5, %%mm1\n\t" |
|
355 |
+ "pxor %%mm3, %%mm4\n" |
|
356 |
+ "pxor %%mm1, %%mm5\n" |
|
357 |
+ "psubw %%mm3, %%mm4\n" |
|
358 |
+ "psubw %%mm1, %%mm5\n" |
|
359 |
+ "paddw %%mm4, %%mm5\n" |
|
360 |
+ "paddw %%mm5, %%mm6\n" |
|
361 |
+ |
|
362 |
+ "add %2, %0\n" |
|
363 |
+ |
|
364 |
+ "movq (%0), %%mm4\n" |
|
365 |
+ "movq 1(%0), %%mm1\n" |
|
366 |
+ "movq %%mm4, %%mm5\n" |
|
367 |
+ "movq %%mm1, %%mm3\n" |
|
368 |
+ "punpcklbw %%mm7, %%mm4\n" |
|
369 |
+ "punpcklbw %%mm7, %%mm1\n" |
|
370 |
+ "punpckhbw %%mm7, %%mm5\n" |
|
371 |
+ "punpckhbw %%mm7, %%mm3\n" |
|
372 |
+ "psubw %%mm1, %%mm4\n" |
|
373 |
+ "psubw %%mm3, %%mm5\n" |
|
374 |
+ "psubw %%mm4, %%mm0\n" |
|
375 |
+ "psubw %%mm5, %%mm2\n" |
|
376 |
+ "pxor %%mm3, %%mm3\n" |
|
377 |
+ "pxor %%mm1, %%mm1\n" |
|
378 |
+ "pcmpgtw %%mm0, %%mm3\n\t" |
|
379 |
+ "pcmpgtw %%mm2, %%mm1\n\t" |
|
380 |
+ "pxor %%mm3, %%mm0\n" |
|
381 |
+ "pxor %%mm1, %%mm2\n" |
|
382 |
+ "psubw %%mm3, %%mm0\n" |
|
383 |
+ "psubw %%mm1, %%mm2\n" |
|
384 |
+ "paddw %%mm0, %%mm2\n" |
|
385 |
+ "paddw %%mm2, %%mm6\n" |
|
386 |
+ |
|
387 |
+ "add %2, %0\n" |
|
388 |
+ "subl $2, %%ecx\n" |
|
389 |
+ " jnz 1b\n" |
|
390 |
+ |
|
391 |
+ "movq %%mm6, %%mm0\n" |
|
392 |
+ "punpcklwd %%mm7, %%mm0\n" |
|
393 |
+ "punpckhwd %%mm7, %%mm6\n" |
|
394 |
+ "paddd %%mm0, %%mm6\n" |
|
395 |
+ |
|
396 |
+ "movq %%mm6, %%mm0\n" |
|
397 |
+ "psrlq $32, %%mm6\n" |
|
398 |
+ "paddd %%mm6, %%mm0\n" |
|
399 |
+ "movd %%mm0, %1\n" |
|
400 |
+ : "+r" (pix1), "=r" (tmp) |
|
401 |
+ : "r" ((x86_reg) line_size), "g" (h - 2) |
|
402 |
+ : "%ecx"); |
|
403 |
+ |
|
404 |
+ return tmp + hf_noise8_mmx(pix + 8, line_size, h); |
|
405 |
+} |
|
406 |
+ |
|
407 |
+static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, |
|
408 |
+ int line_size, int h) |
|
409 |
+{ |
|
410 |
+ int score1, score2; |
|
411 |
+ |
|
412 |
+ if (c) |
|
413 |
+ score1 = c->mecc.sse[0](c, pix1, pix2, line_size, h); |
|
414 |
+ else |
|
415 |
+ score1 = sse16_mmx(c, pix1, pix2, line_size, h); |
|
416 |
+ score2 = hf_noise16_mmx(pix1, line_size, h) - |
|
417 |
+ hf_noise16_mmx(pix2, line_size, h); |
|
418 |
+ |
|
419 |
+ if (c) |
|
420 |
+ return score1 + FFABS(score2) * c->avctx->nsse_weight; |
|
421 |
+ else |
|
422 |
+ return score1 + FFABS(score2) * 8; |
|
423 |
+} |
|
424 |
+ |
|
425 |
+static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, |
|
426 |
+ int line_size, int h) |
|
427 |
+{ |
|
428 |
+ int score1 = sse8_mmx(c, pix1, pix2, line_size, h); |
|
429 |
+ int score2 = hf_noise8_mmx(pix1, line_size, h) - |
|
430 |
+ hf_noise8_mmx(pix2, line_size, h); |
|
431 |
+ |
|
432 |
+ if (c) |
|
433 |
+ return score1 + FFABS(score2) * c->avctx->nsse_weight; |
|
434 |
+ else |
|
435 |
+ return score1 + FFABS(score2) * 8; |
|
436 |
+} |
|
437 |
+ |
|
438 |
+static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, |
|
439 |
+ int line_size, int h) |
|
440 |
+{ |
|
441 |
+ int tmp; |
|
442 |
+ |
|
443 |
+ assert((((int) pix) & 7) == 0); |
|
444 |
+ assert((line_size & 7) == 0); |
|
445 |
+ |
|
446 |
+#define SUM(in0, in1, out0, out1) \ |
|
447 |
+ "movq (%0), %%mm2\n" \ |
|
448 |
+ "movq 8(%0), %%mm3\n" \ |
|
449 |
+ "add %2,%0\n" \ |
|
450 |
+ "movq %%mm2, " #out0 "\n" \ |
|
451 |
+ "movq %%mm3, " #out1 "\n" \ |
|
452 |
+ "psubusb " #in0 ", %%mm2\n" \ |
|
453 |
+ "psubusb " #in1 ", %%mm3\n" \ |
|
454 |
+ "psubusb " #out0 ", " #in0 "\n" \ |
|
455 |
+ "psubusb " #out1 ", " #in1 "\n" \ |
|
456 |
+ "por %%mm2, " #in0 "\n" \ |
|
457 |
+ "por %%mm3, " #in1 "\n" \ |
|
458 |
+ "movq " #in0 ", %%mm2\n" \ |
|
459 |
+ "movq " #in1 ", %%mm3\n" \ |
|
460 |
+ "punpcklbw %%mm7, " #in0 "\n" \ |
|
461 |
+ "punpcklbw %%mm7, " #in1 "\n" \ |
|
462 |
+ "punpckhbw %%mm7, %%mm2\n" \ |
|
463 |
+ "punpckhbw %%mm7, %%mm3\n" \ |
|
464 |
+ "paddw " #in1 ", " #in0 "\n" \ |
|
465 |
+ "paddw %%mm3, %%mm2\n" \ |
|
466 |
+ "paddw %%mm2, " #in0 "\n" \ |
|
467 |
+ "paddw " #in0 ", %%mm6\n" |
|
468 |
+ |
|
469 |
+ |
|
470 |
+ __asm__ volatile ( |
|
471 |
+ "movl %3, %%ecx\n" |
|
472 |
+ "pxor %%mm6, %%mm6\n" |
|
473 |
+ "pxor %%mm7, %%mm7\n" |
|
474 |
+ "movq (%0), %%mm0\n" |
|
475 |
+ "movq 8(%0), %%mm1\n" |
|
476 |
+ "add %2, %0\n" |
|
477 |
+ "jmp 2f\n" |
|
478 |
+ "1:\n" |
|
479 |
+ |
|
480 |
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
|
481 |
+ "2:\n" |
|
482 |
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
|
483 |
+ |
|
484 |
+ "subl $2, %%ecx\n" |
|
485 |
+ "jnz 1b\n" |
|
486 |
+ |
|
487 |
+ "movq %%mm6, %%mm0\n" |
|
488 |
+ "psrlq $32, %%mm6\n" |
|
489 |
+ "paddw %%mm6, %%mm0\n" |
|
490 |
+ "movq %%mm0, %%mm6\n" |
|
491 |
+ "psrlq $16, %%mm0\n" |
|
492 |
+ "paddw %%mm6, %%mm0\n" |
|
493 |
+ "movd %%mm0, %1\n" |
|
494 |
+ : "+r" (pix), "=r" (tmp) |
|
495 |
+ : "r" ((x86_reg) line_size), "m" (h) |
|
496 |
+ : "%ecx"); |
|
497 |
+ |
|
498 |
+ return tmp & 0xFFFF; |
|
499 |
+} |
|
500 |
+#undef SUM |
|
501 |
+ |
|
502 |
+static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, |
|
503 |
+ int line_size, int h) |
|
504 |
+{ |
|
505 |
+ int tmp; |
|
506 |
+ |
|
507 |
+ assert((((int) pix) & 7) == 0); |
|
508 |
+ assert((line_size & 7) == 0); |
|
509 |
+ |
|
510 |
+#define SUM(in0, in1, out0, out1) \ |
|
511 |
+ "movq (%0), " #out0 "\n" \ |
|
512 |
+ "movq 8(%0), " #out1 "\n" \ |
|
513 |
+ "add %2, %0\n" \ |
|
514 |
+ "psadbw " #out0 ", " #in0 "\n" \ |
|
515 |
+ "psadbw " #out1 ", " #in1 "\n" \ |
|
516 |
+ "paddw " #in1 ", " #in0 "\n" \ |
|
517 |
+ "paddw " #in0 ", %%mm6\n" |
|
518 |
+ |
|
519 |
+ __asm__ volatile ( |
|
520 |
+ "movl %3, %%ecx\n" |
|
521 |
+ "pxor %%mm6, %%mm6\n" |
|
522 |
+ "pxor %%mm7, %%mm7\n" |
|
523 |
+ "movq (%0), %%mm0\n" |
|
524 |
+ "movq 8(%0), %%mm1\n" |
|
525 |
+ "add %2, %0\n" |
|
526 |
+ "jmp 2f\n" |
|
527 |
+ "1:\n" |
|
528 |
+ |
|
529 |
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
|
530 |
+ "2:\n" |
|
531 |
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
|
532 |
+ |
|
533 |
+ "subl $2, %%ecx\n" |
|
534 |
+ "jnz 1b\n" |
|
535 |
+ |
|
536 |
+ "movd %%mm6, %1\n" |
|
537 |
+ : "+r" (pix), "=r" (tmp) |
|
538 |
+ : "r" ((x86_reg) line_size), "m" (h) |
|
539 |
+ : "%ecx"); |
|
540 |
+ |
|
541 |
+ return tmp; |
|
542 |
+} |
|
543 |
+#undef SUM |
|
544 |
+ |
|
545 |
+static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
546 |
+ int line_size, int h) |
|
547 |
+{ |
|
548 |
+ int tmp; |
|
549 |
+ |
|
550 |
+ assert((((int) pix1) & 7) == 0); |
|
551 |
+ assert((((int) pix2) & 7) == 0); |
|
552 |
+ assert((line_size & 7) == 0); |
|
553 |
+ |
|
554 |
+#define SUM(in0, in1, out0, out1) \ |
|
555 |
+ "movq (%0), %%mm2\n" \ |
|
556 |
+ "movq (%1), " #out0 "\n" \ |
|
557 |
+ "movq 8(%0), %%mm3\n" \ |
|
558 |
+ "movq 8(%1), " #out1 "\n" \ |
|
559 |
+ "add %3, %0\n" \ |
|
560 |
+ "add %3, %1\n" \ |
|
561 |
+ "psubb " #out0 ", %%mm2\n" \ |
|
562 |
+ "psubb " #out1 ", %%mm3\n" \ |
|
563 |
+ "pxor %%mm7, %%mm2\n" \ |
|
564 |
+ "pxor %%mm7, %%mm3\n" \ |
|
565 |
+ "movq %%mm2, " #out0 "\n" \ |
|
566 |
+ "movq %%mm3, " #out1 "\n" \ |
|
567 |
+ "psubusb " #in0 ", %%mm2\n" \ |
|
568 |
+ "psubusb " #in1 ", %%mm3\n" \ |
|
569 |
+ "psubusb " #out0 ", " #in0 "\n" \ |
|
570 |
+ "psubusb " #out1 ", " #in1 "\n" \ |
|
571 |
+ "por %%mm2, " #in0 "\n" \ |
|
572 |
+ "por %%mm3, " #in1 "\n" \ |
|
573 |
+ "movq " #in0 ", %%mm2\n" \ |
|
574 |
+ "movq " #in1 ", %%mm3\n" \ |
|
575 |
+ "punpcklbw %%mm7, " #in0 "\n" \ |
|
576 |
+ "punpcklbw %%mm7, " #in1 "\n" \ |
|
577 |
+ "punpckhbw %%mm7, %%mm2\n" \ |
|
578 |
+ "punpckhbw %%mm7, %%mm3\n" \ |
|
579 |
+ "paddw " #in1 ", " #in0 "\n" \ |
|
580 |
+ "paddw %%mm3, %%mm2\n" \ |
|
581 |
+ "paddw %%mm2, " #in0 "\n" \ |
|
582 |
+ "paddw " #in0 ", %%mm6\n" |
|
583 |
+ |
|
584 |
+ |
|
585 |
+ __asm__ volatile ( |
|
586 |
+ "movl %4, %%ecx\n" |
|
587 |
+ "pxor %%mm6, %%mm6\n" |
|
588 |
+ "pcmpeqw %%mm7, %%mm7\n" |
|
589 |
+ "psllw $15, %%mm7\n" |
|
590 |
+ "packsswb %%mm7, %%mm7\n" |
|
591 |
+ "movq (%0), %%mm0\n" |
|
592 |
+ "movq (%1), %%mm2\n" |
|
593 |
+ "movq 8(%0), %%mm1\n" |
|
594 |
+ "movq 8(%1), %%mm3\n" |
|
595 |
+ "add %3, %0\n" |
|
596 |
+ "add %3, %1\n" |
|
597 |
+ "psubb %%mm2, %%mm0\n" |
|
598 |
+ "psubb %%mm3, %%mm1\n" |
|
599 |
+ "pxor %%mm7, %%mm0\n" |
|
600 |
+ "pxor %%mm7, %%mm1\n" |
|
601 |
+ "jmp 2f\n" |
|
602 |
+ "1:\n" |
|
603 |
+ |
|
604 |
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
|
605 |
+ "2:\n" |
|
606 |
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
|
607 |
+ |
|
608 |
+ "subl $2, %%ecx\n" |
|
609 |
+ "jnz 1b\n" |
|
610 |
+ |
|
611 |
+ "movq %%mm6, %%mm0\n" |
|
612 |
+ "psrlq $32, %%mm6\n" |
|
613 |
+ "paddw %%mm6, %%mm0\n" |
|
614 |
+ "movq %%mm0, %%mm6\n" |
|
615 |
+ "psrlq $16, %%mm0\n" |
|
616 |
+ "paddw %%mm6, %%mm0\n" |
|
617 |
+ "movd %%mm0, %2\n" |
|
618 |
+ : "+r" (pix1), "+r" (pix2), "=r" (tmp) |
|
619 |
+ : "r" ((x86_reg) line_size), "m" (h) |
|
620 |
+ : "%ecx"); |
|
621 |
+ |
|
622 |
+ return tmp & 0x7FFF; |
|
623 |
+} |
|
624 |
+#undef SUM |
|
625 |
+ |
|
626 |
+static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
627 |
+ int line_size, int h) |
|
628 |
+{ |
|
629 |
+ int tmp; |
|
630 |
+ |
|
631 |
+ assert((((int) pix1) & 7) == 0); |
|
632 |
+ assert((((int) pix2) & 7) == 0); |
|
633 |
+ assert((line_size & 7) == 0); |
|
634 |
+ |
|
635 |
+#define SUM(in0, in1, out0, out1) \ |
|
636 |
+ "movq (%0), " #out0 "\n" \ |
|
637 |
+ "movq (%1), %%mm2\n" \ |
|
638 |
+ "movq 8(%0), " #out1 "\n" \ |
|
639 |
+ "movq 8(%1), %%mm3\n" \ |
|
640 |
+ "add %3, %0\n" \ |
|
641 |
+ "add %3, %1\n" \ |
|
642 |
+ "psubb %%mm2, " #out0 "\n" \ |
|
643 |
+ "psubb %%mm3, " #out1 "\n" \ |
|
644 |
+ "pxor %%mm7, " #out0 "\n" \ |
|
645 |
+ "pxor %%mm7, " #out1 "\n" \ |
|
646 |
+ "psadbw " #out0 ", " #in0 "\n" \ |
|
647 |
+ "psadbw " #out1 ", " #in1 "\n" \ |
|
648 |
+ "paddw " #in1 ", " #in0 "\n" \ |
|
649 |
+ "paddw " #in0 ", %%mm6\n " |
|
650 |
+ |
|
651 |
+ __asm__ volatile ( |
|
652 |
+ "movl %4, %%ecx\n" |
|
653 |
+ "pxor %%mm6, %%mm6\n" |
|
654 |
+ "pcmpeqw %%mm7, %%mm7\n" |
|
655 |
+ "psllw $15, %%mm7\n" |
|
656 |
+ "packsswb %%mm7, %%mm7\n" |
|
657 |
+ "movq (%0), %%mm0\n" |
|
658 |
+ "movq (%1), %%mm2\n" |
|
659 |
+ "movq 8(%0), %%mm1\n" |
|
660 |
+ "movq 8(%1), %%mm3\n" |
|
661 |
+ "add %3, %0\n" |
|
662 |
+ "add %3, %1\n" |
|
663 |
+ "psubb %%mm2, %%mm0\n" |
|
664 |
+ "psubb %%mm3, %%mm1\n" |
|
665 |
+ "pxor %%mm7, %%mm0\n" |
|
666 |
+ "pxor %%mm7, %%mm1\n" |
|
667 |
+ "jmp 2f\n" |
|
668 |
+ "1:\n" |
|
669 |
+ |
|
670 |
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
|
671 |
+ "2:\n" |
|
672 |
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
|
673 |
+ |
|
674 |
+ "subl $2, %%ecx\n" |
|
675 |
+ "jnz 1b\n" |
|
676 |
+ |
|
677 |
+ "movd %%mm6, %2\n" |
|
678 |
+ : "+r" (pix1), "+r" (pix2), "=r" (tmp) |
|
679 |
+ : "r" ((x86_reg) line_size), "m" (h) |
|
680 |
+ : "%ecx"); |
|
681 |
+ |
|
682 |
+ return tmp; |
|
683 |
+} |
|
684 |
+#undef SUM |
|
685 |
+ |
|
686 |
+#define MMABS_MMX(a,z) \ |
|
687 |
+ "pxor " #z ", " #z " \n\t" \ |
|
688 |
+ "pcmpgtw " #a ", " #z " \n\t" \ |
|
689 |
+ "pxor " #z ", " #a " \n\t" \ |
|
690 |
+ "psubw " #z ", " #a " \n\t" |
|
691 |
+ |
|
692 |
+#define MMABS_MMXEXT(a, z) \ |
|
693 |
+ "pxor " #z ", " #z " \n\t" \ |
|
694 |
+ "psubw " #a ", " #z " \n\t" \ |
|
695 |
+ "pmaxsw " #z ", " #a " \n\t" |
|
696 |
+ |
|
697 |
+#define MMABS_SSSE3(a,z) \ |
|
698 |
+ "pabsw " #a ", " #a " \n\t" |
|
699 |
+ |
|
700 |
+#define MMABS_SUM(a,z, sum) \ |
|
701 |
+ MMABS(a,z) \ |
|
702 |
+ "paddusw " #a ", " #sum " \n\t" |
|
703 |
+ |
|
704 |
+/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get |
|
705 |
+ * up to about 100k on extreme inputs. But that's very unlikely to occur in |
|
706 |
+ * natural video, and it's even more unlikely to not have any alternative |
|
707 |
+ * mvs/modes with lower cost. */ |
|
708 |
+#define HSUM_MMX(a, t, dst) \ |
|
709 |
+ "movq " #a ", " #t " \n\t" \ |
|
710 |
+ "psrlq $32, " #a " \n\t" \ |
|
711 |
+ "paddusw " #t ", " #a " \n\t" \ |
|
712 |
+ "movq " #a ", " #t " \n\t" \ |
|
713 |
+ "psrlq $16, " #a " \n\t" \ |
|
714 |
+ "paddusw " #t ", " #a " \n\t" \ |
|
715 |
+ "movd " #a ", " #dst " \n\t" \ |
|
716 |
+ |
|
717 |
+#define HSUM_MMXEXT(a, t, dst) \ |
|
718 |
+ "pshufw $0x0E, " #a ", " #t " \n\t" \ |
|
719 |
+ "paddusw " #t ", " #a " \n\t" \ |
|
720 |
+ "pshufw $0x01, " #a ", " #t " \n\t" \ |
|
721 |
+ "paddusw " #t ", " #a " \n\t" \ |
|
722 |
+ "movd " #a ", " #dst " \n\t" \ |
|
723 |
+ |
|
724 |
+#define HSUM_SSE2(a, t, dst) \ |
|
725 |
+ "movhlps " #a ", " #t " \n\t" \ |
|
726 |
+ "paddusw " #t ", " #a " \n\t" \ |
|
727 |
+ "pshuflw $0x0E, " #a ", " #t " \n\t" \ |
|
728 |
+ "paddusw " #t ", " #a " \n\t" \ |
|
729 |
+ "pshuflw $0x01, " #a ", " #t " \n\t" \ |
|
730 |
+ "paddusw " #t ", " #a " \n\t" \ |
|
731 |
+ "movd " #a ", " #dst " \n\t" \ |
|
732 |
+ |
|
733 |
+#define DCT_SAD4(m, mm, o) \ |
|
734 |
+ "mov"#m" "#o" + 0(%1), " #mm "2 \n\t" \ |
|
735 |
+ "mov"#m" "#o" + 16(%1), " #mm "3 \n\t" \ |
|
736 |
+ "mov"#m" "#o" + 32(%1), " #mm "4 \n\t" \ |
|
737 |
+ "mov"#m" "#o" + 48(%1), " #mm "5 \n\t" \ |
|
738 |
+ MMABS_SUM(mm ## 2, mm ## 6, mm ## 0) \ |
|
739 |
+ MMABS_SUM(mm ## 3, mm ## 7, mm ## 1) \ |
|
740 |
+ MMABS_SUM(mm ## 4, mm ## 6, mm ## 0) \ |
|
741 |
+ MMABS_SUM(mm ## 5, mm ## 7, mm ## 1) \ |
|
742 |
+ |
|
743 |
+#define DCT_SAD_MMX \ |
|
744 |
+ "pxor %%mm0, %%mm0 \n\t" \ |
|
745 |
+ "pxor %%mm1, %%mm1 \n\t" \ |
|
746 |
+ DCT_SAD4(q, %%mm, 0) \ |
|
747 |
+ DCT_SAD4(q, %%mm, 8) \ |
|
748 |
+ DCT_SAD4(q, %%mm, 64) \ |
|
749 |
+ DCT_SAD4(q, %%mm, 72) \ |
|
750 |
+ "paddusw %%mm1, %%mm0 \n\t" \ |
|
751 |
+ HSUM(%%mm0, %%mm1, %0) |
|
752 |
+ |
|
753 |
+#define DCT_SAD_SSE2 \ |
|
754 |
+ "pxor %%xmm0, %%xmm0 \n\t" \ |
|
755 |
+ "pxor %%xmm1, %%xmm1 \n\t" \ |
|
756 |
+ DCT_SAD4(dqa, %%xmm, 0) \ |
|
757 |
+ DCT_SAD4(dqa, %%xmm, 64) \ |
|
758 |
+ "paddusw %%xmm1, %%xmm0 \n\t" \ |
|
759 |
+ HSUM(%%xmm0, %%xmm1, %0) |
|
760 |
+ |
|
761 |
+#define DCT_SAD_FUNC(cpu) \ |
|
762 |
+static int sum_abs_dctelem_ ## cpu(int16_t *block) \ |
|
763 |
+{ \ |
|
764 |
+ int sum; \ |
|
765 |
+ __asm__ volatile ( \ |
|
766 |
+ DCT_SAD \ |
|
767 |
+ :"=r"(sum) \ |
|
768 |
+ :"r"(block)); \ |
|
769 |
+ return sum & 0xFFFF; \ |
|
770 |
+} |
|
771 |
+ |
|
772 |
+#define DCT_SAD DCT_SAD_MMX |
|
773 |
+#define HSUM(a, t, dst) HSUM_MMX(a, t, dst) |
|
774 |
+#define MMABS(a, z) MMABS_MMX(a, z) |
|
775 |
+DCT_SAD_FUNC(mmx) |
|
776 |
+#undef MMABS |
|
777 |
+#undef HSUM |
|
778 |
+ |
|
779 |
+#define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst) |
|
780 |
+#define MMABS(a, z) MMABS_MMXEXT(a, z) |
|
781 |
+DCT_SAD_FUNC(mmxext) |
|
782 |
+#undef HSUM |
|
783 |
+#undef DCT_SAD |
|
784 |
+ |
|
785 |
+#define DCT_SAD DCT_SAD_SSE2 |
|
786 |
+#define HSUM(a, t, dst) HSUM_SSE2(a, t, dst) |
|
787 |
+DCT_SAD_FUNC(sse2) |
|
788 |
+#undef MMABS |
|
789 |
+ |
|
790 |
+#if HAVE_SSSE3_INLINE |
|
791 |
+#define MMABS(a, z) MMABS_SSSE3(a, z) |
|
792 |
+DCT_SAD_FUNC(ssse3) |
|
793 |
+#undef MMABS |
|
794 |
+#endif |
|
795 |
+#undef HSUM |
|
796 |
+#undef DCT_SAD |
|
797 |
+ |
|
798 |
+ |
|
799 |
+DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = { |
|
800 |
+ 0x0000000000000000ULL, |
|
801 |
+ 0x0001000100010001ULL, |
|
802 |
+ 0x0002000200020002ULL, |
|
803 |
+}; |
|
804 |
+ |
|
805 |
+DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL; |
|
806 |
+ |
|
807 |
+static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
|
808 |
+{ |
|
809 |
+ x86_reg len = -(stride * h); |
|
810 |
+ __asm__ volatile ( |
|
811 |
+ ".p2align 4 \n\t" |
|
812 |
+ "1: \n\t" |
|
813 |
+ "movq (%1, %%"REG_a"), %%mm0 \n\t" |
|
814 |
+ "movq (%2, %%"REG_a"), %%mm2 \n\t" |
|
815 |
+ "movq (%2, %%"REG_a"), %%mm4 \n\t" |
|
816 |
+ "add %3, %%"REG_a" \n\t" |
|
817 |
+ "psubusb %%mm0, %%mm2 \n\t" |
|
818 |
+ "psubusb %%mm4, %%mm0 \n\t" |
|
819 |
+ "movq (%1, %%"REG_a"), %%mm1 \n\t" |
|
820 |
+ "movq (%2, %%"REG_a"), %%mm3 \n\t" |
|
821 |
+ "movq (%2, %%"REG_a"), %%mm5 \n\t" |
|
822 |
+ "psubusb %%mm1, %%mm3 \n\t" |
|
823 |
+ "psubusb %%mm5, %%mm1 \n\t" |
|
824 |
+ "por %%mm2, %%mm0 \n\t" |
|
825 |
+ "por %%mm1, %%mm3 \n\t" |
|
826 |
+ "movq %%mm0, %%mm1 \n\t" |
|
827 |
+ "movq %%mm3, %%mm2 \n\t" |
|
828 |
+ "punpcklbw %%mm7, %%mm0 \n\t" |
|
829 |
+ "punpckhbw %%mm7, %%mm1 \n\t" |
|
830 |
+ "punpcklbw %%mm7, %%mm3 \n\t" |
|
831 |
+ "punpckhbw %%mm7, %%mm2 \n\t" |
|
832 |
+ "paddw %%mm1, %%mm0 \n\t" |
|
833 |
+ "paddw %%mm3, %%mm2 \n\t" |
|
834 |
+ "paddw %%mm2, %%mm0 \n\t" |
|
835 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
836 |
+ "add %3, %%"REG_a" \n\t" |
|
837 |
+ " js 1b \n\t" |
|
838 |
+ : "+a" (len) |
|
839 |
+ : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride)); |
|
840 |
+} |
|
841 |
+ |
|
842 |
+static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2, |
|
843 |
+ int stride, int h) |
|
844 |
+{ |
|
845 |
+ __asm__ volatile ( |
|
846 |
+ ".p2align 4 \n\t" |
|
847 |
+ "1: \n\t" |
|
848 |
+ "movq (%1), %%mm0 \n\t" |
|
849 |
+ "movq (%1, %3), %%mm1 \n\t" |
|
850 |
+ "psadbw (%2), %%mm0 \n\t" |
|
851 |
+ "psadbw (%2, %3), %%mm1 \n\t" |
|
852 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
853 |
+ "paddw %%mm1, %%mm6 \n\t" |
|
854 |
+ "lea (%1,%3,2), %1 \n\t" |
|
855 |
+ "lea (%2,%3,2), %2 \n\t" |
|
856 |
+ "sub $2, %0 \n\t" |
|
857 |
+ " jg 1b \n\t" |
|
858 |
+ : "+r" (h), "+r" (blk1), "+r" (blk2) |
|
859 |
+ : "r" ((x86_reg) stride)); |
|
860 |
+} |
|
861 |
+ |
|
862 |
+static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1, |
|
863 |
+ int stride, int h) |
|
864 |
+{ |
|
865 |
+ int ret; |
|
866 |
+ __asm__ volatile ( |
|
867 |
+ "pxor %%xmm2, %%xmm2 \n\t" |
|
868 |
+ ".p2align 4 \n\t" |
|
869 |
+ "1: \n\t" |
|
870 |
+ "movdqu (%1), %%xmm0 \n\t" |
|
871 |
+ "movdqu (%1, %4), %%xmm1 \n\t" |
|
872 |
+ "psadbw (%2), %%xmm0 \n\t" |
|
873 |
+ "psadbw (%2, %4), %%xmm1 \n\t" |
|
874 |
+ "paddw %%xmm0, %%xmm2 \n\t" |
|
875 |
+ "paddw %%xmm1, %%xmm2 \n\t" |
|
876 |
+ "lea (%1,%4,2), %1 \n\t" |
|
877 |
+ "lea (%2,%4,2), %2 \n\t" |
|
878 |
+ "sub $2, %0 \n\t" |
|
879 |
+ " jg 1b \n\t" |
|
880 |
+ "movhlps %%xmm2, %%xmm0 \n\t" |
|
881 |
+ "paddw %%xmm0, %%xmm2 \n\t" |
|
882 |
+ "movd %%xmm2, %3 \n\t" |
|
883 |
+ : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret) |
|
884 |
+ : "r" ((x86_reg) stride)); |
|
885 |
+ return ret; |
|
886 |
+} |
|
887 |
+ |
|
888 |
+static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2, |
|
889 |
+ int stride, int h) |
|
890 |
+{ |
|
891 |
+ __asm__ volatile ( |
|
892 |
+ ".p2align 4 \n\t" |
|
893 |
+ "1: \n\t" |
|
894 |
+ "movq (%1), %%mm0 \n\t" |
|
895 |
+ "movq (%1, %3), %%mm1 \n\t" |
|
896 |
+ "pavgb 1(%1), %%mm0 \n\t" |
|
897 |
+ "pavgb 1(%1, %3), %%mm1 \n\t" |
|
898 |
+ "psadbw (%2), %%mm0 \n\t" |
|
899 |
+ "psadbw (%2, %3), %%mm1 \n\t" |
|
900 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
901 |
+ "paddw %%mm1, %%mm6 \n\t" |
|
902 |
+ "lea (%1,%3,2), %1 \n\t" |
|
903 |
+ "lea (%2,%3,2), %2 \n\t" |
|
904 |
+ "sub $2, %0 \n\t" |
|
905 |
+ " jg 1b \n\t" |
|
906 |
+ : "+r" (h), "+r" (blk1), "+r" (blk2) |
|
907 |
+ : "r" ((x86_reg) stride)); |
|
908 |
+} |
|
909 |
+ |
|
910 |
+static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2, |
|
911 |
+ int stride, int h) |
|
912 |
+{ |
|
913 |
+ __asm__ volatile ( |
|
914 |
+ "movq (%1), %%mm0 \n\t" |
|
915 |
+ "add %3, %1 \n\t" |
|
916 |
+ ".p2align 4 \n\t" |
|
917 |
+ "1: \n\t" |
|
918 |
+ "movq (%1), %%mm1 \n\t" |
|
919 |
+ "movq (%1, %3), %%mm2 \n\t" |
|
920 |
+ "pavgb %%mm1, %%mm0 \n\t" |
|
921 |
+ "pavgb %%mm2, %%mm1 \n\t" |
|
922 |
+ "psadbw (%2), %%mm0 \n\t" |
|
923 |
+ "psadbw (%2, %3), %%mm1 \n\t" |
|
924 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
925 |
+ "paddw %%mm1, %%mm6 \n\t" |
|
926 |
+ "movq %%mm2, %%mm0 \n\t" |
|
927 |
+ "lea (%1,%3,2), %1 \n\t" |
|
928 |
+ "lea (%2,%3,2), %2 \n\t" |
|
929 |
+ "sub $2, %0 \n\t" |
|
930 |
+ " jg 1b \n\t" |
|
931 |
+ : "+r" (h), "+r" (blk1), "+r" (blk2) |
|
932 |
+ : "r" ((x86_reg) stride)); |
|
933 |
+} |
|
934 |
+ |
|
935 |
+static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2, |
|
936 |
+ int stride, int h) |
|
937 |
+{ |
|
938 |
+ __asm__ volatile ( |
|
939 |
+ "movq "MANGLE(bone)", %%mm5 \n\t" |
|
940 |
+ "movq (%1), %%mm0 \n\t" |
|
941 |
+ "pavgb 1(%1), %%mm0 \n\t" |
|
942 |
+ "add %3, %1 \n\t" |
|
943 |
+ ".p2align 4 \n\t" |
|
944 |
+ "1: \n\t" |
|
945 |
+ "movq (%1), %%mm1 \n\t" |
|
946 |
+ "movq (%1,%3), %%mm2 \n\t" |
|
947 |
+ "pavgb 1(%1), %%mm1 \n\t" |
|
948 |
+ "pavgb 1(%1,%3), %%mm2 \n\t" |
|
949 |
+ "psubusb %%mm5, %%mm1 \n\t" |
|
950 |
+ "pavgb %%mm1, %%mm0 \n\t" |
|
951 |
+ "pavgb %%mm2, %%mm1 \n\t" |
|
952 |
+ "psadbw (%2), %%mm0 \n\t" |
|
953 |
+ "psadbw (%2,%3), %%mm1 \n\t" |
|
954 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
955 |
+ "paddw %%mm1, %%mm6 \n\t" |
|
956 |
+ "movq %%mm2, %%mm0 \n\t" |
|
957 |
+ "lea (%1,%3,2), %1 \n\t" |
|
958 |
+ "lea (%2,%3,2), %2 \n\t" |
|
959 |
+ "sub $2, %0 \n\t" |
|
960 |
+ " jg 1b \n\t" |
|
961 |
+ : "+r" (h), "+r" (blk1), "+r" (blk2) |
|
962 |
+ : "r" ((x86_reg) stride)); |
|
963 |
+} |
|
964 |
+ |
|
965 |
+static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, |
|
966 |
+ int stride, int h) |
|
967 |
+{ |
|
968 |
+ x86_reg len = -(stride * h); |
|
969 |
+ __asm__ volatile ( |
|
970 |
+ ".p2align 4 \n\t" |
|
971 |
+ "1: \n\t" |
|
972 |
+ "movq (%1, %%"REG_a"), %%mm0 \n\t" |
|
973 |
+ "movq (%2, %%"REG_a"), %%mm1 \n\t" |
|
974 |
+ "movq (%1, %%"REG_a"), %%mm2 \n\t" |
|
975 |
+ "movq (%2, %%"REG_a"), %%mm3 \n\t" |
|
976 |
+ "punpcklbw %%mm7, %%mm0 \n\t" |
|
977 |
+ "punpcklbw %%mm7, %%mm1 \n\t" |
|
978 |
+ "punpckhbw %%mm7, %%mm2 \n\t" |
|
979 |
+ "punpckhbw %%mm7, %%mm3 \n\t" |
|
980 |
+ "paddw %%mm0, %%mm1 \n\t" |
|
981 |
+ "paddw %%mm2, %%mm3 \n\t" |
|
982 |
+ "movq (%3, %%"REG_a"), %%mm4 \n\t" |
|
983 |
+ "movq (%3, %%"REG_a"), %%mm2 \n\t" |
|
984 |
+ "paddw %%mm5, %%mm1 \n\t" |
|
985 |
+ "paddw %%mm5, %%mm3 \n\t" |
|
986 |
+ "psrlw $1, %%mm1 \n\t" |
|
987 |
+ "psrlw $1, %%mm3 \n\t" |
|
988 |
+ "packuswb %%mm3, %%mm1 \n\t" |
|
989 |
+ "psubusb %%mm1, %%mm4 \n\t" |
|
990 |
+ "psubusb %%mm2, %%mm1 \n\t" |
|
991 |
+ "por %%mm4, %%mm1 \n\t" |
|
992 |
+ "movq %%mm1, %%mm0 \n\t" |
|
993 |
+ "punpcklbw %%mm7, %%mm0 \n\t" |
|
994 |
+ "punpckhbw %%mm7, %%mm1 \n\t" |
|
995 |
+ "paddw %%mm1, %%mm0 \n\t" |
|
996 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
997 |
+ "add %4, %%"REG_a" \n\t" |
|
998 |
+ " js 1b \n\t" |
|
999 |
+ : "+a" (len) |
|
1000 |
+ : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len), |
|
1001 |
+ "r" ((x86_reg) stride)); |
|
1002 |
+} |
|
1003 |
+ |
|
1004 |
+static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
|
1005 |
+{ |
|
1006 |
+ x86_reg len = -(stride * h); |
|
1007 |
+ __asm__ volatile ( |
|
1008 |
+ "movq (%1, %%"REG_a"), %%mm0 \n\t" |
|
1009 |
+ "movq 1(%1, %%"REG_a"), %%mm2 \n\t" |
|
1010 |
+ "movq %%mm0, %%mm1 \n\t" |
|
1011 |
+ "movq %%mm2, %%mm3 \n\t" |
|
1012 |
+ "punpcklbw %%mm7, %%mm0 \n\t" |
|
1013 |
+ "punpckhbw %%mm7, %%mm1 \n\t" |
|
1014 |
+ "punpcklbw %%mm7, %%mm2 \n\t" |
|
1015 |
+ "punpckhbw %%mm7, %%mm3 \n\t" |
|
1016 |
+ "paddw %%mm2, %%mm0 \n\t" |
|
1017 |
+ "paddw %%mm3, %%mm1 \n\t" |
|
1018 |
+ ".p2align 4 \n\t" |
|
1019 |
+ "1: \n\t" |
|
1020 |
+ "movq (%2, %%"REG_a"), %%mm2 \n\t" |
|
1021 |
+ "movq 1(%2, %%"REG_a"), %%mm4 \n\t" |
|
1022 |
+ "movq %%mm2, %%mm3 \n\t" |
|
1023 |
+ "movq %%mm4, %%mm5 \n\t" |
|
1024 |
+ "punpcklbw %%mm7, %%mm2 \n\t" |
|
1025 |
+ "punpckhbw %%mm7, %%mm3 \n\t" |
|
1026 |
+ "punpcklbw %%mm7, %%mm4 \n\t" |
|
1027 |
+ "punpckhbw %%mm7, %%mm5 \n\t" |
|
1028 |
+ "paddw %%mm4, %%mm2 \n\t" |
|
1029 |
+ "paddw %%mm5, %%mm3 \n\t" |
|
1030 |
+ "movq 16+"MANGLE(round_tab)", %%mm5 \n\t" |
|
1031 |
+ "paddw %%mm2, %%mm0 \n\t" |
|
1032 |
+ "paddw %%mm3, %%mm1 \n\t" |
|
1033 |
+ "paddw %%mm5, %%mm0 \n\t" |
|
1034 |
+ "paddw %%mm5, %%mm1 \n\t" |
|
1035 |
+ "movq (%3, %%"REG_a"), %%mm4 \n\t" |
|
1036 |
+ "movq (%3, %%"REG_a"), %%mm5 \n\t" |
|
1037 |
+ "psrlw $2, %%mm0 \n\t" |
|
1038 |
+ "psrlw $2, %%mm1 \n\t" |
|
1039 |
+ "packuswb %%mm1, %%mm0 \n\t" |
|
1040 |
+ "psubusb %%mm0, %%mm4 \n\t" |
|
1041 |
+ "psubusb %%mm5, %%mm0 \n\t" |
|
1042 |
+ "por %%mm4, %%mm0 \n\t" |
|
1043 |
+ "movq %%mm0, %%mm4 \n\t" |
|
1044 |
+ "punpcklbw %%mm7, %%mm0 \n\t" |
|
1045 |
+ "punpckhbw %%mm7, %%mm4 \n\t" |
|
1046 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
1047 |
+ "paddw %%mm4, %%mm6 \n\t" |
|
1048 |
+ "movq %%mm2, %%mm0 \n\t" |
|
1049 |
+ "movq %%mm3, %%mm1 \n\t" |
|
1050 |
+ "add %4, %%"REG_a" \n\t" |
|
1051 |
+ " js 1b \n\t" |
|
1052 |
+ : "+a" (len) |
|
1053 |
+ : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), |
|
1054 |
+ "r" ((x86_reg) stride)); |
|
1055 |
+} |
|
1056 |
+ |
|
1057 |
+static inline int sum_mmx(void) |
|
1058 |
+{ |
|
1059 |
+ int ret; |
|
1060 |
+ __asm__ volatile ( |
|
1061 |
+ "movq %%mm6, %%mm0 \n\t" |
|
1062 |
+ "psrlq $32, %%mm6 \n\t" |
|
1063 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
1064 |
+ "movq %%mm6, %%mm0 \n\t" |
|
1065 |
+ "psrlq $16, %%mm6 \n\t" |
|
1066 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
1067 |
+ "movd %%mm6, %0 \n\t" |
|
1068 |
+ : "=r" (ret)); |
|
1069 |
+ return ret & 0xFFFF; |
|
1070 |
+} |
|
1071 |
+ |
|
1072 |
+static inline int sum_mmxext(void) |
|
1073 |
+{ |
|
1074 |
+ int ret; |
|
1075 |
+ __asm__ volatile ( |
|
1076 |
+ "movd %%mm6, %0 \n\t" |
|
1077 |
+ : "=r" (ret)); |
|
1078 |
+ return ret; |
|
1079 |
+} |
|
1080 |
+ |
|
1081 |
+static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
|
1082 |
+{ |
|
1083 |
+ sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h); |
|
1084 |
+} |
|
1085 |
+ |
|
1086 |
+static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
|
1087 |
+{ |
|
1088 |
+ sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h); |
|
1089 |
+} |
|
1090 |
+ |
|
1091 |
+#define PIX_SAD(suf) \ |
|
1092 |
+static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
1093 |
+ uint8_t *blk1, int stride, int h) \ |
|
1094 |
+{ \ |
|
1095 |
+ assert(h == 8); \ |
|
1096 |
+ __asm__ volatile ( \ |
|
1097 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
1098 |
+ "pxor %%mm6, %%mm6 \n\t" \ |
|
1099 |
+ :); \ |
|
1100 |
+ \ |
|
1101 |
+ sad8_1_ ## suf(blk1, blk2, stride, 8); \ |
|
1102 |
+ \ |
|
1103 |
+ return sum_ ## suf(); \ |
|
1104 |
+} \ |
|
1105 |
+ \ |
|
1106 |
+static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
1107 |
+ uint8_t *blk1, int stride, int h) \ |
|
1108 |
+{ \ |
|
1109 |
+ assert(h == 8); \ |
|
1110 |
+ __asm__ volatile ( \ |
|
1111 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
1112 |
+ "pxor %%mm6, %%mm6 \n\t" \ |
|
1113 |
+ "movq %0, %%mm5 \n\t" \ |
|
1114 |
+ :: "m" (round_tab[1])); \ |
|
1115 |
+ \ |
|
1116 |
+ sad8_x2a_ ## suf(blk1, blk2, stride, 8); \ |
|
1117 |
+ \ |
|
1118 |
+ return sum_ ## suf(); \ |
|
1119 |
+} \ |
|
1120 |
+ \ |
|
1121 |
+static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
1122 |
+ uint8_t *blk1, int stride, int h) \ |
|
1123 |
+{ \ |
|
1124 |
+ assert(h == 8); \ |
|
1125 |
+ __asm__ volatile ( \ |
|
1126 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
1127 |
+ "pxor %%mm6, %%mm6 \n\t" \ |
|
1128 |
+ "movq %0, %%mm5 \n\t" \ |
|
1129 |
+ :: "m" (round_tab[1])); \ |
|
1130 |
+ \ |
|
1131 |
+ sad8_y2a_ ## suf(blk1, blk2, stride, 8); \ |
|
1132 |
+ \ |
|
1133 |
+ return sum_ ## suf(); \ |
|
1134 |
+} \ |
|
1135 |
+ \ |
|
1136 |
+static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
1137 |
+ uint8_t *blk1, int stride, int h) \ |
|
1138 |
+{ \ |
|
1139 |
+ assert(h == 8); \ |
|
1140 |
+ __asm__ volatile ( \ |
|
1141 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
1142 |
+ "pxor %%mm6, %%mm6 \n\t" \ |
|
1143 |
+ ::); \ |
|
1144 |
+ \ |
|
1145 |
+ sad8_4_ ## suf(blk1, blk2, stride, 8); \ |
|
1146 |
+ \ |
|
1147 |
+ return sum_ ## suf(); \ |
|
1148 |
+} \ |
|
1149 |
+ \ |
|
1150 |
+static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
1151 |
+ uint8_t *blk1, int stride, int h) \ |
|
1152 |
+{ \ |
|
1153 |
+ __asm__ volatile ( \ |
|
1154 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
1155 |
+ "pxor %%mm6, %%mm6 \n\t" \ |
|
1156 |
+ :); \ |
|
1157 |
+ \ |
|
1158 |
+ sad8_1_ ## suf(blk1, blk2, stride, h); \ |
|
1159 |
+ sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ |
|
1160 |
+ \ |
|
1161 |
+ return sum_ ## suf(); \ |
|
1162 |
+} \ |
|
1163 |
+ \ |
|
1164 |
+static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
1165 |
+ uint8_t *blk1, int stride, int h) \ |
|
1166 |
+{ \ |
|
1167 |
+ __asm__ volatile ( \ |
|
1168 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
1169 |
+ "pxor %%mm6, %%mm6 \n\t" \ |
|
1170 |
+ "movq %0, %%mm5 \n\t" \ |
|
1171 |
+ :: "m" (round_tab[1])); \ |
|
1172 |
+ \ |
|
1173 |
+ sad8_x2a_ ## suf(blk1, blk2, stride, h); \ |
|
1174 |
+ sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ |
|
1175 |
+ \ |
|
1176 |
+ return sum_ ## suf(); \ |
|
1177 |
+} \ |
|
1178 |
+ \ |
|
1179 |
+static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
1180 |
+ uint8_t *blk1, int stride, int h) \ |
|
1181 |
+{ \ |
|
1182 |
+ __asm__ volatile ( \ |
|
1183 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
1184 |
+ "pxor %%mm6, %%mm6 \n\t" \ |
|
1185 |
+ "movq %0, %%mm5 \n\t" \ |
|
1186 |
+ :: "m" (round_tab[1])); \ |
|
1187 |
+ \ |
|
1188 |
+ sad8_y2a_ ## suf(blk1, blk2, stride, h); \ |
|
1189 |
+ sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ |
|
1190 |
+ \ |
|
1191 |
+ return sum_ ## suf(); \ |
|
1192 |
+} \ |
|
1193 |
+ \ |
|
1194 |
+static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
1195 |
+ uint8_t *blk1, int stride, int h) \ |
|
1196 |
+{ \ |
|
1197 |
+ __asm__ volatile ( \ |
|
1198 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
1199 |
+ "pxor %%mm6, %%mm6 \n\t" \ |
|
1200 |
+ ::); \ |
|
1201 |
+ \ |
|
1202 |
+ sad8_4_ ## suf(blk1, blk2, stride, h); \ |
|
1203 |
+ sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ |
|
1204 |
+ \ |
|
1205 |
+ return sum_ ## suf(); \ |
|
1206 |
+} \ |
|
1207 |
+ |
|
1208 |
+PIX_SAD(mmx) |
|
1209 |
+PIX_SAD(mmxext) |
|
1210 |
+ |
|
1211 |
+#endif /* HAVE_INLINE_ASM */ |
|
1212 |
+ |
|
1213 |
+int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
1214 |
+ int line_size, int h); |
|
1215 |
+ |
|
1216 |
+#define hadamard_func(cpu) \ |
|
1217 |
+ int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \ |
|
1218 |
+ uint8_t *src2, int stride, int h); \ |
|
1219 |
+ int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \ |
|
1220 |
+ uint8_t *src2, int stride, int h); |
|
1221 |
+ |
|
1222 |
+hadamard_func(mmx) |
|
1223 |
+hadamard_func(mmxext) |
|
1224 |
+hadamard_func(sse2) |
|
1225 |
+hadamard_func(ssse3) |
|
1226 |
+ |
|
1227 |
+av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx) |
|
1228 |
+{ |
|
1229 |
+ int cpu_flags = av_get_cpu_flags(); |
|
1230 |
+ |
|
1231 |
+#if HAVE_INLINE_ASM |
|
1232 |
+ if (INLINE_MMX(cpu_flags)) { |
|
1233 |
+ c->sum_abs_dctelem = sum_abs_dctelem_mmx; |
|
1234 |
+ |
|
1235 |
+ c->pix_abs[0][0] = sad16_mmx; |
|
1236 |
+ c->pix_abs[0][1] = sad16_x2_mmx; |
|
1237 |
+ c->pix_abs[0][2] = sad16_y2_mmx; |
|
1238 |
+ c->pix_abs[0][3] = sad16_xy2_mmx; |
|
1239 |
+ c->pix_abs[1][0] = sad8_mmx; |
|
1240 |
+ c->pix_abs[1][1] = sad8_x2_mmx; |
|
1241 |
+ c->pix_abs[1][2] = sad8_y2_mmx; |
|
1242 |
+ c->pix_abs[1][3] = sad8_xy2_mmx; |
|
1243 |
+ |
|
1244 |
+ c->sad[0] = sad16_mmx; |
|
1245 |
+ c->sad[1] = sad8_mmx; |
|
1246 |
+ |
|
1247 |
+ c->sse[0] = sse16_mmx; |
|
1248 |
+ c->sse[1] = sse8_mmx; |
|
1249 |
+ c->vsad[4] = vsad_intra16_mmx; |
|
1250 |
+ |
|
1251 |
+ c->nsse[0] = nsse16_mmx; |
|
1252 |
+ c->nsse[1] = nsse8_mmx; |
|
1253 |
+ |
|
1254 |
+ if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
|
1255 |
+ c->vsad[0] = vsad16_mmx; |
|
1256 |
+ } |
|
1257 |
+ } |
|
1258 |
+ |
|
1259 |
+ if (INLINE_MMXEXT(cpu_flags)) { |
|
1260 |
+ c->sum_abs_dctelem = sum_abs_dctelem_mmxext; |
|
1261 |
+ |
|
1262 |
+ c->vsad[4] = vsad_intra16_mmxext; |
|
1263 |
+ |
|
1264 |
+ c->pix_abs[0][0] = sad16_mmxext; |
|
1265 |
+ c->pix_abs[1][0] = sad8_mmxext; |
|
1266 |
+ |
|
1267 |
+ c->sad[0] = sad16_mmxext; |
|
1268 |
+ c->sad[1] = sad8_mmxext; |
|
1269 |
+ |
|
1270 |
+ if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
|
1271 |
+ c->pix_abs[0][1] = sad16_x2_mmxext; |
|
1272 |
+ c->pix_abs[0][2] = sad16_y2_mmxext; |
|
1273 |
+ c->pix_abs[0][3] = sad16_xy2_mmxext; |
|
1274 |
+ c->pix_abs[1][1] = sad8_x2_mmxext; |
|
1275 |
+ c->pix_abs[1][2] = sad8_y2_mmxext; |
|
1276 |
+ c->pix_abs[1][3] = sad8_xy2_mmxext; |
|
1277 |
+ |
|
1278 |
+ c->vsad[0] = vsad16_mmxext; |
|
1279 |
+ } |
|
1280 |
+ } |
|
1281 |
+ |
|
1282 |
+ if (INLINE_SSE2(cpu_flags)) { |
|
1283 |
+ c->sum_abs_dctelem = sum_abs_dctelem_sse2; |
|
1284 |
+ } |
|
1285 |
+ |
|
1286 |
+ if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) { |
|
1287 |
+ c->sad[0] = sad16_sse2; |
|
1288 |
+ } |
|
1289 |
+ |
|
1290 |
+#if HAVE_SSSE3_INLINE |
|
1291 |
+ if (INLINE_SSSE3(cpu_flags)) { |
|
1292 |
+ c->sum_abs_dctelem = sum_abs_dctelem_ssse3; |
|
1293 |
+ } |
|
1294 |
+#endif |
|
1295 |
+#endif /* HAVE_INLINE_ASM */ |
|
1296 |
+ |
|
1297 |
+ if (EXTERNAL_MMX(cpu_flags)) { |
|
1298 |
+ c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx; |
|
1299 |
+ c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; |
|
1300 |
+ } |
|
1301 |
+ |
|
1302 |
+ if (EXTERNAL_MMXEXT(cpu_flags)) { |
|
1303 |
+ c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; |
|
1304 |
+ c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; |
|
1305 |
+ } |
|
1306 |
+ |
|
1307 |
+ if (EXTERNAL_SSE2(cpu_flags)) { |
|
1308 |
+ c->sse[0] = ff_sse16_sse2; |
|
1309 |
+ |
|
1310 |
+#if HAVE_ALIGNED_STACK |
|
1311 |
+ c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; |
|
1312 |
+ c->hadamard8_diff[1] = ff_hadamard8_diff_sse2; |
|
1313 |
+#endif |
|
1314 |
+ } |
|
1315 |
+ |
|
1316 |
+ if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) { |
|
1317 |
+ c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; |
|
1318 |
+ c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; |
|
1319 |
+ } |
|
1320 |
+} |