* commit '2d60444331fca1910510038dd3817bea885c2367':
dsputil: Split motion estimation compare bits off into their own context
Conflicts:
configure
libavcodec/Makefile
libavcodec/arm/Makefile
libavcodec/dvenc.c
libavcodec/error_resilience.c
libavcodec/h264.h
libavcodec/h264_slice.c
libavcodec/me_cmp.c
libavcodec/me_cmp.h
libavcodec/motion_est.c
libavcodec/motion_est_template.c
libavcodec/mpeg4videoenc.c
libavcodec/mpegvideo.c
libavcodec/mpegvideo_enc.c
libavcodec/x86/Makefile
libavcodec/x86/me_cmp_init.c
Merged-by: Michael Niedermayer <michaelni@gmx.at>
... | ... |
@@ -1804,7 +1804,6 @@ CONFIG_EXTRA=" |
1804 | 1804 |
blockdsp |
1805 | 1805 |
bswapdsp |
1806 | 1806 |
cabac |
1807 |
- dsputil |
|
1808 | 1807 |
dvprofile |
1809 | 1808 |
exif |
1810 | 1809 |
fdctdsp |
... | ... |
@@ -1827,6 +1826,7 @@ CONFIG_EXTRA=" |
1827 | 1827 |
llauddsp |
1828 | 1828 |
llviddsp |
1829 | 1829 |
lpc |
1830 |
+ me_cmp |
|
1830 | 1831 |
mpeg_er |
1831 | 1832 |
mpegaudio |
1832 | 1833 |
mpegaudiodsp |
... | ... |
@@ -2002,17 +2002,17 @@ threads_if_any="$THREADS_LIST" |
2002 | 2002 |
|
2003 | 2003 |
# subsystems |
2004 | 2004 |
dct_select="rdft" |
2005 |
-dsputil_select="fdctdsp idctdsp pixblockdsp" |
|
2006 |
-error_resilience_select="dsputil" |
|
2005 |
+error_resilience_select="me_cmp" |
|
2007 | 2006 |
frame_thread_encoder_deps="encoders threads" |
2008 | 2007 |
intrax8_select="error_resilience" |
2009 | 2008 |
mdct_select="fft" |
2010 | 2009 |
rdft_select="fft" |
2010 |
+me_cmp_select="fdctdsp idctdsp pixblockdsp" |
|
2011 | 2011 |
mpeg_er_select="error_resilience" |
2012 | 2012 |
mpegaudio_select="mpegaudiodsp" |
2013 | 2013 |
mpegaudiodsp_select="dct" |
2014 |
-mpegvideo_select="blockdsp dsputil h264chroma hpeldsp idctdsp videodsp" |
|
2015 |
-mpegvideoenc_select="dsputil mpegvideo pixblockdsp qpeldsp" |
|
2014 |
+mpegvideo_select="blockdsp h264chroma hpeldsp idctdsp me_cmp videodsp" |
|
2015 |
+mpegvideoenc_select="me_cmp mpegvideo pixblockdsp qpeldsp" |
|
2016 | 2016 |
|
2017 | 2017 |
# decoders / encoders |
2018 | 2018 |
aac_decoder_select="mdct sinewin" |
... | ... |
@@ -2020,8 +2020,8 @@ aac_encoder_select="audio_frame_queue mdct sinewin" |
2020 | 2020 |
aac_latm_decoder_select="aac_decoder aac_latm_parser" |
2021 | 2021 |
ac3_decoder_select="ac3_parser ac3dsp bswapdsp mdct" |
2022 | 2022 |
ac3_fixed_decoder_select="ac3_parser ac3dsp bswapdsp mdct" |
2023 |
-ac3_encoder_select="ac3dsp audiodsp dsputil mdct" |
|
2024 |
-ac3_fixed_encoder_select="ac3dsp audiodsp dsputil mdct" |
|
2023 |
+ac3_encoder_select="ac3dsp audiodsp mdct me_cmp" |
|
2024 |
+ac3_fixed_encoder_select="ac3dsp audiodsp mdct me_cmp" |
|
2025 | 2025 |
aic_decoder_select="golomb idctdsp" |
2026 | 2026 |
alac_encoder_select="lpc" |
2027 | 2027 |
als_decoder_select="bswapdsp" |
... | ... |
@@ -2048,11 +2048,11 @@ cook_decoder_select="audiodsp mdct sinewin" |
2048 | 2048 |
cscd_decoder_select="lzo" |
2049 | 2049 |
cscd_decoder_suggest="zlib" |
2050 | 2050 |
dca_decoder_select="mdct" |
2051 |
-dirac_decoder_select="dsputil dwt golomb videodsp mpegvideoenc" |
|
2051 |
+dirac_decoder_select="dwt golomb videodsp mpegvideoenc" |
|
2052 | 2052 |
dnxhd_decoder_select="blockdsp idctdsp" |
2053 | 2053 |
dnxhd_encoder_select="aandcttables blockdsp fdctdsp idctdsp mpegvideoenc pixblockdsp" |
2054 | 2054 |
dvvideo_decoder_select="dvprofile idctdsp" |
2055 |
-dvvideo_encoder_select="dsputil dvprofile fdctdsp pixblockdsp" |
|
2055 |
+dvvideo_encoder_select="dvprofile fdctdsp me_cmp pixblockdsp" |
|
2056 | 2056 |
dxa_decoder_select="zlib" |
2057 | 2057 |
eac3_decoder_select="ac3_decoder" |
2058 | 2058 |
eac3_encoder_select="ac3_encoder" |
... | ... |
@@ -2147,8 +2147,8 @@ qdm2_decoder_select="mdct rdft mpegaudiodsp" |
2147 | 2147 |
ra_144_encoder_select="audio_frame_queue lpc audiodsp" |
2148 | 2148 |
ra_144_decoder_select="audiodsp" |
2149 | 2149 |
ralf_decoder_select="golomb" |
2150 |
-rawvideo_decoder_select="dsputil bswapdsp" |
|
2151 |
-rtjpeg_decoder_select="dsputil" |
|
2150 |
+rawvideo_decoder_select="bswapdsp" |
|
2151 |
+rtjpeg_decoder_select="me_cmp" |
|
2152 | 2152 |
rv10_decoder_select="error_resilience h263_decoder h263dsp mpeg_er" |
2153 | 2153 |
rv10_encoder_select="h263_encoder" |
2154 | 2154 |
rv20_decoder_select="error_resilience h263_decoder h263dsp mpeg_er" |
... | ... |
@@ -2157,14 +2157,14 @@ rv30_decoder_select="error_resilience golomb h264chroma h264pred h264qpel mpeg_e |
2157 | 2157 |
rv40_decoder_select="error_resilience golomb h264chroma h264pred h264qpel mpeg_er mpegvideo videodsp" |
2158 | 2158 |
shorten_decoder_select="golomb" |
2159 | 2159 |
sipr_decoder_select="lsp" |
2160 |
-snow_decoder_select="dsputil dwt h264qpel hpeldsp rangecoder" |
|
2161 |
-snow_encoder_select="aandcttables dsputil dwt h264qpel hpeldsp mpegvideoenc rangecoder" |
|
2160 |
+snow_decoder_select="dwt h264qpel hpeldsp rangecoder" |
|
2161 |
+snow_encoder_select="aandcttables dwt h264qpel hpeldsp me_cmp mpegvideoenc rangecoder" |
|
2162 | 2162 |
sonic_decoder_select="golomb rangecoder" |
2163 | 2163 |
sonic_encoder_select="golomb rangecoder" |
2164 | 2164 |
sonic_ls_encoder_select="golomb rangecoder" |
2165 | 2165 |
sp5x_decoder_select="mjpeg_decoder" |
2166 | 2166 |
svq1_decoder_select="hpeldsp" |
2167 |
-svq1_encoder_select="aandcttables dsputil hpeldsp mpegvideoenc" |
|
2167 |
+svq1_encoder_select="aandcttables hpeldsp me_cmp mpegvideoenc" |
|
2168 | 2168 |
svq3_decoder_select="h264_decoder hpeldsp tpeldsp" |
2169 | 2169 |
svq3_decoder_suggest="zlib" |
2170 | 2170 |
tak_decoder_select="audiodsp" |
... | ... |
@@ -2517,7 +2517,7 @@ dctdnoiz_filter_deps="avcodec" |
2517 | 2517 |
dctdnoiz_filter_select="dct" |
2518 | 2518 |
delogo_filter_deps="gpl" |
2519 | 2519 |
deshake_filter_deps="avcodec" |
2520 |
-deshake_filter_select="dsputil" |
|
2520 |
+deshake_filter_select="me_cmp" |
|
2521 | 2521 |
drawtext_filter_deps="libfreetype" |
2522 | 2522 |
ebur128_filter_deps="gpl" |
2523 | 2523 |
flite_filter_deps="libflite" |
... | ... |
@@ -2536,7 +2536,7 @@ mcdeint_filter_deps="avcodec gpl" |
2536 | 2536 |
movie_filter_deps="avcodec avformat" |
2537 | 2537 |
mp_filter_deps="gpl avcodec swscale inline_asm" |
2538 | 2538 |
mpdecimate_filter_deps="gpl avcodec" |
2539 |
-mpdecimate_filter_select="dsputil pixblockdsp" |
|
2539 |
+mpdecimate_filter_select="me_cmp pixblockdsp" |
|
2540 | 2540 |
mptestsrc_filter_deps="gpl" |
2541 | 2541 |
negate_filter_deps="lut_filter" |
2542 | 2542 |
perspective_filter_deps="gpl" |
... | ... |
@@ -2554,7 +2554,7 @@ smartblur_filter_deps="gpl swscale" |
2554 | 2554 |
showspectrum_filter_deps="avcodec" |
2555 | 2555 |
showspectrum_filter_select="rdft" |
2556 | 2556 |
spp_filter_deps="gpl avcodec" |
2557 |
-spp_filter_select="dsputil fft idctdsp fdctdsp pixblockdsp" |
|
2557 |
+spp_filter_select="fft idctdsp fdctdsp me_cmp pixblockdsp" |
|
2558 | 2558 |
stereo3d_filter_deps="gpl" |
2559 | 2559 |
subtitles_filter_deps="avformat avcodec libass" |
2560 | 2560 |
super2xsai_filter_deps="gpl" |
... | ... |
@@ -41,7 +41,6 @@ OBJS-$(CONFIG_BSWAPDSP) += bswapdsp.o |
41 | 41 |
OBJS-$(CONFIG_CABAC) += cabac.o |
42 | 42 |
OBJS-$(CONFIG_CRYSTALHD) += crystalhd.o |
43 | 43 |
OBJS-$(CONFIG_DCT) += dct.o dct32_fixed.o dct32_float.o |
44 |
-OBJS-$(CONFIG_DSPUTIL) += dsputil.o |
|
45 | 44 |
OBJS-$(CONFIG_DXVA2) += dxva2.o |
46 | 45 |
OBJS-$(CONFIG_ERROR_RESILIENCE) += error_resilience.o |
47 | 46 |
OBJS-$(CONFIG_EXIF) += exif.o tiff_common.o |
... | ... |
@@ -70,6 +69,7 @@ OBJS-$(CONFIG_LLVIDDSP) += lossless_videodsp.o |
70 | 70 |
OBJS-$(CONFIG_LPC) += lpc.o |
71 | 71 |
OBJS-$(CONFIG_LSP) += lsp.o |
72 | 72 |
OBJS-$(CONFIG_MDCT) += mdct_fixed.o mdct_float.o mdct_fixed_32.o |
73 |
+OBJS-$(CONFIG_ME_CMP) += me_cmp.o |
|
73 | 74 |
OBJS-$(CONFIG_MPEG_ER) += mpeg_er.o |
74 | 75 |
OBJS-$(CONFIG_MPEGAUDIO) += mpegaudio.o mpegaudiodata.o \ |
75 | 76 |
mpegaudiodecheader.o |
... | ... |
@@ -36,6 +36,7 @@ |
36 | 36 |
#include "libavutil/internal.h" |
37 | 37 |
#include "libavutil/opt.h" |
38 | 38 |
#include "avcodec.h" |
39 |
+#include "me_cmp.h" |
|
39 | 40 |
#include "put_bits.h" |
40 | 41 |
#include "audiodsp.h" |
41 | 42 |
#include "ac3dsp.h" |
... | ... |
@@ -379,7 +380,7 @@ static void compute_exp_strategy(AC3EncodeContext *s) |
379 | 379 |
exp_strategy[blk] = EXP_NEW; |
380 | 380 |
continue; |
381 | 381 |
} |
382 |
- exp_diff = s->dsp.sad[0](NULL, exp, exp - AC3_MAX_COEFS, 16, 16); |
|
382 |
+ exp_diff = s->mecc.sad[0](NULL, exp, exp - AC3_MAX_COEFS, 16, 16); |
|
383 | 383 |
exp_strategy[blk] = EXP_REUSE; |
384 | 384 |
if (ch == CPL_CH && exp_diff > (EXP_DIFF_THRESHOLD * (s->blocks[blk].end_freq[ch] - s->start_freq[ch]) / AC3_MAX_COEFS)) |
385 | 385 |
exp_strategy[blk] = EXP_NEW; |
... | ... |
@@ -2480,7 +2481,7 @@ av_cold int ff_ac3_encode_init(AVCodecContext *avctx) |
2480 | 2480 |
goto init_fail; |
2481 | 2481 |
|
2482 | 2482 |
ff_audiodsp_init(&s->adsp); |
2483 |
- ff_dsputil_init(&s->dsp, avctx); |
|
2483 |
+ ff_me_cmp_init(&s->mecc, avctx); |
|
2484 | 2484 |
ff_ac3dsp_init(&s->ac3dsp, avctx->flags & CODEC_FLAG_BITEXACT); |
2485 | 2485 |
|
2486 | 2486 |
dprint_options(s); |
... | ... |
@@ -35,9 +35,9 @@ |
35 | 35 |
#include "ac3.h" |
36 | 36 |
#include "ac3dsp.h" |
37 | 37 |
#include "avcodec.h" |
38 |
-#include "dsputil.h" |
|
39 | 38 |
#include "fft.h" |
40 | 39 |
#include "mathops.h" |
40 |
+#include "me_cmp.h" |
|
41 | 41 |
#include "put_bits.h" |
42 | 42 |
#include "audiodsp.h" |
43 | 43 |
|
... | ... |
@@ -162,9 +162,9 @@ typedef struct AC3EncodeContext { |
162 | 162 |
AC3EncOptions options; ///< encoding options |
163 | 163 |
AVCodecContext *avctx; ///< parent AVCodecContext |
164 | 164 |
PutBitContext pb; ///< bitstream writer context |
165 |
- DSPContext dsp; |
|
166 | 165 |
AudioDSPContext adsp; |
167 | 166 |
AVFloatDSPContext fdsp; |
167 |
+ MECmpContext mecc; |
|
168 | 168 |
AC3DSPContext ac3dsp; ///< AC-3 optimized functions |
169 | 169 |
FFTContext mdct; ///< FFT context for MDCT calculation |
170 | 170 |
const SampleType *mdct_window; ///< MDCT window function array |
... | ... |
@@ -6,7 +6,6 @@ OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \ |
6 | 6 |
arm/ac3dsp_arm.o |
7 | 7 |
OBJS-$(CONFIG_AUDIODSP) += arm/audiodsp_init_arm.o |
8 | 8 |
OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_arm.o |
9 |
-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_arm.o |
|
10 | 9 |
OBJS-$(CONFIG_FFT) += arm/fft_init_arm.o \ |
11 | 10 |
arm/fft_fixed_init_arm.o |
12 | 11 |
OBJS-$(CONFIG_H264CHROMA) += arm/h264chroma_init_arm.o |
... | ... |
@@ -20,6 +19,7 @@ OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_arm.o \ |
20 | 20 |
arm/jrevdct_arm.o \ |
21 | 21 |
arm/simple_idct_arm.o |
22 | 22 |
OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_init_arm.o |
23 |
+OBJS-$(CONFIG_ME_CMP) += arm/me_cmp_init_arm.o |
|
23 | 24 |
OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o |
24 | 25 |
OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o |
25 | 26 |
OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_init_arm.o |
... | ... |
@@ -54,13 +54,13 @@ ARMV5TE-OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_armv5te.o \ |
54 | 54 |
ARMV5TE-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv5te.o |
55 | 55 |
|
56 | 56 |
ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o |
57 |
-ARMV6-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_armv6.o |
|
58 | 57 |
ARMV6-OBJS-$(CONFIG_H264DSP) += arm/startcode_armv6.o |
59 | 58 |
ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \ |
60 | 59 |
arm/hpeldsp_armv6.o |
61 | 60 |
ARMV6-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_armv6.o \ |
62 | 61 |
arm/idctdsp_armv6.o \ |
63 | 62 |
arm/simple_idct_armv6.o |
63 |
+ARMV6-OBJS-$(CONFIG_ME_CMP) += arm/me_cmp_armv6.o |
|
64 | 64 |
ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o |
65 | 65 |
ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_armv6.o |
66 | 66 |
ARMV6-OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_armv6.o |
67 | 67 |
deleted file mode 100644 |
... | ... |
@@ -1,244 +0,0 @@ |
1 |
-/* |
|
2 |
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> |
|
3 |
- * |
|
4 |
- * This file is part of FFmpeg. |
|
5 |
- * |
|
6 |
- * FFmpeg is free software; you can redistribute it and/or |
|
7 |
- * modify it under the terms of the GNU Lesser General Public |
|
8 |
- * License as published by the Free Software Foundation; either |
|
9 |
- * version 2.1 of the License, or (at your option) any later version. |
|
10 |
- * |
|
11 |
- * FFmpeg is distributed in the hope that it will be useful, |
|
12 |
- * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
14 |
- * Lesser General Public License for more details. |
|
15 |
- * |
|
16 |
- * You should have received a copy of the GNU Lesser General Public |
|
17 |
- * License along with FFmpeg; if not, write to the Free Software |
|
18 |
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
19 |
- */ |
|
20 |
- |
|
21 |
-#include "libavutil/arm/asm.S" |
|
22 |
- |
|
23 |
-function ff_pix_abs16_armv6, export=1 |
|
24 |
- ldr r0, [sp] |
|
25 |
- push {r4-r9, lr} |
|
26 |
- mov r12, #0 |
|
27 |
- mov lr, #0 |
|
28 |
- ldm r1, {r4-r7} |
|
29 |
- ldr r8, [r2] |
|
30 |
-1: |
|
31 |
- ldr r9, [r2, #4] |
|
32 |
- pld [r1, r3] |
|
33 |
- usada8 r12, r4, r8, r12 |
|
34 |
- ldr r8, [r2, #8] |
|
35 |
- pld [r2, r3] |
|
36 |
- usada8 lr, r5, r9, lr |
|
37 |
- ldr r9, [r2, #12] |
|
38 |
- usada8 r12, r6, r8, r12 |
|
39 |
- subs r0, r0, #1 |
|
40 |
- usada8 lr, r7, r9, lr |
|
41 |
- beq 2f |
|
42 |
- add r1, r1, r3 |
|
43 |
- ldm r1, {r4-r7} |
|
44 |
- add r2, r2, r3 |
|
45 |
- ldr r8, [r2] |
|
46 |
- b 1b |
|
47 |
-2: |
|
48 |
- add r0, r12, lr |
|
49 |
- pop {r4-r9, pc} |
|
50 |
-endfunc |
|
51 |
- |
|
52 |
-function ff_pix_abs16_x2_armv6, export=1 |
|
53 |
- ldr r12, [sp] |
|
54 |
- push {r4-r11, lr} |
|
55 |
- mov r0, #0 |
|
56 |
- mov lr, #1 |
|
57 |
- orr lr, lr, lr, lsl #8 |
|
58 |
- orr lr, lr, lr, lsl #16 |
|
59 |
-1: |
|
60 |
- ldr r8, [r2] |
|
61 |
- ldr r9, [r2, #4] |
|
62 |
- lsr r10, r8, #8 |
|
63 |
- ldr r4, [r1] |
|
64 |
- lsr r6, r9, #8 |
|
65 |
- orr r10, r10, r9, lsl #24 |
|
66 |
- ldr r5, [r2, #8] |
|
67 |
- eor r11, r8, r10 |
|
68 |
- uhadd8 r7, r8, r10 |
|
69 |
- orr r6, r6, r5, lsl #24 |
|
70 |
- and r11, r11, lr |
|
71 |
- uadd8 r7, r7, r11 |
|
72 |
- ldr r8, [r1, #4] |
|
73 |
- usada8 r0, r4, r7, r0 |
|
74 |
- eor r7, r9, r6 |
|
75 |
- lsr r10, r5, #8 |
|
76 |
- and r7, r7, lr |
|
77 |
- uhadd8 r4, r9, r6 |
|
78 |
- ldr r6, [r2, #12] |
|
79 |
- uadd8 r4, r4, r7 |
|
80 |
- pld [r1, r3] |
|
81 |
- orr r10, r10, r6, lsl #24 |
|
82 |
- usada8 r0, r8, r4, r0 |
|
83 |
- ldr r4, [r1, #8] |
|
84 |
- eor r11, r5, r10 |
|
85 |
- ldrb r7, [r2, #16] |
|
86 |
- and r11, r11, lr |
|
87 |
- uhadd8 r8, r5, r10 |
|
88 |
- ldr r5, [r1, #12] |
|
89 |
- uadd8 r8, r8, r11 |
|
90 |
- pld [r2, r3] |
|
91 |
- lsr r10, r6, #8 |
|
92 |
- usada8 r0, r4, r8, r0 |
|
93 |
- orr r10, r10, r7, lsl #24 |
|
94 |
- subs r12, r12, #1 |
|
95 |
- eor r11, r6, r10 |
|
96 |
- add r1, r1, r3 |
|
97 |
- uhadd8 r9, r6, r10 |
|
98 |
- and r11, r11, lr |
|
99 |
- uadd8 r9, r9, r11 |
|
100 |
- add r2, r2, r3 |
|
101 |
- usada8 r0, r5, r9, r0 |
|
102 |
- bgt 1b |
|
103 |
- |
|
104 |
- pop {r4-r11, pc} |
|
105 |
-endfunc |
|
106 |
- |
|
107 |
-.macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3 |
|
108 |
- ldr \n0, [r2] |
|
109 |
- eor \n1, \p0, \n0 |
|
110 |
- uhadd8 \p0, \p0, \n0 |
|
111 |
- and \n1, \n1, lr |
|
112 |
- ldr \n2, [r1] |
|
113 |
- uadd8 \p0, \p0, \n1 |
|
114 |
- ldr \n1, [r2, #4] |
|
115 |
- usada8 r0, \p0, \n2, r0 |
|
116 |
- pld [r1, r3] |
|
117 |
- eor \n3, \p1, \n1 |
|
118 |
- uhadd8 \p1, \p1, \n1 |
|
119 |
- and \n3, \n3, lr |
|
120 |
- ldr \p0, [r1, #4] |
|
121 |
- uadd8 \p1, \p1, \n3 |
|
122 |
- ldr \n2, [r2, #8] |
|
123 |
- usada8 r0, \p1, \p0, r0 |
|
124 |
- pld [r2, r3] |
|
125 |
- eor \p0, \p2, \n2 |
|
126 |
- uhadd8 \p2, \p2, \n2 |
|
127 |
- and \p0, \p0, lr |
|
128 |
- ldr \p1, [r1, #8] |
|
129 |
- uadd8 \p2, \p2, \p0 |
|
130 |
- ldr \n3, [r2, #12] |
|
131 |
- usada8 r0, \p2, \p1, r0 |
|
132 |
- eor \p1, \p3, \n3 |
|
133 |
- uhadd8 \p3, \p3, \n3 |
|
134 |
- and \p1, \p1, lr |
|
135 |
- ldr \p0, [r1, #12] |
|
136 |
- uadd8 \p3, \p3, \p1 |
|
137 |
- add r1, r1, r3 |
|
138 |
- usada8 r0, \p3, \p0, r0 |
|
139 |
- add r2, r2, r3 |
|
140 |
-.endm |
|
141 |
- |
|
142 |
-function ff_pix_abs16_y2_armv6, export=1 |
|
143 |
- pld [r1] |
|
144 |
- pld [r2] |
|
145 |
- ldr r12, [sp] |
|
146 |
- push {r4-r11, lr} |
|
147 |
- mov r0, #0 |
|
148 |
- mov lr, #1 |
|
149 |
- orr lr, lr, lr, lsl #8 |
|
150 |
- orr lr, lr, lr, lsl #16 |
|
151 |
- ldr r4, [r2] |
|
152 |
- ldr r5, [r2, #4] |
|
153 |
- ldr r6, [r2, #8] |
|
154 |
- ldr r7, [r2, #12] |
|
155 |
- add r2, r2, r3 |
|
156 |
-1: |
|
157 |
- usad_y2 r4, r5, r6, r7, r8, r9, r10, r11 |
|
158 |
- subs r12, r12, #2 |
|
159 |
- usad_y2 r8, r9, r10, r11, r4, r5, r6, r7 |
|
160 |
- bgt 1b |
|
161 |
- |
|
162 |
- pop {r4-r11, pc} |
|
163 |
-endfunc |
|
164 |
- |
|
165 |
-function ff_pix_abs8_armv6, export=1 |
|
166 |
- pld [r2, r3] |
|
167 |
- ldr r12, [sp] |
|
168 |
- push {r4-r9, lr} |
|
169 |
- mov r0, #0 |
|
170 |
- mov lr, #0 |
|
171 |
- ldrd_post r4, r5, r1, r3 |
|
172 |
-1: |
|
173 |
- subs r12, r12, #2 |
|
174 |
- ldr r7, [r2, #4] |
|
175 |
- ldr_post r6, r2, r3 |
|
176 |
- ldrd_post r8, r9, r1, r3 |
|
177 |
- usada8 r0, r4, r6, r0 |
|
178 |
- pld [r2, r3] |
|
179 |
- usada8 lr, r5, r7, lr |
|
180 |
- ldr r7, [r2, #4] |
|
181 |
- ldr_post r6, r2, r3 |
|
182 |
- beq 2f |
|
183 |
- ldrd_post r4, r5, r1, r3 |
|
184 |
- usada8 r0, r8, r6, r0 |
|
185 |
- pld [r2, r3] |
|
186 |
- usada8 lr, r9, r7, lr |
|
187 |
- b 1b |
|
188 |
-2: |
|
189 |
- usada8 r0, r8, r6, r0 |
|
190 |
- usada8 lr, r9, r7, lr |
|
191 |
- add r0, r0, lr |
|
192 |
- pop {r4-r9, pc} |
|
193 |
-endfunc |
|
194 |
- |
|
195 |
-function ff_sse16_armv6, export=1 |
|
196 |
- ldr r12, [sp] |
|
197 |
- push {r4-r9, lr} |
|
198 |
- mov r0, #0 |
|
199 |
-1: |
|
200 |
- ldrd r4, r5, [r1] |
|
201 |
- ldr r8, [r2] |
|
202 |
- uxtb16 lr, r4 |
|
203 |
- uxtb16 r4, r4, ror #8 |
|
204 |
- uxtb16 r9, r8 |
|
205 |
- uxtb16 r8, r8, ror #8 |
|
206 |
- ldr r7, [r2, #4] |
|
207 |
- usub16 lr, lr, r9 |
|
208 |
- usub16 r4, r4, r8 |
|
209 |
- smlad r0, lr, lr, r0 |
|
210 |
- uxtb16 r6, r5 |
|
211 |
- uxtb16 lr, r5, ror #8 |
|
212 |
- uxtb16 r8, r7 |
|
213 |
- uxtb16 r9, r7, ror #8 |
|
214 |
- smlad r0, r4, r4, r0 |
|
215 |
- ldrd r4, r5, [r1, #8] |
|
216 |
- usub16 r6, r6, r8 |
|
217 |
- usub16 r8, lr, r9 |
|
218 |
- ldr r7, [r2, #8] |
|
219 |
- smlad r0, r6, r6, r0 |
|
220 |
- uxtb16 lr, r4 |
|
221 |
- uxtb16 r4, r4, ror #8 |
|
222 |
- uxtb16 r9, r7 |
|
223 |
- uxtb16 r7, r7, ror #8 |
|
224 |
- smlad r0, r8, r8, r0 |
|
225 |
- ldr r8, [r2, #12] |
|
226 |
- usub16 lr, lr, r9 |
|
227 |
- usub16 r4, r4, r7 |
|
228 |
- smlad r0, lr, lr, r0 |
|
229 |
- uxtb16 r6, r5 |
|
230 |
- uxtb16 r5, r5, ror #8 |
|
231 |
- uxtb16 r9, r8 |
|
232 |
- uxtb16 r8, r8, ror #8 |
|
233 |
- smlad r0, r4, r4, r0 |
|
234 |
- usub16 r6, r6, r9 |
|
235 |
- usub16 r5, r5, r8 |
|
236 |
- smlad r0, r6, r6, r0 |
|
237 |
- add r1, r1, r3 |
|
238 |
- add r2, r2, r3 |
|
239 |
- subs r12, r12, #1 |
|
240 |
- smlad r0, r5, r5, r0 |
|
241 |
- bgt 1b |
|
242 |
- |
|
243 |
- pop {r4-r9, pc} |
|
244 |
-endfunc |
245 | 1 |
deleted file mode 100644 |
... | ... |
@@ -1,58 +0,0 @@ |
1 |
-/* |
|
2 |
- * This file is part of FFmpeg. |
|
3 |
- * |
|
4 |
- * FFmpeg is free software; you can redistribute it and/or |
|
5 |
- * modify it under the terms of the GNU Lesser General Public |
|
6 |
- * License as published by the Free Software Foundation; either |
|
7 |
- * version 2.1 of the License, or (at your option) any later version. |
|
8 |
- * |
|
9 |
- * FFmpeg is distributed in the hope that it will be useful, |
|
10 |
- * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
11 |
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
12 |
- * Lesser General Public License for more details. |
|
13 |
- * |
|
14 |
- * You should have received a copy of the GNU Lesser General Public |
|
15 |
- * License along with FFmpeg; if not, write to the Free Software |
|
16 |
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
17 |
- */ |
|
18 |
- |
|
19 |
-#include <stdint.h> |
|
20 |
- |
|
21 |
-#include "libavutil/attributes.h" |
|
22 |
-#include "libavutil/cpu.h" |
|
23 |
-#include "libavutil/arm/cpu.h" |
|
24 |
-#include "libavcodec/avcodec.h" |
|
25 |
-#include "libavcodec/dsputil.h" |
|
26 |
-#include "libavcodec/mpegvideo.h" |
|
27 |
- |
|
28 |
-int ff_pix_abs16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
29 |
- int line_size, int h); |
|
30 |
-int ff_pix_abs16_x2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
31 |
- int line_size, int h); |
|
32 |
-int ff_pix_abs16_y2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
33 |
- int line_size, int h); |
|
34 |
- |
|
35 |
-int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
36 |
- int line_size, int h); |
|
37 |
- |
|
38 |
-int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
39 |
- int line_size, int h); |
|
40 |
- |
|
41 |
- |
|
42 |
-av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx) |
|
43 |
-{ |
|
44 |
- int cpu_flags = av_get_cpu_flags(); |
|
45 |
- |
|
46 |
- if (have_armv6(cpu_flags)) { |
|
47 |
- c->pix_abs[0][0] = ff_pix_abs16_armv6; |
|
48 |
- c->pix_abs[0][1] = ff_pix_abs16_x2_armv6; |
|
49 |
- c->pix_abs[0][2] = ff_pix_abs16_y2_armv6; |
|
50 |
- |
|
51 |
- c->pix_abs[1][0] = ff_pix_abs8_armv6; |
|
52 |
- |
|
53 |
- c->sad[0] = ff_pix_abs16_armv6; |
|
54 |
- c->sad[1] = ff_pix_abs8_armv6; |
|
55 |
- |
|
56 |
- c->sse[0] = ff_sse16_armv6; |
|
57 |
- } |
|
58 |
-} |
59 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,244 @@ |
0 |
+/* |
|
1 |
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> |
|
2 |
+ * |
|
3 |
+ * This file is part of FFmpeg. |
|
4 |
+ * |
|
5 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
6 |
+ * modify it under the terms of the GNU Lesser General Public |
|
7 |
+ * License as published by the Free Software Foundation; either |
|
8 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
9 |
+ * |
|
10 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
13 |
+ * Lesser General Public License for more details. |
|
14 |
+ * |
|
15 |
+ * You should have received a copy of the GNU Lesser General Public |
|
16 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
18 |
+ */ |
|
19 |
+ |
|
20 |
+#include "libavutil/arm/asm.S" |
|
21 |
+ |
|
22 |
+function ff_pix_abs16_armv6, export=1 |
|
23 |
+ ldr r0, [sp] |
|
24 |
+ push {r4-r9, lr} |
|
25 |
+ mov r12, #0 |
|
26 |
+ mov lr, #0 |
|
27 |
+ ldm r1, {r4-r7} |
|
28 |
+ ldr r8, [r2] |
|
29 |
+1: |
|
30 |
+ ldr r9, [r2, #4] |
|
31 |
+ pld [r1, r3] |
|
32 |
+ usada8 r12, r4, r8, r12 |
|
33 |
+ ldr r8, [r2, #8] |
|
34 |
+ pld [r2, r3] |
|
35 |
+ usada8 lr, r5, r9, lr |
|
36 |
+ ldr r9, [r2, #12] |
|
37 |
+ usada8 r12, r6, r8, r12 |
|
38 |
+ subs r0, r0, #1 |
|
39 |
+ usada8 lr, r7, r9, lr |
|
40 |
+ beq 2f |
|
41 |
+ add r1, r1, r3 |
|
42 |
+ ldm r1, {r4-r7} |
|
43 |
+ add r2, r2, r3 |
|
44 |
+ ldr r8, [r2] |
|
45 |
+ b 1b |
|
46 |
+2: |
|
47 |
+ add r0, r12, lr |
|
48 |
+ pop {r4-r9, pc} |
|
49 |
+endfunc |
|
50 |
+ |
|
51 |
+function ff_pix_abs16_x2_armv6, export=1 |
|
52 |
+ ldr r12, [sp] |
|
53 |
+ push {r4-r11, lr} |
|
54 |
+ mov r0, #0 |
|
55 |
+ mov lr, #1 |
|
56 |
+ orr lr, lr, lr, lsl #8 |
|
57 |
+ orr lr, lr, lr, lsl #16 |
|
58 |
+1: |
|
59 |
+ ldr r8, [r2] |
|
60 |
+ ldr r9, [r2, #4] |
|
61 |
+ lsr r10, r8, #8 |
|
62 |
+ ldr r4, [r1] |
|
63 |
+ lsr r6, r9, #8 |
|
64 |
+ orr r10, r10, r9, lsl #24 |
|
65 |
+ ldr r5, [r2, #8] |
|
66 |
+ eor r11, r8, r10 |
|
67 |
+ uhadd8 r7, r8, r10 |
|
68 |
+ orr r6, r6, r5, lsl #24 |
|
69 |
+ and r11, r11, lr |
|
70 |
+ uadd8 r7, r7, r11 |
|
71 |
+ ldr r8, [r1, #4] |
|
72 |
+ usada8 r0, r4, r7, r0 |
|
73 |
+ eor r7, r9, r6 |
|
74 |
+ lsr r10, r5, #8 |
|
75 |
+ and r7, r7, lr |
|
76 |
+ uhadd8 r4, r9, r6 |
|
77 |
+ ldr r6, [r2, #12] |
|
78 |
+ uadd8 r4, r4, r7 |
|
79 |
+ pld [r1, r3] |
|
80 |
+ orr r10, r10, r6, lsl #24 |
|
81 |
+ usada8 r0, r8, r4, r0 |
|
82 |
+ ldr r4, [r1, #8] |
|
83 |
+ eor r11, r5, r10 |
|
84 |
+ ldrb r7, [r2, #16] |
|
85 |
+ and r11, r11, lr |
|
86 |
+ uhadd8 r8, r5, r10 |
|
87 |
+ ldr r5, [r1, #12] |
|
88 |
+ uadd8 r8, r8, r11 |
|
89 |
+ pld [r2, r3] |
|
90 |
+ lsr r10, r6, #8 |
|
91 |
+ usada8 r0, r4, r8, r0 |
|
92 |
+ orr r10, r10, r7, lsl #24 |
|
93 |
+ subs r12, r12, #1 |
|
94 |
+ eor r11, r6, r10 |
|
95 |
+ add r1, r1, r3 |
|
96 |
+ uhadd8 r9, r6, r10 |
|
97 |
+ and r11, r11, lr |
|
98 |
+ uadd8 r9, r9, r11 |
|
99 |
+ add r2, r2, r3 |
|
100 |
+ usada8 r0, r5, r9, r0 |
|
101 |
+ bgt 1b |
|
102 |
+ |
|
103 |
+ pop {r4-r11, pc} |
|
104 |
+endfunc |
|
105 |
+ |
|
106 |
+.macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3 |
|
107 |
+ ldr \n0, [r2] |
|
108 |
+ eor \n1, \p0, \n0 |
|
109 |
+ uhadd8 \p0, \p0, \n0 |
|
110 |
+ and \n1, \n1, lr |
|
111 |
+ ldr \n2, [r1] |
|
112 |
+ uadd8 \p0, \p0, \n1 |
|
113 |
+ ldr \n1, [r2, #4] |
|
114 |
+ usada8 r0, \p0, \n2, r0 |
|
115 |
+ pld [r1, r3] |
|
116 |
+ eor \n3, \p1, \n1 |
|
117 |
+ uhadd8 \p1, \p1, \n1 |
|
118 |
+ and \n3, \n3, lr |
|
119 |
+ ldr \p0, [r1, #4] |
|
120 |
+ uadd8 \p1, \p1, \n3 |
|
121 |
+ ldr \n2, [r2, #8] |
|
122 |
+ usada8 r0, \p1, \p0, r0 |
|
123 |
+ pld [r2, r3] |
|
124 |
+ eor \p0, \p2, \n2 |
|
125 |
+ uhadd8 \p2, \p2, \n2 |
|
126 |
+ and \p0, \p0, lr |
|
127 |
+ ldr \p1, [r1, #8] |
|
128 |
+ uadd8 \p2, \p2, \p0 |
|
129 |
+ ldr \n3, [r2, #12] |
|
130 |
+ usada8 r0, \p2, \p1, r0 |
|
131 |
+ eor \p1, \p3, \n3 |
|
132 |
+ uhadd8 \p3, \p3, \n3 |
|
133 |
+ and \p1, \p1, lr |
|
134 |
+ ldr \p0, [r1, #12] |
|
135 |
+ uadd8 \p3, \p3, \p1 |
|
136 |
+ add r1, r1, r3 |
|
137 |
+ usada8 r0, \p3, \p0, r0 |
|
138 |
+ add r2, r2, r3 |
|
139 |
+.endm |
|
140 |
+ |
|
141 |
+function ff_pix_abs16_y2_armv6, export=1 |
|
142 |
+ pld [r1] |
|
143 |
+ pld [r2] |
|
144 |
+ ldr r12, [sp] |
|
145 |
+ push {r4-r11, lr} |
|
146 |
+ mov r0, #0 |
|
147 |
+ mov lr, #1 |
|
148 |
+ orr lr, lr, lr, lsl #8 |
|
149 |
+ orr lr, lr, lr, lsl #16 |
|
150 |
+ ldr r4, [r2] |
|
151 |
+ ldr r5, [r2, #4] |
|
152 |
+ ldr r6, [r2, #8] |
|
153 |
+ ldr r7, [r2, #12] |
|
154 |
+ add r2, r2, r3 |
|
155 |
+1: |
|
156 |
+ usad_y2 r4, r5, r6, r7, r8, r9, r10, r11 |
|
157 |
+ subs r12, r12, #2 |
|
158 |
+ usad_y2 r8, r9, r10, r11, r4, r5, r6, r7 |
|
159 |
+ bgt 1b |
|
160 |
+ |
|
161 |
+ pop {r4-r11, pc} |
|
162 |
+endfunc |
|
163 |
+ |
|
164 |
+function ff_pix_abs8_armv6, export=1 |
|
165 |
+ pld [r2, r3] |
|
166 |
+ ldr r12, [sp] |
|
167 |
+ push {r4-r9, lr} |
|
168 |
+ mov r0, #0 |
|
169 |
+ mov lr, #0 |
|
170 |
+ ldrd_post r4, r5, r1, r3 |
|
171 |
+1: |
|
172 |
+ subs r12, r12, #2 |
|
173 |
+ ldr r7, [r2, #4] |
|
174 |
+ ldr_post r6, r2, r3 |
|
175 |
+ ldrd_post r8, r9, r1, r3 |
|
176 |
+ usada8 r0, r4, r6, r0 |
|
177 |
+ pld [r2, r3] |
|
178 |
+ usada8 lr, r5, r7, lr |
|
179 |
+ ldr r7, [r2, #4] |
|
180 |
+ ldr_post r6, r2, r3 |
|
181 |
+ beq 2f |
|
182 |
+ ldrd_post r4, r5, r1, r3 |
|
183 |
+ usada8 r0, r8, r6, r0 |
|
184 |
+ pld [r2, r3] |
|
185 |
+ usada8 lr, r9, r7, lr |
|
186 |
+ b 1b |
|
187 |
+2: |
|
188 |
+ usada8 r0, r8, r6, r0 |
|
189 |
+ usada8 lr, r9, r7, lr |
|
190 |
+ add r0, r0, lr |
|
191 |
+ pop {r4-r9, pc} |
|
192 |
+endfunc |
|
193 |
+ |
|
194 |
+function ff_sse16_armv6, export=1 |
|
195 |
+ ldr r12, [sp] |
|
196 |
+ push {r4-r9, lr} |
|
197 |
+ mov r0, #0 |
|
198 |
+1: |
|
199 |
+ ldrd r4, r5, [r1] |
|
200 |
+ ldr r8, [r2] |
|
201 |
+ uxtb16 lr, r4 |
|
202 |
+ uxtb16 r4, r4, ror #8 |
|
203 |
+ uxtb16 r9, r8 |
|
204 |
+ uxtb16 r8, r8, ror #8 |
|
205 |
+ ldr r7, [r2, #4] |
|
206 |
+ usub16 lr, lr, r9 |
|
207 |
+ usub16 r4, r4, r8 |
|
208 |
+ smlad r0, lr, lr, r0 |
|
209 |
+ uxtb16 r6, r5 |
|
210 |
+ uxtb16 lr, r5, ror #8 |
|
211 |
+ uxtb16 r8, r7 |
|
212 |
+ uxtb16 r9, r7, ror #8 |
|
213 |
+ smlad r0, r4, r4, r0 |
|
214 |
+ ldrd r4, r5, [r1, #8] |
|
215 |
+ usub16 r6, r6, r8 |
|
216 |
+ usub16 r8, lr, r9 |
|
217 |
+ ldr r7, [r2, #8] |
|
218 |
+ smlad r0, r6, r6, r0 |
|
219 |
+ uxtb16 lr, r4 |
|
220 |
+ uxtb16 r4, r4, ror #8 |
|
221 |
+ uxtb16 r9, r7 |
|
222 |
+ uxtb16 r7, r7, ror #8 |
|
223 |
+ smlad r0, r8, r8, r0 |
|
224 |
+ ldr r8, [r2, #12] |
|
225 |
+ usub16 lr, lr, r9 |
|
226 |
+ usub16 r4, r4, r7 |
|
227 |
+ smlad r0, lr, lr, r0 |
|
228 |
+ uxtb16 r6, r5 |
|
229 |
+ uxtb16 r5, r5, ror #8 |
|
230 |
+ uxtb16 r9, r8 |
|
231 |
+ uxtb16 r8, r8, ror #8 |
|
232 |
+ smlad r0, r4, r4, r0 |
|
233 |
+ usub16 r6, r6, r9 |
|
234 |
+ usub16 r5, r5, r8 |
|
235 |
+ smlad r0, r6, r6, r0 |
|
236 |
+ add r1, r1, r3 |
|
237 |
+ add r2, r2, r3 |
|
238 |
+ subs r12, r12, #1 |
|
239 |
+ smlad r0, r5, r5, r0 |
|
240 |
+ bgt 1b |
|
241 |
+ |
|
242 |
+ pop {r4-r9, pc} |
|
243 |
+endfunc |
0 | 244 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,57 @@ |
0 |
+/* |
|
1 |
+ * This file is part of FFmpeg. |
|
2 |
+ * |
|
3 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
4 |
+ * modify it under the terms of the GNU Lesser General Public |
|
5 |
+ * License as published by the Free Software Foundation; either |
|
6 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
7 |
+ * |
|
8 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
9 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
10 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
11 |
+ * Lesser General Public License for more details. |
|
12 |
+ * |
|
13 |
+ * You should have received a copy of the GNU Lesser General Public |
|
14 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
15 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
16 |
+ */ |
|
17 |
+ |
|
18 |
+#include <stdint.h> |
|
19 |
+ |
|
20 |
+#include "libavutil/attributes.h" |
|
21 |
+#include "libavutil/cpu.h" |
|
22 |
+#include "libavutil/arm/cpu.h" |
|
23 |
+#include "libavcodec/avcodec.h" |
|
24 |
+#include "libavcodec/me_cmp.h" |
|
25 |
+#include "libavcodec/mpegvideo.h" |
|
26 |
+ |
|
27 |
+int ff_pix_abs16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
28 |
+ int line_size, int h); |
|
29 |
+int ff_pix_abs16_x2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
30 |
+ int line_size, int h); |
|
31 |
+int ff_pix_abs16_y2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
32 |
+ int line_size, int h); |
|
33 |
+ |
|
34 |
+int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
35 |
+ int line_size, int h); |
|
36 |
+ |
|
37 |
+int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
|
38 |
+ int line_size, int h); |
|
39 |
+ |
|
40 |
+av_cold void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx) |
|
41 |
+{ |
|
42 |
+ int cpu_flags = av_get_cpu_flags(); |
|
43 |
+ |
|
44 |
+ if (have_armv6(cpu_flags)) { |
|
45 |
+ c->pix_abs[0][0] = ff_pix_abs16_armv6; |
|
46 |
+ c->pix_abs[0][1] = ff_pix_abs16_x2_armv6; |
|
47 |
+ c->pix_abs[0][2] = ff_pix_abs16_y2_armv6; |
|
48 |
+ |
|
49 |
+ c->pix_abs[1][0] = ff_pix_abs8_armv6; |
|
50 |
+ |
|
51 |
+ c->sad[0] = ff_pix_abs16_armv6; |
|
52 |
+ c->sad[1] = ff_pix_abs8_armv6; |
|
53 |
+ |
|
54 |
+ c->sse[0] = ff_sse16_armv6; |
|
55 |
+ } |
|
56 |
+} |
25 | 25 |
deleted file mode 100644 |
... | ... |
@@ -1,1008 +0,0 @@ |
1 |
-/* |
|
2 |
- * DSP utils |
|
3 |
- * Copyright (c) 2000, 2001 Fabrice Bellard |
|
4 |
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
5 |
- * |
|
6 |
- * This file is part of FFmpeg. |
|
7 |
- * |
|
8 |
- * FFmpeg is free software; you can redistribute it and/or |
|
9 |
- * modify it under the terms of the GNU Lesser General Public |
|
10 |
- * License as published by the Free Software Foundation; either |
|
11 |
- * version 2.1 of the License, or (at your option) any later version. |
|
12 |
- * |
|
13 |
- * FFmpeg is distributed in the hope that it will be useful, |
|
14 |
- * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
15 |
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
16 |
- * Lesser General Public License for more details. |
|
17 |
- * |
|
18 |
- * You should have received a copy of the GNU Lesser General Public |
|
19 |
- * License along with FFmpeg; if not, write to the Free Software |
|
20 |
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
21 |
- */ |
|
22 |
- |
|
23 |
-/** |
|
24 |
- * @file |
|
25 |
- * DSP utils |
|
26 |
- */ |
|
27 |
- |
|
28 |
-#include "libavutil/attributes.h" |
|
29 |
-#include "libavutil/internal.h" |
|
30 |
-#include "avcodec.h" |
|
31 |
-#include "copy_block.h" |
|
32 |
-#include "dsputil.h" |
|
33 |
-#include "simple_idct.h" |
|
34 |
-#include "mpegvideo.h" |
|
35 |
-#include "config.h" |
|
36 |
- |
|
37 |
-uint32_t ff_square_tab[512] = { 0, }; |
|
38 |
- |
|
39 |
-static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
40 |
- int line_size, int h) |
|
41 |
-{ |
|
42 |
- int s = 0, i; |
|
43 |
- uint32_t *sq = ff_square_tab + 256; |
|
44 |
- |
|
45 |
- for (i = 0; i < h; i++) { |
|
46 |
- s += sq[pix1[0] - pix2[0]]; |
|
47 |
- s += sq[pix1[1] - pix2[1]]; |
|
48 |
- s += sq[pix1[2] - pix2[2]]; |
|
49 |
- s += sq[pix1[3] - pix2[3]]; |
|
50 |
- pix1 += line_size; |
|
51 |
- pix2 += line_size; |
|
52 |
- } |
|
53 |
- return s; |
|
54 |
-} |
|
55 |
- |
|
56 |
-static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
57 |
- int line_size, int h) |
|
58 |
-{ |
|
59 |
- int s = 0, i; |
|
60 |
- uint32_t *sq = ff_square_tab + 256; |
|
61 |
- |
|
62 |
- for (i = 0; i < h; i++) { |
|
63 |
- s += sq[pix1[0] - pix2[0]]; |
|
64 |
- s += sq[pix1[1] - pix2[1]]; |
|
65 |
- s += sq[pix1[2] - pix2[2]]; |
|
66 |
- s += sq[pix1[3] - pix2[3]]; |
|
67 |
- s += sq[pix1[4] - pix2[4]]; |
|
68 |
- s += sq[pix1[5] - pix2[5]]; |
|
69 |
- s += sq[pix1[6] - pix2[6]]; |
|
70 |
- s += sq[pix1[7] - pix2[7]]; |
|
71 |
- pix1 += line_size; |
|
72 |
- pix2 += line_size; |
|
73 |
- } |
|
74 |
- return s; |
|
75 |
-} |
|
76 |
- |
|
77 |
-static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
78 |
- int line_size, int h) |
|
79 |
-{ |
|
80 |
- int s = 0, i; |
|
81 |
- uint32_t *sq = ff_square_tab + 256; |
|
82 |
- |
|
83 |
- for (i = 0; i < h; i++) { |
|
84 |
- s += sq[pix1[0] - pix2[0]]; |
|
85 |
- s += sq[pix1[1] - pix2[1]]; |
|
86 |
- s += sq[pix1[2] - pix2[2]]; |
|
87 |
- s += sq[pix1[3] - pix2[3]]; |
|
88 |
- s += sq[pix1[4] - pix2[4]]; |
|
89 |
- s += sq[pix1[5] - pix2[5]]; |
|
90 |
- s += sq[pix1[6] - pix2[6]]; |
|
91 |
- s += sq[pix1[7] - pix2[7]]; |
|
92 |
- s += sq[pix1[8] - pix2[8]]; |
|
93 |
- s += sq[pix1[9] - pix2[9]]; |
|
94 |
- s += sq[pix1[10] - pix2[10]]; |
|
95 |
- s += sq[pix1[11] - pix2[11]]; |
|
96 |
- s += sq[pix1[12] - pix2[12]]; |
|
97 |
- s += sq[pix1[13] - pix2[13]]; |
|
98 |
- s += sq[pix1[14] - pix2[14]]; |
|
99 |
- s += sq[pix1[15] - pix2[15]]; |
|
100 |
- |
|
101 |
- pix1 += line_size; |
|
102 |
- pix2 += line_size; |
|
103 |
- } |
|
104 |
- return s; |
|
105 |
-} |
|
106 |
- |
|
107 |
-static int sum_abs_dctelem_c(int16_t *block) |
|
108 |
-{ |
|
109 |
- int sum = 0, i; |
|
110 |
- |
|
111 |
- for (i = 0; i < 64; i++) |
|
112 |
- sum += FFABS(block[i]); |
|
113 |
- return sum; |
|
114 |
-} |
|
115 |
- |
|
116 |
-#define avg2(a, b) ((a + b + 1) >> 1) |
|
117 |
-#define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2) |
|
118 |
- |
|
119 |
-static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
120 |
- int line_size, int h) |
|
121 |
-{ |
|
122 |
- int s = 0, i; |
|
123 |
- |
|
124 |
- for (i = 0; i < h; i++) { |
|
125 |
- s += abs(pix1[0] - pix2[0]); |
|
126 |
- s += abs(pix1[1] - pix2[1]); |
|
127 |
- s += abs(pix1[2] - pix2[2]); |
|
128 |
- s += abs(pix1[3] - pix2[3]); |
|
129 |
- s += abs(pix1[4] - pix2[4]); |
|
130 |
- s += abs(pix1[5] - pix2[5]); |
|
131 |
- s += abs(pix1[6] - pix2[6]); |
|
132 |
- s += abs(pix1[7] - pix2[7]); |
|
133 |
- s += abs(pix1[8] - pix2[8]); |
|
134 |
- s += abs(pix1[9] - pix2[9]); |
|
135 |
- s += abs(pix1[10] - pix2[10]); |
|
136 |
- s += abs(pix1[11] - pix2[11]); |
|
137 |
- s += abs(pix1[12] - pix2[12]); |
|
138 |
- s += abs(pix1[13] - pix2[13]); |
|
139 |
- s += abs(pix1[14] - pix2[14]); |
|
140 |
- s += abs(pix1[15] - pix2[15]); |
|
141 |
- pix1 += line_size; |
|
142 |
- pix2 += line_size; |
|
143 |
- } |
|
144 |
- return s; |
|
145 |
-} |
|
146 |
- |
|
147 |
-static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
148 |
- int line_size, int h) |
|
149 |
-{ |
|
150 |
- int s = 0, i; |
|
151 |
- |
|
152 |
- for (i = 0; i < h; i++) { |
|
153 |
- s += abs(pix1[0] - avg2(pix2[0], pix2[1])); |
|
154 |
- s += abs(pix1[1] - avg2(pix2[1], pix2[2])); |
|
155 |
- s += abs(pix1[2] - avg2(pix2[2], pix2[3])); |
|
156 |
- s += abs(pix1[3] - avg2(pix2[3], pix2[4])); |
|
157 |
- s += abs(pix1[4] - avg2(pix2[4], pix2[5])); |
|
158 |
- s += abs(pix1[5] - avg2(pix2[5], pix2[6])); |
|
159 |
- s += abs(pix1[6] - avg2(pix2[6], pix2[7])); |
|
160 |
- s += abs(pix1[7] - avg2(pix2[7], pix2[8])); |
|
161 |
- s += abs(pix1[8] - avg2(pix2[8], pix2[9])); |
|
162 |
- s += abs(pix1[9] - avg2(pix2[9], pix2[10])); |
|
163 |
- s += abs(pix1[10] - avg2(pix2[10], pix2[11])); |
|
164 |
- s += abs(pix1[11] - avg2(pix2[11], pix2[12])); |
|
165 |
- s += abs(pix1[12] - avg2(pix2[12], pix2[13])); |
|
166 |
- s += abs(pix1[13] - avg2(pix2[13], pix2[14])); |
|
167 |
- s += abs(pix1[14] - avg2(pix2[14], pix2[15])); |
|
168 |
- s += abs(pix1[15] - avg2(pix2[15], pix2[16])); |
|
169 |
- pix1 += line_size; |
|
170 |
- pix2 += line_size; |
|
171 |
- } |
|
172 |
- return s; |
|
173 |
-} |
|
174 |
- |
|
175 |
-static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
176 |
- int line_size, int h) |
|
177 |
-{ |
|
178 |
- int s = 0, i; |
|
179 |
- uint8_t *pix3 = pix2 + line_size; |
|
180 |
- |
|
181 |
- for (i = 0; i < h; i++) { |
|
182 |
- s += abs(pix1[0] - avg2(pix2[0], pix3[0])); |
|
183 |
- s += abs(pix1[1] - avg2(pix2[1], pix3[1])); |
|
184 |
- s += abs(pix1[2] - avg2(pix2[2], pix3[2])); |
|
185 |
- s += abs(pix1[3] - avg2(pix2[3], pix3[3])); |
|
186 |
- s += abs(pix1[4] - avg2(pix2[4], pix3[4])); |
|
187 |
- s += abs(pix1[5] - avg2(pix2[5], pix3[5])); |
|
188 |
- s += abs(pix1[6] - avg2(pix2[6], pix3[6])); |
|
189 |
- s += abs(pix1[7] - avg2(pix2[7], pix3[7])); |
|
190 |
- s += abs(pix1[8] - avg2(pix2[8], pix3[8])); |
|
191 |
- s += abs(pix1[9] - avg2(pix2[9], pix3[9])); |
|
192 |
- s += abs(pix1[10] - avg2(pix2[10], pix3[10])); |
|
193 |
- s += abs(pix1[11] - avg2(pix2[11], pix3[11])); |
|
194 |
- s += abs(pix1[12] - avg2(pix2[12], pix3[12])); |
|
195 |
- s += abs(pix1[13] - avg2(pix2[13], pix3[13])); |
|
196 |
- s += abs(pix1[14] - avg2(pix2[14], pix3[14])); |
|
197 |
- s += abs(pix1[15] - avg2(pix2[15], pix3[15])); |
|
198 |
- pix1 += line_size; |
|
199 |
- pix2 += line_size; |
|
200 |
- pix3 += line_size; |
|
201 |
- } |
|
202 |
- return s; |
|
203 |
-} |
|
204 |
- |
|
205 |
-static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
206 |
- int line_size, int h) |
|
207 |
-{ |
|
208 |
- int s = 0, i; |
|
209 |
- uint8_t *pix3 = pix2 + line_size; |
|
210 |
- |
|
211 |
- for (i = 0; i < h; i++) { |
|
212 |
- s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); |
|
213 |
- s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); |
|
214 |
- s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); |
|
215 |
- s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); |
|
216 |
- s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); |
|
217 |
- s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); |
|
218 |
- s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); |
|
219 |
- s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); |
|
220 |
- s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9])); |
|
221 |
- s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10])); |
|
222 |
- s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11])); |
|
223 |
- s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12])); |
|
224 |
- s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13])); |
|
225 |
- s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14])); |
|
226 |
- s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15])); |
|
227 |
- s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16])); |
|
228 |
- pix1 += line_size; |
|
229 |
- pix2 += line_size; |
|
230 |
- pix3 += line_size; |
|
231 |
- } |
|
232 |
- return s; |
|
233 |
-} |
|
234 |
- |
|
235 |
-static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
236 |
- int line_size, int h) |
|
237 |
-{ |
|
238 |
- int s = 0, i; |
|
239 |
- |
|
240 |
- for (i = 0; i < h; i++) { |
|
241 |
- s += abs(pix1[0] - pix2[0]); |
|
242 |
- s += abs(pix1[1] - pix2[1]); |
|
243 |
- s += abs(pix1[2] - pix2[2]); |
|
244 |
- s += abs(pix1[3] - pix2[3]); |
|
245 |
- s += abs(pix1[4] - pix2[4]); |
|
246 |
- s += abs(pix1[5] - pix2[5]); |
|
247 |
- s += abs(pix1[6] - pix2[6]); |
|
248 |
- s += abs(pix1[7] - pix2[7]); |
|
249 |
- pix1 += line_size; |
|
250 |
- pix2 += line_size; |
|
251 |
- } |
|
252 |
- return s; |
|
253 |
-} |
|
254 |
- |
|
255 |
-static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
256 |
- int line_size, int h) |
|
257 |
-{ |
|
258 |
- int s = 0, i; |
|
259 |
- |
|
260 |
- for (i = 0; i < h; i++) { |
|
261 |
- s += abs(pix1[0] - avg2(pix2[0], pix2[1])); |
|
262 |
- s += abs(pix1[1] - avg2(pix2[1], pix2[2])); |
|
263 |
- s += abs(pix1[2] - avg2(pix2[2], pix2[3])); |
|
264 |
- s += abs(pix1[3] - avg2(pix2[3], pix2[4])); |
|
265 |
- s += abs(pix1[4] - avg2(pix2[4], pix2[5])); |
|
266 |
- s += abs(pix1[5] - avg2(pix2[5], pix2[6])); |
|
267 |
- s += abs(pix1[6] - avg2(pix2[6], pix2[7])); |
|
268 |
- s += abs(pix1[7] - avg2(pix2[7], pix2[8])); |
|
269 |
- pix1 += line_size; |
|
270 |
- pix2 += line_size; |
|
271 |
- } |
|
272 |
- return s; |
|
273 |
-} |
|
274 |
- |
|
275 |
-static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
276 |
- int line_size, int h) |
|
277 |
-{ |
|
278 |
- int s = 0, i; |
|
279 |
- uint8_t *pix3 = pix2 + line_size; |
|
280 |
- |
|
281 |
- for (i = 0; i < h; i++) { |
|
282 |
- s += abs(pix1[0] - avg2(pix2[0], pix3[0])); |
|
283 |
- s += abs(pix1[1] - avg2(pix2[1], pix3[1])); |
|
284 |
- s += abs(pix1[2] - avg2(pix2[2], pix3[2])); |
|
285 |
- s += abs(pix1[3] - avg2(pix2[3], pix3[3])); |
|
286 |
- s += abs(pix1[4] - avg2(pix2[4], pix3[4])); |
|
287 |
- s += abs(pix1[5] - avg2(pix2[5], pix3[5])); |
|
288 |
- s += abs(pix1[6] - avg2(pix2[6], pix3[6])); |
|
289 |
- s += abs(pix1[7] - avg2(pix2[7], pix3[7])); |
|
290 |
- pix1 += line_size; |
|
291 |
- pix2 += line_size; |
|
292 |
- pix3 += line_size; |
|
293 |
- } |
|
294 |
- return s; |
|
295 |
-} |
|
296 |
- |
|
297 |
-static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
298 |
- int line_size, int h) |
|
299 |
-{ |
|
300 |
- int s = 0, i; |
|
301 |
- uint8_t *pix3 = pix2 + line_size; |
|
302 |
- |
|
303 |
- for (i = 0; i < h; i++) { |
|
304 |
- s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); |
|
305 |
- s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); |
|
306 |
- s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); |
|
307 |
- s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); |
|
308 |
- s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); |
|
309 |
- s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); |
|
310 |
- s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); |
|
311 |
- s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); |
|
312 |
- pix1 += line_size; |
|
313 |
- pix2 += line_size; |
|
314 |
- pix3 += line_size; |
|
315 |
- } |
|
316 |
- return s; |
|
317 |
-} |
|
318 |
- |
|
319 |
-static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h) |
|
320 |
-{ |
|
321 |
- int score1 = 0, score2 = 0, x, y; |
|
322 |
- |
|
323 |
- for (y = 0; y < h; y++) { |
|
324 |
- for (x = 0; x < 16; x++) |
|
325 |
- score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]); |
|
326 |
- if (y + 1 < h) { |
|
327 |
- for (x = 0; x < 15; x++) |
|
328 |
- score2 += FFABS(s1[x] - s1[x + stride] - |
|
329 |
- s1[x + 1] + s1[x + stride + 1]) - |
|
330 |
- FFABS(s2[x] - s2[x + stride] - |
|
331 |
- s2[x + 1] + s2[x + stride + 1]); |
|
332 |
- } |
|
333 |
- s1 += stride; |
|
334 |
- s2 += stride; |
|
335 |
- } |
|
336 |
- |
|
337 |
- if (c) |
|
338 |
- return score1 + FFABS(score2) * c->avctx->nsse_weight; |
|
339 |
- else |
|
340 |
- return score1 + FFABS(score2) * 8; |
|
341 |
-} |
|
342 |
- |
|
343 |
-static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h) |
|
344 |
-{ |
|
345 |
- int score1 = 0, score2 = 0, x, y; |
|
346 |
- |
|
347 |
- for (y = 0; y < h; y++) { |
|
348 |
- for (x = 0; x < 8; x++) |
|
349 |
- score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]); |
|
350 |
- if (y + 1 < h) { |
|
351 |
- for (x = 0; x < 7; x++) |
|
352 |
- score2 += FFABS(s1[x] - s1[x + stride] - |
|
353 |
- s1[x + 1] + s1[x + stride + 1]) - |
|
354 |
- FFABS(s2[x] - s2[x + stride] - |
|
355 |
- s2[x + 1] + s2[x + stride + 1]); |
|
356 |
- } |
|
357 |
- s1 += stride; |
|
358 |
- s2 += stride; |
|
359 |
- } |
|
360 |
- |
|
361 |
- if (c) |
|
362 |
- return score1 + FFABS(score2) * c->avctx->nsse_weight; |
|
363 |
- else |
|
364 |
- return score1 + FFABS(score2) * 8; |
|
365 |
-} |
|
366 |
- |
|
367 |
-static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b, |
|
368 |
- int stride, int h) |
|
369 |
-{ |
|
370 |
- return 0; |
|
371 |
-} |
|
372 |
- |
|
373 |
-void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type) |
|
374 |
-{ |
|
375 |
- int i; |
|
376 |
- |
|
377 |
- memset(cmp, 0, sizeof(void *) * 6); |
|
378 |
- |
|
379 |
- for (i = 0; i < 6; i++) { |
|
380 |
- switch (type & 0xFF) { |
|
381 |
- case FF_CMP_SAD: |
|
382 |
- cmp[i] = c->sad[i]; |
|
383 |
- break; |
|
384 |
- case FF_CMP_SATD: |
|
385 |
- cmp[i] = c->hadamard8_diff[i]; |
|
386 |
- break; |
|
387 |
- case FF_CMP_SSE: |
|
388 |
- cmp[i] = c->sse[i]; |
|
389 |
- break; |
|
390 |
- case FF_CMP_DCT: |
|
391 |
- cmp[i] = c->dct_sad[i]; |
|
392 |
- break; |
|
393 |
- case FF_CMP_DCT264: |
|
394 |
- cmp[i] = c->dct264_sad[i]; |
|
395 |
- break; |
|
396 |
- case FF_CMP_DCTMAX: |
|
397 |
- cmp[i] = c->dct_max[i]; |
|
398 |
- break; |
|
399 |
- case FF_CMP_PSNR: |
|
400 |
- cmp[i] = c->quant_psnr[i]; |
|
401 |
- break; |
|
402 |
- case FF_CMP_BIT: |
|
403 |
- cmp[i] = c->bit[i]; |
|
404 |
- break; |
|
405 |
- case FF_CMP_RD: |
|
406 |
- cmp[i] = c->rd[i]; |
|
407 |
- break; |
|
408 |
- case FF_CMP_VSAD: |
|
409 |
- cmp[i] = c->vsad[i]; |
|
410 |
- break; |
|
411 |
- case FF_CMP_VSSE: |
|
412 |
- cmp[i] = c->vsse[i]; |
|
413 |
- break; |
|
414 |
- case FF_CMP_ZERO: |
|
415 |
- cmp[i] = zero_cmp; |
|
416 |
- break; |
|
417 |
- case FF_CMP_NSSE: |
|
418 |
- cmp[i] = c->nsse[i]; |
|
419 |
- break; |
|
420 |
-#if CONFIG_DWT |
|
421 |
- case FF_CMP_W53: |
|
422 |
- cmp[i]= c->w53[i]; |
|
423 |
- break; |
|
424 |
- case FF_CMP_W97: |
|
425 |
- cmp[i]= c->w97[i]; |
|
426 |
- break; |
|
427 |
-#endif |
|
428 |
- default: |
|
429 |
- av_log(NULL, AV_LOG_ERROR, |
|
430 |
- "internal error in cmp function selection\n"); |
|
431 |
- } |
|
432 |
- } |
|
433 |
-} |
|
434 |
- |
|
435 |
-#define BUTTERFLY2(o1, o2, i1, i2) \ |
|
436 |
- o1 = (i1) + (i2); \ |
|
437 |
- o2 = (i1) - (i2); |
|
438 |
- |
|
439 |
-#define BUTTERFLY1(x, y) \ |
|
440 |
- { \ |
|
441 |
- int a, b; \ |
|
442 |
- a = x; \ |
|
443 |
- b = y; \ |
|
444 |
- x = a + b; \ |
|
445 |
- y = a - b; \ |
|
446 |
- } |
|
447 |
- |
|
448 |
-#define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y))) |
|
449 |
- |
|
450 |
-static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst, |
|
451 |
- uint8_t *src, int stride, int h) |
|
452 |
-{ |
|
453 |
- int i, temp[64], sum = 0; |
|
454 |
- |
|
455 |
- av_assert2(h == 8); |
|
456 |
- |
|
457 |
- for (i = 0; i < 8; i++) { |
|
458 |
- // FIXME: try pointer walks |
|
459 |
- BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1], |
|
460 |
- src[stride * i + 0] - dst[stride * i + 0], |
|
461 |
- src[stride * i + 1] - dst[stride * i + 1]); |
|
462 |
- BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3], |
|
463 |
- src[stride * i + 2] - dst[stride * i + 2], |
|
464 |
- src[stride * i + 3] - dst[stride * i + 3]); |
|
465 |
- BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5], |
|
466 |
- src[stride * i + 4] - dst[stride * i + 4], |
|
467 |
- src[stride * i + 5] - dst[stride * i + 5]); |
|
468 |
- BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7], |
|
469 |
- src[stride * i + 6] - dst[stride * i + 6], |
|
470 |
- src[stride * i + 7] - dst[stride * i + 7]); |
|
471 |
- |
|
472 |
- BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]); |
|
473 |
- BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]); |
|
474 |
- BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]); |
|
475 |
- BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]); |
|
476 |
- |
|
477 |
- BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]); |
|
478 |
- BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]); |
|
479 |
- BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]); |
|
480 |
- BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]); |
|
481 |
- } |
|
482 |
- |
|
483 |
- for (i = 0; i < 8; i++) { |
|
484 |
- BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]); |
|
485 |
- BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]); |
|
486 |
- BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]); |
|
487 |
- BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]); |
|
488 |
- |
|
489 |
- BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]); |
|
490 |
- BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]); |
|
491 |
- BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]); |
|
492 |
- BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]); |
|
493 |
- |
|
494 |
- sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) + |
|
495 |
- BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) + |
|
496 |
- BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) + |
|
497 |
- BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]); |
|
498 |
- } |
|
499 |
- return sum; |
|
500 |
-} |
|
501 |
- |
|
502 |
-static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src, |
|
503 |
- uint8_t *dummy, int stride, int h) |
|
504 |
-{ |
|
505 |
- int i, temp[64], sum = 0; |
|
506 |
- |
|
507 |
- av_assert2(h == 8); |
|
508 |
- |
|
509 |
- for (i = 0; i < 8; i++) { |
|
510 |
- // FIXME: try pointer walks |
|
511 |
- BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1], |
|
512 |
- src[stride * i + 0], src[stride * i + 1]); |
|
513 |
- BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3], |
|
514 |
- src[stride * i + 2], src[stride * i + 3]); |
|
515 |
- BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5], |
|
516 |
- src[stride * i + 4], src[stride * i + 5]); |
|
517 |
- BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7], |
|
518 |
- src[stride * i + 6], src[stride * i + 7]); |
|
519 |
- |
|
520 |
- BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]); |
|
521 |
- BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]); |
|
522 |
- BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]); |
|
523 |
- BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]); |
|
524 |
- |
|
525 |
- BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]); |
|
526 |
- BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]); |
|
527 |
- BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]); |
|
528 |
- BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]); |
|
529 |
- } |
|
530 |
- |
|
531 |
- for (i = 0; i < 8; i++) { |
|
532 |
- BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]); |
|
533 |
- BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]); |
|
534 |
- BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]); |
|
535 |
- BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]); |
|
536 |
- |
|
537 |
- BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]); |
|
538 |
- BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]); |
|
539 |
- BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]); |
|
540 |
- BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]); |
|
541 |
- |
|
542 |
- sum += |
|
543 |
- BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) |
|
544 |
- + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) |
|
545 |
- + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) |
|
546 |
- + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]); |
|
547 |
- } |
|
548 |
- |
|
549 |
- sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean |
|
550 |
- |
|
551 |
- return sum; |
|
552 |
-} |
|
553 |
- |
|
554 |
-static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1, |
|
555 |
- uint8_t *src2, int stride, int h) |
|
556 |
-{ |
|
557 |
- LOCAL_ALIGNED_16(int16_t, temp, [64]); |
|
558 |
- |
|
559 |
- av_assert2(h == 8); |
|
560 |
- |
|
561 |
- s->pdsp.diff_pixels(temp, src1, src2, stride); |
|
562 |
- s->fdsp.fdct(temp); |
|
563 |
- return s->dsp.sum_abs_dctelem(temp); |
|
564 |
-} |
|
565 |
- |
|
566 |
-#if CONFIG_GPL |
|
567 |
-#define DCT8_1D \ |
|
568 |
- { \ |
|
569 |
- const int s07 = SRC(0) + SRC(7); \ |
|
570 |
- const int s16 = SRC(1) + SRC(6); \ |
|
571 |
- const int s25 = SRC(2) + SRC(5); \ |
|
572 |
- const int s34 = SRC(3) + SRC(4); \ |
|
573 |
- const int a0 = s07 + s34; \ |
|
574 |
- const int a1 = s16 + s25; \ |
|
575 |
- const int a2 = s07 - s34; \ |
|
576 |
- const int a3 = s16 - s25; \ |
|
577 |
- const int d07 = SRC(0) - SRC(7); \ |
|
578 |
- const int d16 = SRC(1) - SRC(6); \ |
|
579 |
- const int d25 = SRC(2) - SRC(5); \ |
|
580 |
- const int d34 = SRC(3) - SRC(4); \ |
|
581 |
- const int a4 = d16 + d25 + (d07 + (d07 >> 1)); \ |
|
582 |
- const int a5 = d07 - d34 - (d25 + (d25 >> 1)); \ |
|
583 |
- const int a6 = d07 + d34 - (d16 + (d16 >> 1)); \ |
|
584 |
- const int a7 = d16 - d25 + (d34 + (d34 >> 1)); \ |
|
585 |
- DST(0, a0 + a1); \ |
|
586 |
- DST(1, a4 + (a7 >> 2)); \ |
|
587 |
- DST(2, a2 + (a3 >> 1)); \ |
|
588 |
- DST(3, a5 + (a6 >> 2)); \ |
|
589 |
- DST(4, a0 - a1); \ |
|
590 |
- DST(5, a6 - (a5 >> 2)); \ |
|
591 |
- DST(6, (a2 >> 1) - a3); \ |
|
592 |
- DST(7, (a4 >> 2) - a7); \ |
|
593 |
- } |
|
594 |
- |
|
595 |
-static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1, |
|
596 |
- uint8_t *src2, int stride, int h) |
|
597 |
-{ |
|
598 |
- int16_t dct[8][8]; |
|
599 |
- int i, sum = 0; |
|
600 |
- |
|
601 |
- s->pdsp.diff_pixels(dct[0], src1, src2, stride); |
|
602 |
- |
|
603 |
-#define SRC(x) dct[i][x] |
|
604 |
-#define DST(x, v) dct[i][x] = v |
|
605 |
- for (i = 0; i < 8; i++) |
|
606 |
- DCT8_1D |
|
607 |
-#undef SRC |
|
608 |
-#undef DST |
|
609 |
- |
|
610 |
-#define SRC(x) dct[x][i] |
|
611 |
-#define DST(x, v) sum += FFABS(v) |
|
612 |
- for (i = 0; i < 8; i++) |
|
613 |
- DCT8_1D |
|
614 |
-#undef SRC |
|
615 |
-#undef DST |
|
616 |
- return sum; |
|
617 |
-} |
|
618 |
-#endif |
|
619 |
- |
|
620 |
-static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1, |
|
621 |
- uint8_t *src2, int stride, int h) |
|
622 |
-{ |
|
623 |
- LOCAL_ALIGNED_16(int16_t, temp, [64]); |
|
624 |
- int sum = 0, i; |
|
625 |
- |
|
626 |
- av_assert2(h == 8); |
|
627 |
- |
|
628 |
- s->pdsp.diff_pixels(temp, src1, src2, stride); |
|
629 |
- s->fdsp.fdct(temp); |
|
630 |
- |
|
631 |
- for (i = 0; i < 64; i++) |
|
632 |
- sum = FFMAX(sum, FFABS(temp[i])); |
|
633 |
- |
|
634 |
- return sum; |
|
635 |
-} |
|
636 |
- |
|
637 |
-static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1, |
|
638 |
- uint8_t *src2, int stride, int h) |
|
639 |
-{ |
|
640 |
- LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]); |
|
641 |
- int16_t *const bak = temp + 64; |
|
642 |
- int sum = 0, i; |
|
643 |
- |
|
644 |
- av_assert2(h == 8); |
|
645 |
- s->mb_intra = 0; |
|
646 |
- |
|
647 |
- s->pdsp.diff_pixels(temp, src1, src2, stride); |
|
648 |
- |
|
649 |
- memcpy(bak, temp, 64 * sizeof(int16_t)); |
|
650 |
- |
|
651 |
- s->block_last_index[0 /* FIXME */] = |
|
652 |
- s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); |
|
653 |
- s->dct_unquantize_inter(s, temp, 0, s->qscale); |
|
654 |
- ff_simple_idct_8(temp); // FIXME |
|
655 |
- |
|
656 |
- for (i = 0; i < 64; i++) |
|
657 |
- sum += (temp[i] - bak[i]) * (temp[i] - bak[i]); |
|
658 |
- |
|
659 |
- return sum; |
|
660 |
-} |
|
661 |
- |
|
662 |
-static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2, |
|
663 |
- int stride, int h) |
|
664 |
-{ |
|
665 |
- const uint8_t *scantable = s->intra_scantable.permutated; |
|
666 |
- LOCAL_ALIGNED_16(int16_t, temp, [64]); |
|
667 |
- LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]); |
|
668 |
- LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]); |
|
669 |
- int i, last, run, bits, level, distortion, start_i; |
|
670 |
- const int esc_length = s->ac_esc_length; |
|
671 |
- uint8_t *length, *last_length; |
|
672 |
- |
|
673 |
- av_assert2(h == 8); |
|
674 |
- |
|
675 |
- copy_block8(lsrc1, src1, 8, stride, 8); |
|
676 |
- copy_block8(lsrc2, src2, 8, stride, 8); |
|
677 |
- |
|
678 |
- s->pdsp.diff_pixels(temp, lsrc1, lsrc2, 8); |
|
679 |
- |
|
680 |
- s->block_last_index[0 /* FIXME */] = |
|
681 |
- last = |
|
682 |
- s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); |
|
683 |
- |
|
684 |
- bits = 0; |
|
685 |
- |
|
686 |
- if (s->mb_intra) { |
|
687 |
- start_i = 1; |
|
688 |
- length = s->intra_ac_vlc_length; |
|
689 |
- last_length = s->intra_ac_vlc_last_length; |
|
690 |
- bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma |
|
691 |
- } else { |
|
692 |
- start_i = 0; |
|
693 |
- length = s->inter_ac_vlc_length; |
|
694 |
- last_length = s->inter_ac_vlc_last_length; |
|
695 |
- } |
|
696 |
- |
|
697 |
- if (last >= start_i) { |
|
698 |
- run = 0; |
|
699 |
- for (i = start_i; i < last; i++) { |
|
700 |
- int j = scantable[i]; |
|
701 |
- level = temp[j]; |
|
702 |
- |
|
703 |
- if (level) { |
|
704 |
- level += 64; |
|
705 |
- if ((level & (~127)) == 0) |
|
706 |
- bits += length[UNI_AC_ENC_INDEX(run, level)]; |
|
707 |
- else |
|
708 |
- bits += esc_length; |
|
709 |
- run = 0; |
|
710 |
- } else |
|
711 |
- run++; |
|
712 |
- } |
|
713 |
- i = scantable[last]; |
|
714 |
- |
|
715 |
- level = temp[i] + 64; |
|
716 |
- |
|
717 |
- av_assert2(level - 64); |
|
718 |
- |
|
719 |
- if ((level & (~127)) == 0) { |
|
720 |
- bits += last_length[UNI_AC_ENC_INDEX(run, level)]; |
|
721 |
- } else |
|
722 |
- bits += esc_length; |
|
723 |
- } |
|
724 |
- |
|
725 |
- if (last >= 0) { |
|
726 |
- if (s->mb_intra) |
|
727 |
- s->dct_unquantize_intra(s, temp, 0, s->qscale); |
|
728 |
- else |
|
729 |
- s->dct_unquantize_inter(s, temp, 0, s->qscale); |
|
730 |
- } |
|
731 |
- |
|
732 |
- s->idsp.idct_add(lsrc2, 8, temp); |
|
733 |
- |
|
734 |
- distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8); |
|
735 |
- |
|
736 |
- return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7); |
|
737 |
-} |
|
738 |
- |
|
739 |
-static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2, |
|
740 |
- int stride, int h) |
|
741 |
-{ |
|
742 |
- const uint8_t *scantable = s->intra_scantable.permutated; |
|
743 |
- LOCAL_ALIGNED_16(int16_t, temp, [64]); |
|
744 |
- int i, last, run, bits, level, start_i; |
|
745 |
- const int esc_length = s->ac_esc_length; |
|
746 |
- uint8_t *length, *last_length; |
|
747 |
- |
|
748 |
- av_assert2(h == 8); |
|
749 |
- |
|
750 |
- s->pdsp.diff_pixels(temp, src1, src2, stride); |
|
751 |
- |
|
752 |
- s->block_last_index[0 /* FIXME */] = |
|
753 |
- last = |
|
754 |
- s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); |
|
755 |
- |
|
756 |
- bits = 0; |
|
757 |
- |
|
758 |
- if (s->mb_intra) { |
|
759 |
- start_i = 1; |
|
760 |
- length = s->intra_ac_vlc_length; |
|
761 |
- last_length = s->intra_ac_vlc_last_length; |
|
762 |
- bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma |
|
763 |
- } else { |
|
764 |
- start_i = 0; |
|
765 |
- length = s->inter_ac_vlc_length; |
|
766 |
- last_length = s->inter_ac_vlc_last_length; |
|
767 |
- } |
|
768 |
- |
|
769 |
- if (last >= start_i) { |
|
770 |
- run = 0; |
|
771 |
- for (i = start_i; i < last; i++) { |
|
772 |
- int j = scantable[i]; |
|
773 |
- level = temp[j]; |
|
774 |
- |
|
775 |
- if (level) { |
|
776 |
- level += 64; |
|
777 |
- if ((level & (~127)) == 0) |
|
778 |
- bits += length[UNI_AC_ENC_INDEX(run, level)]; |
|
779 |
- else |
|
780 |
- bits += esc_length; |
|
781 |
- run = 0; |
|
782 |
- } else |
|
783 |
- run++; |
|
784 |
- } |
|
785 |
- i = scantable[last]; |
|
786 |
- |
|
787 |
- level = temp[i] + 64; |
|
788 |
- |
|
789 |
- av_assert2(level - 64); |
|
790 |
- |
|
791 |
- if ((level & (~127)) == 0) |
|
792 |
- bits += last_length[UNI_AC_ENC_INDEX(run, level)]; |
|
793 |
- else |
|
794 |
- bits += esc_length; |
|
795 |
- } |
|
796 |
- |
|
797 |
- return bits; |
|
798 |
-} |
|
799 |
- |
|
800 |
-#define VSAD_INTRA(size) \ |
|
801 |
-static int vsad_intra ## size ## _c(MpegEncContext *c, \ |
|
802 |
- uint8_t *s, uint8_t *dummy, \ |
|
803 |
- int stride, int h) \ |
|
804 |
-{ \ |
|
805 |
- int score = 0, x, y; \ |
|
806 |
- \ |
|
807 |
- for (y = 1; y < h; y++) { \ |
|
808 |
- for (x = 0; x < size; x += 4) { \ |
|
809 |
- score += FFABS(s[x] - s[x + stride]) + \ |
|
810 |
- FFABS(s[x + 1] - s[x + stride + 1]) + \ |
|
811 |
- FFABS(s[x + 2] - s[x + 2 + stride]) + \ |
|
812 |
- FFABS(s[x + 3] - s[x + 3 + stride]); \ |
|
813 |
- } \ |
|
814 |
- s += stride; \ |
|
815 |
- } \ |
|
816 |
- \ |
|
817 |
- return score; \ |
|
818 |
-} |
|
819 |
-VSAD_INTRA(8) |
|
820 |
-VSAD_INTRA(16) |
|
821 |
- |
|
822 |
-#define VSAD(size) \ |
|
823 |
-static int vsad ## size ## _c(MpegEncContext *c, \ |
|
824 |
- uint8_t *s1, uint8_t *s2, \ |
|
825 |
- int stride, int h) \ |
|
826 |
-{ \ |
|
827 |
- int score = 0, x, y; \ |
|
828 |
- \ |
|
829 |
- for (y = 1; y < h; y++) { \ |
|
830 |
- for (x = 0; x < size; x++) \ |
|
831 |
- score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); \ |
|
832 |
- s1 += stride; \ |
|
833 |
- s2 += stride; \ |
|
834 |
- } \ |
|
835 |
- \ |
|
836 |
- return score; \ |
|
837 |
-} |
|
838 |
-VSAD(8) |
|
839 |
-VSAD(16) |
|
840 |
- |
|
841 |
-#define SQ(a) ((a) * (a)) |
|
842 |
-#define VSSE_INTRA(size) \ |
|
843 |
-static int vsse_intra ## size ## _c(MpegEncContext *c, \ |
|
844 |
- uint8_t *s, uint8_t *dummy, \ |
|
845 |
- int stride, int h) \ |
|
846 |
-{ \ |
|
847 |
- int score = 0, x, y; \ |
|
848 |
- \ |
|
849 |
- for (y = 1; y < h; y++) { \ |
|
850 |
- for (x = 0; x < size; x += 4) { \ |
|
851 |
- score += SQ(s[x] - s[x + stride]) + \ |
|
852 |
- SQ(s[x + 1] - s[x + stride + 1]) + \ |
|
853 |
- SQ(s[x + 2] - s[x + stride + 2]) + \ |
|
854 |
- SQ(s[x + 3] - s[x + stride + 3]); \ |
|
855 |
- } \ |
|
856 |
- s += stride; \ |
|
857 |
- } \ |
|
858 |
- \ |
|
859 |
- return score; \ |
|
860 |
-} |
|
861 |
-VSSE_INTRA(8) |
|
862 |
-VSSE_INTRA(16) |
|
863 |
- |
|
864 |
-#define VSSE(size) \ |
|
865 |
-static int vsse ## size ## _c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, \ |
|
866 |
- int stride, int h) \ |
|
867 |
-{ \ |
|
868 |
- int score = 0, x, y; \ |
|
869 |
- \ |
|
870 |
- for (y = 1; y < h; y++) { \ |
|
871 |
- for (x = 0; x < size; x++) \ |
|
872 |
- score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); \ |
|
873 |
- s1 += stride; \ |
|
874 |
- s2 += stride; \ |
|
875 |
- } \ |
|
876 |
- \ |
|
877 |
- return score; \ |
|
878 |
-} |
|
879 |
-VSSE(8) |
|
880 |
-VSSE(16) |
|
881 |
- |
|
882 |
-#define WRAPPER8_16_SQ(name8, name16) \ |
|
883 |
-static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \ |
|
884 |
- int stride, int h) \ |
|
885 |
-{ \ |
|
886 |
- int score = 0; \ |
|
887 |
- \ |
|
888 |
- score += name8(s, dst, src, stride, 8); \ |
|
889 |
- score += name8(s, dst + 8, src + 8, stride, 8); \ |
|
890 |
- if (h == 16) { \ |
|
891 |
- dst += 8 * stride; \ |
|
892 |
- src += 8 * stride; \ |
|
893 |
- score += name8(s, dst, src, stride, 8); \ |
|
894 |
- score += name8(s, dst + 8, src + 8, stride, 8); \ |
|
895 |
- } \ |
|
896 |
- return score; \ |
|
897 |
-} |
|
898 |
- |
|
899 |
-WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c) |
|
900 |
-WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c) |
|
901 |
-WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c) |
|
902 |
-#if CONFIG_GPL |
|
903 |
-WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c) |
|
904 |
-#endif |
|
905 |
-WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c) |
|
906 |
-WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c) |
|
907 |
-WRAPPER8_16_SQ(rd8x8_c, rd16_c) |
|
908 |
-WRAPPER8_16_SQ(bit8x8_c, bit16_c) |
|
909 |
- |
|
910 |
-/* init static data */ |
|
911 |
-av_cold void ff_dsputil_static_init(void) |
|
912 |
-{ |
|
913 |
- int i; |
|
914 |
- |
|
915 |
- for (i = 0; i < 512; i++) |
|
916 |
- ff_square_tab[i] = (i - 256) * (i - 256); |
|
917 |
-} |
|
918 |
- |
|
919 |
-int ff_check_alignment(void) |
|
920 |
-{ |
|
921 |
- static int did_fail = 0; |
|
922 |
- LOCAL_ALIGNED_16(int, aligned, [4]); |
|
923 |
- |
|
924 |
- if ((intptr_t)aligned & 15) { |
|
925 |
- if (!did_fail) { |
|
926 |
-#if HAVE_MMX || HAVE_ALTIVEC |
|
927 |
- av_log(NULL, AV_LOG_ERROR, |
|
928 |
- "Compiler did not align stack variables. Libavcodec has been miscompiled\n" |
|
929 |
- "and may be very slow or crash. This is not a bug in libavcodec,\n" |
|
930 |
- "but in the compiler. You may try recompiling using gcc >= 4.2.\n" |
|
931 |
- "Do not report crashes to FFmpeg developers.\n"); |
|
932 |
-#endif |
|
933 |
- did_fail=1; |
|
934 |
- } |
|
935 |
- return -1; |
|
936 |
- } |
|
937 |
- return 0; |
|
938 |
-} |
|
939 |
- |
|
940 |
-av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx) |
|
941 |
-{ |
|
942 |
- ff_check_alignment(); |
|
943 |
- |
|
944 |
- c->sum_abs_dctelem = sum_abs_dctelem_c; |
|
945 |
- |
|
946 |
- /* TODO [0] 16 [1] 8 */ |
|
947 |
- c->pix_abs[0][0] = pix_abs16_c; |
|
948 |
- c->pix_abs[0][1] = pix_abs16_x2_c; |
|
949 |
- c->pix_abs[0][2] = pix_abs16_y2_c; |
|
950 |
- c->pix_abs[0][3] = pix_abs16_xy2_c; |
|
951 |
- c->pix_abs[1][0] = pix_abs8_c; |
|
952 |
- c->pix_abs[1][1] = pix_abs8_x2_c; |
|
953 |
- c->pix_abs[1][2] = pix_abs8_y2_c; |
|
954 |
- c->pix_abs[1][3] = pix_abs8_xy2_c; |
|
955 |
- |
|
956 |
-#define SET_CMP_FUNC(name) \ |
|
957 |
- c->name[0] = name ## 16_c; \ |
|
958 |
- c->name[1] = name ## 8x8_c; |
|
959 |
- |
|
960 |
- SET_CMP_FUNC(hadamard8_diff) |
|
961 |
- c->hadamard8_diff[4] = hadamard8_intra16_c; |
|
962 |
- c->hadamard8_diff[5] = hadamard8_intra8x8_c; |
|
963 |
- SET_CMP_FUNC(dct_sad) |
|
964 |
- SET_CMP_FUNC(dct_max) |
|
965 |
-#if CONFIG_GPL |
|
966 |
- SET_CMP_FUNC(dct264_sad) |
|
967 |
-#endif |
|
968 |
- c->sad[0] = pix_abs16_c; |
|
969 |
- c->sad[1] = pix_abs8_c; |
|
970 |
- c->sse[0] = sse16_c; |
|
971 |
- c->sse[1] = sse8_c; |
|
972 |
- c->sse[2] = sse4_c; |
|
973 |
- SET_CMP_FUNC(quant_psnr) |
|
974 |
- SET_CMP_FUNC(rd) |
|
975 |
- SET_CMP_FUNC(bit) |
|
976 |
- c->vsad[0] = vsad16_c; |
|
977 |
- c->vsad[1] = vsad8_c; |
|
978 |
- c->vsad[4] = vsad_intra16_c; |
|
979 |
- c->vsad[5] = vsad_intra8_c; |
|
980 |
- c->vsse[0] = vsse16_c; |
|
981 |
- c->vsse[1] = vsse8_c; |
|
982 |
- c->vsse[4] = vsse_intra16_c; |
|
983 |
- c->vsse[5] = vsse_intra8_c; |
|
984 |
- c->nsse[0] = nsse16_c; |
|
985 |
- c->nsse[1] = nsse8_c; |
|
986 |
-#if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER |
|
987 |
- ff_dsputil_init_dwt(c); |
|
988 |
-#endif |
|
989 |
- |
|
990 |
- if (ARCH_ALPHA) |
|
991 |
- ff_dsputil_init_alpha(c, avctx); |
|
992 |
- if (ARCH_ARM) |
|
993 |
- ff_dsputil_init_arm(c, avctx); |
|
994 |
- if (ARCH_PPC) |
|
995 |
- ff_dsputil_init_ppc(c, avctx); |
|
996 |
- if (ARCH_X86) |
|
997 |
- ff_dsputil_init_x86(c, avctx); |
|
998 |
-} |
|
999 |
- |
|
1000 |
-av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) |
|
1001 |
-{ |
|
1002 |
- ff_dsputil_init(c, avctx); |
|
1003 |
-} |
|
1004 |
- |
|
1005 |
-av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx) |
|
1006 |
-{ |
|
1007 |
- ff_dsputil_init(c, avctx); |
|
1008 |
-} |
1009 | 1 |
deleted file mode 100644 |
... | ... |
@@ -1,108 +0,0 @@ |
1 |
-/* |
|
2 |
- * DSP utils |
|
3 |
- * Copyright (c) 2000, 2001, 2002 Fabrice Bellard |
|
4 |
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
5 |
- * |
|
6 |
- * This file is part of FFmpeg. |
|
7 |
- * |
|
8 |
- * FFmpeg is free software; you can redistribute it and/or |
|
9 |
- * modify it under the terms of the GNU Lesser General Public |
|
10 |
- * License as published by the Free Software Foundation; either |
|
11 |
- * version 2.1 of the License, or (at your option) any later version. |
|
12 |
- * |
|
13 |
- * FFmpeg is distributed in the hope that it will be useful, |
|
14 |
- * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
15 |
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
16 |
- * Lesser General Public License for more details. |
|
17 |
- * |
|
18 |
- * You should have received a copy of the GNU Lesser General Public |
|
19 |
- * License along with FFmpeg; if not, write to the Free Software |
|
20 |
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
21 |
- */ |
|
22 |
- |
|
23 |
-/** |
|
24 |
- * @file |
|
25 |
- * DSP utils. |
|
26 |
- * Note, many functions in here may use MMX which trashes the FPU state, it is |
|
27 |
- * absolutely necessary to call emms_c() between DSP & float/double code. |
|
28 |
- */ |
|
29 |
- |
|
30 |
-#ifndef AVCODEC_DSPUTIL_H |
|
31 |
-#define AVCODEC_DSPUTIL_H |
|
32 |
- |
|
33 |
-#include "avcodec.h" |
|
34 |
- |
|
35 |
-extern uint32_t ff_square_tab[512]; |
|
36 |
- |
|
37 |
- |
|
38 |
-/* minimum alignment rules ;) |
|
39 |
- * If you notice errors in the align stuff, need more alignment for some ASM code |
|
40 |
- * for some CPU or need to use a function with less aligned data then send a mail |
|
41 |
- * to the ffmpeg-devel mailing list, ... |
|
42 |
- * |
|
43 |
- * !warning These alignments might not match reality, (missing attribute((align)) |
|
44 |
- * stuff somewhere possible). |
|
45 |
- * I (Michael) did not check them, these are just the alignments which I think |
|
46 |
- * could be reached easily ... |
|
47 |
- * |
|
48 |
- * !future video codecs might need functions with less strict alignment |
|
49 |
- */ |
|
50 |
- |
|
51 |
-struct MpegEncContext; |
|
52 |
-/* Motion estimation: |
|
53 |
- * h is limited to { width / 2, width, 2 * width }, |
|
54 |
- * but never larger than 16 and never smaller than 2. |
|
55 |
- * Although currently h < 4 is not used as functions with |
|
56 |
- * width < 8 are neither used nor implemented. */ |
|
57 |
-typedef int (*me_cmp_func)(struct MpegEncContext *c, |
|
58 |
- uint8_t *blk1 /* align width (8 or 16) */, |
|
59 |
- uint8_t *blk2 /* align 1 */, int line_size, int h); |
|
60 |
- |
|
61 |
-/** |
|
62 |
- * DSPContext. |
|
63 |
- */ |
|
64 |
-typedef struct DSPContext { |
|
65 |
- int (*sum_abs_dctelem)(int16_t *block /* align 16 */); |
|
66 |
- |
|
67 |
- me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */ |
|
68 |
- me_cmp_func sse[6]; |
|
69 |
- me_cmp_func hadamard8_diff[6]; |
|
70 |
- me_cmp_func dct_sad[6]; |
|
71 |
- me_cmp_func quant_psnr[6]; |
|
72 |
- me_cmp_func bit[6]; |
|
73 |
- me_cmp_func rd[6]; |
|
74 |
- me_cmp_func vsad[6]; |
|
75 |
- me_cmp_func vsse[6]; |
|
76 |
- me_cmp_func nsse[6]; |
|
77 |
- me_cmp_func w53[6]; |
|
78 |
- me_cmp_func w97[6]; |
|
79 |
- me_cmp_func dct_max[6]; |
|
80 |
- me_cmp_func dct264_sad[6]; |
|
81 |
- |
|
82 |
- me_cmp_func me_pre_cmp[6]; |
|
83 |
- me_cmp_func me_cmp[6]; |
|
84 |
- me_cmp_func me_sub_cmp[6]; |
|
85 |
- me_cmp_func mb_cmp[6]; |
|
86 |
- me_cmp_func ildct_cmp[6]; // only width 16 used |
|
87 |
- me_cmp_func frame_skip_cmp[6]; // only width 8 used |
|
88 |
- |
|
89 |
- me_cmp_func pix_abs[2][4]; |
|
90 |
-} DSPContext; |
|
91 |
- |
|
92 |
-void ff_dsputil_static_init(void); |
|
93 |
-void ff_dsputil_init(DSPContext *p, AVCodecContext *avctx); |
|
94 |
-void avpriv_dsputil_init(DSPContext* p, AVCodecContext *avctx); |
|
95 |
-attribute_deprecated void dsputil_init(DSPContext* c, AVCodecContext *avctx); |
|
96 |
- |
|
97 |
-int ff_check_alignment(void); |
|
98 |
- |
|
99 |
-void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type); |
|
100 |
- |
|
101 |
-void ff_dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx); |
|
102 |
-void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx); |
|
103 |
-void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx); |
|
104 |
-void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx); |
|
105 |
- |
|
106 |
-void ff_dsputil_init_dwt(DSPContext *c); |
|
107 |
- |
|
108 |
-#endif /* AVCODEC_DSPUTIL_H */ |
... | ... |
@@ -28,9 +28,9 @@ |
28 | 28 |
#include "libavutil/pixdesc.h" |
29 | 29 |
#include "config.h" |
30 | 30 |
#include "avcodec.h" |
31 |
-#include "dsputil.h" |
|
32 | 31 |
#include "fdctdsp.h" |
33 | 32 |
#include "internal.h" |
33 |
+#include "me_cmp.h" |
|
34 | 34 |
#include "pixblockdsp.h" |
35 | 35 |
#include "put_bits.h" |
36 | 36 |
#include "dv.h" |
... | ... |
@@ -40,8 +40,8 @@ |
40 | 40 |
static av_cold int dvvideo_encode_init(AVCodecContext *avctx) |
41 | 41 |
{ |
42 | 42 |
DVVideoContext *s = avctx->priv_data; |
43 |
- DSPContext dsp; |
|
44 | 43 |
FDCTDSPContext fdsp; |
44 |
+ MECmpContext mecc; |
|
45 | 45 |
PixblockDSPContext pdsp; |
46 | 46 |
int ret; |
47 | 47 |
|
... | ... |
@@ -69,14 +69,14 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx) |
69 | 69 |
|
70 | 70 |
dv_vlc_map_tableinit(); |
71 | 71 |
|
72 |
- memset(&dsp,0, sizeof(dsp)); |
|
73 |
- ff_dsputil_init(&dsp, avctx); |
|
72 |
+ memset(&mecc,0, sizeof(mecc)); |
|
74 | 73 |
ff_fdctdsp_init(&fdsp, avctx); |
74 |
+ ff_me_cmp_init(&mecc, avctx); |
|
75 | 75 |
ff_pixblockdsp_init(&pdsp, avctx); |
76 |
- ff_set_cmp(&dsp, dsp.ildct_cmp, avctx->ildct_cmp); |
|
76 |
+ ff_set_cmp(&mecc, mecc.ildct_cmp, avctx->ildct_cmp); |
|
77 | 77 |
|
78 | 78 |
s->get_pixels = pdsp.get_pixels; |
79 |
- s->ildct_cmp = dsp.ildct_cmp[5]; |
|
79 |
+ s->ildct_cmp = mecc.ildct_cmp[5]; |
|
80 | 80 |
|
81 | 81 |
s->fdct[0] = fdsp.fdct; |
82 | 82 |
s->fdct[1] = fdsp.fdct248; |
... | ... |
@@ -739,12 +739,12 @@ static int is_intra_more_likely(ERContext *s) |
739 | 739 |
} else { |
740 | 740 |
ff_thread_await_progress(s->last_pic.tf, mb_y, 0); |
741 | 741 |
} |
742 |
- is_intra_likely += s->dsp->sad[0](NULL, last_mb_ptr, mb_ptr, |
|
743 |
- linesize[0], 16); |
|
742 |
+ is_intra_likely += s->mecc->sad[0](NULL, last_mb_ptr, mb_ptr, |
|
743 |
+ linesize[0], 16); |
|
744 | 744 |
// FIXME need await_progress() here |
745 |
- is_intra_likely -= s->dsp->sad[0](NULL, last_mb_ptr, |
|
746 |
- last_mb_ptr + linesize[0] * 16, |
|
747 |
- linesize[0], 16); |
|
745 |
+ is_intra_likely -= s->mecc->sad[0](NULL, last_mb_ptr, |
|
746 |
+ last_mb_ptr + linesize[0] * 16, |
|
747 |
+ linesize[0], 16); |
|
748 | 748 |
} else { |
749 | 749 |
if (IS_INTRA(s->cur_pic.mb_type[mb_xy])) |
750 | 750 |
is_intra_likely++; |
... | ... |
@@ -23,7 +23,7 @@ |
23 | 23 |
#include <stdint.h> |
24 | 24 |
|
25 | 25 |
#include "avcodec.h" |
26 |
-#include "dsputil.h" |
|
26 |
+#include "me_cmp.h" |
|
27 | 27 |
#include "thread.h" |
28 | 28 |
|
29 | 29 |
///< current MB is the first after a resync marker |
... | ... |
@@ -52,7 +52,7 @@ typedef struct ERPicture { |
52 | 52 |
|
53 | 53 |
typedef struct ERContext { |
54 | 54 |
AVCodecContext *avctx; |
55 |
- DSPContext *dsp; |
|
55 |
+ MECmpContext *mecc; |
|
56 | 56 |
|
57 | 57 |
int *mb_index2xy; |
58 | 58 |
int mb_num; |
... | ... |
@@ -36,7 +36,6 @@ |
36 | 36 |
#include "internal.h" |
37 | 37 |
#include "cabac.h" |
38 | 38 |
#include "cabac_functions.h" |
39 |
-#include "dsputil.h" |
|
40 | 39 |
#include "error_resilience.h" |
41 | 40 |
#include "avcodec.h" |
42 | 41 |
#include "h264.h" |
... | ... |
@@ -45,6 +44,7 @@ |
45 | 45 |
#include "h264_mvpred.h" |
46 | 46 |
#include "golomb.h" |
47 | 47 |
#include "mathops.h" |
48 |
+#include "me_cmp.h" |
|
48 | 49 |
#include "mpegutils.h" |
49 | 50 |
#include "rectangle.h" |
50 | 51 |
#include "svq3.h" |
... | ... |
@@ -515,7 +515,7 @@ int ff_h264_context_init(H264Context *h) |
515 | 515 |
if (CONFIG_ERROR_RESILIENCE) { |
516 | 516 |
/* init ER */ |
517 | 517 |
er->avctx = h->avctx; |
518 |
- er->dsp = &h->dsp; |
|
518 |
+ er->mecc = &h->mecc; |
|
519 | 519 |
er->decode_mb = h264_er_decode_mb; |
520 | 520 |
er->opaque = h; |
521 | 521 |
er->quarter_sample = 1; |
... | ... |
@@ -653,7 +653,7 @@ av_cold int ff_h264_decode_init(AVCodecContext *avctx) |
653 | 653 |
|
654 | 654 |
/* needed so that IDCT permutation is known early */ |
655 | 655 |
if (CONFIG_ERROR_RESILIENCE) |
656 |
- ff_dsputil_init(&h->dsp, h->avctx); |
|
656 |
+ ff_me_cmp_init(&h->mecc, h->avctx); |
|
657 | 657 |
ff_videodsp_init(&h->vdsp, 8); |
658 | 658 |
|
659 | 659 |
memset(h->pps.scaling_matrix4, 16, 6 * 16 * sizeof(uint8_t)); |
... | ... |
@@ -1266,7 +1266,7 @@ int ff_h264_set_parameter_from_sps(H264Context *h) |
1266 | 1266 |
h->sps.chroma_format_idc); |
1267 | 1267 |
|
1268 | 1268 |
if (CONFIG_ERROR_RESILIENCE) |
1269 |
- ff_dsputil_init(&h->dsp, h->avctx); |
|
1269 |
+ ff_me_cmp_init(&h->mecc, h->avctx); |
|
1270 | 1270 |
ff_videodsp_init(&h->vdsp, h->sps.bit_depth_luma); |
1271 | 1271 |
} else { |
1272 | 1272 |
av_log(h->avctx, AV_LOG_ERROR, "Unsupported bit depth %d\n", |
... | ... |
@@ -30,13 +30,13 @@ |
30 | 30 |
|
31 | 31 |
#include "libavutil/intreadwrite.h" |
32 | 32 |
#include "cabac.h" |
33 |
-#include "dsputil.h" |
|
34 | 33 |
#include "error_resilience.h" |
35 | 34 |
#include "get_bits.h" |
36 | 35 |
#include "h264chroma.h" |
37 | 36 |
#include "h264dsp.h" |
38 | 37 |
#include "h264pred.h" |
39 | 38 |
#include "h264qpel.h" |
39 |
+#include "me_cmp.h" |
|
40 | 40 |
#include "mpegutils.h" |
41 | 41 |
#include "parser.h" |
42 | 42 |
#include "qpeldsp.h" |
... | ... |
@@ -338,13 +338,13 @@ typedef struct H264Picture { |
338 | 338 |
*/ |
339 | 339 |
typedef struct H264Context { |
340 | 340 |
AVCodecContext *avctx; |
341 |
+ MECmpContext mecc; |
|
341 | 342 |
VideoDSPContext vdsp; |
342 | 343 |
H264DSPContext h264dsp; |
343 | 344 |
H264ChromaContext h264chroma; |
344 | 345 |
H264QpelContext h264qpel; |
345 | 346 |
ParseContext parse_context; |
346 | 347 |
GetBitContext gb; |
347 |
- DSPContext dsp; |
|
348 | 348 |
ERContext er; |
349 | 349 |
|
350 | 350 |
H264Picture *DPB; |
... | ... |
@@ -31,7 +31,6 @@ |
31 | 31 |
#include "internal.h" |
32 | 32 |
#include "cabac.h" |
33 | 33 |
#include "cabac_functions.h" |
34 |
-#include "dsputil.h" |
|
35 | 34 |
#include "error_resilience.h" |
36 | 35 |
#include "avcodec.h" |
37 | 36 |
#include "h264.h" |
... | ... |
@@ -1203,7 +1202,7 @@ static int h264_slice_header_init(H264Context *h, int reinit) |
1203 | 1203 |
return AVERROR(ENOMEM); |
1204 | 1204 |
c->avctx = h->avctx; |
1205 | 1205 |
if (CONFIG_ERROR_RESILIENCE) { |
1206 |
- c->dsp = h->dsp; |
|
1206 |
+ c->mecc = h->mecc; |
|
1207 | 1207 |
} |
1208 | 1208 |
c->vdsp = h->vdsp; |
1209 | 1209 |
c->h264dsp = h->h264dsp; |
... | ... |
@@ -3,8 +3,6 @@ LIBAVCODEC_$MAJOR { |
3 | 3 |
#deprecated, remove after next bump |
4 | 4 |
audio_resample; |
5 | 5 |
audio_resample_close; |
6 |
- dsputil_init; |
|
7 |
- ff_dsputil_init; |
|
8 | 6 |
ff_find_pix_fmt; |
9 | 7 |
ff_framenum_to_drop_timecode; |
10 | 8 |
ff_framenum_to_smtpe_timecode; |
... | ... |
@@ -30,5 +28,6 @@ LIBAVCODEC_$MAJOR { |
30 | 30 |
ff_idctdsp_init; |
31 | 31 |
ff_fdctdsp_init; |
32 | 32 |
ff_pixblockdsp_init; |
33 |
+ ff_me_cmp_init; |
|
33 | 34 |
local: *; |
34 | 35 |
}; |
35 | 36 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,988 @@ |
0 |
+/* |
|
1 |
+ * This file is part of FFmpeg. |
|
2 |
+ * |
|
3 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
4 |
+ * modify it under the terms of the GNU Lesser General Public |
|
5 |
+ * License as published by the Free Software Foundation; either |
|
6 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
7 |
+ * |
|
8 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
9 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
10 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
11 |
+ * Lesser General Public License for more details. |
|
12 |
+ * |
|
13 |
+ * You should have received a copy of the GNU Lesser General Public |
|
14 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
15 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
16 |
+ */ |
|
17 |
+ |
|
18 |
+#include "libavutil/attributes.h" |
|
19 |
+#include "libavutil/internal.h" |
|
20 |
+#include "avcodec.h" |
|
21 |
+#include "copy_block.h" |
|
22 |
+#include "simple_idct.h" |
|
23 |
+#include "me_cmp.h" |
|
24 |
+#include "mpegvideo.h" |
|
25 |
+#include "config.h" |
|
26 |
+ |
|
27 |
+uint32_t ff_square_tab[512] = { 0, }; |
|
28 |
+ |
|
29 |
+static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
30 |
+ int line_size, int h) |
|
31 |
+{ |
|
32 |
+ int s = 0, i; |
|
33 |
+ uint32_t *sq = ff_square_tab + 256; |
|
34 |
+ |
|
35 |
+ for (i = 0; i < h; i++) { |
|
36 |
+ s += sq[pix1[0] - pix2[0]]; |
|
37 |
+ s += sq[pix1[1] - pix2[1]]; |
|
38 |
+ s += sq[pix1[2] - pix2[2]]; |
|
39 |
+ s += sq[pix1[3] - pix2[3]]; |
|
40 |
+ pix1 += line_size; |
|
41 |
+ pix2 += line_size; |
|
42 |
+ } |
|
43 |
+ return s; |
|
44 |
+} |
|
45 |
+ |
|
46 |
+static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
47 |
+ int line_size, int h) |
|
48 |
+{ |
|
49 |
+ int s = 0, i; |
|
50 |
+ uint32_t *sq = ff_square_tab + 256; |
|
51 |
+ |
|
52 |
+ for (i = 0; i < h; i++) { |
|
53 |
+ s += sq[pix1[0] - pix2[0]]; |
|
54 |
+ s += sq[pix1[1] - pix2[1]]; |
|
55 |
+ s += sq[pix1[2] - pix2[2]]; |
|
56 |
+ s += sq[pix1[3] - pix2[3]]; |
|
57 |
+ s += sq[pix1[4] - pix2[4]]; |
|
58 |
+ s += sq[pix1[5] - pix2[5]]; |
|
59 |
+ s += sq[pix1[6] - pix2[6]]; |
|
60 |
+ s += sq[pix1[7] - pix2[7]]; |
|
61 |
+ pix1 += line_size; |
|
62 |
+ pix2 += line_size; |
|
63 |
+ } |
|
64 |
+ return s; |
|
65 |
+} |
|
66 |
+ |
|
67 |
+static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
68 |
+ int line_size, int h) |
|
69 |
+{ |
|
70 |
+ int s = 0, i; |
|
71 |
+ uint32_t *sq = ff_square_tab + 256; |
|
72 |
+ |
|
73 |
+ for (i = 0; i < h; i++) { |
|
74 |
+ s += sq[pix1[0] - pix2[0]]; |
|
75 |
+ s += sq[pix1[1] - pix2[1]]; |
|
76 |
+ s += sq[pix1[2] - pix2[2]]; |
|
77 |
+ s += sq[pix1[3] - pix2[3]]; |
|
78 |
+ s += sq[pix1[4] - pix2[4]]; |
|
79 |
+ s += sq[pix1[5] - pix2[5]]; |
|
80 |
+ s += sq[pix1[6] - pix2[6]]; |
|
81 |
+ s += sq[pix1[7] - pix2[7]]; |
|
82 |
+ s += sq[pix1[8] - pix2[8]]; |
|
83 |
+ s += sq[pix1[9] - pix2[9]]; |
|
84 |
+ s += sq[pix1[10] - pix2[10]]; |
|
85 |
+ s += sq[pix1[11] - pix2[11]]; |
|
86 |
+ s += sq[pix1[12] - pix2[12]]; |
|
87 |
+ s += sq[pix1[13] - pix2[13]]; |
|
88 |
+ s += sq[pix1[14] - pix2[14]]; |
|
89 |
+ s += sq[pix1[15] - pix2[15]]; |
|
90 |
+ |
|
91 |
+ pix1 += line_size; |
|
92 |
+ pix2 += line_size; |
|
93 |
+ } |
|
94 |
+ return s; |
|
95 |
+} |
|
96 |
+ |
|
97 |
+static int sum_abs_dctelem_c(int16_t *block) |
|
98 |
+{ |
|
99 |
+ int sum = 0, i; |
|
100 |
+ |
|
101 |
+ for (i = 0; i < 64; i++) |
|
102 |
+ sum += FFABS(block[i]); |
|
103 |
+ return sum; |
|
104 |
+} |
|
105 |
+ |
|
106 |
+#define avg2(a, b) ((a + b + 1) >> 1) |
|
107 |
+#define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2) |
|
108 |
+ |
|
109 |
+static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
110 |
+ int line_size, int h) |
|
111 |
+{ |
|
112 |
+ int s = 0, i; |
|
113 |
+ |
|
114 |
+ for (i = 0; i < h; i++) { |
|
115 |
+ s += abs(pix1[0] - pix2[0]); |
|
116 |
+ s += abs(pix1[1] - pix2[1]); |
|
117 |
+ s += abs(pix1[2] - pix2[2]); |
|
118 |
+ s += abs(pix1[3] - pix2[3]); |
|
119 |
+ s += abs(pix1[4] - pix2[4]); |
|
120 |
+ s += abs(pix1[5] - pix2[5]); |
|
121 |
+ s += abs(pix1[6] - pix2[6]); |
|
122 |
+ s += abs(pix1[7] - pix2[7]); |
|
123 |
+ s += abs(pix1[8] - pix2[8]); |
|
124 |
+ s += abs(pix1[9] - pix2[9]); |
|
125 |
+ s += abs(pix1[10] - pix2[10]); |
|
126 |
+ s += abs(pix1[11] - pix2[11]); |
|
127 |
+ s += abs(pix1[12] - pix2[12]); |
|
128 |
+ s += abs(pix1[13] - pix2[13]); |
|
129 |
+ s += abs(pix1[14] - pix2[14]); |
|
130 |
+ s += abs(pix1[15] - pix2[15]); |
|
131 |
+ pix1 += line_size; |
|
132 |
+ pix2 += line_size; |
|
133 |
+ } |
|
134 |
+ return s; |
|
135 |
+} |
|
136 |
+ |
|
137 |
+static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
138 |
+ int line_size, int h) |
|
139 |
+{ |
|
140 |
+ int s = 0, i; |
|
141 |
+ |
|
142 |
+ for (i = 0; i < h; i++) { |
|
143 |
+ s += abs(pix1[0] - avg2(pix2[0], pix2[1])); |
|
144 |
+ s += abs(pix1[1] - avg2(pix2[1], pix2[2])); |
|
145 |
+ s += abs(pix1[2] - avg2(pix2[2], pix2[3])); |
|
146 |
+ s += abs(pix1[3] - avg2(pix2[3], pix2[4])); |
|
147 |
+ s += abs(pix1[4] - avg2(pix2[4], pix2[5])); |
|
148 |
+ s += abs(pix1[5] - avg2(pix2[5], pix2[6])); |
|
149 |
+ s += abs(pix1[6] - avg2(pix2[6], pix2[7])); |
|
150 |
+ s += abs(pix1[7] - avg2(pix2[7], pix2[8])); |
|
151 |
+ s += abs(pix1[8] - avg2(pix2[8], pix2[9])); |
|
152 |
+ s += abs(pix1[9] - avg2(pix2[9], pix2[10])); |
|
153 |
+ s += abs(pix1[10] - avg2(pix2[10], pix2[11])); |
|
154 |
+ s += abs(pix1[11] - avg2(pix2[11], pix2[12])); |
|
155 |
+ s += abs(pix1[12] - avg2(pix2[12], pix2[13])); |
|
156 |
+ s += abs(pix1[13] - avg2(pix2[13], pix2[14])); |
|
157 |
+ s += abs(pix1[14] - avg2(pix2[14], pix2[15])); |
|
158 |
+ s += abs(pix1[15] - avg2(pix2[15], pix2[16])); |
|
159 |
+ pix1 += line_size; |
|
160 |
+ pix2 += line_size; |
|
161 |
+ } |
|
162 |
+ return s; |
|
163 |
+} |
|
164 |
+ |
|
165 |
+static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
166 |
+ int line_size, int h) |
|
167 |
+{ |
|
168 |
+ int s = 0, i; |
|
169 |
+ uint8_t *pix3 = pix2 + line_size; |
|
170 |
+ |
|
171 |
+ for (i = 0; i < h; i++) { |
|
172 |
+ s += abs(pix1[0] - avg2(pix2[0], pix3[0])); |
|
173 |
+ s += abs(pix1[1] - avg2(pix2[1], pix3[1])); |
|
174 |
+ s += abs(pix1[2] - avg2(pix2[2], pix3[2])); |
|
175 |
+ s += abs(pix1[3] - avg2(pix2[3], pix3[3])); |
|
176 |
+ s += abs(pix1[4] - avg2(pix2[4], pix3[4])); |
|
177 |
+ s += abs(pix1[5] - avg2(pix2[5], pix3[5])); |
|
178 |
+ s += abs(pix1[6] - avg2(pix2[6], pix3[6])); |
|
179 |
+ s += abs(pix1[7] - avg2(pix2[7], pix3[7])); |
|
180 |
+ s += abs(pix1[8] - avg2(pix2[8], pix3[8])); |
|
181 |
+ s += abs(pix1[9] - avg2(pix2[9], pix3[9])); |
|
182 |
+ s += abs(pix1[10] - avg2(pix2[10], pix3[10])); |
|
183 |
+ s += abs(pix1[11] - avg2(pix2[11], pix3[11])); |
|
184 |
+ s += abs(pix1[12] - avg2(pix2[12], pix3[12])); |
|
185 |
+ s += abs(pix1[13] - avg2(pix2[13], pix3[13])); |
|
186 |
+ s += abs(pix1[14] - avg2(pix2[14], pix3[14])); |
|
187 |
+ s += abs(pix1[15] - avg2(pix2[15], pix3[15])); |
|
188 |
+ pix1 += line_size; |
|
189 |
+ pix2 += line_size; |
|
190 |
+ pix3 += line_size; |
|
191 |
+ } |
|
192 |
+ return s; |
|
193 |
+} |
|
194 |
+ |
|
195 |
+static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
196 |
+ int line_size, int h) |
|
197 |
+{ |
|
198 |
+ int s = 0, i; |
|
199 |
+ uint8_t *pix3 = pix2 + line_size; |
|
200 |
+ |
|
201 |
+ for (i = 0; i < h; i++) { |
|
202 |
+ s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); |
|
203 |
+ s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); |
|
204 |
+ s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); |
|
205 |
+ s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); |
|
206 |
+ s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); |
|
207 |
+ s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); |
|
208 |
+ s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); |
|
209 |
+ s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); |
|
210 |
+ s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9])); |
|
211 |
+ s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10])); |
|
212 |
+ s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11])); |
|
213 |
+ s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12])); |
|
214 |
+ s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13])); |
|
215 |
+ s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14])); |
|
216 |
+ s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15])); |
|
217 |
+ s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16])); |
|
218 |
+ pix1 += line_size; |
|
219 |
+ pix2 += line_size; |
|
220 |
+ pix3 += line_size; |
|
221 |
+ } |
|
222 |
+ return s; |
|
223 |
+} |
|
224 |
+ |
|
225 |
+static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
226 |
+ int line_size, int h) |
|
227 |
+{ |
|
228 |
+ int s = 0, i; |
|
229 |
+ |
|
230 |
+ for (i = 0; i < h; i++) { |
|
231 |
+ s += abs(pix1[0] - pix2[0]); |
|
232 |
+ s += abs(pix1[1] - pix2[1]); |
|
233 |
+ s += abs(pix1[2] - pix2[2]); |
|
234 |
+ s += abs(pix1[3] - pix2[3]); |
|
235 |
+ s += abs(pix1[4] - pix2[4]); |
|
236 |
+ s += abs(pix1[5] - pix2[5]); |
|
237 |
+ s += abs(pix1[6] - pix2[6]); |
|
238 |
+ s += abs(pix1[7] - pix2[7]); |
|
239 |
+ pix1 += line_size; |
|
240 |
+ pix2 += line_size; |
|
241 |
+ } |
|
242 |
+ return s; |
|
243 |
+} |
|
244 |
+ |
|
245 |
+static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
246 |
+ int line_size, int h) |
|
247 |
+{ |
|
248 |
+ int s = 0, i; |
|
249 |
+ |
|
250 |
+ for (i = 0; i < h; i++) { |
|
251 |
+ s += abs(pix1[0] - avg2(pix2[0], pix2[1])); |
|
252 |
+ s += abs(pix1[1] - avg2(pix2[1], pix2[2])); |
|
253 |
+ s += abs(pix1[2] - avg2(pix2[2], pix2[3])); |
|
254 |
+ s += abs(pix1[3] - avg2(pix2[3], pix2[4])); |
|
255 |
+ s += abs(pix1[4] - avg2(pix2[4], pix2[5])); |
|
256 |
+ s += abs(pix1[5] - avg2(pix2[5], pix2[6])); |
|
257 |
+ s += abs(pix1[6] - avg2(pix2[6], pix2[7])); |
|
258 |
+ s += abs(pix1[7] - avg2(pix2[7], pix2[8])); |
|
259 |
+ pix1 += line_size; |
|
260 |
+ pix2 += line_size; |
|
261 |
+ } |
|
262 |
+ return s; |
|
263 |
+} |
|
264 |
+ |
|
265 |
+static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
266 |
+ int line_size, int h) |
|
267 |
+{ |
|
268 |
+ int s = 0, i; |
|
269 |
+ uint8_t *pix3 = pix2 + line_size; |
|
270 |
+ |
|
271 |
+ for (i = 0; i < h; i++) { |
|
272 |
+ s += abs(pix1[0] - avg2(pix2[0], pix3[0])); |
|
273 |
+ s += abs(pix1[1] - avg2(pix2[1], pix3[1])); |
|
274 |
+ s += abs(pix1[2] - avg2(pix2[2], pix3[2])); |
|
275 |
+ s += abs(pix1[3] - avg2(pix2[3], pix3[3])); |
|
276 |
+ s += abs(pix1[4] - avg2(pix2[4], pix3[4])); |
|
277 |
+ s += abs(pix1[5] - avg2(pix2[5], pix3[5])); |
|
278 |
+ s += abs(pix1[6] - avg2(pix2[6], pix3[6])); |
|
279 |
+ s += abs(pix1[7] - avg2(pix2[7], pix3[7])); |
|
280 |
+ pix1 += line_size; |
|
281 |
+ pix2 += line_size; |
|
282 |
+ pix3 += line_size; |
|
283 |
+ } |
|
284 |
+ return s; |
|
285 |
+} |
|
286 |
+ |
|
287 |
+static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
288 |
+ int line_size, int h) |
|
289 |
+{ |
|
290 |
+ int s = 0, i; |
|
291 |
+ uint8_t *pix3 = pix2 + line_size; |
|
292 |
+ |
|
293 |
+ for (i = 0; i < h; i++) { |
|
294 |
+ s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); |
|
295 |
+ s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); |
|
296 |
+ s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); |
|
297 |
+ s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); |
|
298 |
+ s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); |
|
299 |
+ s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); |
|
300 |
+ s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); |
|
301 |
+ s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); |
|
302 |
+ pix1 += line_size; |
|
303 |
+ pix2 += line_size; |
|
304 |
+ pix3 += line_size; |
|
305 |
+ } |
|
306 |
+ return s; |
|
307 |
+} |
|
308 |
+ |
|
309 |
+static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h) |
|
310 |
+{ |
|
311 |
+ int score1 = 0, score2 = 0, x, y; |
|
312 |
+ |
|
313 |
+ for (y = 0; y < h; y++) { |
|
314 |
+ for (x = 0; x < 16; x++) |
|
315 |
+ score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]); |
|
316 |
+ if (y + 1 < h) { |
|
317 |
+ for (x = 0; x < 15; x++) |
|
318 |
+ score2 += FFABS(s1[x] - s1[x + stride] - |
|
319 |
+ s1[x + 1] + s1[x + stride + 1]) - |
|
320 |
+ FFABS(s2[x] - s2[x + stride] - |
|
321 |
+ s2[x + 1] + s2[x + stride + 1]); |
|
322 |
+ } |
|
323 |
+ s1 += stride; |
|
324 |
+ s2 += stride; |
|
325 |
+ } |
|
326 |
+ |
|
327 |
+ if (c) |
|
328 |
+ return score1 + FFABS(score2) * c->avctx->nsse_weight; |
|
329 |
+ else |
|
330 |
+ return score1 + FFABS(score2) * 8; |
|
331 |
+} |
|
332 |
+ |
|
333 |
+static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h) |
|
334 |
+{ |
|
335 |
+ int score1 = 0, score2 = 0, x, y; |
|
336 |
+ |
|
337 |
+ for (y = 0; y < h; y++) { |
|
338 |
+ for (x = 0; x < 8; x++) |
|
339 |
+ score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]); |
|
340 |
+ if (y + 1 < h) { |
|
341 |
+ for (x = 0; x < 7; x++) |
|
342 |
+ score2 += FFABS(s1[x] - s1[x + stride] - |
|
343 |
+ s1[x + 1] + s1[x + stride + 1]) - |
|
344 |
+ FFABS(s2[x] - s2[x + stride] - |
|
345 |
+ s2[x + 1] + s2[x + stride + 1]); |
|
346 |
+ } |
|
347 |
+ s1 += stride; |
|
348 |
+ s2 += stride; |
|
349 |
+ } |
|
350 |
+ |
|
351 |
+ if (c) |
|
352 |
+ return score1 + FFABS(score2) * c->avctx->nsse_weight; |
|
353 |
+ else |
|
354 |
+ return score1 + FFABS(score2) * 8; |
|
355 |
+} |
|
356 |
+ |
|
357 |
+static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b, |
|
358 |
+ int stride, int h) |
|
359 |
+{ |
|
360 |
+ return 0; |
|
361 |
+} |
|
362 |
+ |
|
363 |
+void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type) |
|
364 |
+{ |
|
365 |
+ int i; |
|
366 |
+ |
|
367 |
+ memset(cmp, 0, sizeof(void *) * 6); |
|
368 |
+ |
|
369 |
+ for (i = 0; i < 6; i++) { |
|
370 |
+ switch (type & 0xFF) { |
|
371 |
+ case FF_CMP_SAD: |
|
372 |
+ cmp[i] = c->sad[i]; |
|
373 |
+ break; |
|
374 |
+ case FF_CMP_SATD: |
|
375 |
+ cmp[i] = c->hadamard8_diff[i]; |
|
376 |
+ break; |
|
377 |
+ case FF_CMP_SSE: |
|
378 |
+ cmp[i] = c->sse[i]; |
|
379 |
+ break; |
|
380 |
+ case FF_CMP_DCT: |
|
381 |
+ cmp[i] = c->dct_sad[i]; |
|
382 |
+ break; |
|
383 |
+ case FF_CMP_DCT264: |
|
384 |
+ cmp[i] = c->dct264_sad[i]; |
|
385 |
+ break; |
|
386 |
+ case FF_CMP_DCTMAX: |
|
387 |
+ cmp[i] = c->dct_max[i]; |
|
388 |
+ break; |
|
389 |
+ case FF_CMP_PSNR: |
|
390 |
+ cmp[i] = c->quant_psnr[i]; |
|
391 |
+ break; |
|
392 |
+ case FF_CMP_BIT: |
|
393 |
+ cmp[i] = c->bit[i]; |
|
394 |
+ break; |
|
395 |
+ case FF_CMP_RD: |
|
396 |
+ cmp[i] = c->rd[i]; |
|
397 |
+ break; |
|
398 |
+ case FF_CMP_VSAD: |
|
399 |
+ cmp[i] = c->vsad[i]; |
|
400 |
+ break; |
|
401 |
+ case FF_CMP_VSSE: |
|
402 |
+ cmp[i] = c->vsse[i]; |
|
403 |
+ break; |
|
404 |
+ case FF_CMP_ZERO: |
|
405 |
+ cmp[i] = zero_cmp; |
|
406 |
+ break; |
|
407 |
+ case FF_CMP_NSSE: |
|
408 |
+ cmp[i] = c->nsse[i]; |
|
409 |
+ break; |
|
410 |
+#if CONFIG_DWT |
|
411 |
+ case FF_CMP_W53: |
|
412 |
+ cmp[i]= c->w53[i]; |
|
413 |
+ break; |
|
414 |
+ case FF_CMP_W97: |
|
415 |
+ cmp[i]= c->w97[i]; |
|
416 |
+ break; |
|
417 |
+#endif |
|
418 |
+ default: |
|
419 |
+ av_log(NULL, AV_LOG_ERROR, |
|
420 |
+ "internal error in cmp function selection\n"); |
|
421 |
+ } |
|
422 |
+ } |
|
423 |
+} |
|
424 |
+ |
|
425 |
+#define BUTTERFLY2(o1, o2, i1, i2) \ |
|
426 |
+ o1 = (i1) + (i2); \ |
|
427 |
+ o2 = (i1) - (i2); |
|
428 |
+ |
|
429 |
+#define BUTTERFLY1(x, y) \ |
|
430 |
+ { \ |
|
431 |
+ int a, b; \ |
|
432 |
+ a = x; \ |
|
433 |
+ b = y; \ |
|
434 |
+ x = a + b; \ |
|
435 |
+ y = a - b; \ |
|
436 |
+ } |
|
437 |
+ |
|
438 |
+#define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y))) |
|
439 |
+ |
|
440 |
+static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst, |
|
441 |
+ uint8_t *src, int stride, int h) |
|
442 |
+{ |
|
443 |
+ int i, temp[64], sum = 0; |
|
444 |
+ |
|
445 |
+ av_assert2(h == 8); |
|
446 |
+ |
|
447 |
+ for (i = 0; i < 8; i++) { |
|
448 |
+ // FIXME: try pointer walks |
|
449 |
+ BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1], |
|
450 |
+ src[stride * i + 0] - dst[stride * i + 0], |
|
451 |
+ src[stride * i + 1] - dst[stride * i + 1]); |
|
452 |
+ BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3], |
|
453 |
+ src[stride * i + 2] - dst[stride * i + 2], |
|
454 |
+ src[stride * i + 3] - dst[stride * i + 3]); |
|
455 |
+ BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5], |
|
456 |
+ src[stride * i + 4] - dst[stride * i + 4], |
|
457 |
+ src[stride * i + 5] - dst[stride * i + 5]); |
|
458 |
+ BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7], |
|
459 |
+ src[stride * i + 6] - dst[stride * i + 6], |
|
460 |
+ src[stride * i + 7] - dst[stride * i + 7]); |
|
461 |
+ |
|
462 |
+ BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]); |
|
463 |
+ BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]); |
|
464 |
+ BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]); |
|
465 |
+ BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]); |
|
466 |
+ |
|
467 |
+ BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]); |
|
468 |
+ BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]); |
|
469 |
+ BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]); |
|
470 |
+ BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]); |
|
471 |
+ } |
|
472 |
+ |
|
473 |
+ for (i = 0; i < 8; i++) { |
|
474 |
+ BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]); |
|
475 |
+ BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]); |
|
476 |
+ BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]); |
|
477 |
+ BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]); |
|
478 |
+ |
|
479 |
+ BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]); |
|
480 |
+ BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]); |
|
481 |
+ BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]); |
|
482 |
+ BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]); |
|
483 |
+ |
|
484 |
+ sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) + |
|
485 |
+ BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) + |
|
486 |
+ BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) + |
|
487 |
+ BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]); |
|
488 |
+ } |
|
489 |
+ return sum; |
|
490 |
+} |
|
491 |
+ |
|
492 |
+static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src, |
|
493 |
+ uint8_t *dummy, int stride, int h) |
|
494 |
+{ |
|
495 |
+ int i, temp[64], sum = 0; |
|
496 |
+ |
|
497 |
+ av_assert2(h == 8); |
|
498 |
+ |
|
499 |
+ for (i = 0; i < 8; i++) { |
|
500 |
+ // FIXME: try pointer walks |
|
501 |
+ BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1], |
|
502 |
+ src[stride * i + 0], src[stride * i + 1]); |
|
503 |
+ BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3], |
|
504 |
+ src[stride * i + 2], src[stride * i + 3]); |
|
505 |
+ BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5], |
|
506 |
+ src[stride * i + 4], src[stride * i + 5]); |
|
507 |
+ BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7], |
|
508 |
+ src[stride * i + 6], src[stride * i + 7]); |
|
509 |
+ |
|
510 |
+ BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]); |
|
511 |
+ BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]); |
|
512 |
+ BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]); |
|
513 |
+ BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]); |
|
514 |
+ |
|
515 |
+ BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]); |
|
516 |
+ BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]); |
|
517 |
+ BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]); |
|
518 |
+ BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]); |
|
519 |
+ } |
|
520 |
+ |
|
521 |
+ for (i = 0; i < 8; i++) { |
|
522 |
+ BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]); |
|
523 |
+ BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]); |
|
524 |
+ BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]); |
|
525 |
+ BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]); |
|
526 |
+ |
|
527 |
+ BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]); |
|
528 |
+ BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]); |
|
529 |
+ BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]); |
|
530 |
+ BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]); |
|
531 |
+ |
|
532 |
+ sum += |
|
533 |
+ BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) |
|
534 |
+ + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) |
|
535 |
+ + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) |
|
536 |
+ + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]); |
|
537 |
+ } |
|
538 |
+ |
|
539 |
+ sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean |
|
540 |
+ |
|
541 |
+ return sum; |
|
542 |
+} |
|
543 |
+ |
|
544 |
+static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1, |
|
545 |
+ uint8_t *src2, int stride, int h) |
|
546 |
+{ |
|
547 |
+ LOCAL_ALIGNED_16(int16_t, temp, [64]); |
|
548 |
+ |
|
549 |
+ av_assert2(h == 8); |
|
550 |
+ |
|
551 |
+ s->pdsp.diff_pixels(temp, src1, src2, stride); |
|
552 |
+ s->fdsp.fdct(temp); |
|
553 |
+ return s->mecc.sum_abs_dctelem(temp); |
|
554 |
+} |
|
555 |
+ |
|
556 |
+#if CONFIG_GPL |
|
557 |
+#define DCT8_1D \ |
|
558 |
+ { \ |
|
559 |
+ const int s07 = SRC(0) + SRC(7); \ |
|
560 |
+ const int s16 = SRC(1) + SRC(6); \ |
|
561 |
+ const int s25 = SRC(2) + SRC(5); \ |
|
562 |
+ const int s34 = SRC(3) + SRC(4); \ |
|
563 |
+ const int a0 = s07 + s34; \ |
|
564 |
+ const int a1 = s16 + s25; \ |
|
565 |
+ const int a2 = s07 - s34; \ |
|
566 |
+ const int a3 = s16 - s25; \ |
|
567 |
+ const int d07 = SRC(0) - SRC(7); \ |
|
568 |
+ const int d16 = SRC(1) - SRC(6); \ |
|
569 |
+ const int d25 = SRC(2) - SRC(5); \ |
|
570 |
+ const int d34 = SRC(3) - SRC(4); \ |
|
571 |
+ const int a4 = d16 + d25 + (d07 + (d07 >> 1)); \ |
|
572 |
+ const int a5 = d07 - d34 - (d25 + (d25 >> 1)); \ |
|
573 |
+ const int a6 = d07 + d34 - (d16 + (d16 >> 1)); \ |
|
574 |
+ const int a7 = d16 - d25 + (d34 + (d34 >> 1)); \ |
|
575 |
+ DST(0, a0 + a1); \ |
|
576 |
+ DST(1, a4 + (a7 >> 2)); \ |
|
577 |
+ DST(2, a2 + (a3 >> 1)); \ |
|
578 |
+ DST(3, a5 + (a6 >> 2)); \ |
|
579 |
+ DST(4, a0 - a1); \ |
|
580 |
+ DST(5, a6 - (a5 >> 2)); \ |
|
581 |
+ DST(6, (a2 >> 1) - a3); \ |
|
582 |
+ DST(7, (a4 >> 2) - a7); \ |
|
583 |
+ } |
|
584 |
+ |
|
585 |
+static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1, |
|
586 |
+ uint8_t *src2, int stride, int h) |
|
587 |
+{ |
|
588 |
+ int16_t dct[8][8]; |
|
589 |
+ int i, sum = 0; |
|
590 |
+ |
|
591 |
+ s->pdsp.diff_pixels(dct[0], src1, src2, stride); |
|
592 |
+ |
|
593 |
+#define SRC(x) dct[i][x] |
|
594 |
+#define DST(x, v) dct[i][x] = v |
|
595 |
+ for (i = 0; i < 8; i++) |
|
596 |
+ DCT8_1D |
|
597 |
+#undef SRC |
|
598 |
+#undef DST |
|
599 |
+ |
|
600 |
+#define SRC(x) dct[x][i] |
|
601 |
+#define DST(x, v) sum += FFABS(v) |
|
602 |
+ for (i = 0; i < 8; i++) |
|
603 |
+ DCT8_1D |
|
604 |
+#undef SRC |
|
605 |
+#undef DST |
|
606 |
+ return sum; |
|
607 |
+} |
|
608 |
+#endif |
|
609 |
+ |
|
610 |
+static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1, |
|
611 |
+ uint8_t *src2, int stride, int h) |
|
612 |
+{ |
|
613 |
+ LOCAL_ALIGNED_16(int16_t, temp, [64]); |
|
614 |
+ int sum = 0, i; |
|
615 |
+ |
|
616 |
+ av_assert2(h == 8); |
|
617 |
+ |
|
618 |
+ s->pdsp.diff_pixels(temp, src1, src2, stride); |
|
619 |
+ s->fdsp.fdct(temp); |
|
620 |
+ |
|
621 |
+ for (i = 0; i < 64; i++) |
|
622 |
+ sum = FFMAX(sum, FFABS(temp[i])); |
|
623 |
+ |
|
624 |
+ return sum; |
|
625 |
+} |
|
626 |
+ |
|
627 |
+static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1, |
|
628 |
+ uint8_t *src2, int stride, int h) |
|
629 |
+{ |
|
630 |
+ LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]); |
|
631 |
+ int16_t *const bak = temp + 64; |
|
632 |
+ int sum = 0, i; |
|
633 |
+ |
|
634 |
+ av_assert2(h == 8); |
|
635 |
+ s->mb_intra = 0; |
|
636 |
+ |
|
637 |
+ s->pdsp.diff_pixels(temp, src1, src2, stride); |
|
638 |
+ |
|
639 |
+ memcpy(bak, temp, 64 * sizeof(int16_t)); |
|
640 |
+ |
|
641 |
+ s->block_last_index[0 /* FIXME */] = |
|
642 |
+ s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); |
|
643 |
+ s->dct_unquantize_inter(s, temp, 0, s->qscale); |
|
644 |
+ ff_simple_idct_8(temp); // FIXME |
|
645 |
+ |
|
646 |
+ for (i = 0; i < 64; i++) |
|
647 |
+ sum += (temp[i] - bak[i]) * (temp[i] - bak[i]); |
|
648 |
+ |
|
649 |
+ return sum; |
|
650 |
+} |
|
651 |
+ |
|
652 |
+static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2, |
|
653 |
+ int stride, int h) |
|
654 |
+{ |
|
655 |
+ const uint8_t *scantable = s->intra_scantable.permutated; |
|
656 |
+ LOCAL_ALIGNED_16(int16_t, temp, [64]); |
|
657 |
+ LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]); |
|
658 |
+ LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]); |
|
659 |
+ int i, last, run, bits, level, distortion, start_i; |
|
660 |
+ const int esc_length = s->ac_esc_length; |
|
661 |
+ uint8_t *length, *last_length; |
|
662 |
+ |
|
663 |
+ av_assert2(h == 8); |
|
664 |
+ |
|
665 |
+ copy_block8(lsrc1, src1, 8, stride, 8); |
|
666 |
+ copy_block8(lsrc2, src2, 8, stride, 8); |
|
667 |
+ |
|
668 |
+ s->pdsp.diff_pixels(temp, lsrc1, lsrc2, 8); |
|
669 |
+ |
|
670 |
+ s->block_last_index[0 /* FIXME */] = |
|
671 |
+ last = |
|
672 |
+ s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); |
|
673 |
+ |
|
674 |
+ bits = 0; |
|
675 |
+ |
|
676 |
+ if (s->mb_intra) { |
|
677 |
+ start_i = 1; |
|
678 |
+ length = s->intra_ac_vlc_length; |
|
679 |
+ last_length = s->intra_ac_vlc_last_length; |
|
680 |
+ bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma |
|
681 |
+ } else { |
|
682 |
+ start_i = 0; |
|
683 |
+ length = s->inter_ac_vlc_length; |
|
684 |
+ last_length = s->inter_ac_vlc_last_length; |
|
685 |
+ } |
|
686 |
+ |
|
687 |
+ if (last >= start_i) { |
|
688 |
+ run = 0; |
|
689 |
+ for (i = start_i; i < last; i++) { |
|
690 |
+ int j = scantable[i]; |
|
691 |
+ level = temp[j]; |
|
692 |
+ |
|
693 |
+ if (level) { |
|
694 |
+ level += 64; |
|
695 |
+ if ((level & (~127)) == 0) |
|
696 |
+ bits += length[UNI_AC_ENC_INDEX(run, level)]; |
|
697 |
+ else |
|
698 |
+ bits += esc_length; |
|
699 |
+ run = 0; |
|
700 |
+ } else |
|
701 |
+ run++; |
|
702 |
+ } |
|
703 |
+ i = scantable[last]; |
|
704 |
+ |
|
705 |
+ level = temp[i] + 64; |
|
706 |
+ |
|
707 |
+ av_assert2(level - 64); |
|
708 |
+ |
|
709 |
+ if ((level & (~127)) == 0) { |
|
710 |
+ bits += last_length[UNI_AC_ENC_INDEX(run, level)]; |
|
711 |
+ } else |
|
712 |
+ bits += esc_length; |
|
713 |
+ } |
|
714 |
+ |
|
715 |
+ if (last >= 0) { |
|
716 |
+ if (s->mb_intra) |
|
717 |
+ s->dct_unquantize_intra(s, temp, 0, s->qscale); |
|
718 |
+ else |
|
719 |
+ s->dct_unquantize_inter(s, temp, 0, s->qscale); |
|
720 |
+ } |
|
721 |
+ |
|
722 |
+ s->idsp.idct_add(lsrc2, 8, temp); |
|
723 |
+ |
|
724 |
+ distortion = s->mecc.sse[1](NULL, lsrc2, lsrc1, 8, 8); |
|
725 |
+ |
|
726 |
+ return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7); |
|
727 |
+} |
|
728 |
+ |
|
729 |
+static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2, |
|
730 |
+ int stride, int h) |
|
731 |
+{ |
|
732 |
+ const uint8_t *scantable = s->intra_scantable.permutated; |
|
733 |
+ LOCAL_ALIGNED_16(int16_t, temp, [64]); |
|
734 |
+ int i, last, run, bits, level, start_i; |
|
735 |
+ const int esc_length = s->ac_esc_length; |
|
736 |
+ uint8_t *length, *last_length; |
|
737 |
+ |
|
738 |
+ av_assert2(h == 8); |
|
739 |
+ |
|
740 |
+ s->pdsp.diff_pixels(temp, src1, src2, stride); |
|
741 |
+ |
|
742 |
+ s->block_last_index[0 /* FIXME */] = |
|
743 |
+ last = |
|
744 |
+ s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); |
|
745 |
+ |
|
746 |
+ bits = 0; |
|
747 |
+ |
|
748 |
+ if (s->mb_intra) { |
|
749 |
+ start_i = 1; |
|
750 |
+ length = s->intra_ac_vlc_length; |
|
751 |
+ last_length = s->intra_ac_vlc_last_length; |
|
752 |
+ bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma |
|
753 |
+ } else { |
|
754 |
+ start_i = 0; |
|
755 |
+ length = s->inter_ac_vlc_length; |
|
756 |
+ last_length = s->inter_ac_vlc_last_length; |
|
757 |
+ } |
|
758 |
+ |
|
759 |
+ if (last >= start_i) { |
|
760 |
+ run = 0; |
|
761 |
+ for (i = start_i; i < last; i++) { |
|
762 |
+ int j = scantable[i]; |
|
763 |
+ level = temp[j]; |
|
764 |
+ |
|
765 |
+ if (level) { |
|
766 |
+ level += 64; |
|
767 |
+ if ((level & (~127)) == 0) |
|
768 |
+ bits += length[UNI_AC_ENC_INDEX(run, level)]; |
|
769 |
+ else |
|
770 |
+ bits += esc_length; |
|
771 |
+ run = 0; |
|
772 |
+ } else |
|
773 |
+ run++; |
|
774 |
+ } |
|
775 |
+ i = scantable[last]; |
|
776 |
+ |
|
777 |
+ level = temp[i] + 64; |
|
778 |
+ |
|
779 |
+ av_assert2(level - 64); |
|
780 |
+ |
|
781 |
+ if ((level & (~127)) == 0) |
|
782 |
+ bits += last_length[UNI_AC_ENC_INDEX(run, level)]; |
|
783 |
+ else |
|
784 |
+ bits += esc_length; |
|
785 |
+ } |
|
786 |
+ |
|
787 |
+ return bits; |
|
788 |
+} |
|
789 |
+ |
|
790 |
+#define VSAD_INTRA(size) \ |
|
791 |
+static int vsad_intra ## size ## _c(MpegEncContext *c, \ |
|
792 |
+ uint8_t *s, uint8_t *dummy, \ |
|
793 |
+ int stride, int h) \ |
|
794 |
+{ \ |
|
795 |
+ int score = 0, x, y; \ |
|
796 |
+ \ |
|
797 |
+ for (y = 1; y < h; y++) { \ |
|
798 |
+ for (x = 0; x < size; x += 4) { \ |
|
799 |
+ score += FFABS(s[x] - s[x + stride]) + \ |
|
800 |
+ FFABS(s[x + 1] - s[x + stride + 1]) + \ |
|
801 |
+ FFABS(s[x + 2] - s[x + 2 + stride]) + \ |
|
802 |
+ FFABS(s[x + 3] - s[x + 3 + stride]); \ |
|
803 |
+ } \ |
|
804 |
+ s += stride; \ |
|
805 |
+ } \ |
|
806 |
+ \ |
|
807 |
+ return score; \ |
|
808 |
+} |
|
809 |
+VSAD_INTRA(8) |
|
810 |
+VSAD_INTRA(16) |
|
811 |
+ |
|
812 |
+#define VSAD(size) \ |
|
813 |
+static int vsad ## size ## _c(MpegEncContext *c, \ |
|
814 |
+ uint8_t *s1, uint8_t *s2, \ |
|
815 |
+ int stride, int h) \ |
|
816 |
+{ \ |
|
817 |
+ int score = 0, x, y; \ |
|
818 |
+ \ |
|
819 |
+ for (y = 1; y < h; y++) { \ |
|
820 |
+ for (x = 0; x < size; x++) \ |
|
821 |
+ score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); \ |
|
822 |
+ s1 += stride; \ |
|
823 |
+ s2 += stride; \ |
|
824 |
+ } \ |
|
825 |
+ \ |
|
826 |
+ return score; \ |
|
827 |
+} |
|
828 |
+VSAD(8) |
|
829 |
+VSAD(16) |
|
830 |
+ |
|
831 |
+#define SQ(a) ((a) * (a)) |
|
832 |
+#define VSSE_INTRA(size) \ |
|
833 |
+static int vsse_intra ## size ## _c(MpegEncContext *c, \ |
|
834 |
+ uint8_t *s, uint8_t *dummy, \ |
|
835 |
+ int stride, int h) \ |
|
836 |
+{ \ |
|
837 |
+ int score = 0, x, y; \ |
|
838 |
+ \ |
|
839 |
+ for (y = 1; y < h; y++) { \ |
|
840 |
+ for (x = 0; x < size; x += 4) { \ |
|
841 |
+ score += SQ(s[x] - s[x + stride]) + \ |
|
842 |
+ SQ(s[x + 1] - s[x + stride + 1]) + \ |
|
843 |
+ SQ(s[x + 2] - s[x + stride + 2]) + \ |
|
844 |
+ SQ(s[x + 3] - s[x + stride + 3]); \ |
|
845 |
+ } \ |
|
846 |
+ s += stride; \ |
|
847 |
+ } \ |
|
848 |
+ \ |
|
849 |
+ return score; \ |
|
850 |
+} |
|
851 |
+VSSE_INTRA(8) |
|
852 |
+VSSE_INTRA(16) |
|
853 |
+ |
|
854 |
+#define VSSE(size) \ |
|
855 |
+static int vsse ## size ## _c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, \ |
|
856 |
+ int stride, int h) \ |
|
857 |
+{ \ |
|
858 |
+ int score = 0, x, y; \ |
|
859 |
+ \ |
|
860 |
+ for (y = 1; y < h; y++) { \ |
|
861 |
+ for (x = 0; x < size; x++) \ |
|
862 |
+ score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); \ |
|
863 |
+ s1 += stride; \ |
|
864 |
+ s2 += stride; \ |
|
865 |
+ } \ |
|
866 |
+ \ |
|
867 |
+ return score; \ |
|
868 |
+} |
|
869 |
+VSSE(8) |
|
870 |
+VSSE(16) |
|
871 |
+ |
|
872 |
+#define WRAPPER8_16_SQ(name8, name16) \ |
|
873 |
+static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \ |
|
874 |
+ int stride, int h) \ |
|
875 |
+{ \ |
|
876 |
+ int score = 0; \ |
|
877 |
+ \ |
|
878 |
+ score += name8(s, dst, src, stride, 8); \ |
|
879 |
+ score += name8(s, dst + 8, src + 8, stride, 8); \ |
|
880 |
+ if (h == 16) { \ |
|
881 |
+ dst += 8 * stride; \ |
|
882 |
+ src += 8 * stride; \ |
|
883 |
+ score += name8(s, dst, src, stride, 8); \ |
|
884 |
+ score += name8(s, dst + 8, src + 8, stride, 8); \ |
|
885 |
+ } \ |
|
886 |
+ return score; \ |
|
887 |
+} |
|
888 |
+ |
|
889 |
+WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c) |
|
890 |
+WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c) |
|
891 |
+WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c) |
|
892 |
+#if CONFIG_GPL |
|
893 |
+WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c) |
|
894 |
+#endif |
|
895 |
+WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c) |
|
896 |
+WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c) |
|
897 |
+WRAPPER8_16_SQ(rd8x8_c, rd16_c) |
|
898 |
+WRAPPER8_16_SQ(bit8x8_c, bit16_c) |
|
899 |
+ |
|
900 |
+av_cold void ff_me_cmp_init_static(void) |
|
901 |
+{ |
|
902 |
+ int i; |
|
903 |
+ |
|
904 |
+ for (i = 0; i < 512; i++) |
|
905 |
+ ff_square_tab[i] = (i - 256) * (i - 256); |
|
906 |
+} |
|
907 |
+ |
|
908 |
+int ff_check_alignment(void) |
|
909 |
+{ |
|
910 |
+ static int did_fail = 0; |
|
911 |
+ LOCAL_ALIGNED_16(int, aligned, [4]); |
|
912 |
+ |
|
913 |
+ if ((intptr_t)aligned & 15) { |
|
914 |
+ if (!did_fail) { |
|
915 |
+#if HAVE_MMX || HAVE_ALTIVEC |
|
916 |
+ av_log(NULL, AV_LOG_ERROR, |
|
917 |
+ "Compiler did not align stack variables. Libavcodec has been miscompiled\n" |
|
918 |
+ "and may be very slow or crash. This is not a bug in libavcodec,\n" |
|
919 |
+ "but in the compiler. You may try recompiling using gcc >= 4.2.\n" |
|
920 |
+ "Do not report crashes to FFmpeg developers.\n"); |
|
921 |
+#endif |
|
922 |
+ did_fail=1; |
|
923 |
+ } |
|
924 |
+ return -1; |
|
925 |
+ } |
|
926 |
+ return 0; |
|
927 |
+} |
|
928 |
+ |
|
929 |
+av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx) |
|
930 |
+{ |
|
931 |
+ ff_check_alignment(); |
|
932 |
+ |
|
933 |
+ c->sum_abs_dctelem = sum_abs_dctelem_c; |
|
934 |
+ |
|
935 |
+ /* TODO [0] 16 [1] 8 */ |
|
936 |
+ c->pix_abs[0][0] = pix_abs16_c; |
|
937 |
+ c->pix_abs[0][1] = pix_abs16_x2_c; |
|
938 |
+ c->pix_abs[0][2] = pix_abs16_y2_c; |
|
939 |
+ c->pix_abs[0][3] = pix_abs16_xy2_c; |
|
940 |
+ c->pix_abs[1][0] = pix_abs8_c; |
|
941 |
+ c->pix_abs[1][1] = pix_abs8_x2_c; |
|
942 |
+ c->pix_abs[1][2] = pix_abs8_y2_c; |
|
943 |
+ c->pix_abs[1][3] = pix_abs8_xy2_c; |
|
944 |
+ |
|
945 |
+#define SET_CMP_FUNC(name) \ |
|
946 |
+ c->name[0] = name ## 16_c; \ |
|
947 |
+ c->name[1] = name ## 8x8_c; |
|
948 |
+ |
|
949 |
+ SET_CMP_FUNC(hadamard8_diff) |
|
950 |
+ c->hadamard8_diff[4] = hadamard8_intra16_c; |
|
951 |
+ c->hadamard8_diff[5] = hadamard8_intra8x8_c; |
|
952 |
+ SET_CMP_FUNC(dct_sad) |
|
953 |
+ SET_CMP_FUNC(dct_max) |
|
954 |
+#if CONFIG_GPL |
|
955 |
+ SET_CMP_FUNC(dct264_sad) |
|
956 |
+#endif |
|
957 |
+ c->sad[0] = pix_abs16_c; |
|
958 |
+ c->sad[1] = pix_abs8_c; |
|
959 |
+ c->sse[0] = sse16_c; |
|
960 |
+ c->sse[1] = sse8_c; |
|
961 |
+ c->sse[2] = sse4_c; |
|
962 |
+ SET_CMP_FUNC(quant_psnr) |
|
963 |
+ SET_CMP_FUNC(rd) |
|
964 |
+ SET_CMP_FUNC(bit) |
|
965 |
+ c->vsad[0] = vsad16_c; |
|
966 |
+ c->vsad[1] = vsad8_c; |
|
967 |
+ c->vsad[4] = vsad_intra16_c; |
|
968 |
+ c->vsad[5] = vsad_intra8_c; |
|
969 |
+ c->vsse[0] = vsse16_c; |
|
970 |
+ c->vsse[1] = vsse8_c; |
|
971 |
+ c->vsse[4] = vsse_intra16_c; |
|
972 |
+ c->vsse[5] = vsse_intra8_c; |
|
973 |
+ c->nsse[0] = nsse16_c; |
|
974 |
+ c->nsse[1] = nsse8_c; |
|
975 |
+#if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER |
|
976 |
+ ff_dsputil_init_dwt(c); |
|
977 |
+#endif |
|
978 |
+ |
|
979 |
+ if (ARCH_ALPHA) |
|
980 |
+ ff_me_cmp_init_alpha(c, avctx); |
|
981 |
+ if (ARCH_ARM) |
|
982 |
+ ff_me_cmp_init_arm(c, avctx); |
|
983 |
+ if (ARCH_PPC) |
|
984 |
+ ff_me_cmp_init_ppc(c, avctx); |
|
985 |
+ if (ARCH_X86) |
|
986 |
+ ff_me_cmp_init_x86(c, avctx); |
|
987 |
+} |
0 | 988 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,94 @@ |
0 |
+/* |
|
1 |
+ * This file is part of FFmpeg. |
|
2 |
+ * |
|
3 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
4 |
+ * modify it under the terms of the GNU Lesser General Public |
|
5 |
+ * License as published by the Free Software Foundation; either |
|
6 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
7 |
+ * |
|
8 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
9 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
10 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
11 |
+ * Lesser General Public License for more details. |
|
12 |
+ * |
|
13 |
+ * You should have received a copy of the GNU Lesser General Public |
|
14 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
15 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
16 |
+ */ |
|
17 |
+ |
|
18 |
+#ifndef AVCODEC_ME_CMP_H |
|
19 |
+#define AVCODEC_ME_CMP_H |
|
20 |
+ |
|
21 |
+#include <stdint.h> |
|
22 |
+ |
|
23 |
+#include "avcodec.h" |
|
24 |
+ |
|
25 |
+extern uint32_t ff_square_tab[512]; |
|
26 |
+ |
|
27 |
+ |
|
28 |
+/* minimum alignment rules ;) |
|
29 |
+ * If you notice errors in the align stuff, need more alignment for some ASM code |
|
30 |
+ * for some CPU or need to use a function with less aligned data then send a mail |
|
31 |
+ * to the ffmpeg-devel mailing list, ... |
|
32 |
+ * |
|
33 |
+ * !warning These alignments might not match reality, (missing attribute((align)) |
|
34 |
+ * stuff somewhere possible). |
|
35 |
+ * I (Michael) did not check them, these are just the alignments which I think |
|
36 |
+ * could be reached easily ... |
|
37 |
+ * |
|
38 |
+ * !future video codecs might need functions with less strict alignment |
|
39 |
+ */ |
|
40 |
+ |
|
41 |
+struct MpegEncContext; |
|
42 |
+/* Motion estimation: |
|
43 |
+ * h is limited to { width / 2, width, 2 * width }, |
|
44 |
+ * but never larger than 16 and never smaller than 2. |
|
45 |
+ * Although currently h < 4 is not used as functions with |
|
46 |
+ * width < 8 are neither used nor implemented. */ |
|
47 |
+typedef int (*me_cmp_func)(struct MpegEncContext *c, |
|
48 |
+ uint8_t *blk1 /* align width (8 or 16) */, |
|
49 |
+ uint8_t *blk2 /* align 1 */, int line_size, int h); |
|
50 |
+ |
|
51 |
+typedef struct MECmpContext { |
|
52 |
+ int (*sum_abs_dctelem)(int16_t *block /* align 16 */); |
|
53 |
+ |
|
54 |
+ me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */ |
|
55 |
+ me_cmp_func sse[6]; |
|
56 |
+ me_cmp_func hadamard8_diff[6]; |
|
57 |
+ me_cmp_func dct_sad[6]; |
|
58 |
+ me_cmp_func quant_psnr[6]; |
|
59 |
+ me_cmp_func bit[6]; |
|
60 |
+ me_cmp_func rd[6]; |
|
61 |
+ me_cmp_func vsad[6]; |
|
62 |
+ me_cmp_func vsse[6]; |
|
63 |
+ me_cmp_func nsse[6]; |
|
64 |
+ me_cmp_func w53[6]; |
|
65 |
+ me_cmp_func w97[6]; |
|
66 |
+ me_cmp_func dct_max[6]; |
|
67 |
+ me_cmp_func dct264_sad[6]; |
|
68 |
+ |
|
69 |
+ me_cmp_func me_pre_cmp[6]; |
|
70 |
+ me_cmp_func me_cmp[6]; |
|
71 |
+ me_cmp_func me_sub_cmp[6]; |
|
72 |
+ me_cmp_func mb_cmp[6]; |
|
73 |
+ me_cmp_func ildct_cmp[6]; // only width 16 used |
|
74 |
+ me_cmp_func frame_skip_cmp[6]; // only width 8 used |
|
75 |
+ |
|
76 |
+ me_cmp_func pix_abs[2][4]; |
|
77 |
+} MECmpContext; |
|
78 |
+ |
|
79 |
+void ff_me_cmp_init_static(void); |
|
80 |
+ |
|
81 |
+int ff_check_alignment(void); |
|
82 |
+ |
|
83 |
+void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx); |
|
84 |
+void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx); |
|
85 |
+void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx); |
|
86 |
+void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx); |
|
87 |
+void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx); |
|
88 |
+ |
|
89 |
+void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type); |
|
90 |
+ |
|
91 |
+void ff_dsputil_init_dwt(MECmpContext *c); |
|
92 |
+ |
|
93 |
+#endif /* AVCODEC_ME_CMP_H */ |
... | ... |
@@ -28,7 +28,7 @@ |
28 | 28 |
#include <string.h> |
29 | 29 |
|
30 | 30 |
#include "config.h" |
31 |
-#include "dsputil.h" |
|
31 |
+#include "me_cmp.h" |
|
32 | 32 |
#include "libavutil/internal.h" |
33 | 33 |
#include "libavutil/lfg.h" |
34 | 34 |
#include "libavutil/mem.h" |
... | ... |
@@ -115,7 +115,7 @@ int main(int argc, char **argv) |
115 | 115 |
{ |
116 | 116 |
AVCodecContext *ctx; |
117 | 117 |
int c; |
118 |
- DSPContext cctx, mmxctx; |
|
118 |
+ MECmpContext cctx, mmxctx; |
|
119 | 119 |
int flags[2] = { AV_CPU_FLAG_MMX, AV_CPU_FLAG_MMXEXT }; |
120 | 120 |
int flags_size = HAVE_MMXEXT ? 2 : 1; |
121 | 121 |
|
... | ... |
@@ -130,12 +130,12 @@ int main(int argc, char **argv) |
130 | 130 |
ctx->flags |= CODEC_FLAG_BITEXACT; |
131 | 131 |
av_force_cpu_flags(0); |
132 | 132 |
memset(&cctx, 0, sizeof(cctx)); |
133 |
- ff_dsputil_init(&cctx, ctx); |
|
133 |
+ ff_me_cmp_init(&cctx, ctx); |
|
134 | 134 |
for (c = 0; c < flags_size; c++) { |
135 | 135 |
int x; |
136 | 136 |
av_force_cpu_flags(flags[c]); |
137 | 137 |
memset(&mmxctx, 0, sizeof(mmxctx)); |
138 |
- ff_dsputil_init(&mmxctx, ctx); |
|
138 |
+ ff_me_cmp_init(&mmxctx, ctx); |
|
139 | 139 |
|
140 | 140 |
for (x = 0; x < 2; x++) { |
141 | 141 |
printf("%s for %dx%d pixels\n", c ? "mmx2" : "mmx", |
... | ... |
@@ -316,10 +316,10 @@ int ff_init_me(MpegEncContext *s){ |
316 | 316 |
av_log(s->avctx, AV_LOG_INFO, "ME_MAP size may be a little small for the selected diamond size\n"); |
317 | 317 |
} |
318 | 318 |
|
319 |
- ff_set_cmp(&s->dsp, s->dsp.me_pre_cmp, c->avctx->me_pre_cmp); |
|
320 |
- ff_set_cmp(&s->dsp, s->dsp.me_cmp, c->avctx->me_cmp); |
|
321 |
- ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, c->avctx->me_sub_cmp); |
|
322 |
- ff_set_cmp(&s->dsp, s->dsp.mb_cmp, c->avctx->mb_cmp); |
|
319 |
+ ff_set_cmp(&s->mecc, s->mecc.me_pre_cmp, c->avctx->me_pre_cmp); |
|
320 |
+ ff_set_cmp(&s->mecc, s->mecc.me_cmp, c->avctx->me_cmp); |
|
321 |
+ ff_set_cmp(&s->mecc, s->mecc.me_sub_cmp, c->avctx->me_sub_cmp); |
|
322 |
+ ff_set_cmp(&s->mecc, s->mecc.mb_cmp, c->avctx->mb_cmp); |
|
323 | 323 |
|
324 | 324 |
c->flags = get_flags(c, 0, c->avctx->me_cmp &FF_CMP_CHROMA); |
325 | 325 |
c->sub_flags= get_flags(c, 0, c->avctx->me_sub_cmp&FF_CMP_CHROMA); |
... | ... |
@@ -360,13 +360,11 @@ int ff_init_me(MpegEncContext *s){ |
360 | 360 |
/* 8x8 fullpel search would need a 4x4 chroma compare, which we do |
361 | 361 |
* not have yet, and even if we had, the motion estimation code |
362 | 362 |
* does not expect it. */ |
363 |
- if(s->codec_id != AV_CODEC_ID_SNOW){ |
|
364 |
- if((c->avctx->me_cmp&FF_CMP_CHROMA)/* && !s->dsp.me_cmp[2]*/){ |
|
365 |
- s->dsp.me_cmp[2]= zero_cmp; |
|
366 |
- } |
|
367 |
- if((c->avctx->me_sub_cmp&FF_CMP_CHROMA) && !s->dsp.me_sub_cmp[2]){ |
|
368 |
- s->dsp.me_sub_cmp[2]= zero_cmp; |
|
369 |
- } |
|
363 |
+ if (s->codec_id != AV_CODEC_ID_SNOW) { |
|
364 |
+ if ((c->avctx->me_cmp & FF_CMP_CHROMA) /* && !s->mecc.me_cmp[2] */) |
|
365 |
+ s->mecc.me_cmp[2] = zero_cmp; |
|
366 |
+ if ((c->avctx->me_sub_cmp & FF_CMP_CHROMA) && !s->mecc.me_sub_cmp[2]) |
|
367 |
+ s->mecc.me_sub_cmp[2] = zero_cmp; |
|
370 | 368 |
c->hpel_put[2][0]= c->hpel_put[2][1]= |
371 | 369 |
c->hpel_put[2][2]= c->hpel_put[2][3]= zero_hpel; |
372 | 370 |
} |
... | ... |
@@ -380,7 +378,7 @@ int ff_init_me(MpegEncContext *s){ |
380 | 380 |
|
381 | 381 |
#define CHECK_SAD_HALF_MV(suffix, x, y) \ |
382 | 382 |
{\ |
383 |
- d= s->dsp.pix_abs[size][(x?1:0)+(y?2:0)](NULL, pix, ptr+((x)>>1), stride, h);\ |
|
383 |
+ d = s->mecc.pix_abs[size][(x ? 1 : 0) + (y ? 2 : 0)](NULL, pix, ptr + ((x) >> 1), stride, h); \ |
|
384 | 384 |
d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*penalty_factor;\ |
385 | 385 |
COPY3_IF_LT(dminh, d, dx, x, dy, y)\ |
386 | 386 |
} |
... | ... |
@@ -633,7 +631,7 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift) |
633 | 633 |
|
634 | 634 |
dmin4= c->sub_motion_search(s, &mx4, &my4, dmin4, block, block, size, h); |
635 | 635 |
|
636 |
- if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){ |
|
636 |
+ if (s->mecc.me_sub_cmp[0] != s->mecc.mb_cmp[0]) { |
|
637 | 637 |
int dxy; |
638 | 638 |
const int offset= ((block&1) + (block>>1)*stride)*8; |
639 | 639 |
uint8_t *dest_y = c->scratchpad + offset; |
... | ... |
@@ -675,8 +673,11 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift) |
675 | 675 |
if(same) |
676 | 676 |
return INT_MAX; |
677 | 677 |
|
678 |
- if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){ |
|
679 |
- dmin_sum += s->dsp.mb_cmp[0](s, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*16*stride, c->scratchpad, stride, 16); |
|
678 |
+ if (s->mecc.me_sub_cmp[0] != s->mecc.mb_cmp[0]) { |
|
679 |
+ dmin_sum += s->mecc.mb_cmp[0](s, |
|
680 |
+ s->new_picture.f->data[0] + |
|
681 |
+ s->mb_x * 16 + s->mb_y * 16 * stride, |
|
682 |
+ c->scratchpad, stride, 16); |
|
680 | 683 |
} |
681 | 684 |
|
682 | 685 |
if(c->avctx->mb_cmp&FF_CMP_CHROMA){ |
... | ... |
@@ -698,8 +699,8 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift) |
698 | 698 |
s->hdsp.put_pixels_tab [1][dxy](c->scratchpad + 8, s->last_picture.f->data[2] + offset, s->uvlinesize, 8); |
699 | 699 |
} |
700 | 700 |
|
701 |
- dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.f->data[1] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, c->scratchpad , s->uvlinesize, 8); |
|
702 |
- dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.f->data[2] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, c->scratchpad+8, s->uvlinesize, 8); |
|
701 |
+ dmin_sum += s->mecc.mb_cmp[1](s, s->new_picture.f->data[1] + s->mb_x * 8 + s->mb_y * 8 * s->uvlinesize, c->scratchpad, s->uvlinesize, 8); |
|
702 |
+ dmin_sum += s->mecc.mb_cmp[1](s, s->new_picture.f->data[2] + s->mb_x * 8 + s->mb_y * 8 * s->uvlinesize, c->scratchpad + 8, s->uvlinesize, 8); |
|
703 | 703 |
} |
704 | 704 |
|
705 | 705 |
c->pred_x= mx; |
... | ... |
@@ -795,7 +796,7 @@ static int interlaced_search(MpegEncContext *s, int ref_index, |
795 | 795 |
mv_table[xy][0]= mx_i; |
796 | 796 |
mv_table[xy][1]= my_i; |
797 | 797 |
|
798 |
- if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){ |
|
798 |
+ if (s->mecc.me_sub_cmp[0] != s->mecc.mb_cmp[0]) { |
|
799 | 799 |
int dxy; |
800 | 800 |
|
801 | 801 |
//FIXME chroma ME |
... | ... |
@@ -807,7 +808,7 @@ static int interlaced_search(MpegEncContext *s, int ref_index, |
807 | 807 |
}else{ |
808 | 808 |
s->hdsp.put_pixels_tab [size][dxy](c->scratchpad, ref , stride, h); |
809 | 809 |
} |
810 |
- dmin= s->dsp.mb_cmp[size](s, c->src[block][0], c->scratchpad, stride, h); |
|
810 |
+ dmin = s->mecc.mb_cmp[size](s, c->src[block][0], c->scratchpad, stride, h); |
|
811 | 811 |
dmin+= (mv_penalty[mx_i-c->pred_x] + mv_penalty[my_i-c->pred_y] + 1)*c->mb_penalty_factor; |
812 | 812 |
}else |
813 | 813 |
dmin+= c->mb_penalty_factor; //field_select bits |
... | ... |
@@ -962,7 +963,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s, |
962 | 962 |
/* At this point (mx,my) are full-pell and the relative displacement */ |
963 | 963 |
ppix = c->ref[0][0] + (my * s->linesize) + mx; |
964 | 964 |
|
965 |
- vard = s->dsp.sse[0](NULL, pix, ppix, s->linesize, 16); |
|
965 |
+ vard = s->mecc.sse[0](NULL, pix, ppix, s->linesize, 16); |
|
966 | 966 |
|
967 | 967 |
pic->mc_mb_var[s->mb_stride * mb_y + mb_x] = (vard+128)>>8; |
968 | 968 |
c->mc_mb_var_sum_temp += (vard+128)>>8; |
... | ... |
@@ -1059,7 +1060,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s, |
1059 | 1059 |
*(uint32_t*)(&c->scratchpad[i*s->linesize+12]) = mean; |
1060 | 1060 |
} |
1061 | 1061 |
|
1062 |
- intra_score= s->dsp.mb_cmp[0](s, c->scratchpad, pix, s->linesize, 16); |
|
1062 |
+ intra_score= s->mecc.mb_cmp[0](s, c->scratchpad, pix, s->linesize, 16); |
|
1063 | 1063 |
} |
1064 | 1064 |
intra_score += c->mb_penalty_factor*16; |
1065 | 1065 |
|
... | ... |
@@ -1259,7 +1260,7 @@ static inline int check_bidir_mv(MpegEncContext * s, |
1259 | 1259 |
|
1260 | 1260 |
fbmin = (mv_penalty_f[motion_fx-pred_fx] + mv_penalty_f[motion_fy-pred_fy])*c->mb_penalty_factor |
1261 | 1261 |
+(mv_penalty_b[motion_bx-pred_bx] + mv_penalty_b[motion_by-pred_by])*c->mb_penalty_factor |
1262 |
- + s->dsp.mb_cmp[size](s, src_data[0], dest_y, stride, h); //FIXME new_pic |
|
1262 |
+ + s->mecc.mb_cmp[size](s, src_data[0], dest_y, stride, h); // FIXME new_pic |
|
1263 | 1263 |
|
1264 | 1264 |
if(c->avctx->mb_cmp&FF_CMP_CHROMA){ |
1265 | 1265 |
} |
... | ... |
@@ -63,8 +63,8 @@ static int hpel_motion_search(MpegEncContext * s, |
63 | 63 |
|
64 | 64 |
//FIXME factorize |
65 | 65 |
|
66 |
- cmp_sub= s->dsp.me_sub_cmp[size]; |
|
67 |
- chroma_cmp_sub= s->dsp.me_sub_cmp[size+1]; |
|
66 |
+ cmp_sub = s->mecc.me_sub_cmp[size]; |
|
67 |
+ chroma_cmp_sub = s->mecc.me_sub_cmp[size + 1]; |
|
68 | 68 |
|
69 | 69 |
if(c->skip){ //FIXME move out of hpel? |
70 | 70 |
*mx_ptr = 0; |
... | ... |
@@ -165,7 +165,6 @@ static inline int get_mb_score(MpegEncContext *s, int mx, int my, |
165 | 165 |
int src_index, int ref_index, int size, |
166 | 166 |
int h, int add_rate) |
167 | 167 |
{ |
168 |
-// const int check_luma= s->dsp.me_sub_cmp != s->dsp.mb_cmp; |
|
169 | 168 |
MotionEstContext * const c= &s->me; |
170 | 169 |
const int penalty_factor= c->mb_penalty_factor; |
171 | 170 |
const int flags= c->mb_flags; |
... | ... |
@@ -178,8 +177,8 @@ static inline int get_mb_score(MpegEncContext *s, int mx, int my, |
178 | 178 |
|
179 | 179 |
//FIXME factorize |
180 | 180 |
|
181 |
- cmp_sub= s->dsp.mb_cmp[size]; |
|
182 |
- chroma_cmp_sub= s->dsp.mb_cmp[size+1]; |
|
181 |
+ cmp_sub = s->mecc.mb_cmp[size]; |
|
182 |
+ chroma_cmp_sub = s->mecc.mb_cmp[size + 1]; |
|
183 | 183 |
|
184 | 184 |
d= cmp(s, mx>>(qpel+1), my>>(qpel+1), mx&mask, my&mask, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags); |
185 | 185 |
//FIXME check cbp before adding penalty for (0,0) vector |
... | ... |
@@ -222,12 +221,12 @@ static int qpel_motion_search(MpegEncContext * s, |
222 | 222 |
LOAD_COMMON |
223 | 223 |
int flags= c->sub_flags; |
224 | 224 |
|
225 |
- cmpf= s->dsp.me_cmp[size]; |
|
226 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; //factorize FIXME |
|
225 |
+ cmpf = s->mecc.me_cmp[size]; |
|
226 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; // FIXME: factorize |
|
227 | 227 |
//FIXME factorize |
228 | 228 |
|
229 |
- cmp_sub= s->dsp.me_sub_cmp[size]; |
|
230 |
- chroma_cmp_sub= s->dsp.me_sub_cmp[size+1]; |
|
229 |
+ cmp_sub = s->mecc.me_sub_cmp[size]; |
|
230 |
+ chroma_cmp_sub = s->mecc.me_sub_cmp[size + 1]; |
|
231 | 231 |
|
232 | 232 |
if(c->skip){ //FIXME somehow move up (benchmark) |
233 | 233 |
*mx_ptr = 0; |
... | ... |
@@ -423,8 +422,8 @@ static av_always_inline int small_diamond_search(MpegEncContext * s, int *best, |
423 | 423 |
LOAD_COMMON2 |
424 | 424 |
unsigned map_generation = c->map_generation; |
425 | 425 |
|
426 |
- cmpf= s->dsp.me_cmp[size]; |
|
427 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
426 |
+ cmpf = s->mecc.me_cmp[size]; |
|
427 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
428 | 428 |
|
429 | 429 |
{ /* ensure that the best point is in the MAP as h/qpel refinement needs it */ |
430 | 430 |
const unsigned key = (best[1]<<ME_MAP_MV_BITS) + best[0] + map_generation; |
... | ... |
@@ -464,8 +463,8 @@ static int funny_diamond_search(MpegEncContext * s, int *best, int dmin, |
464 | 464 |
LOAD_COMMON2 |
465 | 465 |
unsigned map_generation = c->map_generation; |
466 | 466 |
|
467 |
- cmpf= s->dsp.me_cmp[size]; |
|
468 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
467 |
+ cmpf = s->mecc.me_cmp[size]; |
|
468 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
469 | 469 |
|
470 | 470 |
for(dia_size=1; dia_size<=4; dia_size++){ |
471 | 471 |
int dir; |
... | ... |
@@ -507,8 +506,8 @@ static int hex_search(MpegEncContext * s, int *best, int dmin, |
507 | 507 |
int x,y,d; |
508 | 508 |
const int dec= dia_size & (dia_size-1); |
509 | 509 |
|
510 |
- cmpf= s->dsp.me_cmp[size]; |
|
511 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
510 |
+ cmpf = s->mecc.me_cmp[size]; |
|
511 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
512 | 512 |
|
513 | 513 |
for(;dia_size; dia_size= dec ? dia_size-1 : dia_size>>1){ |
514 | 514 |
do{ |
... | ... |
@@ -544,8 +543,8 @@ static int l2s_dia_search(MpegEncContext * s, int *best, int dmin, |
544 | 544 |
static const int hex[8][2]={{-2, 0}, {-1,-1}, { 0,-2}, { 1,-1}, |
545 | 545 |
{ 2, 0}, { 1, 1}, { 0, 2}, {-1, 1}}; |
546 | 546 |
|
547 |
- cmpf= s->dsp.me_cmp[size]; |
|
548 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
547 |
+ cmpf = s->mecc.me_cmp[size]; |
|
548 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
549 | 549 |
|
550 | 550 |
for(; dia_size; dia_size= dec ? dia_size-1 : dia_size>>1){ |
551 | 551 |
do{ |
... | ... |
@@ -583,8 +582,8 @@ static int umh_search(MpegEncContext * s, int *best, int dmin, |
583 | 583 |
{-2, 3}, { 0, 4}, { 2, 3}, |
584 | 584 |
{-2,-3}, { 0,-4}, { 2,-3},}; |
585 | 585 |
|
586 |
- cmpf= s->dsp.me_cmp[size]; |
|
587 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
586 |
+ cmpf = s->mecc.me_cmp[size]; |
|
587 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
588 | 588 |
|
589 | 589 |
x= best[0]; |
590 | 590 |
y= best[1]; |
... | ... |
@@ -626,8 +625,8 @@ static int full_search(MpegEncContext * s, int *best, int dmin, |
626 | 626 |
int x,y, d; |
627 | 627 |
const int dia_size= c->dia_size&0xFF; |
628 | 628 |
|
629 |
- cmpf= s->dsp.me_cmp[size]; |
|
630 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
629 |
+ cmpf = s->mecc.me_cmp[size]; |
|
630 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
631 | 631 |
|
632 | 632 |
for(y=FFMAX(-dia_size, ymin); y<=FFMIN(dia_size,ymax); y++){ |
633 | 633 |
for(x=FFMAX(-dia_size, xmin); x<=FFMIN(dia_size,xmax); x++){ |
... | ... |
@@ -692,8 +691,8 @@ static int sab_diamond_search(MpegEncContext * s, int *best, int dmin, |
692 | 692 |
|
693 | 693 |
av_assert1(minima_count <= MAX_SAB_SIZE); |
694 | 694 |
|
695 |
- cmpf= s->dsp.me_cmp[size]; |
|
696 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
695 |
+ cmpf = s->mecc.me_cmp[size]; |
|
696 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
697 | 697 |
|
698 | 698 |
/*Note j<MAX_SAB_SIZE is needed if MAX_SAB_SIZE < ME_MAP_SIZE as j can |
699 | 699 |
become larger due to MVs overflowing their ME_MAP_MV_BITS bits space in map |
... | ... |
@@ -777,8 +776,8 @@ static int var_diamond_search(MpegEncContext * s, int *best, int dmin, |
777 | 777 |
LOAD_COMMON2 |
778 | 778 |
unsigned map_generation = c->map_generation; |
779 | 779 |
|
780 |
- cmpf= s->dsp.me_cmp[size]; |
|
781 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
780 |
+ cmpf = s->mecc.me_cmp[size]; |
|
781 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
782 | 782 |
|
783 | 783 |
for(dia_size=1; dia_size<=c->dia_size; dia_size++){ |
784 | 784 |
int dir, start, end; |
... | ... |
@@ -878,12 +877,12 @@ static av_always_inline int epzs_motion_search_internal(MpegEncContext * s, int |
878 | 878 |
|
879 | 879 |
if(c->pre_pass){ |
880 | 880 |
penalty_factor= c->pre_penalty_factor; |
881 |
- cmpf= s->dsp.me_pre_cmp[size]; |
|
882 |
- chroma_cmpf= s->dsp.me_pre_cmp[size+1]; |
|
881 |
+ cmpf = s->mecc.me_pre_cmp[size]; |
|
882 |
+ chroma_cmpf = s->mecc.me_pre_cmp[size + 1]; |
|
883 | 883 |
}else{ |
884 | 884 |
penalty_factor= c->penalty_factor; |
885 |
- cmpf= s->dsp.me_cmp[size]; |
|
886 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
885 |
+ cmpf = s->mecc.me_cmp[size]; |
|
886 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
887 | 887 |
} |
888 | 888 |
|
889 | 889 |
map_generation= update_map_generation(c); |
... | ... |
@@ -1007,8 +1006,8 @@ static int epzs_motion_search4(MpegEncContext * s, |
1007 | 1007 |
int flags= c->flags; |
1008 | 1008 |
LOAD_COMMON2 |
1009 | 1009 |
|
1010 |
- cmpf= s->dsp.me_cmp[size]; |
|
1011 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
1010 |
+ cmpf = s->mecc.me_cmp[size]; |
|
1011 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
1012 | 1012 |
|
1013 | 1013 |
map_generation= update_map_generation(c); |
1014 | 1014 |
|
... | ... |
@@ -1066,8 +1065,8 @@ static int epzs_motion_search2(MpegEncContext * s, |
1066 | 1066 |
int flags= c->flags; |
1067 | 1067 |
LOAD_COMMON2 |
1068 | 1068 |
|
1069 |
- cmpf= s->dsp.me_cmp[size]; |
|
1070 |
- chroma_cmpf= s->dsp.me_cmp[size+1]; |
|
1069 |
+ cmpf = s->mecc.me_cmp[size]; |
|
1070 |
+ chroma_cmpf = s->mecc.me_cmp[size + 1]; |
|
1071 | 1071 |
|
1072 | 1072 |
map_generation= update_map_generation(c); |
1073 | 1073 |
|
... | ... |
@@ -698,7 +698,7 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64], |
698 | 698 |
} |
699 | 699 |
diff = diff * 256 / (xe * ye); |
700 | 700 |
} else { |
701 |
- diff = s->dsp.sad[0](NULL, p_pic, b_pic, s->linesize, 16); |
|
701 |
+ diff = s->mecc.sad[0](NULL, p_pic, b_pic, s->linesize, 16); |
|
702 | 702 |
} |
703 | 703 |
if (diff > s->qscale * 70) { // FIXME check that 70 is optimal |
704 | 704 |
s->mb_skipped = 0; |
... | ... |
@@ -380,10 +380,10 @@ static void gray8(uint8_t *dst, const uint8_t *src, ptrdiff_t linesize, int h) |
380 | 380 |
av_cold int ff_dct_common_init(MpegEncContext *s) |
381 | 381 |
{ |
382 | 382 |
ff_blockdsp_init(&s->bdsp, s->avctx); |
383 |
- ff_dsputil_init(&s->dsp, s->avctx); |
|
384 | 383 |
ff_h264chroma_init(&s->h264chroma, 8); //for lowres |
385 | 384 |
ff_hpeldsp_init(&s->hdsp, s->avctx->flags); |
386 | 385 |
ff_idctdsp_init(&s->idsp, s->avctx); |
386 |
+ ff_me_cmp_init(&s->mecc, s->avctx); |
|
387 | 387 |
ff_mpegvideodsp_init(&s->mdsp); |
388 | 388 |
ff_videodsp_init(&s->vdsp, s->avctx->bits_per_raw_sample); |
389 | 389 |
|
... | ... |
@@ -1106,7 +1106,7 @@ static int init_er(MpegEncContext *s) |
1106 | 1106 |
int i; |
1107 | 1107 |
|
1108 | 1108 |
er->avctx = s->avctx; |
1109 |
- er->dsp = &s->dsp; |
|
1109 |
+ er->mecc = &s->mecc; |
|
1110 | 1110 |
|
1111 | 1111 |
er->mb_index2xy = s->mb_index2xy; |
1112 | 1112 |
er->mb_num = s->mb_num; |
... | ... |
@@ -30,7 +30,6 @@ |
30 | 30 |
|
31 | 31 |
#include "avcodec.h" |
32 | 32 |
#include "blockdsp.h" |
33 |
-#include "dsputil.h" |
|
34 | 33 |
#include "error_resilience.h" |
35 | 34 |
#include "fdctdsp.h" |
36 | 35 |
#include "get_bits.h" |
... | ... |
@@ -38,6 +37,7 @@ |
38 | 38 |
#include "h263dsp.h" |
39 | 39 |
#include "hpeldsp.h" |
40 | 40 |
#include "idctdsp.h" |
41 |
+#include "me_cmp.h" |
|
41 | 42 |
#include "mpegvideodsp.h" |
42 | 43 |
#include "mpegvideoencdsp.h" |
43 | 44 |
#include "pixblockdsp.h" |
... | ... |
@@ -365,11 +365,11 @@ typedef struct MpegEncContext { |
365 | 365 |
int h263_long_vectors; ///< use horrible h263v1 long vector mode |
366 | 366 |
|
367 | 367 |
BlockDSPContext bdsp; |
368 |
- DSPContext dsp; ///< pointers for accelerated dsp functions |
|
369 | 368 |
FDCTDSPContext fdsp; |
370 | 369 |
H264ChromaContext h264chroma; |
371 | 370 |
HpelDSPContext hdsp; |
372 | 371 |
IDCTDSPContext idsp; |
372 |
+ MECmpContext mecc; |
|
373 | 373 |
MpegVideoDSPContext mdsp; |
374 | 374 |
MpegvideoEncDSPContext mpvencdsp; |
375 | 375 |
PixblockDSPContext pdsp; |
... | ... |
@@ -836,6 +836,7 @@ av_cold int ff_MPV_encode_init(AVCodecContext *avctx) |
836 | 836 |
return -1; |
837 | 837 |
|
838 | 838 |
ff_fdctdsp_init(&s->fdsp, avctx); |
839 |
+ ff_me_cmp_init(&s->mecc, avctx); |
|
839 | 840 |
ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx); |
840 | 841 |
ff_pixblockdsp_init(&s->pdsp, avctx); |
841 | 842 |
ff_qpeldsp_init(&s->qdsp); |
... | ... |
@@ -872,8 +873,8 @@ av_cold int ff_MPV_encode_init(AVCodecContext *avctx) |
872 | 872 |
|
873 | 873 |
s->quant_precision = 5; |
874 | 874 |
|
875 |
- ff_set_cmp(&s->dsp, s->dsp.ildct_cmp, s->avctx->ildct_cmp); |
|
876 |
- ff_set_cmp(&s->dsp, s->dsp.frame_skip_cmp, s->avctx->frame_skip_cmp); |
|
875 |
+ ff_set_cmp(&s->mecc, s->mecc.ildct_cmp, s->avctx->ildct_cmp); |
|
876 |
+ ff_set_cmp(&s->mecc, s->mecc.frame_skip_cmp, s->avctx->frame_skip_cmp); |
|
877 | 877 |
|
878 | 878 |
if (CONFIG_H261_ENCODER && s->out_format == FMT_H261) |
879 | 879 |
ff_h261_encode_init(s); |
... | ... |
@@ -1027,8 +1028,8 @@ static int get_intra_count(MpegEncContext *s, uint8_t *src, |
1027 | 1027 |
for (y = 0; y < h; y += 16) { |
1028 | 1028 |
for (x = 0; x < w; x += 16) { |
1029 | 1029 |
int offset = x + y * stride; |
1030 |
- int sad = s->dsp.sad[0](NULL, src + offset, ref + offset, stride, |
|
1031 |
- 16); |
|
1030 |
+ int sad = s->mecc.sad[0](NULL, src + offset, ref + offset, |
|
1031 |
+ stride, 16); |
|
1032 | 1032 |
int mean = (s->mpvencdsp.pix_sum(src + offset, stride) + 128) >> 8; |
1033 | 1033 |
int sae = get_sae(src + offset, mean, stride); |
1034 | 1034 |
|
... | ... |
@@ -1205,7 +1206,7 @@ static int skip_check(MpegEncContext *s, Picture *p, Picture *ref) |
1205 | 1205 |
int off = p->shared ? 0 : 16; |
1206 | 1206 |
uint8_t *dptr = p->f->data[plane] + 8 * (x + y * stride) + off; |
1207 | 1207 |
uint8_t *rptr = ref->f->data[plane] + 8 * (x + y * stride); |
1208 |
- int v = s->dsp.frame_skip_cmp[1](s, dptr, rptr, stride, 8); |
|
1208 |
+ int v = s->mecc.frame_skip_cmp[1](s, dptr, rptr, stride, 8); |
|
1209 | 1209 |
|
1210 | 1210 |
switch (FFABS(s->avctx->frame_skip_exp)) { |
1211 | 1211 |
case 0: score = FFMAX(score, v); break; |
... | ... |
@@ -2089,16 +2090,15 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s, |
2089 | 2089 |
int progressive_score, interlaced_score; |
2090 | 2090 |
|
2091 | 2091 |
s->interlaced_dct = 0; |
2092 |
- progressive_score = s->dsp.ildct_cmp[4](s, ptr_y, |
|
2093 |
- NULL, wrap_y, 8) + |
|
2094 |
- s->dsp.ildct_cmp[4](s, ptr_y + wrap_y * 8, |
|
2095 |
- NULL, wrap_y, 8) - 400; |
|
2092 |
+ progressive_score = s->mecc.ildct_cmp[4](s, ptr_y, NULL, wrap_y, 8) + |
|
2093 |
+ s->mecc.ildct_cmp[4](s, ptr_y + wrap_y * 8, |
|
2094 |
+ NULL, wrap_y, 8) - 400; |
|
2096 | 2095 |
|
2097 | 2096 |
if (progressive_score > 0) { |
2098 |
- interlaced_score = s->dsp.ildct_cmp[4](s, ptr_y, |
|
2099 |
- NULL, wrap_y * 2, 8) + |
|
2100 |
- s->dsp.ildct_cmp[4](s, ptr_y + wrap_y, |
|
2101 |
- NULL, wrap_y * 2, 8); |
|
2097 |
+ interlaced_score = s->mecc.ildct_cmp[4](s, ptr_y, |
|
2098 |
+ NULL, wrap_y * 2, 8) + |
|
2099 |
+ s->mecc.ildct_cmp[4](s, ptr_y + wrap_y, |
|
2100 |
+ NULL, wrap_y * 2, 8); |
|
2102 | 2101 |
if (progressive_score > interlaced_score) { |
2103 | 2102 |
s->interlaced_dct = 1; |
2104 | 2103 |
|
... | ... |
@@ -2169,23 +2169,20 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s, |
2169 | 2169 |
int progressive_score, interlaced_score; |
2170 | 2170 |
|
2171 | 2171 |
s->interlaced_dct = 0; |
2172 |
- progressive_score = s->dsp.ildct_cmp[0](s, dest_y, |
|
2173 |
- ptr_y, wrap_y, |
|
2174 |
- 8) + |
|
2175 |
- s->dsp.ildct_cmp[0](s, dest_y + wrap_y * 8, |
|
2176 |
- ptr_y + wrap_y * 8, wrap_y, |
|
2177 |
- 8) - 400; |
|
2172 |
+ progressive_score = s->mecc.ildct_cmp[0](s, dest_y, ptr_y, wrap_y, 8) + |
|
2173 |
+ s->mecc.ildct_cmp[0](s, dest_y + wrap_y * 8, |
|
2174 |
+ ptr_y + wrap_y * 8, |
|
2175 |
+ wrap_y, 8) - 400; |
|
2178 | 2176 |
|
2179 | 2177 |
if (s->avctx->ildct_cmp == FF_CMP_VSSE) |
2180 | 2178 |
progressive_score -= 400; |
2181 | 2179 |
|
2182 | 2180 |
if (progressive_score > 0) { |
2183 |
- interlaced_score = s->dsp.ildct_cmp[0](s, dest_y, |
|
2184 |
- ptr_y, |
|
2185 |
- wrap_y * 2, 8) + |
|
2186 |
- s->dsp.ildct_cmp[0](s, dest_y + wrap_y, |
|
2187 |
- ptr_y + wrap_y, |
|
2188 |
- wrap_y * 2, 8); |
|
2181 |
+ interlaced_score = s->mecc.ildct_cmp[0](s, dest_y, ptr_y, |
|
2182 |
+ wrap_y * 2, 8) + |
|
2183 |
+ s->mecc.ildct_cmp[0](s, dest_y + wrap_y, |
|
2184 |
+ ptr_y + wrap_y, |
|
2185 |
+ wrap_y * 2, 8); |
|
2189 | 2186 |
|
2190 | 2187 |
if (progressive_score > interlaced_score) { |
2191 | 2188 |
s->interlaced_dct = 1; |
... | ... |
@@ -2223,33 +2220,28 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s, |
2223 | 2223 |
if (s->current_picture.mc_mb_var[s->mb_stride * mb_y + mb_x] < |
2224 | 2224 |
2 * s->qscale * s->qscale) { |
2225 | 2225 |
// FIXME optimize |
2226 |
- if (s->dsp.sad[1](NULL, ptr_y , dest_y, |
|
2227 |
- wrap_y, 8) < 20 * s->qscale) |
|
2226 |
+ if (s->mecc.sad[1](NULL, ptr_y, dest_y, wrap_y, 8) < 20 * s->qscale) |
|
2228 | 2227 |
skip_dct[0] = 1; |
2229 |
- if (s->dsp.sad[1](NULL, ptr_y + 8, |
|
2230 |
- dest_y + 8, wrap_y, 8) < 20 * s->qscale) |
|
2228 |
+ if (s->mecc.sad[1](NULL, ptr_y + 8, dest_y + 8, wrap_y, 8) < 20 * s->qscale) |
|
2231 | 2229 |
skip_dct[1] = 1; |
2232 |
- if (s->dsp.sad[1](NULL, ptr_y + dct_offset, |
|
2233 |
- dest_y + dct_offset, wrap_y, 8) < 20 * s->qscale) |
|
2230 |
+ if (s->mecc.sad[1](NULL, ptr_y + dct_offset, dest_y + dct_offset, |
|
2231 |
+ wrap_y, 8) < 20 * s->qscale) |
|
2234 | 2232 |
skip_dct[2] = 1; |
2235 |
- if (s->dsp.sad[1](NULL, ptr_y + dct_offset + 8, |
|
2236 |
- dest_y + dct_offset + 8, |
|
2237 |
- wrap_y, 8) < 20 * s->qscale) |
|
2233 |
+ if (s->mecc.sad[1](NULL, ptr_y + dct_offset + 8, dest_y + dct_offset + 8, |
|
2234 |
+ wrap_y, 8) < 20 * s->qscale) |
|
2238 | 2235 |
skip_dct[3] = 1; |
2239 |
- if (s->dsp.sad[1](NULL, ptr_cb, dest_cb, |
|
2240 |
- wrap_c, 8) < 20 * s->qscale) |
|
2236 |
+ if (s->mecc.sad[1](NULL, ptr_cb, dest_cb, wrap_c, 8) < 20 * s->qscale) |
|
2241 | 2237 |
skip_dct[4] = 1; |
2242 |
- if (s->dsp.sad[1](NULL, ptr_cr, dest_cr, |
|
2243 |
- wrap_c, 8) < 20 * s->qscale) |
|
2238 |
+ if (s->mecc.sad[1](NULL, ptr_cr, dest_cr, wrap_c, 8) < 20 * s->qscale) |
|
2244 | 2239 |
skip_dct[5] = 1; |
2245 | 2240 |
if (!s->chroma_y_shift) { /* 422 */ |
2246 |
- if (s->dsp.sad[1](NULL, ptr_cb + uv_dct_offset, |
|
2247 |
- dest_cb + uv_dct_offset, |
|
2248 |
- wrap_c, 8) < 20 * s->qscale) |
|
2241 |
+ if (s->mecc.sad[1](NULL, ptr_cb + uv_dct_offset, |
|
2242 |
+ dest_cb + uv_dct_offset, |
|
2243 |
+ wrap_c, 8) < 20 * s->qscale) |
|
2249 | 2244 |
skip_dct[6] = 1; |
2250 |
- if (s->dsp.sad[1](NULL, ptr_cr + uv_dct_offset, |
|
2251 |
- dest_cr + uv_dct_offset, |
|
2252 |
- wrap_c, 8) < 20 * s->qscale) |
|
2245 |
+ if (s->mecc.sad[1](NULL, ptr_cr + uv_dct_offset, |
|
2246 |
+ dest_cr + uv_dct_offset, |
|
2247 |
+ wrap_c, 8) < 20 * s->qscale) |
|
2253 | 2248 |
skip_dct[7] = 1; |
2254 | 2249 |
} |
2255 | 2250 |
} |
... | ... |
@@ -2522,9 +2514,9 @@ static int sse(MpegEncContext *s, uint8_t *src1, uint8_t *src2, int w, int h, in |
2522 | 2522 |
int x,y; |
2523 | 2523 |
|
2524 | 2524 |
if(w==16 && h==16) |
2525 |
- return s->dsp.sse[0](NULL, src1, src2, stride, 16); |
|
2525 |
+ return s->mecc.sse[0](NULL, src1, src2, stride, 16); |
|
2526 | 2526 |
else if(w==8 && h==8) |
2527 |
- return s->dsp.sse[1](NULL, src1, src2, stride, 8); |
|
2527 |
+ return s->mecc.sse[1](NULL, src1, src2, stride, 8); |
|
2528 | 2528 |
|
2529 | 2529 |
for(y=0; y<h; y++){ |
2530 | 2530 |
for(x=0; x<w; x++){ |
... | ... |
@@ -2546,13 +2538,13 @@ static int sse_mb(MpegEncContext *s){ |
2546 | 2546 |
|
2547 | 2547 |
if(w==16 && h==16) |
2548 | 2548 |
if(s->avctx->mb_cmp == FF_CMP_NSSE){ |
2549 |
- return s->dsp.nsse[0](s, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], s->linesize, 16) |
|
2550 |
- +s->dsp.nsse[1](s, s->new_picture.f->data[1] + s->mb_x*8 + s->mb_y*s->uvlinesize*8,s->dest[1], s->uvlinesize, 8) |
|
2551 |
- +s->dsp.nsse[1](s, s->new_picture.f->data[2] + s->mb_x*8 + s->mb_y*s->uvlinesize*8,s->dest[2], s->uvlinesize, 8); |
|
2549 |
+ return s->mecc.nsse[0](s, s->new_picture.f->data[0] + s->mb_x * 16 + s->mb_y * s->linesize * 16, s->dest[0], s->linesize, 16) + |
|
2550 |
+ s->mecc.nsse[1](s, s->new_picture.f->data[1] + s->mb_x * 8 + s->mb_y * s->uvlinesize * 8, s->dest[1], s->uvlinesize, 8) + |
|
2551 |
+ s->mecc.nsse[1](s, s->new_picture.f->data[2] + s->mb_x * 8 + s->mb_y * s->uvlinesize * 8, s->dest[2], s->uvlinesize, 8); |
|
2552 | 2552 |
}else{ |
2553 |
- return s->dsp.sse[0](NULL, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], s->linesize, 16) |
|
2554 |
- +s->dsp.sse[1](NULL, s->new_picture.f->data[1] + s->mb_x*8 + s->mb_y*s->uvlinesize*8,s->dest[1], s->uvlinesize, 8) |
|
2555 |
- +s->dsp.sse[1](NULL, s->new_picture.f->data[2] + s->mb_x*8 + s->mb_y*s->uvlinesize*8,s->dest[2], s->uvlinesize, 8); |
|
2553 |
+ return s->mecc.sse[0](NULL, s->new_picture.f->data[0] + s->mb_x * 16 + s->mb_y * s->linesize * 16, s->dest[0], s->linesize, 16) + |
|
2554 |
+ s->mecc.sse[1](NULL, s->new_picture.f->data[1] + s->mb_x * 8 + s->mb_y * s->uvlinesize * 8, s->dest[1], s->uvlinesize, 8) + |
|
2555 |
+ s->mecc.sse[1](NULL, s->new_picture.f->data[2] + s->mb_x * 8 + s->mb_y * s->uvlinesize * 8, s->dest[2], s->uvlinesize, 8); |
|
2556 | 2556 |
} |
2557 | 2557 |
else |
2558 | 2558 |
return sse(s, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], w, h, s->linesize) |
... | ... |
@@ -25,8 +25,8 @@ |
25 | 25 |
#include "libavutil/attributes.h" |
26 | 26 |
#include "libavutil/imgutils.h" |
27 | 27 |
#include "avcodec.h" |
28 |
-#include "dsputil.h" |
|
29 | 28 |
#include "imgconvert.h" |
29 |
+#include "me_cmp.h" |
|
30 | 30 |
#include "mpegvideoencdsp.h" |
31 | 31 |
|
32 | 32 |
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], |
... | ... |
@@ -2,7 +2,6 @@ OBJS += ppc/fmtconvert_altivec.o \ |
2 | 2 |
|
3 | 3 |
OBJS-$(CONFIG_AUDIODSP) += ppc/audiodsp.o |
4 | 4 |
OBJS-$(CONFIG_BLOCKDSP) += ppc/blockdsp.o |
5 |
-OBJS-$(CONFIG_DSPUTIL) += ppc/dsputil_altivec.o |
|
6 | 5 |
OBJS-$(CONFIG_FFT) += ppc/fft_altivec.o |
7 | 6 |
OBJS-$(CONFIG_H264CHROMA) += ppc/h264chroma_init.o |
8 | 7 |
OBJS-$(CONFIG_H264DSP) += ppc/h264dsp.o ppc/hpeldsp_altivec.o |
... | ... |
@@ -11,6 +10,7 @@ OBJS-$(CONFIG_HPELDSP) += ppc/hpeldsp_altivec.o |
11 | 11 |
OBJS-$(CONFIG_HUFFYUVDSP) += ppc/huffyuvdsp_altivec.o |
12 | 12 |
OBJS-$(CONFIG_FDCTDSP) += ppc/fdctdsp.o |
13 | 13 |
OBJS-$(CONFIG_IDCTDSP) += ppc/idctdsp.o |
14 |
+OBJS-$(CONFIG_ME_CMP) += ppc/me_cmp.o |
|
14 | 15 |
OBJS-$(CONFIG_MPEGAUDIODSP) += ppc/mpegaudiodsp_altivec.o |
15 | 16 |
OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o \ |
16 | 17 |
ppc/mpegvideodsp.o |
17 | 18 |
deleted file mode 100644 |
... | ... |
@@ -1,767 +0,0 @@ |
1 |
-/* |
|
2 |
- * Copyright (c) 2002 Brian Foley |
|
3 |
- * Copyright (c) 2002 Dieter Shirley |
|
4 |
- * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> |
|
5 |
- * |
|
6 |
- * This file is part of FFmpeg. |
|
7 |
- * |
|
8 |
- * FFmpeg is free software; you can redistribute it and/or |
|
9 |
- * modify it under the terms of the GNU Lesser General Public |
|
10 |
- * License as published by the Free Software Foundation; either |
|
11 |
- * version 2.1 of the License, or (at your option) any later version. |
|
12 |
- * |
|
13 |
- * FFmpeg is distributed in the hope that it will be useful, |
|
14 |
- * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
15 |
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
16 |
- * Lesser General Public License for more details. |
|
17 |
- * |
|
18 |
- * You should have received a copy of the GNU Lesser General Public |
|
19 |
- * License along with FFmpeg; if not, write to the Free Software |
|
20 |
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
21 |
- */ |
|
22 |
- |
|
23 |
-#include "config.h" |
|
24 |
-#if HAVE_ALTIVEC_H |
|
25 |
-#include <altivec.h> |
|
26 |
-#endif |
|
27 |
- |
|
28 |
-#include "libavutil/attributes.h" |
|
29 |
-#include "libavutil/cpu.h" |
|
30 |
-#include "libavutil/ppc/cpu.h" |
|
31 |
-#include "libavutil/ppc/types_altivec.h" |
|
32 |
-#include "libavutil/ppc/util_altivec.h" |
|
33 |
-#include "libavcodec/avcodec.h" |
|
34 |
-#include "libavcodec/dsputil.h" |
|
35 |
-#include "libavcodec/mpegvideo.h" |
|
36 |
- |
|
37 |
-#if HAVE_ALTIVEC |
|
38 |
-static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
39 |
- int line_size, int h) |
|
40 |
-{ |
|
41 |
- int i, s = 0; |
|
42 |
- const vector unsigned char zero = |
|
43 |
- (const vector unsigned char) vec_splat_u8(0); |
|
44 |
- vector unsigned char perm1 = vec_lvsl(0, pix2); |
|
45 |
- vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); |
|
46 |
- vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
47 |
- vector signed int sumdiffs; |
|
48 |
- |
|
49 |
- for (i = 0; i < h; i++) { |
|
50 |
- /* Read unaligned pixels into our vectors. The vectors are as follows: |
|
51 |
- * pix1v: pix1[0] - pix1[15] |
|
52 |
- * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] */ |
|
53 |
- vector unsigned char pix1v = vec_ld(0, pix1); |
|
54 |
- vector unsigned char pix2l = vec_ld(0, pix2); |
|
55 |
- vector unsigned char pix2r = vec_ld(16, pix2); |
|
56 |
- vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm1); |
|
57 |
- vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2); |
|
58 |
- |
|
59 |
- /* Calculate the average vector. */ |
|
60 |
- vector unsigned char avgv = vec_avg(pix2v, pix2iv); |
|
61 |
- |
|
62 |
- /* Calculate a sum of abs differences vector. */ |
|
63 |
- vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv), |
|
64 |
- vec_min(pix1v, avgv)); |
|
65 |
- |
|
66 |
- /* Add each 4 pixel group together and put 4 results into sad. */ |
|
67 |
- sad = vec_sum4s(t5, sad); |
|
68 |
- |
|
69 |
- pix1 += line_size; |
|
70 |
- pix2 += line_size; |
|
71 |
- } |
|
72 |
- /* Sum up the four partial sums, and put the result into s. */ |
|
73 |
- sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
74 |
- sumdiffs = vec_splat(sumdiffs, 3); |
|
75 |
- vec_ste(sumdiffs, 0, &s); |
|
76 |
- |
|
77 |
- return s; |
|
78 |
-} |
|
79 |
- |
|
80 |
-static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
81 |
- int line_size, int h) |
|
82 |
-{ |
|
83 |
- int i, s = 0; |
|
84 |
- const vector unsigned char zero = |
|
85 |
- (const vector unsigned char) vec_splat_u8(0); |
|
86 |
- vector unsigned char perm = vec_lvsl(0, pix2); |
|
87 |
- vector unsigned char pix1v, pix3v, avgv, t5; |
|
88 |
- vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
89 |
- vector signed int sumdiffs; |
|
90 |
- uint8_t *pix3 = pix2 + line_size; |
|
91 |
- |
|
92 |
- /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one |
|
93 |
- * iteration becomes pix2 in the next iteration. We can use this |
|
94 |
- * fact to avoid a potentially expensive unaligned read, each |
|
95 |
- * time around the loop. |
|
96 |
- * Read unaligned pixels into our vectors. The vectors are as follows: |
|
97 |
- * pix2v: pix2[0] - pix2[15] |
|
98 |
- * Split the pixel vectors into shorts. */ |
|
99 |
- vector unsigned char pix2l = vec_ld(0, pix2); |
|
100 |
- vector unsigned char pix2r = vec_ld(15, pix2); |
|
101 |
- vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm); |
|
102 |
- |
|
103 |
- for (i = 0; i < h; i++) { |
|
104 |
- /* Read unaligned pixels into our vectors. The vectors are as follows: |
|
105 |
- * pix1v: pix1[0] - pix1[15] |
|
106 |
- * pix3v: pix3[0] - pix3[15] */ |
|
107 |
- pix1v = vec_ld(0, pix1); |
|
108 |
- |
|
109 |
- pix2l = vec_ld(0, pix3); |
|
110 |
- pix2r = vec_ld(15, pix3); |
|
111 |
- pix3v = vec_perm(pix2l, pix2r, perm); |
|
112 |
- |
|
113 |
- /* Calculate the average vector. */ |
|
114 |
- avgv = vec_avg(pix2v, pix3v); |
|
115 |
- |
|
116 |
- /* Calculate a sum of abs differences vector. */ |
|
117 |
- t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
|
118 |
- |
|
119 |
- /* Add each 4 pixel group together and put 4 results into sad. */ |
|
120 |
- sad = vec_sum4s(t5, sad); |
|
121 |
- |
|
122 |
- pix1 += line_size; |
|
123 |
- pix2v = pix3v; |
|
124 |
- pix3 += line_size; |
|
125 |
- } |
|
126 |
- |
|
127 |
- /* Sum up the four partial sums, and put the result into s. */ |
|
128 |
- sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
129 |
- sumdiffs = vec_splat(sumdiffs, 3); |
|
130 |
- vec_ste(sumdiffs, 0, &s); |
|
131 |
- return s; |
|
132 |
-} |
|
133 |
- |
|
134 |
-static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
135 |
- int line_size, int h) |
|
136 |
-{ |
|
137 |
- int i, s = 0; |
|
138 |
- uint8_t *pix3 = pix2 + line_size; |
|
139 |
- const vector unsigned char zero = |
|
140 |
- (const vector unsigned char) vec_splat_u8(0); |
|
141 |
- const vector unsigned short two = |
|
142 |
- (const vector unsigned short) vec_splat_u16(2); |
|
143 |
- vector unsigned char avgv, t5; |
|
144 |
- vector unsigned char perm1 = vec_lvsl(0, pix2); |
|
145 |
- vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); |
|
146 |
- vector unsigned char pix1v, pix3v, pix3iv; |
|
147 |
- vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; |
|
148 |
- vector unsigned short avghv, avglv; |
|
149 |
- vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
150 |
- vector signed int sumdiffs; |
|
151 |
- |
|
152 |
- /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one |
|
153 |
- * iteration becomes pix2 in the next iteration. We can use this |
|
154 |
- * fact to avoid a potentially expensive unaligned read, as well |
|
155 |
- * as some splitting, and vector addition each time around the loop. |
|
156 |
- * Read unaligned pixels into our vectors. The vectors are as follows: |
|
157 |
- * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] |
|
158 |
- * Split the pixel vectors into shorts. */ |
|
159 |
- vector unsigned char pix2l = vec_ld(0, pix2); |
|
160 |
- vector unsigned char pix2r = vec_ld(16, pix2); |
|
161 |
- vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm1); |
|
162 |
- vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2); |
|
163 |
- |
|
164 |
- vector unsigned short pix2hv = |
|
165 |
- (vector unsigned short) vec_mergeh(zero, pix2v); |
|
166 |
- vector unsigned short pix2lv = |
|
167 |
- (vector unsigned short) vec_mergel(zero, pix2v); |
|
168 |
- vector unsigned short pix2ihv = |
|
169 |
- (vector unsigned short) vec_mergeh(zero, pix2iv); |
|
170 |
- vector unsigned short pix2ilv = |
|
171 |
- (vector unsigned short) vec_mergel(zero, pix2iv); |
|
172 |
- vector unsigned short t1 = vec_add(pix2hv, pix2ihv); |
|
173 |
- vector unsigned short t2 = vec_add(pix2lv, pix2ilv); |
|
174 |
- vector unsigned short t3, t4; |
|
175 |
- |
|
176 |
- for (i = 0; i < h; i++) { |
|
177 |
- /* Read unaligned pixels into our vectors. The vectors are as follows: |
|
178 |
- * pix1v: pix1[0] - pix1[15] |
|
179 |
- * pix3v: pix3[0] - pix3[15] pix3iv: pix3[1] - pix3[16] */ |
|
180 |
- pix1v = vec_ld(0, pix1); |
|
181 |
- |
|
182 |
- pix2l = vec_ld(0, pix3); |
|
183 |
- pix2r = vec_ld(16, pix3); |
|
184 |
- pix3v = vec_perm(pix2l, pix2r, perm1); |
|
185 |
- pix3iv = vec_perm(pix2l, pix2r, perm2); |
|
186 |
- |
|
187 |
- /* Note that AltiVec does have vec_avg, but this works on vector pairs |
|
188 |
- * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the |
|
189 |
- * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when |
|
190 |
- * it should be 1. Instead, we have to split the pixel vectors into |
|
191 |
- * vectors of shorts and do the averaging by hand. */ |
|
192 |
- |
|
193 |
- /* Split the pixel vectors into shorts. */ |
|
194 |
- pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); |
|
195 |
- pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); |
|
196 |
- pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); |
|
197 |
- pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); |
|
198 |
- |
|
199 |
- /* Do the averaging on them. */ |
|
200 |
- t3 = vec_add(pix3hv, pix3ihv); |
|
201 |
- t4 = vec_add(pix3lv, pix3ilv); |
|
202 |
- |
|
203 |
- avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); |
|
204 |
- avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); |
|
205 |
- |
|
206 |
- /* Pack the shorts back into a result. */ |
|
207 |
- avgv = vec_pack(avghv, avglv); |
|
208 |
- |
|
209 |
- /* Calculate a sum of abs differences vector. */ |
|
210 |
- t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
|
211 |
- |
|
212 |
- /* Add each 4 pixel group together and put 4 results into sad. */ |
|
213 |
- sad = vec_sum4s(t5, sad); |
|
214 |
- |
|
215 |
- pix1 += line_size; |
|
216 |
- pix3 += line_size; |
|
217 |
- /* Transfer the calculated values for pix3 into pix2. */ |
|
218 |
- t1 = t3; |
|
219 |
- t2 = t4; |
|
220 |
- } |
|
221 |
- /* Sum up the four partial sums, and put the result into s. */ |
|
222 |
- sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
223 |
- sumdiffs = vec_splat(sumdiffs, 3); |
|
224 |
- vec_ste(sumdiffs, 0, &s); |
|
225 |
- |
|
226 |
- return s; |
|
227 |
-} |
|
228 |
- |
|
229 |
-static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
230 |
- int line_size, int h) |
|
231 |
-{ |
|
232 |
- int i, s; |
|
233 |
- const vector unsigned int zero = |
|
234 |
- (const vector unsigned int) vec_splat_u32(0); |
|
235 |
- vector unsigned char perm = vec_lvsl(0, pix2); |
|
236 |
- vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
237 |
- vector signed int sumdiffs; |
|
238 |
- |
|
239 |
- for (i = 0; i < h; i++) { |
|
240 |
- /* Read potentially unaligned pixels into t1 and t2. */ |
|
241 |
- vector unsigned char pix2l = vec_ld(0, pix2); |
|
242 |
- vector unsigned char pix2r = vec_ld(15, pix2); |
|
243 |
- vector unsigned char t1 = vec_ld(0, pix1); |
|
244 |
- vector unsigned char t2 = vec_perm(pix2l, pix2r, perm); |
|
245 |
- |
|
246 |
- /* Calculate a sum of abs differences vector. */ |
|
247 |
- vector unsigned char t3 = vec_max(t1, t2); |
|
248 |
- vector unsigned char t4 = vec_min(t1, t2); |
|
249 |
- vector unsigned char t5 = vec_sub(t3, t4); |
|
250 |
- |
|
251 |
- /* Add each 4 pixel group together and put 4 results into sad. */ |
|
252 |
- sad = vec_sum4s(t5, sad); |
|
253 |
- |
|
254 |
- pix1 += line_size; |
|
255 |
- pix2 += line_size; |
|
256 |
- } |
|
257 |
- |
|
258 |
- /* Sum up the four partial sums, and put the result into s. */ |
|
259 |
- sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
260 |
- sumdiffs = vec_splat(sumdiffs, 3); |
|
261 |
- vec_ste(sumdiffs, 0, &s); |
|
262 |
- |
|
263 |
- return s; |
|
264 |
-} |
|
265 |
- |
|
266 |
-static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
267 |
- int line_size, int h) |
|
268 |
-{ |
|
269 |
- int i, s; |
|
270 |
- const vector unsigned int zero = |
|
271 |
- (const vector unsigned int) vec_splat_u32(0); |
|
272 |
- const vector unsigned char permclear = |
|
273 |
- (vector unsigned char) |
|
274 |
- { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; |
|
275 |
- vector unsigned char perm1 = vec_lvsl(0, pix1); |
|
276 |
- vector unsigned char perm2 = vec_lvsl(0, pix2); |
|
277 |
- vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
278 |
- vector signed int sumdiffs; |
|
279 |
- |
|
280 |
- for (i = 0; i < h; i++) { |
|
281 |
- /* Read potentially unaligned pixels into t1 and t2. |
|
282 |
- * Since we're reading 16 pixels, and actually only want 8, |
|
283 |
- * mask out the last 8 pixels. The 0s don't change the sum. */ |
|
284 |
- vector unsigned char pix1l = vec_ld(0, pix1); |
|
285 |
- vector unsigned char pix1r = vec_ld(7, pix1); |
|
286 |
- vector unsigned char pix2l = vec_ld(0, pix2); |
|
287 |
- vector unsigned char pix2r = vec_ld(7, pix2); |
|
288 |
- vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1), |
|
289 |
- permclear); |
|
290 |
- vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2), |
|
291 |
- permclear); |
|
292 |
- |
|
293 |
- /* Calculate a sum of abs differences vector. */ |
|
294 |
- vector unsigned char t3 = vec_max(t1, t2); |
|
295 |
- vector unsigned char t4 = vec_min(t1, t2); |
|
296 |
- vector unsigned char t5 = vec_sub(t3, t4); |
|
297 |
- |
|
298 |
- /* Add each 4 pixel group together and put 4 results into sad. */ |
|
299 |
- sad = vec_sum4s(t5, sad); |
|
300 |
- |
|
301 |
- pix1 += line_size; |
|
302 |
- pix2 += line_size; |
|
303 |
- } |
|
304 |
- |
|
305 |
- /* Sum up the four partial sums, and put the result into s. */ |
|
306 |
- sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
307 |
- sumdiffs = vec_splat(sumdiffs, 3); |
|
308 |
- vec_ste(sumdiffs, 0, &s); |
|
309 |
- |
|
310 |
- return s; |
|
311 |
-} |
|
312 |
- |
|
313 |
-/* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced. |
|
314 |
- * It's the sad8_altivec code above w/ squaring added. */ |
|
315 |
-static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
316 |
- int line_size, int h) |
|
317 |
-{ |
|
318 |
- int i, s; |
|
319 |
- const vector unsigned int zero = |
|
320 |
- (const vector unsigned int) vec_splat_u32(0); |
|
321 |
- const vector unsigned char permclear = |
|
322 |
- (vector unsigned char) |
|
323 |
- { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; |
|
324 |
- vector unsigned char perm1 = vec_lvsl(0, pix1); |
|
325 |
- vector unsigned char perm2 = vec_lvsl(0, pix2); |
|
326 |
- vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); |
|
327 |
- vector signed int sumsqr; |
|
328 |
- |
|
329 |
- for (i = 0; i < h; i++) { |
|
330 |
- /* Read potentially unaligned pixels into t1 and t2. |
|
331 |
- * Since we're reading 16 pixels, and actually only want 8, |
|
332 |
- * mask out the last 8 pixels. The 0s don't change the sum. */ |
|
333 |
- vector unsigned char pix1l = vec_ld(0, pix1); |
|
334 |
- vector unsigned char pix1r = vec_ld(7, pix1); |
|
335 |
- vector unsigned char pix2l = vec_ld(0, pix2); |
|
336 |
- vector unsigned char pix2r = vec_ld(7, pix2); |
|
337 |
- vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1), |
|
338 |
- permclear); |
|
339 |
- vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2), |
|
340 |
- permclear); |
|
341 |
- |
|
342 |
- /* Since we want to use unsigned chars, we can take advantage |
|
343 |
- * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ |
|
344 |
- |
|
345 |
- /* Calculate abs differences vector. */ |
|
346 |
- vector unsigned char t3 = vec_max(t1, t2); |
|
347 |
- vector unsigned char t4 = vec_min(t1, t2); |
|
348 |
- vector unsigned char t5 = vec_sub(t3, t4); |
|
349 |
- |
|
350 |
- /* Square the values and add them to our sum. */ |
|
351 |
- sum = vec_msum(t5, t5, sum); |
|
352 |
- |
|
353 |
- pix1 += line_size; |
|
354 |
- pix2 += line_size; |
|
355 |
- } |
|
356 |
- |
|
357 |
- /* Sum up the four partial sums, and put the result into s. */ |
|
358 |
- sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); |
|
359 |
- sumsqr = vec_splat(sumsqr, 3); |
|
360 |
- vec_ste(sumsqr, 0, &s); |
|
361 |
- |
|
362 |
- return s; |
|
363 |
-} |
|
364 |
- |
|
365 |
-/* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced. |
|
366 |
- * It's the sad16_altivec code above w/ squaring added. */ |
|
367 |
-static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
368 |
- int line_size, int h) |
|
369 |
-{ |
|
370 |
- int i, s; |
|
371 |
- const vector unsigned int zero = |
|
372 |
- (const vector unsigned int) vec_splat_u32(0); |
|
373 |
- vector unsigned char perm = vec_lvsl(0, pix2); |
|
374 |
- vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); |
|
375 |
- vector signed int sumsqr; |
|
376 |
- |
|
377 |
- for (i = 0; i < h; i++) { |
|
378 |
- /* Read potentially unaligned pixels into t1 and t2. */ |
|
379 |
- vector unsigned char pix2l = vec_ld(0, pix2); |
|
380 |
- vector unsigned char pix2r = vec_ld(15, pix2); |
|
381 |
- vector unsigned char t1 = vec_ld(0, pix1); |
|
382 |
- vector unsigned char t2 = vec_perm(pix2l, pix2r, perm); |
|
383 |
- |
|
384 |
- /* Since we want to use unsigned chars, we can take advantage |
|
385 |
- * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ |
|
386 |
- |
|
387 |
- /* Calculate abs differences vector. */ |
|
388 |
- vector unsigned char t3 = vec_max(t1, t2); |
|
389 |
- vector unsigned char t4 = vec_min(t1, t2); |
|
390 |
- vector unsigned char t5 = vec_sub(t3, t4); |
|
391 |
- |
|
392 |
- /* Square the values and add them to our sum. */ |
|
393 |
- sum = vec_msum(t5, t5, sum); |
|
394 |
- |
|
395 |
- pix1 += line_size; |
|
396 |
- pix2 += line_size; |
|
397 |
- } |
|
398 |
- |
|
399 |
- /* Sum up the four partial sums, and put the result into s. */ |
|
400 |
- sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); |
|
401 |
- sumsqr = vec_splat(sumsqr, 3); |
|
402 |
- vec_ste(sumsqr, 0, &s); |
|
403 |
- |
|
404 |
- return s; |
|
405 |
-} |
|
406 |
- |
|
407 |
-static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst, |
|
408 |
- uint8_t *src, int stride, int h) |
|
409 |
-{ |
|
410 |
- int sum; |
|
411 |
- register const vector unsigned char vzero = |
|
412 |
- (const vector unsigned char) vec_splat_u8(0); |
|
413 |
- register vector signed short temp0, temp1, temp2, temp3, temp4, |
|
414 |
- temp5, temp6, temp7; |
|
415 |
- { |
|
416 |
- register const vector signed short vprod1 = |
|
417 |
- (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; |
|
418 |
- register const vector signed short vprod2 = |
|
419 |
- (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; |
|
420 |
- register const vector signed short vprod3 = |
|
421 |
- (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; |
|
422 |
- register const vector unsigned char perm1 = |
|
423 |
- (const vector unsigned char) |
|
424 |
- { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, |
|
425 |
- 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; |
|
426 |
- register const vector unsigned char perm2 = |
|
427 |
- (const vector unsigned char) |
|
428 |
- { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, |
|
429 |
- 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; |
|
430 |
- register const vector unsigned char perm3 = |
|
431 |
- (const vector unsigned char) |
|
432 |
- { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
|
433 |
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; |
|
434 |
- |
|
435 |
-#define ONEITERBUTTERFLY(i, res) \ |
|
436 |
- { \ |
|
437 |
- register vector unsigned char src1 = vec_ld(stride * i, src); \ |
|
438 |
- register vector unsigned char src2 = vec_ld(stride * i + 15, src); \ |
|
439 |
- register vector unsigned char srcO = \ |
|
440 |
- vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ |
|
441 |
- register vector unsigned char dst1 = vec_ld(stride * i, dst); \ |
|
442 |
- register vector unsigned char dst2 = vec_ld(stride * i + 15, dst); \ |
|
443 |
- register vector unsigned char dstO = \ |
|
444 |
- vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ |
|
445 |
- \ |
|
446 |
- /* Promote the unsigned chars to signed shorts. */ \ |
|
447 |
- /* We're in the 8x8 function, we only care for the first 8. */ \ |
|
448 |
- register vector signed short srcV = \ |
|
449 |
- (vector signed short) vec_mergeh((vector signed char) vzero, \ |
|
450 |
- (vector signed char) srcO); \ |
|
451 |
- register vector signed short dstV = \ |
|
452 |
- (vector signed short) vec_mergeh((vector signed char) vzero, \ |
|
453 |
- (vector signed char) dstO); \ |
|
454 |
- \ |
|
455 |
- /* subtractions inside the first butterfly */ \ |
|
456 |
- register vector signed short but0 = vec_sub(srcV, dstV); \ |
|
457 |
- register vector signed short op1 = vec_perm(but0, but0, perm1); \ |
|
458 |
- register vector signed short but1 = vec_mladd(but0, vprod1, op1); \ |
|
459 |
- register vector signed short op2 = vec_perm(but1, but1, perm2); \ |
|
460 |
- register vector signed short but2 = vec_mladd(but1, vprod2, op2); \ |
|
461 |
- register vector signed short op3 = vec_perm(but2, but2, perm3); \ |
|
462 |
- res = vec_mladd(but2, vprod3, op3); \ |
|
463 |
- } |
|
464 |
- ONEITERBUTTERFLY(0, temp0); |
|
465 |
- ONEITERBUTTERFLY(1, temp1); |
|
466 |
- ONEITERBUTTERFLY(2, temp2); |
|
467 |
- ONEITERBUTTERFLY(3, temp3); |
|
468 |
- ONEITERBUTTERFLY(4, temp4); |
|
469 |
- ONEITERBUTTERFLY(5, temp5); |
|
470 |
- ONEITERBUTTERFLY(6, temp6); |
|
471 |
- ONEITERBUTTERFLY(7, temp7); |
|
472 |
- } |
|
473 |
-#undef ONEITERBUTTERFLY |
|
474 |
- { |
|
475 |
- register vector signed int vsum; |
|
476 |
- register vector signed short line0 = vec_add(temp0, temp1); |
|
477 |
- register vector signed short line1 = vec_sub(temp0, temp1); |
|
478 |
- register vector signed short line2 = vec_add(temp2, temp3); |
|
479 |
- register vector signed short line3 = vec_sub(temp2, temp3); |
|
480 |
- register vector signed short line4 = vec_add(temp4, temp5); |
|
481 |
- register vector signed short line5 = vec_sub(temp4, temp5); |
|
482 |
- register vector signed short line6 = vec_add(temp6, temp7); |
|
483 |
- register vector signed short line7 = vec_sub(temp6, temp7); |
|
484 |
- |
|
485 |
- register vector signed short line0B = vec_add(line0, line2); |
|
486 |
- register vector signed short line2B = vec_sub(line0, line2); |
|
487 |
- register vector signed short line1B = vec_add(line1, line3); |
|
488 |
- register vector signed short line3B = vec_sub(line1, line3); |
|
489 |
- register vector signed short line4B = vec_add(line4, line6); |
|
490 |
- register vector signed short line6B = vec_sub(line4, line6); |
|
491 |
- register vector signed short line5B = vec_add(line5, line7); |
|
492 |
- register vector signed short line7B = vec_sub(line5, line7); |
|
493 |
- |
|
494 |
- register vector signed short line0C = vec_add(line0B, line4B); |
|
495 |
- register vector signed short line4C = vec_sub(line0B, line4B); |
|
496 |
- register vector signed short line1C = vec_add(line1B, line5B); |
|
497 |
- register vector signed short line5C = vec_sub(line1B, line5B); |
|
498 |
- register vector signed short line2C = vec_add(line2B, line6B); |
|
499 |
- register vector signed short line6C = vec_sub(line2B, line6B); |
|
500 |
- register vector signed short line3C = vec_add(line3B, line7B); |
|
501 |
- register vector signed short line7C = vec_sub(line3B, line7B); |
|
502 |
- |
|
503 |
- vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); |
|
504 |
- vsum = vec_sum4s(vec_abs(line1C), vsum); |
|
505 |
- vsum = vec_sum4s(vec_abs(line2C), vsum); |
|
506 |
- vsum = vec_sum4s(vec_abs(line3C), vsum); |
|
507 |
- vsum = vec_sum4s(vec_abs(line4C), vsum); |
|
508 |
- vsum = vec_sum4s(vec_abs(line5C), vsum); |
|
509 |
- vsum = vec_sum4s(vec_abs(line6C), vsum); |
|
510 |
- vsum = vec_sum4s(vec_abs(line7C), vsum); |
|
511 |
- vsum = vec_sums(vsum, (vector signed int) vzero); |
|
512 |
- vsum = vec_splat(vsum, 3); |
|
513 |
- vec_ste(vsum, 0, &sum); |
|
514 |
- } |
|
515 |
- return sum; |
|
516 |
-} |
|
517 |
- |
|
518 |
-/* |
|
519 |
- * 16x8 works with 16 elements; it allows to avoid replicating loads, and |
|
520 |
- * gives the compiler more room for scheduling. It's only used from |
|
521 |
- * inside hadamard8_diff16_altivec. |
|
522 |
- * |
|
523 |
- * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has |
|
524 |
- * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in |
|
525 |
- * registers by itself. The following code includes hand-made register |
|
526 |
- * allocation. It's not clean, but on a 7450 the resulting code is much faster |
|
527 |
- * (best case falls from 700+ cycles to 550). |
|
528 |
- * |
|
529 |
- * xlc doesn't add spill code, but it doesn't know how to schedule for the |
|
530 |
- * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses |
|
531 |
- * 25% fewer instructions...) |
|
532 |
- * |
|
533 |
- * On the 970, the hand-made RA is still a win (around 690 vs. around 780), |
|
534 |
- * but xlc goes to around 660 on the regular C code... |
|
535 |
- */ |
|
536 |
-static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst, |
|
537 |
- uint8_t *src, int stride, int h) |
|
538 |
-{ |
|
539 |
- int sum; |
|
540 |
- register vector signed short |
|
541 |
- temp0 __asm__ ("v0"), |
|
542 |
- temp1 __asm__ ("v1"), |
|
543 |
- temp2 __asm__ ("v2"), |
|
544 |
- temp3 __asm__ ("v3"), |
|
545 |
- temp4 __asm__ ("v4"), |
|
546 |
- temp5 __asm__ ("v5"), |
|
547 |
- temp6 __asm__ ("v6"), |
|
548 |
- temp7 __asm__ ("v7"); |
|
549 |
- register vector signed short |
|
550 |
- temp0S __asm__ ("v8"), |
|
551 |
- temp1S __asm__ ("v9"), |
|
552 |
- temp2S __asm__ ("v10"), |
|
553 |
- temp3S __asm__ ("v11"), |
|
554 |
- temp4S __asm__ ("v12"), |
|
555 |
- temp5S __asm__ ("v13"), |
|
556 |
- temp6S __asm__ ("v14"), |
|
557 |
- temp7S __asm__ ("v15"); |
|
558 |
- register const vector unsigned char vzero __asm__ ("v31") = |
|
559 |
- (const vector unsigned char) vec_splat_u8(0); |
|
560 |
- { |
|
561 |
- register const vector signed short vprod1 __asm__ ("v16") = |
|
562 |
- (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; |
|
563 |
- |
|
564 |
- register const vector signed short vprod2 __asm__ ("v17") = |
|
565 |
- (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; |
|
566 |
- |
|
567 |
- register const vector signed short vprod3 __asm__ ("v18") = |
|
568 |
- (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; |
|
569 |
- |
|
570 |
- register const vector unsigned char perm1 __asm__ ("v19") = |
|
571 |
- (const vector unsigned char) |
|
572 |
- { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, |
|
573 |
- 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; |
|
574 |
- |
|
575 |
- register const vector unsigned char perm2 __asm__ ("v20") = |
|
576 |
- (const vector unsigned char) |
|
577 |
- { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, |
|
578 |
- 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; |
|
579 |
- |
|
580 |
- register const vector unsigned char perm3 __asm__ ("v21") = |
|
581 |
- (const vector unsigned char) |
|
582 |
- { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
|
583 |
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; |
|
584 |
- |
|
585 |
-#define ONEITERBUTTERFLY(i, res1, res2) \ |
|
586 |
- { \ |
|
587 |
- register vector unsigned char src1 __asm__ ("v22") = \ |
|
588 |
- vec_ld(stride * i, src); \ |
|
589 |
- register vector unsigned char src2 __asm__ ("v23") = \ |
|
590 |
- vec_ld(stride * i + 16, src); \ |
|
591 |
- register vector unsigned char srcO __asm__ ("v22") = \ |
|
592 |
- vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ |
|
593 |
- register vector unsigned char dst1 __asm__ ("v24") = \ |
|
594 |
- vec_ld(stride * i, dst); \ |
|
595 |
- register vector unsigned char dst2 __asm__ ("v25") = \ |
|
596 |
- vec_ld(stride * i + 16, dst); \ |
|
597 |
- register vector unsigned char dstO __asm__ ("v23") = \ |
|
598 |
- vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ |
|
599 |
- \ |
|
600 |
- /* Promote the unsigned chars to signed shorts. */ \ |
|
601 |
- register vector signed short srcV __asm__ ("v24") = \ |
|
602 |
- (vector signed short) vec_mergeh((vector signed char) vzero, \ |
|
603 |
- (vector signed char) srcO); \ |
|
604 |
- register vector signed short dstV __asm__ ("v25") = \ |
|
605 |
- (vector signed short) vec_mergeh((vector signed char) vzero, \ |
|
606 |
- (vector signed char) dstO); \ |
|
607 |
- register vector signed short srcW __asm__ ("v26") = \ |
|
608 |
- (vector signed short) vec_mergel((vector signed char) vzero, \ |
|
609 |
- (vector signed char) srcO); \ |
|
610 |
- register vector signed short dstW __asm__ ("v27") = \ |
|
611 |
- (vector signed short) vec_mergel((vector signed char) vzero, \ |
|
612 |
- (vector signed char) dstO); \ |
|
613 |
- \ |
|
614 |
- /* subtractions inside the first butterfly */ \ |
|
615 |
- register vector signed short but0 __asm__ ("v28") = \ |
|
616 |
- vec_sub(srcV, dstV); \ |
|
617 |
- register vector signed short but0S __asm__ ("v29") = \ |
|
618 |
- vec_sub(srcW, dstW); \ |
|
619 |
- register vector signed short op1 __asm__ ("v30") = \ |
|
620 |
- vec_perm(but0, but0, perm1); \ |
|
621 |
- register vector signed short but1 __asm__ ("v22") = \ |
|
622 |
- vec_mladd(but0, vprod1, op1); \ |
|
623 |
- register vector signed short op1S __asm__ ("v23") = \ |
|
624 |
- vec_perm(but0S, but0S, perm1); \ |
|
625 |
- register vector signed short but1S __asm__ ("v24") = \ |
|
626 |
- vec_mladd(but0S, vprod1, op1S); \ |
|
627 |
- register vector signed short op2 __asm__ ("v25") = \ |
|
628 |
- vec_perm(but1, but1, perm2); \ |
|
629 |
- register vector signed short but2 __asm__ ("v26") = \ |
|
630 |
- vec_mladd(but1, vprod2, op2); \ |
|
631 |
- register vector signed short op2S __asm__ ("v27") = \ |
|
632 |
- vec_perm(but1S, but1S, perm2); \ |
|
633 |
- register vector signed short but2S __asm__ ("v28") = \ |
|
634 |
- vec_mladd(but1S, vprod2, op2S); \ |
|
635 |
- register vector signed short op3 __asm__ ("v29") = \ |
|
636 |
- vec_perm(but2, but2, perm3); \ |
|
637 |
- register vector signed short op3S __asm__ ("v30") = \ |
|
638 |
- vec_perm(but2S, but2S, perm3); \ |
|
639 |
- res1 = vec_mladd(but2, vprod3, op3); \ |
|
640 |
- res2 = vec_mladd(but2S, vprod3, op3S); \ |
|
641 |
- } |
|
642 |
- ONEITERBUTTERFLY(0, temp0, temp0S); |
|
643 |
- ONEITERBUTTERFLY(1, temp1, temp1S); |
|
644 |
- ONEITERBUTTERFLY(2, temp2, temp2S); |
|
645 |
- ONEITERBUTTERFLY(3, temp3, temp3S); |
|
646 |
- ONEITERBUTTERFLY(4, temp4, temp4S); |
|
647 |
- ONEITERBUTTERFLY(5, temp5, temp5S); |
|
648 |
- ONEITERBUTTERFLY(6, temp6, temp6S); |
|
649 |
- ONEITERBUTTERFLY(7, temp7, temp7S); |
|
650 |
- } |
|
651 |
-#undef ONEITERBUTTERFLY |
|
652 |
- { |
|
653 |
- register vector signed int vsum; |
|
654 |
- |
|
655 |
- register vector signed short line0 = vec_add(temp0, temp1); |
|
656 |
- register vector signed short line1 = vec_sub(temp0, temp1); |
|
657 |
- register vector signed short line2 = vec_add(temp2, temp3); |
|
658 |
- register vector signed short line3 = vec_sub(temp2, temp3); |
|
659 |
- register vector signed short line4 = vec_add(temp4, temp5); |
|
660 |
- register vector signed short line5 = vec_sub(temp4, temp5); |
|
661 |
- register vector signed short line6 = vec_add(temp6, temp7); |
|
662 |
- register vector signed short line7 = vec_sub(temp6, temp7); |
|
663 |
- |
|
664 |
- register vector signed short line0B = vec_add(line0, line2); |
|
665 |
- register vector signed short line2B = vec_sub(line0, line2); |
|
666 |
- register vector signed short line1B = vec_add(line1, line3); |
|
667 |
- register vector signed short line3B = vec_sub(line1, line3); |
|
668 |
- register vector signed short line4B = vec_add(line4, line6); |
|
669 |
- register vector signed short line6B = vec_sub(line4, line6); |
|
670 |
- register vector signed short line5B = vec_add(line5, line7); |
|
671 |
- register vector signed short line7B = vec_sub(line5, line7); |
|
672 |
- |
|
673 |
- register vector signed short line0C = vec_add(line0B, line4B); |
|
674 |
- register vector signed short line4C = vec_sub(line0B, line4B); |
|
675 |
- register vector signed short line1C = vec_add(line1B, line5B); |
|
676 |
- register vector signed short line5C = vec_sub(line1B, line5B); |
|
677 |
- register vector signed short line2C = vec_add(line2B, line6B); |
|
678 |
- register vector signed short line6C = vec_sub(line2B, line6B); |
|
679 |
- register vector signed short line3C = vec_add(line3B, line7B); |
|
680 |
- register vector signed short line7C = vec_sub(line3B, line7B); |
|
681 |
- |
|
682 |
- register vector signed short line0S = vec_add(temp0S, temp1S); |
|
683 |
- register vector signed short line1S = vec_sub(temp0S, temp1S); |
|
684 |
- register vector signed short line2S = vec_add(temp2S, temp3S); |
|
685 |
- register vector signed short line3S = vec_sub(temp2S, temp3S); |
|
686 |
- register vector signed short line4S = vec_add(temp4S, temp5S); |
|
687 |
- register vector signed short line5S = vec_sub(temp4S, temp5S); |
|
688 |
- register vector signed short line6S = vec_add(temp6S, temp7S); |
|
689 |
- register vector signed short line7S = vec_sub(temp6S, temp7S); |
|
690 |
- |
|
691 |
- register vector signed short line0BS = vec_add(line0S, line2S); |
|
692 |
- register vector signed short line2BS = vec_sub(line0S, line2S); |
|
693 |
- register vector signed short line1BS = vec_add(line1S, line3S); |
|
694 |
- register vector signed short line3BS = vec_sub(line1S, line3S); |
|
695 |
- register vector signed short line4BS = vec_add(line4S, line6S); |
|
696 |
- register vector signed short line6BS = vec_sub(line4S, line6S); |
|
697 |
- register vector signed short line5BS = vec_add(line5S, line7S); |
|
698 |
- register vector signed short line7BS = vec_sub(line5S, line7S); |
|
699 |
- |
|
700 |
- register vector signed short line0CS = vec_add(line0BS, line4BS); |
|
701 |
- register vector signed short line4CS = vec_sub(line0BS, line4BS); |
|
702 |
- register vector signed short line1CS = vec_add(line1BS, line5BS); |
|
703 |
- register vector signed short line5CS = vec_sub(line1BS, line5BS); |
|
704 |
- register vector signed short line2CS = vec_add(line2BS, line6BS); |
|
705 |
- register vector signed short line6CS = vec_sub(line2BS, line6BS); |
|
706 |
- register vector signed short line3CS = vec_add(line3BS, line7BS); |
|
707 |
- register vector signed short line7CS = vec_sub(line3BS, line7BS); |
|
708 |
- |
|
709 |
- vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); |
|
710 |
- vsum = vec_sum4s(vec_abs(line1C), vsum); |
|
711 |
- vsum = vec_sum4s(vec_abs(line2C), vsum); |
|
712 |
- vsum = vec_sum4s(vec_abs(line3C), vsum); |
|
713 |
- vsum = vec_sum4s(vec_abs(line4C), vsum); |
|
714 |
- vsum = vec_sum4s(vec_abs(line5C), vsum); |
|
715 |
- vsum = vec_sum4s(vec_abs(line6C), vsum); |
|
716 |
- vsum = vec_sum4s(vec_abs(line7C), vsum); |
|
717 |
- |
|
718 |
- vsum = vec_sum4s(vec_abs(line0CS), vsum); |
|
719 |
- vsum = vec_sum4s(vec_abs(line1CS), vsum); |
|
720 |
- vsum = vec_sum4s(vec_abs(line2CS), vsum); |
|
721 |
- vsum = vec_sum4s(vec_abs(line3CS), vsum); |
|
722 |
- vsum = vec_sum4s(vec_abs(line4CS), vsum); |
|
723 |
- vsum = vec_sum4s(vec_abs(line5CS), vsum); |
|
724 |
- vsum = vec_sum4s(vec_abs(line6CS), vsum); |
|
725 |
- vsum = vec_sum4s(vec_abs(line7CS), vsum); |
|
726 |
- vsum = vec_sums(vsum, (vector signed int) vzero); |
|
727 |
- vsum = vec_splat(vsum, 3); |
|
728 |
- vec_ste(vsum, 0, &sum); |
|
729 |
- } |
|
730 |
- return sum; |
|
731 |
-} |
|
732 |
- |
|
733 |
-static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst, |
|
734 |
- uint8_t *src, int stride, int h) |
|
735 |
-{ |
|
736 |
- int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); |
|
737 |
- |
|
738 |
- if (h == 16) { |
|
739 |
- dst += 8 * stride; |
|
740 |
- src += 8 * stride; |
|
741 |
- score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); |
|
742 |
- } |
|
743 |
- return score; |
|
744 |
-} |
|
745 |
-#endif /* HAVE_ALTIVEC */ |
|
746 |
- |
|
747 |
-av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx) |
|
748 |
-{ |
|
749 |
-#if HAVE_ALTIVEC |
|
750 |
- if (!PPC_ALTIVEC(av_get_cpu_flags())) |
|
751 |
- return; |
|
752 |
- |
|
753 |
- c->pix_abs[0][1] = sad16_x2_altivec; |
|
754 |
- c->pix_abs[0][2] = sad16_y2_altivec; |
|
755 |
- c->pix_abs[0][3] = sad16_xy2_altivec; |
|
756 |
- c->pix_abs[0][0] = sad16_altivec; |
|
757 |
- c->pix_abs[1][0] = sad8_altivec; |
|
758 |
- |
|
759 |
- c->sad[0] = sad16_altivec; |
|
760 |
- c->sad[1] = sad8_altivec; |
|
761 |
- c->sse[0] = sse16_altivec; |
|
762 |
- c->sse[1] = sse8_altivec; |
|
763 |
- |
|
764 |
- c->hadamard8_diff[0] = hadamard8_diff16_altivec; |
|
765 |
- c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; |
|
766 |
-#endif /* HAVE_ALTIVEC */ |
|
767 |
-} |
768 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,767 @@ |
0 |
+/* |
|
1 |
+ * Copyright (c) 2002 Brian Foley |
|
2 |
+ * Copyright (c) 2002 Dieter Shirley |
|
3 |
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> |
|
4 |
+ * |
|
5 |
+ * This file is part of FFmpeg. |
|
6 |
+ * |
|
7 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
8 |
+ * modify it under the terms of the GNU Lesser General Public |
|
9 |
+ * License as published by the Free Software Foundation; either |
|
10 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
11 |
+ * |
|
12 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
13 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
15 |
+ * Lesser General Public License for more details. |
|
16 |
+ * |
|
17 |
+ * You should have received a copy of the GNU Lesser General Public |
|
18 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
19 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
20 |
+ */ |
|
21 |
+ |
|
22 |
+#include "config.h" |
|
23 |
+#if HAVE_ALTIVEC_H |
|
24 |
+#include <altivec.h> |
|
25 |
+#endif |
|
26 |
+ |
|
27 |
+#include "libavutil/attributes.h" |
|
28 |
+#include "libavutil/cpu.h" |
|
29 |
+#include "libavutil/ppc/cpu.h" |
|
30 |
+#include "libavutil/ppc/types_altivec.h" |
|
31 |
+#include "libavutil/ppc/util_altivec.h" |
|
32 |
+#include "libavcodec/avcodec.h" |
|
33 |
+#include "libavcodec/mpegvideo.h" |
|
34 |
+#include "libavcodec/me_cmp.h" |
|
35 |
+ |
|
36 |
+#if HAVE_ALTIVEC |
|
37 |
+static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
38 |
+ int line_size, int h) |
|
39 |
+{ |
|
40 |
+ int i, s = 0; |
|
41 |
+ const vector unsigned char zero = |
|
42 |
+ (const vector unsigned char) vec_splat_u8(0); |
|
43 |
+ vector unsigned char perm1 = vec_lvsl(0, pix2); |
|
44 |
+ vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); |
|
45 |
+ vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
46 |
+ vector signed int sumdiffs; |
|
47 |
+ |
|
48 |
+ for (i = 0; i < h; i++) { |
|
49 |
+ /* Read unaligned pixels into our vectors. The vectors are as follows: |
|
50 |
+ * pix1v: pix1[0] - pix1[15] |
|
51 |
+ * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] */ |
|
52 |
+ vector unsigned char pix1v = vec_ld(0, pix1); |
|
53 |
+ vector unsigned char pix2l = vec_ld(0, pix2); |
|
54 |
+ vector unsigned char pix2r = vec_ld(16, pix2); |
|
55 |
+ vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm1); |
|
56 |
+ vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2); |
|
57 |
+ |
|
58 |
+ /* Calculate the average vector. */ |
|
59 |
+ vector unsigned char avgv = vec_avg(pix2v, pix2iv); |
|
60 |
+ |
|
61 |
+ /* Calculate a sum of abs differences vector. */ |
|
62 |
+ vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv), |
|
63 |
+ vec_min(pix1v, avgv)); |
|
64 |
+ |
|
65 |
+ /* Add each 4 pixel group together and put 4 results into sad. */ |
|
66 |
+ sad = vec_sum4s(t5, sad); |
|
67 |
+ |
|
68 |
+ pix1 += line_size; |
|
69 |
+ pix2 += line_size; |
|
70 |
+ } |
|
71 |
+ /* Sum up the four partial sums, and put the result into s. */ |
|
72 |
+ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
73 |
+ sumdiffs = vec_splat(sumdiffs, 3); |
|
74 |
+ vec_ste(sumdiffs, 0, &s); |
|
75 |
+ |
|
76 |
+ return s; |
|
77 |
+} |
|
78 |
+ |
|
79 |
+static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
80 |
+ int line_size, int h) |
|
81 |
+{ |
|
82 |
+ int i, s = 0; |
|
83 |
+ const vector unsigned char zero = |
|
84 |
+ (const vector unsigned char) vec_splat_u8(0); |
|
85 |
+ vector unsigned char perm = vec_lvsl(0, pix2); |
|
86 |
+ vector unsigned char pix1v, pix3v, avgv, t5; |
|
87 |
+ vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
88 |
+ vector signed int sumdiffs; |
|
89 |
+ uint8_t *pix3 = pix2 + line_size; |
|
90 |
+ |
|
91 |
+ /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one |
|
92 |
+ * iteration becomes pix2 in the next iteration. We can use this |
|
93 |
+ * fact to avoid a potentially expensive unaligned read, each |
|
94 |
+ * time around the loop. |
|
95 |
+ * Read unaligned pixels into our vectors. The vectors are as follows: |
|
96 |
+ * pix2v: pix2[0] - pix2[15] |
|
97 |
+ * Split the pixel vectors into shorts. */ |
|
98 |
+ vector unsigned char pix2l = vec_ld(0, pix2); |
|
99 |
+ vector unsigned char pix2r = vec_ld(15, pix2); |
|
100 |
+ vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm); |
|
101 |
+ |
|
102 |
+ for (i = 0; i < h; i++) { |
|
103 |
+ /* Read unaligned pixels into our vectors. The vectors are as follows: |
|
104 |
+ * pix1v: pix1[0] - pix1[15] |
|
105 |
+ * pix3v: pix3[0] - pix3[15] */ |
|
106 |
+ pix1v = vec_ld(0, pix1); |
|
107 |
+ |
|
108 |
+ pix2l = vec_ld(0, pix3); |
|
109 |
+ pix2r = vec_ld(15, pix3); |
|
110 |
+ pix3v = vec_perm(pix2l, pix2r, perm); |
|
111 |
+ |
|
112 |
+ /* Calculate the average vector. */ |
|
113 |
+ avgv = vec_avg(pix2v, pix3v); |
|
114 |
+ |
|
115 |
+ /* Calculate a sum of abs differences vector. */ |
|
116 |
+ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
|
117 |
+ |
|
118 |
+ /* Add each 4 pixel group together and put 4 results into sad. */ |
|
119 |
+ sad = vec_sum4s(t5, sad); |
|
120 |
+ |
|
121 |
+ pix1 += line_size; |
|
122 |
+ pix2v = pix3v; |
|
123 |
+ pix3 += line_size; |
|
124 |
+ } |
|
125 |
+ |
|
126 |
+ /* Sum up the four partial sums, and put the result into s. */ |
|
127 |
+ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
128 |
+ sumdiffs = vec_splat(sumdiffs, 3); |
|
129 |
+ vec_ste(sumdiffs, 0, &s); |
|
130 |
+ return s; |
|
131 |
+} |
|
132 |
+ |
|
133 |
+static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
134 |
+ int line_size, int h) |
|
135 |
+{ |
|
136 |
+ int i, s = 0; |
|
137 |
+ uint8_t *pix3 = pix2 + line_size; |
|
138 |
+ const vector unsigned char zero = |
|
139 |
+ (const vector unsigned char) vec_splat_u8(0); |
|
140 |
+ const vector unsigned short two = |
|
141 |
+ (const vector unsigned short) vec_splat_u16(2); |
|
142 |
+ vector unsigned char avgv, t5; |
|
143 |
+ vector unsigned char perm1 = vec_lvsl(0, pix2); |
|
144 |
+ vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); |
|
145 |
+ vector unsigned char pix1v, pix3v, pix3iv; |
|
146 |
+ vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; |
|
147 |
+ vector unsigned short avghv, avglv; |
|
148 |
+ vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
149 |
+ vector signed int sumdiffs; |
|
150 |
+ |
|
151 |
+ /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one |
|
152 |
+ * iteration becomes pix2 in the next iteration. We can use this |
|
153 |
+ * fact to avoid a potentially expensive unaligned read, as well |
|
154 |
+ * as some splitting, and vector addition each time around the loop. |
|
155 |
+ * Read unaligned pixels into our vectors. The vectors are as follows: |
|
156 |
+ * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] |
|
157 |
+ * Split the pixel vectors into shorts. */ |
|
158 |
+ vector unsigned char pix2l = vec_ld(0, pix2); |
|
159 |
+ vector unsigned char pix2r = vec_ld(16, pix2); |
|
160 |
+ vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm1); |
|
161 |
+ vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2); |
|
162 |
+ |
|
163 |
+ vector unsigned short pix2hv = |
|
164 |
+ (vector unsigned short) vec_mergeh(zero, pix2v); |
|
165 |
+ vector unsigned short pix2lv = |
|
166 |
+ (vector unsigned short) vec_mergel(zero, pix2v); |
|
167 |
+ vector unsigned short pix2ihv = |
|
168 |
+ (vector unsigned short) vec_mergeh(zero, pix2iv); |
|
169 |
+ vector unsigned short pix2ilv = |
|
170 |
+ (vector unsigned short) vec_mergel(zero, pix2iv); |
|
171 |
+ vector unsigned short t1 = vec_add(pix2hv, pix2ihv); |
|
172 |
+ vector unsigned short t2 = vec_add(pix2lv, pix2ilv); |
|
173 |
+ vector unsigned short t3, t4; |
|
174 |
+ |
|
175 |
+ for (i = 0; i < h; i++) { |
|
176 |
+ /* Read unaligned pixels into our vectors. The vectors are as follows: |
|
177 |
+ * pix1v: pix1[0] - pix1[15] |
|
178 |
+ * pix3v: pix3[0] - pix3[15] pix3iv: pix3[1] - pix3[16] */ |
|
179 |
+ pix1v = vec_ld(0, pix1); |
|
180 |
+ |
|
181 |
+ pix2l = vec_ld(0, pix3); |
|
182 |
+ pix2r = vec_ld(16, pix3); |
|
183 |
+ pix3v = vec_perm(pix2l, pix2r, perm1); |
|
184 |
+ pix3iv = vec_perm(pix2l, pix2r, perm2); |
|
185 |
+ |
|
186 |
+ /* Note that AltiVec does have vec_avg, but this works on vector pairs |
|
187 |
+ * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the |
|
188 |
+ * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when |
|
189 |
+ * it should be 1. Instead, we have to split the pixel vectors into |
|
190 |
+ * vectors of shorts and do the averaging by hand. */ |
|
191 |
+ |
|
192 |
+ /* Split the pixel vectors into shorts. */ |
|
193 |
+ pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); |
|
194 |
+ pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); |
|
195 |
+ pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); |
|
196 |
+ pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); |
|
197 |
+ |
|
198 |
+ /* Do the averaging on them. */ |
|
199 |
+ t3 = vec_add(pix3hv, pix3ihv); |
|
200 |
+ t4 = vec_add(pix3lv, pix3ilv); |
|
201 |
+ |
|
202 |
+ avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); |
|
203 |
+ avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); |
|
204 |
+ |
|
205 |
+ /* Pack the shorts back into a result. */ |
|
206 |
+ avgv = vec_pack(avghv, avglv); |
|
207 |
+ |
|
208 |
+ /* Calculate a sum of abs differences vector. */ |
|
209 |
+ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
|
210 |
+ |
|
211 |
+ /* Add each 4 pixel group together and put 4 results into sad. */ |
|
212 |
+ sad = vec_sum4s(t5, sad); |
|
213 |
+ |
|
214 |
+ pix1 += line_size; |
|
215 |
+ pix3 += line_size; |
|
216 |
+ /* Transfer the calculated values for pix3 into pix2. */ |
|
217 |
+ t1 = t3; |
|
218 |
+ t2 = t4; |
|
219 |
+ } |
|
220 |
+ /* Sum up the four partial sums, and put the result into s. */ |
|
221 |
+ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
222 |
+ sumdiffs = vec_splat(sumdiffs, 3); |
|
223 |
+ vec_ste(sumdiffs, 0, &s); |
|
224 |
+ |
|
225 |
+ return s; |
|
226 |
+} |
|
227 |
+ |
|
228 |
+static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
229 |
+ int line_size, int h) |
|
230 |
+{ |
|
231 |
+ int i, s; |
|
232 |
+ const vector unsigned int zero = |
|
233 |
+ (const vector unsigned int) vec_splat_u32(0); |
|
234 |
+ vector unsigned char perm = vec_lvsl(0, pix2); |
|
235 |
+ vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
236 |
+ vector signed int sumdiffs; |
|
237 |
+ |
|
238 |
+ for (i = 0; i < h; i++) { |
|
239 |
+ /* Read potentially unaligned pixels into t1 and t2. */ |
|
240 |
+ vector unsigned char pix2l = vec_ld(0, pix2); |
|
241 |
+ vector unsigned char pix2r = vec_ld(15, pix2); |
|
242 |
+ vector unsigned char t1 = vec_ld(0, pix1); |
|
243 |
+ vector unsigned char t2 = vec_perm(pix2l, pix2r, perm); |
|
244 |
+ |
|
245 |
+ /* Calculate a sum of abs differences vector. */ |
|
246 |
+ vector unsigned char t3 = vec_max(t1, t2); |
|
247 |
+ vector unsigned char t4 = vec_min(t1, t2); |
|
248 |
+ vector unsigned char t5 = vec_sub(t3, t4); |
|
249 |
+ |
|
250 |
+ /* Add each 4 pixel group together and put 4 results into sad. */ |
|
251 |
+ sad = vec_sum4s(t5, sad); |
|
252 |
+ |
|
253 |
+ pix1 += line_size; |
|
254 |
+ pix2 += line_size; |
|
255 |
+ } |
|
256 |
+ |
|
257 |
+ /* Sum up the four partial sums, and put the result into s. */ |
|
258 |
+ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
259 |
+ sumdiffs = vec_splat(sumdiffs, 3); |
|
260 |
+ vec_ste(sumdiffs, 0, &s); |
|
261 |
+ |
|
262 |
+ return s; |
|
263 |
+} |
|
264 |
+ |
|
265 |
+static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
266 |
+ int line_size, int h) |
|
267 |
+{ |
|
268 |
+ int i, s; |
|
269 |
+ const vector unsigned int zero = |
|
270 |
+ (const vector unsigned int) vec_splat_u32(0); |
|
271 |
+ const vector unsigned char permclear = |
|
272 |
+ (vector unsigned char) |
|
273 |
+ { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; |
|
274 |
+ vector unsigned char perm1 = vec_lvsl(0, pix1); |
|
275 |
+ vector unsigned char perm2 = vec_lvsl(0, pix2); |
|
276 |
+ vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
|
277 |
+ vector signed int sumdiffs; |
|
278 |
+ |
|
279 |
+ for (i = 0; i < h; i++) { |
|
280 |
+ /* Read potentially unaligned pixels into t1 and t2. |
|
281 |
+ * Since we're reading 16 pixels, and actually only want 8, |
|
282 |
+ * mask out the last 8 pixels. The 0s don't change the sum. */ |
|
283 |
+ vector unsigned char pix1l = vec_ld(0, pix1); |
|
284 |
+ vector unsigned char pix1r = vec_ld(7, pix1); |
|
285 |
+ vector unsigned char pix2l = vec_ld(0, pix2); |
|
286 |
+ vector unsigned char pix2r = vec_ld(7, pix2); |
|
287 |
+ vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1), |
|
288 |
+ permclear); |
|
289 |
+ vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2), |
|
290 |
+ permclear); |
|
291 |
+ |
|
292 |
+ /* Calculate a sum of abs differences vector. */ |
|
293 |
+ vector unsigned char t3 = vec_max(t1, t2); |
|
294 |
+ vector unsigned char t4 = vec_min(t1, t2); |
|
295 |
+ vector unsigned char t5 = vec_sub(t3, t4); |
|
296 |
+ |
|
297 |
+ /* Add each 4 pixel group together and put 4 results into sad. */ |
|
298 |
+ sad = vec_sum4s(t5, sad); |
|
299 |
+ |
|
300 |
+ pix1 += line_size; |
|
301 |
+ pix2 += line_size; |
|
302 |
+ } |
|
303 |
+ |
|
304 |
+ /* Sum up the four partial sums, and put the result into s. */ |
|
305 |
+ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
|
306 |
+ sumdiffs = vec_splat(sumdiffs, 3); |
|
307 |
+ vec_ste(sumdiffs, 0, &s); |
|
308 |
+ |
|
309 |
+ return s; |
|
310 |
+} |
|
311 |
+ |
|
312 |
+/* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced. |
|
313 |
+ * It's the sad8_altivec code above w/ squaring added. */ |
|
314 |
+static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
315 |
+ int line_size, int h) |
|
316 |
+{ |
|
317 |
+ int i, s; |
|
318 |
+ const vector unsigned int zero = |
|
319 |
+ (const vector unsigned int) vec_splat_u32(0); |
|
320 |
+ const vector unsigned char permclear = |
|
321 |
+ (vector unsigned char) |
|
322 |
+ { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; |
|
323 |
+ vector unsigned char perm1 = vec_lvsl(0, pix1); |
|
324 |
+ vector unsigned char perm2 = vec_lvsl(0, pix2); |
|
325 |
+ vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); |
|
326 |
+ vector signed int sumsqr; |
|
327 |
+ |
|
328 |
+ for (i = 0; i < h; i++) { |
|
329 |
+ /* Read potentially unaligned pixels into t1 and t2. |
|
330 |
+ * Since we're reading 16 pixels, and actually only want 8, |
|
331 |
+ * mask out the last 8 pixels. The 0s don't change the sum. */ |
|
332 |
+ vector unsigned char pix1l = vec_ld(0, pix1); |
|
333 |
+ vector unsigned char pix1r = vec_ld(7, pix1); |
|
334 |
+ vector unsigned char pix2l = vec_ld(0, pix2); |
|
335 |
+ vector unsigned char pix2r = vec_ld(7, pix2); |
|
336 |
+ vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1), |
|
337 |
+ permclear); |
|
338 |
+ vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2), |
|
339 |
+ permclear); |
|
340 |
+ |
|
341 |
+ /* Since we want to use unsigned chars, we can take advantage |
|
342 |
+ * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ |
|
343 |
+ |
|
344 |
+ /* Calculate abs differences vector. */ |
|
345 |
+ vector unsigned char t3 = vec_max(t1, t2); |
|
346 |
+ vector unsigned char t4 = vec_min(t1, t2); |
|
347 |
+ vector unsigned char t5 = vec_sub(t3, t4); |
|
348 |
+ |
|
349 |
+ /* Square the values and add them to our sum. */ |
|
350 |
+ sum = vec_msum(t5, t5, sum); |
|
351 |
+ |
|
352 |
+ pix1 += line_size; |
|
353 |
+ pix2 += line_size; |
|
354 |
+ } |
|
355 |
+ |
|
356 |
+ /* Sum up the four partial sums, and put the result into s. */ |
|
357 |
+ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); |
|
358 |
+ sumsqr = vec_splat(sumsqr, 3); |
|
359 |
+ vec_ste(sumsqr, 0, &s); |
|
360 |
+ |
|
361 |
+ return s; |
|
362 |
+} |
|
363 |
+ |
|
364 |
+/* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced. |
|
365 |
+ * It's the sad16_altivec code above w/ squaring added. */ |
|
366 |
+static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
367 |
+ int line_size, int h) |
|
368 |
+{ |
|
369 |
+ int i, s; |
|
370 |
+ const vector unsigned int zero = |
|
371 |
+ (const vector unsigned int) vec_splat_u32(0); |
|
372 |
+ vector unsigned char perm = vec_lvsl(0, pix2); |
|
373 |
+ vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); |
|
374 |
+ vector signed int sumsqr; |
|
375 |
+ |
|
376 |
+ for (i = 0; i < h; i++) { |
|
377 |
+ /* Read potentially unaligned pixels into t1 and t2. */ |
|
378 |
+ vector unsigned char pix2l = vec_ld(0, pix2); |
|
379 |
+ vector unsigned char pix2r = vec_ld(15, pix2); |
|
380 |
+ vector unsigned char t1 = vec_ld(0, pix1); |
|
381 |
+ vector unsigned char t2 = vec_perm(pix2l, pix2r, perm); |
|
382 |
+ |
|
383 |
+ /* Since we want to use unsigned chars, we can take advantage |
|
384 |
+ * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ |
|
385 |
+ |
|
386 |
+ /* Calculate abs differences vector. */ |
|
387 |
+ vector unsigned char t3 = vec_max(t1, t2); |
|
388 |
+ vector unsigned char t4 = vec_min(t1, t2); |
|
389 |
+ vector unsigned char t5 = vec_sub(t3, t4); |
|
390 |
+ |
|
391 |
+ /* Square the values and add them to our sum. */ |
|
392 |
+ sum = vec_msum(t5, t5, sum); |
|
393 |
+ |
|
394 |
+ pix1 += line_size; |
|
395 |
+ pix2 += line_size; |
|
396 |
+ } |
|
397 |
+ |
|
398 |
+ /* Sum up the four partial sums, and put the result into s. */ |
|
399 |
+ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); |
|
400 |
+ sumsqr = vec_splat(sumsqr, 3); |
|
401 |
+ vec_ste(sumsqr, 0, &s); |
|
402 |
+ |
|
403 |
+ return s; |
|
404 |
+} |
|
405 |
+ |
|
406 |
+static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst, |
|
407 |
+ uint8_t *src, int stride, int h) |
|
408 |
+{ |
|
409 |
+ int sum; |
|
410 |
+ register const vector unsigned char vzero = |
|
411 |
+ (const vector unsigned char) vec_splat_u8(0); |
|
412 |
+ register vector signed short temp0, temp1, temp2, temp3, temp4, |
|
413 |
+ temp5, temp6, temp7; |
|
414 |
+ { |
|
415 |
+ register const vector signed short vprod1 = |
|
416 |
+ (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; |
|
417 |
+ register const vector signed short vprod2 = |
|
418 |
+ (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; |
|
419 |
+ register const vector signed short vprod3 = |
|
420 |
+ (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; |
|
421 |
+ register const vector unsigned char perm1 = |
|
422 |
+ (const vector unsigned char) |
|
423 |
+ { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, |
|
424 |
+ 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; |
|
425 |
+ register const vector unsigned char perm2 = |
|
426 |
+ (const vector unsigned char) |
|
427 |
+ { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, |
|
428 |
+ 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; |
|
429 |
+ register const vector unsigned char perm3 = |
|
430 |
+ (const vector unsigned char) |
|
431 |
+ { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
|
432 |
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; |
|
433 |
+ |
|
434 |
+#define ONEITERBUTTERFLY(i, res) \ |
|
435 |
+ { \ |
|
436 |
+ register vector unsigned char src1 = vec_ld(stride * i, src); \ |
|
437 |
+ register vector unsigned char src2 = vec_ld(stride * i + 15, src); \ |
|
438 |
+ register vector unsigned char srcO = \ |
|
439 |
+ vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ |
|
440 |
+ register vector unsigned char dst1 = vec_ld(stride * i, dst); \ |
|
441 |
+ register vector unsigned char dst2 = vec_ld(stride * i + 15, dst); \ |
|
442 |
+ register vector unsigned char dstO = \ |
|
443 |
+ vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ |
|
444 |
+ \ |
|
445 |
+ /* Promote the unsigned chars to signed shorts. */ \ |
|
446 |
+ /* We're in the 8x8 function, we only care for the first 8. */ \ |
|
447 |
+ register vector signed short srcV = \ |
|
448 |
+ (vector signed short) vec_mergeh((vector signed char) vzero, \ |
|
449 |
+ (vector signed char) srcO); \ |
|
450 |
+ register vector signed short dstV = \ |
|
451 |
+ (vector signed short) vec_mergeh((vector signed char) vzero, \ |
|
452 |
+ (vector signed char) dstO); \ |
|
453 |
+ \ |
|
454 |
+ /* subtractions inside the first butterfly */ \ |
|
455 |
+ register vector signed short but0 = vec_sub(srcV, dstV); \ |
|
456 |
+ register vector signed short op1 = vec_perm(but0, but0, perm1); \ |
|
457 |
+ register vector signed short but1 = vec_mladd(but0, vprod1, op1); \ |
|
458 |
+ register vector signed short op2 = vec_perm(but1, but1, perm2); \ |
|
459 |
+ register vector signed short but2 = vec_mladd(but1, vprod2, op2); \ |
|
460 |
+ register vector signed short op3 = vec_perm(but2, but2, perm3); \ |
|
461 |
+ res = vec_mladd(but2, vprod3, op3); \ |
|
462 |
+ } |
|
463 |
+ ONEITERBUTTERFLY(0, temp0); |
|
464 |
+ ONEITERBUTTERFLY(1, temp1); |
|
465 |
+ ONEITERBUTTERFLY(2, temp2); |
|
466 |
+ ONEITERBUTTERFLY(3, temp3); |
|
467 |
+ ONEITERBUTTERFLY(4, temp4); |
|
468 |
+ ONEITERBUTTERFLY(5, temp5); |
|
469 |
+ ONEITERBUTTERFLY(6, temp6); |
|
470 |
+ ONEITERBUTTERFLY(7, temp7); |
|
471 |
+ } |
|
472 |
+#undef ONEITERBUTTERFLY |
|
473 |
+ { |
|
474 |
+ register vector signed int vsum; |
|
475 |
+ register vector signed short line0 = vec_add(temp0, temp1); |
|
476 |
+ register vector signed short line1 = vec_sub(temp0, temp1); |
|
477 |
+ register vector signed short line2 = vec_add(temp2, temp3); |
|
478 |
+ register vector signed short line3 = vec_sub(temp2, temp3); |
|
479 |
+ register vector signed short line4 = vec_add(temp4, temp5); |
|
480 |
+ register vector signed short line5 = vec_sub(temp4, temp5); |
|
481 |
+ register vector signed short line6 = vec_add(temp6, temp7); |
|
482 |
+ register vector signed short line7 = vec_sub(temp6, temp7); |
|
483 |
+ |
|
484 |
+ register vector signed short line0B = vec_add(line0, line2); |
|
485 |
+ register vector signed short line2B = vec_sub(line0, line2); |
|
486 |
+ register vector signed short line1B = vec_add(line1, line3); |
|
487 |
+ register vector signed short line3B = vec_sub(line1, line3); |
|
488 |
+ register vector signed short line4B = vec_add(line4, line6); |
|
489 |
+ register vector signed short line6B = vec_sub(line4, line6); |
|
490 |
+ register vector signed short line5B = vec_add(line5, line7); |
|
491 |
+ register vector signed short line7B = vec_sub(line5, line7); |
|
492 |
+ |
|
493 |
+ register vector signed short line0C = vec_add(line0B, line4B); |
|
494 |
+ register vector signed short line4C = vec_sub(line0B, line4B); |
|
495 |
+ register vector signed short line1C = vec_add(line1B, line5B); |
|
496 |
+ register vector signed short line5C = vec_sub(line1B, line5B); |
|
497 |
+ register vector signed short line2C = vec_add(line2B, line6B); |
|
498 |
+ register vector signed short line6C = vec_sub(line2B, line6B); |
|
499 |
+ register vector signed short line3C = vec_add(line3B, line7B); |
|
500 |
+ register vector signed short line7C = vec_sub(line3B, line7B); |
|
501 |
+ |
|
502 |
+ vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); |
|
503 |
+ vsum = vec_sum4s(vec_abs(line1C), vsum); |
|
504 |
+ vsum = vec_sum4s(vec_abs(line2C), vsum); |
|
505 |
+ vsum = vec_sum4s(vec_abs(line3C), vsum); |
|
506 |
+ vsum = vec_sum4s(vec_abs(line4C), vsum); |
|
507 |
+ vsum = vec_sum4s(vec_abs(line5C), vsum); |
|
508 |
+ vsum = vec_sum4s(vec_abs(line6C), vsum); |
|
509 |
+ vsum = vec_sum4s(vec_abs(line7C), vsum); |
|
510 |
+ vsum = vec_sums(vsum, (vector signed int) vzero); |
|
511 |
+ vsum = vec_splat(vsum, 3); |
|
512 |
+ vec_ste(vsum, 0, &sum); |
|
513 |
+ } |
|
514 |
+ return sum; |
|
515 |
+} |
|
516 |
+ |
|
517 |
+/* |
|
518 |
+ * 16x8 works with 16 elements; it allows to avoid replicating loads, and |
|
519 |
+ * gives the compiler more room for scheduling. It's only used from |
|
520 |
+ * inside hadamard8_diff16_altivec. |
|
521 |
+ * |
|
522 |
+ * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has |
|
523 |
+ * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in |
|
524 |
+ * registers by itself. The following code includes hand-made register |
|
525 |
+ * allocation. It's not clean, but on a 7450 the resulting code is much faster |
|
526 |
+ * (best case falls from 700+ cycles to 550). |
|
527 |
+ * |
|
528 |
+ * xlc doesn't add spill code, but it doesn't know how to schedule for the |
|
529 |
+ * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses |
|
530 |
+ * 25% fewer instructions...) |
|
531 |
+ * |
|
532 |
+ * On the 970, the hand-made RA is still a win (around 690 vs. around 780), |
|
533 |
+ * but xlc goes to around 660 on the regular C code... |
|
534 |
+ */ |
|
535 |
+static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst, |
|
536 |
+ uint8_t *src, int stride, int h) |
|
537 |
+{ |
|
538 |
+ int sum; |
|
539 |
+ register vector signed short |
|
540 |
+ temp0 __asm__ ("v0"), |
|
541 |
+ temp1 __asm__ ("v1"), |
|
542 |
+ temp2 __asm__ ("v2"), |
|
543 |
+ temp3 __asm__ ("v3"), |
|
544 |
+ temp4 __asm__ ("v4"), |
|
545 |
+ temp5 __asm__ ("v5"), |
|
546 |
+ temp6 __asm__ ("v6"), |
|
547 |
+ temp7 __asm__ ("v7"); |
|
548 |
+ register vector signed short |
|
549 |
+ temp0S __asm__ ("v8"), |
|
550 |
+ temp1S __asm__ ("v9"), |
|
551 |
+ temp2S __asm__ ("v10"), |
|
552 |
+ temp3S __asm__ ("v11"), |
|
553 |
+ temp4S __asm__ ("v12"), |
|
554 |
+ temp5S __asm__ ("v13"), |
|
555 |
+ temp6S __asm__ ("v14"), |
|
556 |
+ temp7S __asm__ ("v15"); |
|
557 |
+ register const vector unsigned char vzero __asm__ ("v31") = |
|
558 |
+ (const vector unsigned char) vec_splat_u8(0); |
|
559 |
+ { |
|
560 |
+ register const vector signed short vprod1 __asm__ ("v16") = |
|
561 |
+ (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; |
|
562 |
+ |
|
563 |
+ register const vector signed short vprod2 __asm__ ("v17") = |
|
564 |
+ (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; |
|
565 |
+ |
|
566 |
+ register const vector signed short vprod3 __asm__ ("v18") = |
|
567 |
+ (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; |
|
568 |
+ |
|
569 |
+ register const vector unsigned char perm1 __asm__ ("v19") = |
|
570 |
+ (const vector unsigned char) |
|
571 |
+ { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, |
|
572 |
+ 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; |
|
573 |
+ |
|
574 |
+ register const vector unsigned char perm2 __asm__ ("v20") = |
|
575 |
+ (const vector unsigned char) |
|
576 |
+ { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, |
|
577 |
+ 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; |
|
578 |
+ |
|
579 |
+ register const vector unsigned char perm3 __asm__ ("v21") = |
|
580 |
+ (const vector unsigned char) |
|
581 |
+ { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
|
582 |
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; |
|
583 |
+ |
|
584 |
+#define ONEITERBUTTERFLY(i, res1, res2) \ |
|
585 |
+ { \ |
|
586 |
+ register vector unsigned char src1 __asm__ ("v22") = \ |
|
587 |
+ vec_ld(stride * i, src); \ |
|
588 |
+ register vector unsigned char src2 __asm__ ("v23") = \ |
|
589 |
+ vec_ld(stride * i + 16, src); \ |
|
590 |
+ register vector unsigned char srcO __asm__ ("v22") = \ |
|
591 |
+ vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ |
|
592 |
+ register vector unsigned char dst1 __asm__ ("v24") = \ |
|
593 |
+ vec_ld(stride * i, dst); \ |
|
594 |
+ register vector unsigned char dst2 __asm__ ("v25") = \ |
|
595 |
+ vec_ld(stride * i + 16, dst); \ |
|
596 |
+ register vector unsigned char dstO __asm__ ("v23") = \ |
|
597 |
+ vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ |
|
598 |
+ \ |
|
599 |
+ /* Promote the unsigned chars to signed shorts. */ \ |
|
600 |
+ register vector signed short srcV __asm__ ("v24") = \ |
|
601 |
+ (vector signed short) vec_mergeh((vector signed char) vzero, \ |
|
602 |
+ (vector signed char) srcO); \ |
|
603 |
+ register vector signed short dstV __asm__ ("v25") = \ |
|
604 |
+ (vector signed short) vec_mergeh((vector signed char) vzero, \ |
|
605 |
+ (vector signed char) dstO); \ |
|
606 |
+ register vector signed short srcW __asm__ ("v26") = \ |
|
607 |
+ (vector signed short) vec_mergel((vector signed char) vzero, \ |
|
608 |
+ (vector signed char) srcO); \ |
|
609 |
+ register vector signed short dstW __asm__ ("v27") = \ |
|
610 |
+ (vector signed short) vec_mergel((vector signed char) vzero, \ |
|
611 |
+ (vector signed char) dstO); \ |
|
612 |
+ \ |
|
613 |
+ /* subtractions inside the first butterfly */ \ |
|
614 |
+ register vector signed short but0 __asm__ ("v28") = \ |
|
615 |
+ vec_sub(srcV, dstV); \ |
|
616 |
+ register vector signed short but0S __asm__ ("v29") = \ |
|
617 |
+ vec_sub(srcW, dstW); \ |
|
618 |
+ register vector signed short op1 __asm__ ("v30") = \ |
|
619 |
+ vec_perm(but0, but0, perm1); \ |
|
620 |
+ register vector signed short but1 __asm__ ("v22") = \ |
|
621 |
+ vec_mladd(but0, vprod1, op1); \ |
|
622 |
+ register vector signed short op1S __asm__ ("v23") = \ |
|
623 |
+ vec_perm(but0S, but0S, perm1); \ |
|
624 |
+ register vector signed short but1S __asm__ ("v24") = \ |
|
625 |
+ vec_mladd(but0S, vprod1, op1S); \ |
|
626 |
+ register vector signed short op2 __asm__ ("v25") = \ |
|
627 |
+ vec_perm(but1, but1, perm2); \ |
|
628 |
+ register vector signed short but2 __asm__ ("v26") = \ |
|
629 |
+ vec_mladd(but1, vprod2, op2); \ |
|
630 |
+ register vector signed short op2S __asm__ ("v27") = \ |
|
631 |
+ vec_perm(but1S, but1S, perm2); \ |
|
632 |
+ register vector signed short but2S __asm__ ("v28") = \ |
|
633 |
+ vec_mladd(but1S, vprod2, op2S); \ |
|
634 |
+ register vector signed short op3 __asm__ ("v29") = \ |
|
635 |
+ vec_perm(but2, but2, perm3); \ |
|
636 |
+ register vector signed short op3S __asm__ ("v30") = \ |
|
637 |
+ vec_perm(but2S, but2S, perm3); \ |
|
638 |
+ res1 = vec_mladd(but2, vprod3, op3); \ |
|
639 |
+ res2 = vec_mladd(but2S, vprod3, op3S); \ |
|
640 |
+ } |
|
641 |
+ ONEITERBUTTERFLY(0, temp0, temp0S); |
|
642 |
+ ONEITERBUTTERFLY(1, temp1, temp1S); |
|
643 |
+ ONEITERBUTTERFLY(2, temp2, temp2S); |
|
644 |
+ ONEITERBUTTERFLY(3, temp3, temp3S); |
|
645 |
+ ONEITERBUTTERFLY(4, temp4, temp4S); |
|
646 |
+ ONEITERBUTTERFLY(5, temp5, temp5S); |
|
647 |
+ ONEITERBUTTERFLY(6, temp6, temp6S); |
|
648 |
+ ONEITERBUTTERFLY(7, temp7, temp7S); |
|
649 |
+ } |
|
650 |
+#undef ONEITERBUTTERFLY |
|
651 |
+ { |
|
652 |
+ register vector signed int vsum; |
|
653 |
+ |
|
654 |
+ register vector signed short line0 = vec_add(temp0, temp1); |
|
655 |
+ register vector signed short line1 = vec_sub(temp0, temp1); |
|
656 |
+ register vector signed short line2 = vec_add(temp2, temp3); |
|
657 |
+ register vector signed short line3 = vec_sub(temp2, temp3); |
|
658 |
+ register vector signed short line4 = vec_add(temp4, temp5); |
|
659 |
+ register vector signed short line5 = vec_sub(temp4, temp5); |
|
660 |
+ register vector signed short line6 = vec_add(temp6, temp7); |
|
661 |
+ register vector signed short line7 = vec_sub(temp6, temp7); |
|
662 |
+ |
|
663 |
+ register vector signed short line0B = vec_add(line0, line2); |
|
664 |
+ register vector signed short line2B = vec_sub(line0, line2); |
|
665 |
+ register vector signed short line1B = vec_add(line1, line3); |
|
666 |
+ register vector signed short line3B = vec_sub(line1, line3); |
|
667 |
+ register vector signed short line4B = vec_add(line4, line6); |
|
668 |
+ register vector signed short line6B = vec_sub(line4, line6); |
|
669 |
+ register vector signed short line5B = vec_add(line5, line7); |
|
670 |
+ register vector signed short line7B = vec_sub(line5, line7); |
|
671 |
+ |
|
672 |
+ register vector signed short line0C = vec_add(line0B, line4B); |
|
673 |
+ register vector signed short line4C = vec_sub(line0B, line4B); |
|
674 |
+ register vector signed short line1C = vec_add(line1B, line5B); |
|
675 |
+ register vector signed short line5C = vec_sub(line1B, line5B); |
|
676 |
+ register vector signed short line2C = vec_add(line2B, line6B); |
|
677 |
+ register vector signed short line6C = vec_sub(line2B, line6B); |
|
678 |
+ register vector signed short line3C = vec_add(line3B, line7B); |
|
679 |
+ register vector signed short line7C = vec_sub(line3B, line7B); |
|
680 |
+ |
|
681 |
+ register vector signed short line0S = vec_add(temp0S, temp1S); |
|
682 |
+ register vector signed short line1S = vec_sub(temp0S, temp1S); |
|
683 |
+ register vector signed short line2S = vec_add(temp2S, temp3S); |
|
684 |
+ register vector signed short line3S = vec_sub(temp2S, temp3S); |
|
685 |
+ register vector signed short line4S = vec_add(temp4S, temp5S); |
|
686 |
+ register vector signed short line5S = vec_sub(temp4S, temp5S); |
|
687 |
+ register vector signed short line6S = vec_add(temp6S, temp7S); |
|
688 |
+ register vector signed short line7S = vec_sub(temp6S, temp7S); |
|
689 |
+ |
|
690 |
+ register vector signed short line0BS = vec_add(line0S, line2S); |
|
691 |
+ register vector signed short line2BS = vec_sub(line0S, line2S); |
|
692 |
+ register vector signed short line1BS = vec_add(line1S, line3S); |
|
693 |
+ register vector signed short line3BS = vec_sub(line1S, line3S); |
|
694 |
+ register vector signed short line4BS = vec_add(line4S, line6S); |
|
695 |
+ register vector signed short line6BS = vec_sub(line4S, line6S); |
|
696 |
+ register vector signed short line5BS = vec_add(line5S, line7S); |
|
697 |
+ register vector signed short line7BS = vec_sub(line5S, line7S); |
|
698 |
+ |
|
699 |
+ register vector signed short line0CS = vec_add(line0BS, line4BS); |
|
700 |
+ register vector signed short line4CS = vec_sub(line0BS, line4BS); |
|
701 |
+ register vector signed short line1CS = vec_add(line1BS, line5BS); |
|
702 |
+ register vector signed short line5CS = vec_sub(line1BS, line5BS); |
|
703 |
+ register vector signed short line2CS = vec_add(line2BS, line6BS); |
|
704 |
+ register vector signed short line6CS = vec_sub(line2BS, line6BS); |
|
705 |
+ register vector signed short line3CS = vec_add(line3BS, line7BS); |
|
706 |
+ register vector signed short line7CS = vec_sub(line3BS, line7BS); |
|
707 |
+ |
|
708 |
+ vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); |
|
709 |
+ vsum = vec_sum4s(vec_abs(line1C), vsum); |
|
710 |
+ vsum = vec_sum4s(vec_abs(line2C), vsum); |
|
711 |
+ vsum = vec_sum4s(vec_abs(line3C), vsum); |
|
712 |
+ vsum = vec_sum4s(vec_abs(line4C), vsum); |
|
713 |
+ vsum = vec_sum4s(vec_abs(line5C), vsum); |
|
714 |
+ vsum = vec_sum4s(vec_abs(line6C), vsum); |
|
715 |
+ vsum = vec_sum4s(vec_abs(line7C), vsum); |
|
716 |
+ |
|
717 |
+ vsum = vec_sum4s(vec_abs(line0CS), vsum); |
|
718 |
+ vsum = vec_sum4s(vec_abs(line1CS), vsum); |
|
719 |
+ vsum = vec_sum4s(vec_abs(line2CS), vsum); |
|
720 |
+ vsum = vec_sum4s(vec_abs(line3CS), vsum); |
|
721 |
+ vsum = vec_sum4s(vec_abs(line4CS), vsum); |
|
722 |
+ vsum = vec_sum4s(vec_abs(line5CS), vsum); |
|
723 |
+ vsum = vec_sum4s(vec_abs(line6CS), vsum); |
|
724 |
+ vsum = vec_sum4s(vec_abs(line7CS), vsum); |
|
725 |
+ vsum = vec_sums(vsum, (vector signed int) vzero); |
|
726 |
+ vsum = vec_splat(vsum, 3); |
|
727 |
+ vec_ste(vsum, 0, &sum); |
|
728 |
+ } |
|
729 |
+ return sum; |
|
730 |
+} |
|
731 |
+ |
|
732 |
+static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst, |
|
733 |
+ uint8_t *src, int stride, int h) |
|
734 |
+{ |
|
735 |
+ int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); |
|
736 |
+ |
|
737 |
+ if (h == 16) { |
|
738 |
+ dst += 8 * stride; |
|
739 |
+ src += 8 * stride; |
|
740 |
+ score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); |
|
741 |
+ } |
|
742 |
+ return score; |
|
743 |
+} |
|
744 |
+#endif /* HAVE_ALTIVEC */ |
|
745 |
+ |
|
746 |
+av_cold void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx) |
|
747 |
+{ |
|
748 |
+#if HAVE_ALTIVEC |
|
749 |
+ if (!PPC_ALTIVEC(av_get_cpu_flags())) |
|
750 |
+ return; |
|
751 |
+ |
|
752 |
+ c->pix_abs[0][1] = sad16_x2_altivec; |
|
753 |
+ c->pix_abs[0][2] = sad16_y2_altivec; |
|
754 |
+ c->pix_abs[0][3] = sad16_xy2_altivec; |
|
755 |
+ c->pix_abs[0][0] = sad16_altivec; |
|
756 |
+ c->pix_abs[1][0] = sad8_altivec; |
|
757 |
+ |
|
758 |
+ c->sad[0] = sad16_altivec; |
|
759 |
+ c->sad[1] = sad8_altivec; |
|
760 |
+ c->sse[0] = sse16_altivec; |
|
761 |
+ c->sse[1] = sse8_altivec; |
|
762 |
+ |
|
763 |
+ c->hadamard8_diff[0] = hadamard8_diff16_altivec; |
|
764 |
+ c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; |
|
765 |
+#endif /* HAVE_ALTIVEC */ |
|
766 |
+} |
... | ... |
@@ -22,7 +22,6 @@ |
22 | 22 |
#ifndef AVCODEC_PRORESDEC_H |
23 | 23 |
#define AVCODEC_PRORESDEC_H |
24 | 24 |
|
25 |
-#include "dsputil.h" |
|
26 | 25 |
#include "blockdsp.h" |
27 | 26 |
#include "proresdsp.h" |
28 | 27 |
|
... | ... |
@@ -36,7 +35,6 @@ typedef struct { |
36 | 36 |
} SliceContext; |
37 | 37 |
|
38 | 38 |
typedef struct { |
39 |
- DSPContext dsp; |
|
40 | 39 |
BlockDSPContext bdsp; |
41 | 40 |
ProresDSPContext prodsp; |
42 | 41 |
AVFrame *frame; |
... | ... |
@@ -22,7 +22,7 @@ |
22 | 22 |
#include "libavutil/log.h" |
23 | 23 |
#include "libavutil/opt.h" |
24 | 24 |
#include "avcodec.h" |
25 |
-#include "dsputil.h" |
|
25 |
+#include "me_cmp.h" |
|
26 | 26 |
#include "snow_dwt.h" |
27 | 27 |
#include "internal.h" |
28 | 28 |
#include "snow.h" |
... | ... |
@@ -435,7 +435,7 @@ av_cold int ff_snow_common_init(AVCodecContext *avctx){ |
435 | 435 |
s->avctx= avctx; |
436 | 436 |
s->max_ref_frames=1; //just make sure it's not an invalid value in case of no initial keyframe |
437 | 437 |
|
438 |
- ff_dsputil_init(&s->dsp, avctx); |
|
438 |
+ ff_me_cmp_init(&s->mecc, avctx); |
|
439 | 439 |
ff_hpeldsp_init(&s->hdsp, avctx->flags); |
440 | 440 |
ff_videodsp_init(&s->vdsp, 8); |
441 | 441 |
ff_dwt_init(&s->dwt); |
... | ... |
@@ -22,8 +22,8 @@ |
22 | 22 |
#ifndef AVCODEC_SNOW_H |
23 | 23 |
#define AVCODEC_SNOW_H |
24 | 24 |
|
25 |
-#include "dsputil.h" |
|
26 | 25 |
#include "hpeldsp.h" |
26 |
+#include "me_cmp.h" |
|
27 | 27 |
#include "qpeldsp.h" |
28 | 28 |
#include "snow_dwt.h" |
29 | 29 |
|
... | ... |
@@ -110,7 +110,7 @@ typedef struct SnowContext{ |
110 | 110 |
AVClass *class; |
111 | 111 |
AVCodecContext *avctx; |
112 | 112 |
RangeCoder c; |
113 |
- DSPContext dsp; |
|
113 |
+ MECmpContext mecc; |
|
114 | 114 |
HpelDSPContext hdsp; |
115 | 115 |
QpelDSPContext qdsp; |
116 | 116 |
VideoDSPContext vdsp; |
... | ... |
@@ -22,7 +22,7 @@ |
22 | 22 |
#include "libavutil/attributes.h" |
23 | 23 |
#include "libavutil/avassert.h" |
24 | 24 |
#include "libavutil/common.h" |
25 |
-#include "dsputil.h" |
|
25 |
+#include "me_cmp.h" |
|
26 | 26 |
#include "snow_dwt.h" |
27 | 27 |
|
28 | 28 |
int ff_slice_buffer_init(slice_buffer *buf, int line_count, |
... | ... |
@@ -844,7 +844,7 @@ int ff_w97_32_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int line |
844 | 844 |
return w_c(v, pix1, pix2, line_size, 32, h, 0); |
845 | 845 |
} |
846 | 846 |
|
847 |
-void ff_dsputil_init_dwt(DSPContext *c) |
|
847 |
+void ff_dsputil_init_dwt(MECmpContext *c) |
|
848 | 848 |
{ |
849 | 849 |
c->w53[0] = w53_16_c; |
850 | 850 |
c->w53[1] = w53_8_c; |
... | ... |
@@ -22,7 +22,6 @@ |
22 | 22 |
#include "libavutil/log.h" |
23 | 23 |
#include "libavutil/opt.h" |
24 | 24 |
#include "avcodec.h" |
25 |
-#include "dsputil.h" |
|
26 | 25 |
#include "internal.h" |
27 | 26 |
#include "snow_dwt.h" |
28 | 27 |
#include "snow.h" |
... | ... |
@@ -121,8 +120,8 @@ static av_cold int encode_init(AVCodecContext *avctx) |
121 | 121 |
} |
122 | 122 |
avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift); |
123 | 123 |
|
124 |
- ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp); |
|
125 |
- ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp); |
|
124 |
+ ff_set_cmp(&s->mecc, s->mecc.me_cmp, s->avctx->me_cmp); |
|
125 |
+ ff_set_cmp(&s->mecc, s->mecc.me_sub_cmp, s->avctx->me_sub_cmp); |
|
126 | 126 |
|
127 | 127 |
s->input_picture = av_frame_alloc(); |
128 | 128 |
if (!s->input_picture) |
... | ... |
@@ -669,12 +668,12 @@ static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, uin |
669 | 669 |
distortion = 0; |
670 | 670 |
for(i=0; i<4; i++){ |
671 | 671 |
int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride; |
672 |
- distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16); |
|
672 |
+ distortion += s->mecc.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16); |
|
673 | 673 |
} |
674 | 674 |
} |
675 | 675 |
}else{ |
676 | 676 |
av_assert2(block_w==8); |
677 |
- distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2); |
|
677 |
+ distortion = s->mecc.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2); |
|
678 | 678 |
} |
679 | 679 |
|
680 | 680 |
if(plane_index==0){ |
... | ... |
@@ -738,7 +737,7 @@ static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){ |
738 | 738 |
} |
739 | 739 |
|
740 | 740 |
av_assert1(block_w== 8 || block_w==16); |
741 |
- distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_h); |
|
741 |
+ distortion += s->mecc.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_h); |
|
742 | 742 |
} |
743 | 743 |
|
744 | 744 |
if(plane_index==0){ |
... | ... |
@@ -1660,12 +1659,12 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt, |
1660 | 1660 |
s->m.qscale= (s->m.lambda*139 + FF_LAMBDA_SCALE*64) >> (FF_LAMBDA_SHIFT + 7); |
1661 | 1661 |
s->lambda2= s->m.lambda2= (s->m.lambda*s->m.lambda + FF_LAMBDA_SCALE/2) >> FF_LAMBDA_SHIFT; |
1662 | 1662 |
|
1663 |
- s->m.dsp= s->dsp; //move |
|
1663 |
+ s->m.mecc= s->mecc; //move |
|
1664 | 1664 |
s->m.qdsp= s->qdsp; //move |
1665 | 1665 |
s->m.hdsp = s->hdsp; |
1666 | 1666 |
ff_init_me(&s->m); |
1667 | 1667 |
s->hdsp = s->m.hdsp; |
1668 |
- s->dsp= s->m.dsp; |
|
1668 |
+ s->mecc= s->m.mecc; |
|
1669 | 1669 |
} |
1670 | 1670 |
|
1671 | 1671 |
if(s->pass1_rc){ |
... | ... |
@@ -27,8 +27,8 @@ |
27 | 27 |
*/ |
28 | 28 |
|
29 | 29 |
#include "avcodec.h" |
30 |
-#include "dsputil.h" |
|
31 | 30 |
#include "hpeldsp.h" |
31 |
+#include "me_cmp.h" |
|
32 | 32 |
#include "mpegvideo.h" |
33 | 33 |
#include "h263.h" |
34 | 34 |
#include "internal.h" |
... | ... |
@@ -314,7 +314,7 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane, |
314 | 314 |
s->m.current_picture.motion_val[0] = s->motion_val8[plane] + 2; |
315 | 315 |
s->m.p_mv_table = s->motion_val16[plane] + |
316 | 316 |
s->m.mb_stride + 1; |
317 |
- s->m.dsp = s->dsp; // move |
|
317 |
+ s->m.mecc = s->mecc; // move |
|
318 | 318 |
ff_init_me(&s->m); |
319 | 319 |
|
320 | 320 |
s->m.me.dia_size = s->avctx->dia_size; |
... | ... |
@@ -437,8 +437,8 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane, |
437 | 437 |
best = score[1] <= score[0]; |
438 | 438 |
|
439 | 439 |
vlc = ff_svq1_block_type_vlc[SVQ1_BLOCK_SKIP]; |
440 |
- score[2] = s->dsp.sse[0](NULL, src + 16 * x, ref, |
|
441 |
- stride, 16); |
|
440 |
+ score[2] = s->mecc.sse[0](NULL, src + 16 * x, ref, |
|
441 |
+ stride, 16); |
|
442 | 442 |
score[2] += vlc[1] * lambda; |
443 | 443 |
if (score[2] < score[best] && mx == 0 && my == 0) { |
444 | 444 |
best = 2; |
... | ... |
@@ -515,8 +515,8 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx) |
515 | 515 |
SVQ1EncContext *const s = avctx->priv_data; |
516 | 516 |
int ret; |
517 | 517 |
|
518 |
- ff_dsputil_init(&s->dsp, avctx); |
|
519 | 518 |
ff_hpeldsp_init(&s->hdsp, avctx->flags); |
519 |
+ ff_me_cmp_init(&s->mecc, avctx); |
|
520 | 520 |
ff_mpegvideoencdsp_init(&s->m.mpvencdsp, avctx); |
521 | 521 |
|
522 | 522 |
avctx->coded_frame = av_frame_alloc(); |
... | ... |
@@ -25,9 +25,9 @@ |
25 | 25 |
|
26 | 26 |
#include "libavutil/frame.h" |
27 | 27 |
#include "avcodec.h" |
28 |
-#include "dsputil.h" |
|
29 | 28 |
#include "get_bits.h" |
30 | 29 |
#include "hpeldsp.h" |
30 |
+#include "me_cmp.h" |
|
31 | 31 |
#include "mpegvideo.h" |
32 | 32 |
#include "put_bits.h" |
33 | 33 |
|
... | ... |
@@ -37,7 +37,7 @@ typedef struct SVQ1EncContext { |
37 | 37 |
* of MpegEncContext, so this will be removed then. */ |
38 | 38 |
MpegEncContext m; |
39 | 39 |
AVCodecContext *avctx; |
40 |
- DSPContext dsp; |
|
40 |
+ MECmpContext mecc; |
|
41 | 41 |
HpelDSPContext hdsp; |
42 | 42 |
AVFrame *current_picture; |
43 | 43 |
AVFrame *last_picture; |
... | ... |
@@ -41,8 +41,8 @@ |
41 | 41 |
#include "libavutil/samplefmt.h" |
42 | 42 |
#include "libavutil/dict.h" |
43 | 43 |
#include "avcodec.h" |
44 |
-#include "dsputil.h" |
|
45 | 44 |
#include "libavutil/opt.h" |
45 |
+#include "me_cmp.h" |
|
46 | 46 |
#include "mpegvideo.h" |
47 | 47 |
#include "thread.h" |
48 | 48 |
#include "frame_thread_encoder.h" |
... | ... |
@@ -195,8 +195,8 @@ static av_cold void avcodec_init(void) |
195 | 195 |
return; |
196 | 196 |
initialized = 1; |
197 | 197 |
|
198 |
- if (CONFIG_DSPUTIL) |
|
199 |
- ff_dsputil_static_init(); |
|
198 |
+ if (CONFIG_ME_CMP) |
|
199 |
+ ff_me_cmp_init_static(); |
|
200 | 200 |
} |
201 | 201 |
|
202 | 202 |
int av_codec_is_encoder(const AVCodec *codec) |
... | ... |
@@ -6,7 +6,6 @@ OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o |
6 | 6 |
OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o |
7 | 7 |
OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o |
8 | 8 |
OBJS-$(CONFIG_DCT) += x86/dct_init.o |
9 |
-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_init.o |
|
10 | 9 |
OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o |
11 | 10 |
OBJS-$(CONFIG_FFT) += x86/fft_init.o |
12 | 11 |
OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp_init.o |
... | ... |
@@ -24,6 +23,7 @@ OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o |
24 | 24 |
OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_mmx.o |
25 | 25 |
OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o |
26 | 26 |
OBJS-$(CONFIG_LPC) += x86/lpc.o |
27 |
+OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o |
|
27 | 28 |
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o |
28 | 29 |
OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o \ |
29 | 30 |
x86/mpegvideodsp.o |
... | ... |
@@ -80,7 +80,6 @@ YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o |
80 | 80 |
YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o\ |
81 | 81 |
x86/dwt_yasm.o |
82 | 82 |
YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o |
83 |
-YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputilenc.o |
|
84 | 83 |
YASM-OBJS-$(CONFIG_FFT) += x86/fft.o |
85 | 84 |
YASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o |
86 | 85 |
YASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o |
... | ... |
@@ -107,6 +106,7 @@ YASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o |
107 | 107 |
YASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o |
108 | 108 |
YASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o |
109 | 109 |
YASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o |
110 |
+YASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o |
|
110 | 111 |
YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o |
111 | 112 |
YASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o |
112 | 113 |
YASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o |
113 | 114 |
deleted file mode 100644 |
... | ... |
@@ -1,845 +0,0 @@ |
1 |
-/* |
|
2 |
- * MMX optimized DSP utils |
|
3 |
- * Copyright (c) 2000, 2001 Fabrice Bellard |
|
4 |
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
5 |
- * |
|
6 |
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
|
7 |
- * |
|
8 |
- * This file is part of FFmpeg. |
|
9 |
- * |
|
10 |
- * FFmpeg is free software; you can redistribute it and/or |
|
11 |
- * modify it under the terms of the GNU Lesser General Public |
|
12 |
- * License as published by the Free Software Foundation; either |
|
13 |
- * version 2.1 of the License, or (at your option) any later version. |
|
14 |
- * |
|
15 |
- * FFmpeg is distributed in the hope that it will be useful, |
|
16 |
- * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
17 |
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
18 |
- * Lesser General Public License for more details. |
|
19 |
- * |
|
20 |
- * You should have received a copy of the GNU Lesser General Public |
|
21 |
- * License along with FFmpeg; if not, write to the Free Software |
|
22 |
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
23 |
- */ |
|
24 |
- |
|
25 |
-#include "libavutil/attributes.h" |
|
26 |
-#include "libavutil/cpu.h" |
|
27 |
-#include "libavutil/x86/asm.h" |
|
28 |
-#include "libavutil/x86/cpu.h" |
|
29 |
-#include "libavcodec/dsputil.h" |
|
30 |
-#include "libavcodec/mpegvideo.h" |
|
31 |
- |
|
32 |
-int ff_sum_abs_dctelem_mmx(int16_t *block); |
|
33 |
-int ff_sum_abs_dctelem_mmxext(int16_t *block); |
|
34 |
-int ff_sum_abs_dctelem_sse2(int16_t *block); |
|
35 |
-int ff_sum_abs_dctelem_ssse3(int16_t *block); |
|
36 |
-int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
37 |
- int line_size, int h); |
|
38 |
-int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
39 |
- int line_size, int h); |
|
40 |
-int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
41 |
- int line_size, int h); |
|
42 |
-int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h); |
|
43 |
-int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h); |
|
44 |
- |
|
45 |
-#define hadamard_func(cpu) \ |
|
46 |
- int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \ |
|
47 |
- uint8_t *src2, int stride, int h); \ |
|
48 |
- int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \ |
|
49 |
- uint8_t *src2, int stride, int h); |
|
50 |
- |
|
51 |
-hadamard_func(mmx) |
|
52 |
-hadamard_func(mmxext) |
|
53 |
-hadamard_func(sse2) |
|
54 |
-hadamard_func(ssse3) |
|
55 |
- |
|
56 |
-#if HAVE_YASM |
|
57 |
-static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, |
|
58 |
- int line_size, int h) |
|
59 |
-{ |
|
60 |
- int score1, score2; |
|
61 |
- |
|
62 |
- if (c) |
|
63 |
- score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); |
|
64 |
- else |
|
65 |
- score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h); |
|
66 |
- score2 = ff_hf_noise16_mmx(pix1, line_size, h) + ff_hf_noise8_mmx(pix1+8, line_size, h) |
|
67 |
- - ff_hf_noise16_mmx(pix2, line_size, h) - ff_hf_noise8_mmx(pix2+8, line_size, h); |
|
68 |
- |
|
69 |
- if (c) |
|
70 |
- return score1 + FFABS(score2) * c->avctx->nsse_weight; |
|
71 |
- else |
|
72 |
- return score1 + FFABS(score2) * 8; |
|
73 |
-} |
|
74 |
- |
|
75 |
-static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, |
|
76 |
- int line_size, int h) |
|
77 |
-{ |
|
78 |
- int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h); |
|
79 |
- int score2 = ff_hf_noise8_mmx(pix1, line_size, h) - |
|
80 |
- ff_hf_noise8_mmx(pix2, line_size, h); |
|
81 |
- |
|
82 |
- if (c) |
|
83 |
- return score1 + FFABS(score2) * c->avctx->nsse_weight; |
|
84 |
- else |
|
85 |
- return score1 + FFABS(score2) * 8; |
|
86 |
-} |
|
87 |
- |
|
88 |
-#endif /* HAVE_YASM */ |
|
89 |
- |
|
90 |
-#if HAVE_INLINE_ASM |
|
91 |
- |
|
92 |
-static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, |
|
93 |
- int line_size, int h) |
|
94 |
-{ |
|
95 |
- int tmp; |
|
96 |
- |
|
97 |
- av_assert2((((int) pix) & 7) == 0); |
|
98 |
- av_assert2((line_size & 7) == 0); |
|
99 |
- |
|
100 |
-#define SUM(in0, in1, out0, out1) \ |
|
101 |
- "movq (%0), %%mm2\n" \ |
|
102 |
- "movq 8(%0), %%mm3\n" \ |
|
103 |
- "add %2,%0\n" \ |
|
104 |
- "movq %%mm2, " #out0 "\n" \ |
|
105 |
- "movq %%mm3, " #out1 "\n" \ |
|
106 |
- "psubusb " #in0 ", %%mm2\n" \ |
|
107 |
- "psubusb " #in1 ", %%mm3\n" \ |
|
108 |
- "psubusb " #out0 ", " #in0 "\n" \ |
|
109 |
- "psubusb " #out1 ", " #in1 "\n" \ |
|
110 |
- "por %%mm2, " #in0 "\n" \ |
|
111 |
- "por %%mm3, " #in1 "\n" \ |
|
112 |
- "movq " #in0 ", %%mm2\n" \ |
|
113 |
- "movq " #in1 ", %%mm3\n" \ |
|
114 |
- "punpcklbw %%mm7, " #in0 "\n" \ |
|
115 |
- "punpcklbw %%mm7, " #in1 "\n" \ |
|
116 |
- "punpckhbw %%mm7, %%mm2\n" \ |
|
117 |
- "punpckhbw %%mm7, %%mm3\n" \ |
|
118 |
- "paddw " #in1 ", " #in0 "\n" \ |
|
119 |
- "paddw %%mm3, %%mm2\n" \ |
|
120 |
- "paddw %%mm2, " #in0 "\n" \ |
|
121 |
- "paddw " #in0 ", %%mm6\n" |
|
122 |
- |
|
123 |
- |
|
124 |
- __asm__ volatile ( |
|
125 |
- "movl %3, %%ecx\n" |
|
126 |
- "pxor %%mm6, %%mm6\n" |
|
127 |
- "pxor %%mm7, %%mm7\n" |
|
128 |
- "movq (%0), %%mm0\n" |
|
129 |
- "movq 8(%0), %%mm1\n" |
|
130 |
- "add %2, %0\n" |
|
131 |
- "jmp 2f\n" |
|
132 |
- "1:\n" |
|
133 |
- |
|
134 |
- SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
|
135 |
- "2:\n" |
|
136 |
- SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
|
137 |
- |
|
138 |
- "subl $2, %%ecx\n" |
|
139 |
- "jnz 1b\n" |
|
140 |
- |
|
141 |
- "movq %%mm6, %%mm0\n" |
|
142 |
- "psrlq $32, %%mm6\n" |
|
143 |
- "paddw %%mm6, %%mm0\n" |
|
144 |
- "movq %%mm0, %%mm6\n" |
|
145 |
- "psrlq $16, %%mm0\n" |
|
146 |
- "paddw %%mm6, %%mm0\n" |
|
147 |
- "movd %%mm0, %1\n" |
|
148 |
- : "+r" (pix), "=r" (tmp) |
|
149 |
- : "r" ((x86_reg) line_size), "m" (h) |
|
150 |
- : "%ecx"); |
|
151 |
- |
|
152 |
- return tmp & 0xFFFF; |
|
153 |
-} |
|
154 |
-#undef SUM |
|
155 |
- |
|
156 |
-static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, |
|
157 |
- int line_size, int h) |
|
158 |
-{ |
|
159 |
- int tmp; |
|
160 |
- |
|
161 |
- av_assert2((((int) pix) & 7) == 0); |
|
162 |
- av_assert2((line_size & 7) == 0); |
|
163 |
- |
|
164 |
-#define SUM(in0, in1, out0, out1) \ |
|
165 |
- "movq (%0), " #out0 "\n" \ |
|
166 |
- "movq 8(%0), " #out1 "\n" \ |
|
167 |
- "add %2, %0\n" \ |
|
168 |
- "psadbw " #out0 ", " #in0 "\n" \ |
|
169 |
- "psadbw " #out1 ", " #in1 "\n" \ |
|
170 |
- "paddw " #in1 ", " #in0 "\n" \ |
|
171 |
- "paddw " #in0 ", %%mm6\n" |
|
172 |
- |
|
173 |
- __asm__ volatile ( |
|
174 |
- "movl %3, %%ecx\n" |
|
175 |
- "pxor %%mm6, %%mm6\n" |
|
176 |
- "pxor %%mm7, %%mm7\n" |
|
177 |
- "movq (%0), %%mm0\n" |
|
178 |
- "movq 8(%0), %%mm1\n" |
|
179 |
- "add %2, %0\n" |
|
180 |
- "jmp 2f\n" |
|
181 |
- "1:\n" |
|
182 |
- |
|
183 |
- SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
|
184 |
- "2:\n" |
|
185 |
- SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
|
186 |
- |
|
187 |
- "subl $2, %%ecx\n" |
|
188 |
- "jnz 1b\n" |
|
189 |
- |
|
190 |
- "movd %%mm6, %1\n" |
|
191 |
- : "+r" (pix), "=r" (tmp) |
|
192 |
- : "r" ((x86_reg) line_size), "m" (h) |
|
193 |
- : "%ecx"); |
|
194 |
- |
|
195 |
- return tmp; |
|
196 |
-} |
|
197 |
-#undef SUM |
|
198 |
- |
|
199 |
-static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
200 |
- int line_size, int h) |
|
201 |
-{ |
|
202 |
- int tmp; |
|
203 |
- |
|
204 |
- av_assert2((((int) pix1) & 7) == 0); |
|
205 |
- av_assert2((((int) pix2) & 7) == 0); |
|
206 |
- av_assert2((line_size & 7) == 0); |
|
207 |
- |
|
208 |
-#define SUM(in0, in1, out0, out1) \ |
|
209 |
- "movq (%0), %%mm2\n" \ |
|
210 |
- "movq (%1), " #out0 "\n" \ |
|
211 |
- "movq 8(%0), %%mm3\n" \ |
|
212 |
- "movq 8(%1), " #out1 "\n" \ |
|
213 |
- "add %3, %0\n" \ |
|
214 |
- "add %3, %1\n" \ |
|
215 |
- "psubb " #out0 ", %%mm2\n" \ |
|
216 |
- "psubb " #out1 ", %%mm3\n" \ |
|
217 |
- "pxor %%mm7, %%mm2\n" \ |
|
218 |
- "pxor %%mm7, %%mm3\n" \ |
|
219 |
- "movq %%mm2, " #out0 "\n" \ |
|
220 |
- "movq %%mm3, " #out1 "\n" \ |
|
221 |
- "psubusb " #in0 ", %%mm2\n" \ |
|
222 |
- "psubusb " #in1 ", %%mm3\n" \ |
|
223 |
- "psubusb " #out0 ", " #in0 "\n" \ |
|
224 |
- "psubusb " #out1 ", " #in1 "\n" \ |
|
225 |
- "por %%mm2, " #in0 "\n" \ |
|
226 |
- "por %%mm3, " #in1 "\n" \ |
|
227 |
- "movq " #in0 ", %%mm2\n" \ |
|
228 |
- "movq " #in1 ", %%mm3\n" \ |
|
229 |
- "punpcklbw %%mm7, " #in0 "\n" \ |
|
230 |
- "punpcklbw %%mm7, " #in1 "\n" \ |
|
231 |
- "punpckhbw %%mm7, %%mm2\n" \ |
|
232 |
- "punpckhbw %%mm7, %%mm3\n" \ |
|
233 |
- "paddw " #in1 ", " #in0 "\n" \ |
|
234 |
- "paddw %%mm3, %%mm2\n" \ |
|
235 |
- "paddw %%mm2, " #in0 "\n" \ |
|
236 |
- "paddw " #in0 ", %%mm6\n" |
|
237 |
- |
|
238 |
- |
|
239 |
- __asm__ volatile ( |
|
240 |
- "movl %4, %%ecx\n" |
|
241 |
- "pxor %%mm6, %%mm6\n" |
|
242 |
- "pcmpeqw %%mm7, %%mm7\n" |
|
243 |
- "psllw $15, %%mm7\n" |
|
244 |
- "packsswb %%mm7, %%mm7\n" |
|
245 |
- "movq (%0), %%mm0\n" |
|
246 |
- "movq (%1), %%mm2\n" |
|
247 |
- "movq 8(%0), %%mm1\n" |
|
248 |
- "movq 8(%1), %%mm3\n" |
|
249 |
- "add %3, %0\n" |
|
250 |
- "add %3, %1\n" |
|
251 |
- "psubb %%mm2, %%mm0\n" |
|
252 |
- "psubb %%mm3, %%mm1\n" |
|
253 |
- "pxor %%mm7, %%mm0\n" |
|
254 |
- "pxor %%mm7, %%mm1\n" |
|
255 |
- "jmp 2f\n" |
|
256 |
- "1:\n" |
|
257 |
- |
|
258 |
- SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
|
259 |
- "2:\n" |
|
260 |
- SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
|
261 |
- |
|
262 |
- "subl $2, %%ecx\n" |
|
263 |
- "jnz 1b\n" |
|
264 |
- |
|
265 |
- "movq %%mm6, %%mm0\n" |
|
266 |
- "psrlq $32, %%mm6\n" |
|
267 |
- "paddw %%mm6, %%mm0\n" |
|
268 |
- "movq %%mm0, %%mm6\n" |
|
269 |
- "psrlq $16, %%mm0\n" |
|
270 |
- "paddw %%mm6, %%mm0\n" |
|
271 |
- "movd %%mm0, %2\n" |
|
272 |
- : "+r" (pix1), "+r" (pix2), "=r" (tmp) |
|
273 |
- : "r" ((x86_reg) line_size), "m" (h) |
|
274 |
- : "%ecx"); |
|
275 |
- |
|
276 |
- return tmp & 0x7FFF; |
|
277 |
-} |
|
278 |
-#undef SUM |
|
279 |
- |
|
280 |
-static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
281 |
- int line_size, int h) |
|
282 |
-{ |
|
283 |
- int tmp; |
|
284 |
- |
|
285 |
- av_assert2((((int) pix1) & 7) == 0); |
|
286 |
- av_assert2((((int) pix2) & 7) == 0); |
|
287 |
- av_assert2((line_size & 7) == 0); |
|
288 |
- |
|
289 |
-#define SUM(in0, in1, out0, out1) \ |
|
290 |
- "movq (%0), " #out0 "\n" \ |
|
291 |
- "movq (%1), %%mm2\n" \ |
|
292 |
- "movq 8(%0), " #out1 "\n" \ |
|
293 |
- "movq 8(%1), %%mm3\n" \ |
|
294 |
- "add %3, %0\n" \ |
|
295 |
- "add %3, %1\n" \ |
|
296 |
- "psubb %%mm2, " #out0 "\n" \ |
|
297 |
- "psubb %%mm3, " #out1 "\n" \ |
|
298 |
- "pxor %%mm7, " #out0 "\n" \ |
|
299 |
- "pxor %%mm7, " #out1 "\n" \ |
|
300 |
- "psadbw " #out0 ", " #in0 "\n" \ |
|
301 |
- "psadbw " #out1 ", " #in1 "\n" \ |
|
302 |
- "paddw " #in1 ", " #in0 "\n" \ |
|
303 |
- "paddw " #in0 ", %%mm6\n " |
|
304 |
- |
|
305 |
- __asm__ volatile ( |
|
306 |
- "movl %4, %%ecx\n" |
|
307 |
- "pxor %%mm6, %%mm6\n" |
|
308 |
- "pcmpeqw %%mm7, %%mm7\n" |
|
309 |
- "psllw $15, %%mm7\n" |
|
310 |
- "packsswb %%mm7, %%mm7\n" |
|
311 |
- "movq (%0), %%mm0\n" |
|
312 |
- "movq (%1), %%mm2\n" |
|
313 |
- "movq 8(%0), %%mm1\n" |
|
314 |
- "movq 8(%1), %%mm3\n" |
|
315 |
- "add %3, %0\n" |
|
316 |
- "add %3, %1\n" |
|
317 |
- "psubb %%mm2, %%mm0\n" |
|
318 |
- "psubb %%mm3, %%mm1\n" |
|
319 |
- "pxor %%mm7, %%mm0\n" |
|
320 |
- "pxor %%mm7, %%mm1\n" |
|
321 |
- "jmp 2f\n" |
|
322 |
- "1:\n" |
|
323 |
- |
|
324 |
- SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
|
325 |
- "2:\n" |
|
326 |
- SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
|
327 |
- |
|
328 |
- "subl $2, %%ecx\n" |
|
329 |
- "jnz 1b\n" |
|
330 |
- |
|
331 |
- "movd %%mm6, %2\n" |
|
332 |
- : "+r" (pix1), "+r" (pix2), "=r" (tmp) |
|
333 |
- : "r" ((x86_reg) line_size), "m" (h) |
|
334 |
- : "%ecx"); |
|
335 |
- |
|
336 |
- return tmp; |
|
337 |
-} |
|
338 |
-#undef SUM |
|
339 |
- |
|
340 |
- |
|
341 |
- |
|
342 |
-DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = { |
|
343 |
- 0x0000000000000000ULL, |
|
344 |
- 0x0001000100010001ULL, |
|
345 |
- 0x0002000200020002ULL, |
|
346 |
-}; |
|
347 |
- |
|
348 |
-DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL; |
|
349 |
- |
|
350 |
-static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
|
351 |
-{ |
|
352 |
- x86_reg len = -(x86_reg)stride * h; |
|
353 |
- __asm__ volatile ( |
|
354 |
- ".p2align 4 \n\t" |
|
355 |
- "1: \n\t" |
|
356 |
- "movq (%1, %%"REG_a"), %%mm0 \n\t" |
|
357 |
- "movq (%2, %%"REG_a"), %%mm2 \n\t" |
|
358 |
- "movq (%2, %%"REG_a"), %%mm4 \n\t" |
|
359 |
- "add %3, %%"REG_a" \n\t" |
|
360 |
- "psubusb %%mm0, %%mm2 \n\t" |
|
361 |
- "psubusb %%mm4, %%mm0 \n\t" |
|
362 |
- "movq (%1, %%"REG_a"), %%mm1 \n\t" |
|
363 |
- "movq (%2, %%"REG_a"), %%mm3 \n\t" |
|
364 |
- "movq (%2, %%"REG_a"), %%mm5 \n\t" |
|
365 |
- "psubusb %%mm1, %%mm3 \n\t" |
|
366 |
- "psubusb %%mm5, %%mm1 \n\t" |
|
367 |
- "por %%mm2, %%mm0 \n\t" |
|
368 |
- "por %%mm1, %%mm3 \n\t" |
|
369 |
- "movq %%mm0, %%mm1 \n\t" |
|
370 |
- "movq %%mm3, %%mm2 \n\t" |
|
371 |
- "punpcklbw %%mm7, %%mm0 \n\t" |
|
372 |
- "punpckhbw %%mm7, %%mm1 \n\t" |
|
373 |
- "punpcklbw %%mm7, %%mm3 \n\t" |
|
374 |
- "punpckhbw %%mm7, %%mm2 \n\t" |
|
375 |
- "paddw %%mm1, %%mm0 \n\t" |
|
376 |
- "paddw %%mm3, %%mm2 \n\t" |
|
377 |
- "paddw %%mm2, %%mm0 \n\t" |
|
378 |
- "paddw %%mm0, %%mm6 \n\t" |
|
379 |
- "add %3, %%"REG_a" \n\t" |
|
380 |
- " js 1b \n\t" |
|
381 |
- : "+a" (len) |
|
382 |
- : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride)); |
|
383 |
-} |
|
384 |
- |
|
385 |
-static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2, |
|
386 |
- int stride, int h) |
|
387 |
-{ |
|
388 |
- __asm__ volatile ( |
|
389 |
- ".p2align 4 \n\t" |
|
390 |
- "1: \n\t" |
|
391 |
- "movq (%1), %%mm0 \n\t" |
|
392 |
- "movq (%1, %3), %%mm1 \n\t" |
|
393 |
- "psadbw (%2), %%mm0 \n\t" |
|
394 |
- "psadbw (%2, %3), %%mm1 \n\t" |
|
395 |
- "paddw %%mm0, %%mm6 \n\t" |
|
396 |
- "paddw %%mm1, %%mm6 \n\t" |
|
397 |
- "lea (%1,%3,2), %1 \n\t" |
|
398 |
- "lea (%2,%3,2), %2 \n\t" |
|
399 |
- "sub $2, %0 \n\t" |
|
400 |
- " jg 1b \n\t" |
|
401 |
- : "+r" (h), "+r" (blk1), "+r" (blk2) |
|
402 |
- : "r" ((x86_reg) stride)); |
|
403 |
-} |
|
404 |
- |
|
405 |
-static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1, |
|
406 |
- int stride, int h) |
|
407 |
-{ |
|
408 |
- int ret; |
|
409 |
- __asm__ volatile ( |
|
410 |
- "pxor %%xmm2, %%xmm2 \n\t" |
|
411 |
- ".p2align 4 \n\t" |
|
412 |
- "1: \n\t" |
|
413 |
- "movdqu (%1), %%xmm0 \n\t" |
|
414 |
- "movdqu (%1, %4), %%xmm1 \n\t" |
|
415 |
- "psadbw (%2), %%xmm0 \n\t" |
|
416 |
- "psadbw (%2, %4), %%xmm1 \n\t" |
|
417 |
- "paddw %%xmm0, %%xmm2 \n\t" |
|
418 |
- "paddw %%xmm1, %%xmm2 \n\t" |
|
419 |
- "lea (%1,%4,2), %1 \n\t" |
|
420 |
- "lea (%2,%4,2), %2 \n\t" |
|
421 |
- "sub $2, %0 \n\t" |
|
422 |
- " jg 1b \n\t" |
|
423 |
- "movhlps %%xmm2, %%xmm0 \n\t" |
|
424 |
- "paddw %%xmm0, %%xmm2 \n\t" |
|
425 |
- "movd %%xmm2, %3 \n\t" |
|
426 |
- : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret) |
|
427 |
- : "r" ((x86_reg) stride)); |
|
428 |
- return ret; |
|
429 |
-} |
|
430 |
- |
|
431 |
-static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2, |
|
432 |
- int stride, int h) |
|
433 |
-{ |
|
434 |
- __asm__ volatile ( |
|
435 |
- ".p2align 4 \n\t" |
|
436 |
- "1: \n\t" |
|
437 |
- "movq (%1), %%mm0 \n\t" |
|
438 |
- "movq (%1, %3), %%mm1 \n\t" |
|
439 |
- "pavgb 1(%1), %%mm0 \n\t" |
|
440 |
- "pavgb 1(%1, %3), %%mm1 \n\t" |
|
441 |
- "psadbw (%2), %%mm0 \n\t" |
|
442 |
- "psadbw (%2, %3), %%mm1 \n\t" |
|
443 |
- "paddw %%mm0, %%mm6 \n\t" |
|
444 |
- "paddw %%mm1, %%mm6 \n\t" |
|
445 |
- "lea (%1,%3,2), %1 \n\t" |
|
446 |
- "lea (%2,%3,2), %2 \n\t" |
|
447 |
- "sub $2, %0 \n\t" |
|
448 |
- " jg 1b \n\t" |
|
449 |
- : "+r" (h), "+r" (blk1), "+r" (blk2) |
|
450 |
- : "r" ((x86_reg) stride)); |
|
451 |
-} |
|
452 |
- |
|
453 |
-static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2, |
|
454 |
- int stride, int h) |
|
455 |
-{ |
|
456 |
- __asm__ volatile ( |
|
457 |
- "movq (%1), %%mm0 \n\t" |
|
458 |
- "add %3, %1 \n\t" |
|
459 |
- ".p2align 4 \n\t" |
|
460 |
- "1: \n\t" |
|
461 |
- "movq (%1), %%mm1 \n\t" |
|
462 |
- "movq (%1, %3), %%mm2 \n\t" |
|
463 |
- "pavgb %%mm1, %%mm0 \n\t" |
|
464 |
- "pavgb %%mm2, %%mm1 \n\t" |
|
465 |
- "psadbw (%2), %%mm0 \n\t" |
|
466 |
- "psadbw (%2, %3), %%mm1 \n\t" |
|
467 |
- "paddw %%mm0, %%mm6 \n\t" |
|
468 |
- "paddw %%mm1, %%mm6 \n\t" |
|
469 |
- "movq %%mm2, %%mm0 \n\t" |
|
470 |
- "lea (%1,%3,2), %1 \n\t" |
|
471 |
- "lea (%2,%3,2), %2 \n\t" |
|
472 |
- "sub $2, %0 \n\t" |
|
473 |
- " jg 1b \n\t" |
|
474 |
- : "+r" (h), "+r" (blk1), "+r" (blk2) |
|
475 |
- : "r" ((x86_reg) stride)); |
|
476 |
-} |
|
477 |
- |
|
478 |
-static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2, |
|
479 |
- int stride, int h) |
|
480 |
-{ |
|
481 |
- __asm__ volatile ( |
|
482 |
- "movq "MANGLE(bone)", %%mm5 \n\t" |
|
483 |
- "movq (%1), %%mm0 \n\t" |
|
484 |
- "pavgb 1(%1), %%mm0 \n\t" |
|
485 |
- "add %3, %1 \n\t" |
|
486 |
- ".p2align 4 \n\t" |
|
487 |
- "1: \n\t" |
|
488 |
- "movq (%1), %%mm1 \n\t" |
|
489 |
- "movq (%1,%3), %%mm2 \n\t" |
|
490 |
- "pavgb 1(%1), %%mm1 \n\t" |
|
491 |
- "pavgb 1(%1,%3), %%mm2 \n\t" |
|
492 |
- "psubusb %%mm5, %%mm1 \n\t" |
|
493 |
- "pavgb %%mm1, %%mm0 \n\t" |
|
494 |
- "pavgb %%mm2, %%mm1 \n\t" |
|
495 |
- "psadbw (%2), %%mm0 \n\t" |
|
496 |
- "psadbw (%2,%3), %%mm1 \n\t" |
|
497 |
- "paddw %%mm0, %%mm6 \n\t" |
|
498 |
- "paddw %%mm1, %%mm6 \n\t" |
|
499 |
- "movq %%mm2, %%mm0 \n\t" |
|
500 |
- "lea (%1,%3,2), %1 \n\t" |
|
501 |
- "lea (%2,%3,2), %2 \n\t" |
|
502 |
- "sub $2, %0 \n\t" |
|
503 |
- " jg 1b \n\t" |
|
504 |
- : "+r" (h), "+r" (blk1), "+r" (blk2) |
|
505 |
- : "r" ((x86_reg) stride) |
|
506 |
- NAMED_CONSTRAINTS_ADD(bone)); |
|
507 |
-} |
|
508 |
- |
|
509 |
-static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, |
|
510 |
- int stride, int h) |
|
511 |
-{ |
|
512 |
- x86_reg len = -(x86_reg)stride * h; |
|
513 |
- __asm__ volatile ( |
|
514 |
- ".p2align 4 \n\t" |
|
515 |
- "1: \n\t" |
|
516 |
- "movq (%1, %%"REG_a"), %%mm0 \n\t" |
|
517 |
- "movq (%2, %%"REG_a"), %%mm1 \n\t" |
|
518 |
- "movq (%1, %%"REG_a"), %%mm2 \n\t" |
|
519 |
- "movq (%2, %%"REG_a"), %%mm3 \n\t" |
|
520 |
- "punpcklbw %%mm7, %%mm0 \n\t" |
|
521 |
- "punpcklbw %%mm7, %%mm1 \n\t" |
|
522 |
- "punpckhbw %%mm7, %%mm2 \n\t" |
|
523 |
- "punpckhbw %%mm7, %%mm3 \n\t" |
|
524 |
- "paddw %%mm0, %%mm1 \n\t" |
|
525 |
- "paddw %%mm2, %%mm3 \n\t" |
|
526 |
- "movq (%3, %%"REG_a"), %%mm4 \n\t" |
|
527 |
- "movq (%3, %%"REG_a"), %%mm2 \n\t" |
|
528 |
- "paddw %%mm5, %%mm1 \n\t" |
|
529 |
- "paddw %%mm5, %%mm3 \n\t" |
|
530 |
- "psrlw $1, %%mm1 \n\t" |
|
531 |
- "psrlw $1, %%mm3 \n\t" |
|
532 |
- "packuswb %%mm3, %%mm1 \n\t" |
|
533 |
- "psubusb %%mm1, %%mm4 \n\t" |
|
534 |
- "psubusb %%mm2, %%mm1 \n\t" |
|
535 |
- "por %%mm4, %%mm1 \n\t" |
|
536 |
- "movq %%mm1, %%mm0 \n\t" |
|
537 |
- "punpcklbw %%mm7, %%mm0 \n\t" |
|
538 |
- "punpckhbw %%mm7, %%mm1 \n\t" |
|
539 |
- "paddw %%mm1, %%mm0 \n\t" |
|
540 |
- "paddw %%mm0, %%mm6 \n\t" |
|
541 |
- "add %4, %%"REG_a" \n\t" |
|
542 |
- " js 1b \n\t" |
|
543 |
- : "+a" (len) |
|
544 |
- : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len), |
|
545 |
- "r" ((x86_reg) stride)); |
|
546 |
-} |
|
547 |
- |
|
548 |
-static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
|
549 |
-{ |
|
550 |
- x86_reg len = -(x86_reg)stride * h; |
|
551 |
- __asm__ volatile ( |
|
552 |
- "movq (%1, %%"REG_a"), %%mm0 \n\t" |
|
553 |
- "movq 1(%1, %%"REG_a"), %%mm2 \n\t" |
|
554 |
- "movq %%mm0, %%mm1 \n\t" |
|
555 |
- "movq %%mm2, %%mm3 \n\t" |
|
556 |
- "punpcklbw %%mm7, %%mm0 \n\t" |
|
557 |
- "punpckhbw %%mm7, %%mm1 \n\t" |
|
558 |
- "punpcklbw %%mm7, %%mm2 \n\t" |
|
559 |
- "punpckhbw %%mm7, %%mm3 \n\t" |
|
560 |
- "paddw %%mm2, %%mm0 \n\t" |
|
561 |
- "paddw %%mm3, %%mm1 \n\t" |
|
562 |
- ".p2align 4 \n\t" |
|
563 |
- "1: \n\t" |
|
564 |
- "movq (%2, %%"REG_a"), %%mm2 \n\t" |
|
565 |
- "movq 1(%2, %%"REG_a"), %%mm4 \n\t" |
|
566 |
- "movq %%mm2, %%mm3 \n\t" |
|
567 |
- "movq %%mm4, %%mm5 \n\t" |
|
568 |
- "punpcklbw %%mm7, %%mm2 \n\t" |
|
569 |
- "punpckhbw %%mm7, %%mm3 \n\t" |
|
570 |
- "punpcklbw %%mm7, %%mm4 \n\t" |
|
571 |
- "punpckhbw %%mm7, %%mm5 \n\t" |
|
572 |
- "paddw %%mm4, %%mm2 \n\t" |
|
573 |
- "paddw %%mm5, %%mm3 \n\t" |
|
574 |
- "movq %5, %%mm5 \n\t" |
|
575 |
- "paddw %%mm2, %%mm0 \n\t" |
|
576 |
- "paddw %%mm3, %%mm1 \n\t" |
|
577 |
- "paddw %%mm5, %%mm0 \n\t" |
|
578 |
- "paddw %%mm5, %%mm1 \n\t" |
|
579 |
- "movq (%3, %%"REG_a"), %%mm4 \n\t" |
|
580 |
- "movq (%3, %%"REG_a"), %%mm5 \n\t" |
|
581 |
- "psrlw $2, %%mm0 \n\t" |
|
582 |
- "psrlw $2, %%mm1 \n\t" |
|
583 |
- "packuswb %%mm1, %%mm0 \n\t" |
|
584 |
- "psubusb %%mm0, %%mm4 \n\t" |
|
585 |
- "psubusb %%mm5, %%mm0 \n\t" |
|
586 |
- "por %%mm4, %%mm0 \n\t" |
|
587 |
- "movq %%mm0, %%mm4 \n\t" |
|
588 |
- "punpcklbw %%mm7, %%mm0 \n\t" |
|
589 |
- "punpckhbw %%mm7, %%mm4 \n\t" |
|
590 |
- "paddw %%mm0, %%mm6 \n\t" |
|
591 |
- "paddw %%mm4, %%mm6 \n\t" |
|
592 |
- "movq %%mm2, %%mm0 \n\t" |
|
593 |
- "movq %%mm3, %%mm1 \n\t" |
|
594 |
- "add %4, %%"REG_a" \n\t" |
|
595 |
- " js 1b \n\t" |
|
596 |
- : "+a" (len) |
|
597 |
- : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), |
|
598 |
- "r" ((x86_reg) stride), "m" (round_tab[2])); |
|
599 |
-} |
|
600 |
- |
|
601 |
-static inline int sum_mmx(void) |
|
602 |
-{ |
|
603 |
- int ret; |
|
604 |
- __asm__ volatile ( |
|
605 |
- "movq %%mm6, %%mm0 \n\t" |
|
606 |
- "psrlq $32, %%mm6 \n\t" |
|
607 |
- "paddw %%mm0, %%mm6 \n\t" |
|
608 |
- "movq %%mm6, %%mm0 \n\t" |
|
609 |
- "psrlq $16, %%mm6 \n\t" |
|
610 |
- "paddw %%mm0, %%mm6 \n\t" |
|
611 |
- "movd %%mm6, %0 \n\t" |
|
612 |
- : "=r" (ret)); |
|
613 |
- return ret & 0xFFFF; |
|
614 |
-} |
|
615 |
- |
|
616 |
-static inline int sum_mmxext(void) |
|
617 |
-{ |
|
618 |
- int ret; |
|
619 |
- __asm__ volatile ( |
|
620 |
- "movd %%mm6, %0 \n\t" |
|
621 |
- : "=r" (ret)); |
|
622 |
- return ret; |
|
623 |
-} |
|
624 |
- |
|
625 |
-static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
|
626 |
-{ |
|
627 |
- sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h); |
|
628 |
-} |
|
629 |
- |
|
630 |
-static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
|
631 |
-{ |
|
632 |
- sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h); |
|
633 |
-} |
|
634 |
- |
|
635 |
-#define PIX_SAD(suf) \ |
|
636 |
-static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
637 |
- uint8_t *blk1, int stride, int h) \ |
|
638 |
-{ \ |
|
639 |
- av_assert2(h == 8); \ |
|
640 |
- __asm__ volatile ( \ |
|
641 |
- "pxor %%mm7, %%mm7 \n\t" \ |
|
642 |
- "pxor %%mm6, %%mm6 \n\t" \ |
|
643 |
- :); \ |
|
644 |
- \ |
|
645 |
- sad8_1_ ## suf(blk1, blk2, stride, 8); \ |
|
646 |
- \ |
|
647 |
- return sum_ ## suf(); \ |
|
648 |
-} \ |
|
649 |
- \ |
|
650 |
-static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
651 |
- uint8_t *blk1, int stride, int h) \ |
|
652 |
-{ \ |
|
653 |
- av_assert2(h == 8); \ |
|
654 |
- __asm__ volatile ( \ |
|
655 |
- "pxor %%mm7, %%mm7 \n\t" \ |
|
656 |
- "pxor %%mm6, %%mm6 \n\t" \ |
|
657 |
- "movq %0, %%mm5 \n\t" \ |
|
658 |
- :: "m" (round_tab[1])); \ |
|
659 |
- \ |
|
660 |
- sad8_x2a_ ## suf(blk1, blk2, stride, 8); \ |
|
661 |
- \ |
|
662 |
- return sum_ ## suf(); \ |
|
663 |
-} \ |
|
664 |
- \ |
|
665 |
-static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
666 |
- uint8_t *blk1, int stride, int h) \ |
|
667 |
-{ \ |
|
668 |
- av_assert2(h == 8); \ |
|
669 |
- __asm__ volatile ( \ |
|
670 |
- "pxor %%mm7, %%mm7 \n\t" \ |
|
671 |
- "pxor %%mm6, %%mm6 \n\t" \ |
|
672 |
- "movq %0, %%mm5 \n\t" \ |
|
673 |
- :: "m" (round_tab[1])); \ |
|
674 |
- \ |
|
675 |
- sad8_y2a_ ## suf(blk1, blk2, stride, 8); \ |
|
676 |
- \ |
|
677 |
- return sum_ ## suf(); \ |
|
678 |
-} \ |
|
679 |
- \ |
|
680 |
-static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
681 |
- uint8_t *blk1, int stride, int h) \ |
|
682 |
-{ \ |
|
683 |
- av_assert2(h == 8); \ |
|
684 |
- __asm__ volatile ( \ |
|
685 |
- "pxor %%mm7, %%mm7 \n\t" \ |
|
686 |
- "pxor %%mm6, %%mm6 \n\t" \ |
|
687 |
- ::); \ |
|
688 |
- \ |
|
689 |
- sad8_4_ ## suf(blk1, blk2, stride, 8); \ |
|
690 |
- \ |
|
691 |
- return sum_ ## suf(); \ |
|
692 |
-} \ |
|
693 |
- \ |
|
694 |
-static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
695 |
- uint8_t *blk1, int stride, int h) \ |
|
696 |
-{ \ |
|
697 |
- __asm__ volatile ( \ |
|
698 |
- "pxor %%mm7, %%mm7 \n\t" \ |
|
699 |
- "pxor %%mm6, %%mm6 \n\t" \ |
|
700 |
- :); \ |
|
701 |
- \ |
|
702 |
- sad8_1_ ## suf(blk1, blk2, stride, h); \ |
|
703 |
- sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ |
|
704 |
- \ |
|
705 |
- return sum_ ## suf(); \ |
|
706 |
-} \ |
|
707 |
- \ |
|
708 |
-static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
709 |
- uint8_t *blk1, int stride, int h) \ |
|
710 |
-{ \ |
|
711 |
- __asm__ volatile ( \ |
|
712 |
- "pxor %%mm7, %%mm7 \n\t" \ |
|
713 |
- "pxor %%mm6, %%mm6 \n\t" \ |
|
714 |
- "movq %0, %%mm5 \n\t" \ |
|
715 |
- :: "m" (round_tab[1])); \ |
|
716 |
- \ |
|
717 |
- sad8_x2a_ ## suf(blk1, blk2, stride, h); \ |
|
718 |
- sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ |
|
719 |
- \ |
|
720 |
- return sum_ ## suf(); \ |
|
721 |
-} \ |
|
722 |
- \ |
|
723 |
-static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
724 |
- uint8_t *blk1, int stride, int h) \ |
|
725 |
-{ \ |
|
726 |
- __asm__ volatile ( \ |
|
727 |
- "pxor %%mm7, %%mm7 \n\t" \ |
|
728 |
- "pxor %%mm6, %%mm6 \n\t" \ |
|
729 |
- "movq %0, %%mm5 \n\t" \ |
|
730 |
- :: "m" (round_tab[1])); \ |
|
731 |
- \ |
|
732 |
- sad8_y2a_ ## suf(blk1, blk2, stride, h); \ |
|
733 |
- sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ |
|
734 |
- \ |
|
735 |
- return sum_ ## suf(); \ |
|
736 |
-} \ |
|
737 |
- \ |
|
738 |
-static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
739 |
- uint8_t *blk1, int stride, int h) \ |
|
740 |
-{ \ |
|
741 |
- __asm__ volatile ( \ |
|
742 |
- "pxor %%mm7, %%mm7 \n\t" \ |
|
743 |
- "pxor %%mm6, %%mm6 \n\t" \ |
|
744 |
- ::); \ |
|
745 |
- \ |
|
746 |
- sad8_4_ ## suf(blk1, blk2, stride, h); \ |
|
747 |
- sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ |
|
748 |
- \ |
|
749 |
- return sum_ ## suf(); \ |
|
750 |
-} \ |
|
751 |
- |
|
752 |
-PIX_SAD(mmx) |
|
753 |
-PIX_SAD(mmxext) |
|
754 |
- |
|
755 |
-#endif /* HAVE_INLINE_ASM */ |
|
756 |
- |
|
757 |
-av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx) |
|
758 |
-{ |
|
759 |
- int cpu_flags = av_get_cpu_flags(); |
|
760 |
- |
|
761 |
-#if HAVE_INLINE_ASM |
|
762 |
- if (INLINE_MMX(cpu_flags)) { |
|
763 |
- c->pix_abs[0][0] = sad16_mmx; |
|
764 |
- c->pix_abs[0][1] = sad16_x2_mmx; |
|
765 |
- c->pix_abs[0][2] = sad16_y2_mmx; |
|
766 |
- c->pix_abs[0][3] = sad16_xy2_mmx; |
|
767 |
- c->pix_abs[1][0] = sad8_mmx; |
|
768 |
- c->pix_abs[1][1] = sad8_x2_mmx; |
|
769 |
- c->pix_abs[1][2] = sad8_y2_mmx; |
|
770 |
- c->pix_abs[1][3] = sad8_xy2_mmx; |
|
771 |
- |
|
772 |
- c->sad[0] = sad16_mmx; |
|
773 |
- c->sad[1] = sad8_mmx; |
|
774 |
- |
|
775 |
- c->vsad[4] = vsad_intra16_mmx; |
|
776 |
- |
|
777 |
- if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
|
778 |
- c->vsad[0] = vsad16_mmx; |
|
779 |
- } |
|
780 |
- } |
|
781 |
- |
|
782 |
- if (INLINE_MMXEXT(cpu_flags)) { |
|
783 |
- c->vsad[4] = vsad_intra16_mmxext; |
|
784 |
- |
|
785 |
- c->pix_abs[0][0] = sad16_mmxext; |
|
786 |
- c->pix_abs[1][0] = sad8_mmxext; |
|
787 |
- |
|
788 |
- c->sad[0] = sad16_mmxext; |
|
789 |
- c->sad[1] = sad8_mmxext; |
|
790 |
- |
|
791 |
- c->pix_abs[0][1] = sad16_x2_mmxext; |
|
792 |
- c->pix_abs[0][2] = sad16_y2_mmxext; |
|
793 |
- c->pix_abs[1][1] = sad8_x2_mmxext; |
|
794 |
- c->pix_abs[1][2] = sad8_y2_mmxext; |
|
795 |
- |
|
796 |
- if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
|
797 |
- c->pix_abs[0][3] = sad16_xy2_mmxext; |
|
798 |
- c->pix_abs[1][3] = sad8_xy2_mmxext; |
|
799 |
- |
|
800 |
- c->vsad[0] = vsad16_mmxext; |
|
801 |
- } |
|
802 |
- } |
|
803 |
- |
|
804 |
- if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) { |
|
805 |
- c->sad[0] = sad16_sse2; |
|
806 |
- } |
|
807 |
- |
|
808 |
-#endif /* HAVE_INLINE_ASM */ |
|
809 |
- |
|
810 |
- if (EXTERNAL_MMX(cpu_flags)) { |
|
811 |
- c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx; |
|
812 |
- c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; |
|
813 |
- c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx; |
|
814 |
- c->sse[0] = ff_sse16_mmx; |
|
815 |
- c->sse[1] = ff_sse8_mmx; |
|
816 |
-#if HAVE_YASM |
|
817 |
- c->nsse[0] = nsse16_mmx; |
|
818 |
- c->nsse[1] = nsse8_mmx; |
|
819 |
-#endif |
|
820 |
- } |
|
821 |
- |
|
822 |
- if (EXTERNAL_MMXEXT(cpu_flags)) { |
|
823 |
- c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; |
|
824 |
- c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; |
|
825 |
- c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext; |
|
826 |
- } |
|
827 |
- |
|
828 |
- if (EXTERNAL_SSE2(cpu_flags)) { |
|
829 |
- c->sse[0] = ff_sse16_sse2; |
|
830 |
- c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2; |
|
831 |
- |
|
832 |
-#if HAVE_ALIGNED_STACK |
|
833 |
- c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; |
|
834 |
- c->hadamard8_diff[1] = ff_hadamard8_diff_sse2; |
|
835 |
-#endif |
|
836 |
- } |
|
837 |
- |
|
838 |
- if (EXTERNAL_SSSE3(cpu_flags)) { |
|
839 |
- c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3; |
|
840 |
-#if HAVE_ALIGNED_STACK |
|
841 |
- c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; |
|
842 |
- c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; |
|
843 |
-#endif |
|
844 |
- } |
|
845 |
-} |
846 | 1 |
deleted file mode 100644 |
... | ... |
@@ -1,467 +0,0 @@ |
1 |
-;***************************************************************************** |
|
2 |
-;* MMX optimized DSP utils |
|
3 |
-;***************************************************************************** |
|
4 |
-;* Copyright (c) 2000, 2001 Fabrice Bellard |
|
5 |
-;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
6 |
-;* |
|
7 |
-;* This file is part of FFmpeg. |
|
8 |
-;* |
|
9 |
-;* FFmpeg is free software; you can redistribute it and/or |
|
10 |
-;* modify it under the terms of the GNU Lesser General Public |
|
11 |
-;* License as published by the Free Software Foundation; either |
|
12 |
-;* version 2.1 of the License, or (at your option) any later version. |
|
13 |
-;* |
|
14 |
-;* FFmpeg is distributed in the hope that it will be useful, |
|
15 |
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
16 |
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
17 |
-;* Lesser General Public License for more details. |
|
18 |
-;* |
|
19 |
-;* You should have received a copy of the GNU Lesser General Public |
|
20 |
-;* License along with FFmpeg; if not, write to the Free Software |
|
21 |
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
22 |
-;***************************************************************************** |
|
23 |
- |
|
24 |
-%include "libavutil/x86/x86util.asm" |
|
25 |
- |
|
26 |
-SECTION .text |
|
27 |
- |
|
28 |
-%macro DIFF_PIXELS_1 4 |
|
29 |
- movh %1, %3 |
|
30 |
- movh %2, %4 |
|
31 |
- punpcklbw %2, %1 |
|
32 |
- punpcklbw %1, %1 |
|
33 |
- psubw %1, %2 |
|
34 |
-%endmacro |
|
35 |
- |
|
36 |
-; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 |
|
37 |
-; %6=temporary storage location |
|
38 |
-; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) |
|
39 |
-%macro DIFF_PIXELS_8 6 |
|
40 |
- DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] |
|
41 |
- DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] |
|
42 |
- DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] |
|
43 |
- add %1, %5 |
|
44 |
- add %2, %5 |
|
45 |
- DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] |
|
46 |
- DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] |
|
47 |
- DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] |
|
48 |
- DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] |
|
49 |
-%ifdef m8 |
|
50 |
- DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] |
|
51 |
-%else |
|
52 |
- mova [%6], m0 |
|
53 |
- DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] |
|
54 |
- mova m0, [%6] |
|
55 |
-%endif |
|
56 |
- sub %1, %5 |
|
57 |
- sub %2, %5 |
|
58 |
-%endmacro |
|
59 |
- |
|
60 |
-%macro HADAMARD8 0 |
|
61 |
- SUMSUB_BADC w, 0, 1, 2, 3 |
|
62 |
- SUMSUB_BADC w, 4, 5, 6, 7 |
|
63 |
- SUMSUB_BADC w, 0, 2, 1, 3 |
|
64 |
- SUMSUB_BADC w, 4, 6, 5, 7 |
|
65 |
- SUMSUB_BADC w, 0, 4, 1, 5 |
|
66 |
- SUMSUB_BADC w, 2, 6, 3, 7 |
|
67 |
-%endmacro |
|
68 |
- |
|
69 |
-%macro ABS1_SUM 3 |
|
70 |
- ABS1 %1, %2 |
|
71 |
- paddusw %3, %1 |
|
72 |
-%endmacro |
|
73 |
- |
|
74 |
-%macro ABS2_SUM 6 |
|
75 |
- ABS2 %1, %2, %3, %4 |
|
76 |
- paddusw %5, %1 |
|
77 |
- paddusw %6, %2 |
|
78 |
-%endmacro |
|
79 |
- |
|
80 |
-%macro ABS_SUM_8x8_64 1 |
|
81 |
- ABS2 m0, m1, m8, m9 |
|
82 |
- ABS2_SUM m2, m3, m8, m9, m0, m1 |
|
83 |
- ABS2_SUM m4, m5, m8, m9, m0, m1 |
|
84 |
- ABS2_SUM m6, m7, m8, m9, m0, m1 |
|
85 |
- paddusw m0, m1 |
|
86 |
-%endmacro |
|
87 |
- |
|
88 |
-%macro ABS_SUM_8x8_32 1 |
|
89 |
- mova [%1], m7 |
|
90 |
- ABS1 m0, m7 |
|
91 |
- ABS1 m1, m7 |
|
92 |
- ABS1_SUM m2, m7, m0 |
|
93 |
- ABS1_SUM m3, m7, m1 |
|
94 |
- ABS1_SUM m4, m7, m0 |
|
95 |
- ABS1_SUM m5, m7, m1 |
|
96 |
- ABS1_SUM m6, m7, m0 |
|
97 |
- mova m2, [%1] |
|
98 |
- ABS1_SUM m2, m7, m1 |
|
99 |
- paddusw m0, m1 |
|
100 |
-%endmacro |
|
101 |
- |
|
102 |
-; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to |
|
103 |
-; about 100k on extreme inputs. But that's very unlikely to occur in natural video, |
|
104 |
-; and it's even more unlikely to not have any alternative mvs/modes with lower cost. |
|
105 |
-%macro HSUM 3 |
|
106 |
-%if cpuflag(sse2) |
|
107 |
- movhlps %2, %1 |
|
108 |
- paddusw %1, %2 |
|
109 |
- pshuflw %2, %1, 0xE |
|
110 |
- paddusw %1, %2 |
|
111 |
- pshuflw %2, %1, 0x1 |
|
112 |
- paddusw %1, %2 |
|
113 |
- movd %3, %1 |
|
114 |
-%elif cpuflag(mmxext) |
|
115 |
- pshufw %2, %1, 0xE |
|
116 |
- paddusw %1, %2 |
|
117 |
- pshufw %2, %1, 0x1 |
|
118 |
- paddusw %1, %2 |
|
119 |
- movd %3, %1 |
|
120 |
-%elif cpuflag(mmx) |
|
121 |
- mova %2, %1 |
|
122 |
- psrlq %1, 32 |
|
123 |
- paddusw %1, %2 |
|
124 |
- mova %2, %1 |
|
125 |
- psrlq %1, 16 |
|
126 |
- paddusw %1, %2 |
|
127 |
- movd %3, %1 |
|
128 |
-%endif |
|
129 |
-%endmacro |
|
130 |
- |
|
131 |
-%macro STORE4 5 |
|
132 |
- mova [%1+mmsize*0], %2 |
|
133 |
- mova [%1+mmsize*1], %3 |
|
134 |
- mova [%1+mmsize*2], %4 |
|
135 |
- mova [%1+mmsize*3], %5 |
|
136 |
-%endmacro |
|
137 |
- |
|
138 |
-%macro LOAD4 5 |
|
139 |
- mova %2, [%1+mmsize*0] |
|
140 |
- mova %3, [%1+mmsize*1] |
|
141 |
- mova %4, [%1+mmsize*2] |
|
142 |
- mova %5, [%1+mmsize*3] |
|
143 |
-%endmacro |
|
144 |
- |
|
145 |
-%macro hadamard8_16_wrapper 2 |
|
146 |
-cglobal hadamard8_diff, 4, 4, %1 |
|
147 |
-%ifndef m8 |
|
148 |
- %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) |
|
149 |
- SUB rsp, pad |
|
150 |
-%endif |
|
151 |
- call hadamard8x8_diff %+ SUFFIX |
|
152 |
-%ifndef m8 |
|
153 |
- ADD rsp, pad |
|
154 |
-%endif |
|
155 |
- RET |
|
156 |
- |
|
157 |
-cglobal hadamard8_diff16, 5, 6, %1 |
|
158 |
-%ifndef m8 |
|
159 |
- %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) |
|
160 |
- SUB rsp, pad |
|
161 |
-%endif |
|
162 |
- |
|
163 |
- call hadamard8x8_diff %+ SUFFIX |
|
164 |
- mov r5d, eax |
|
165 |
- |
|
166 |
- add r1, 8 |
|
167 |
- add r2, 8 |
|
168 |
- call hadamard8x8_diff %+ SUFFIX |
|
169 |
- add r5d, eax |
|
170 |
- |
|
171 |
- cmp r4d, 16 |
|
172 |
- jne .done |
|
173 |
- |
|
174 |
- lea r1, [r1+r3*8-8] |
|
175 |
- lea r2, [r2+r3*8-8] |
|
176 |
- call hadamard8x8_diff %+ SUFFIX |
|
177 |
- add r5d, eax |
|
178 |
- |
|
179 |
- add r1, 8 |
|
180 |
- add r2, 8 |
|
181 |
- call hadamard8x8_diff %+ SUFFIX |
|
182 |
- add r5d, eax |
|
183 |
- |
|
184 |
-.done: |
|
185 |
- mov eax, r5d |
|
186 |
-%ifndef m8 |
|
187 |
- ADD rsp, pad |
|
188 |
-%endif |
|
189 |
- RET |
|
190 |
-%endmacro |
|
191 |
- |
|
192 |
-%macro HADAMARD8_DIFF 0-1 |
|
193 |
-%if cpuflag(sse2) |
|
194 |
-hadamard8x8_diff %+ SUFFIX: |
|
195 |
- lea r0, [r3*3] |
|
196 |
- DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize |
|
197 |
- HADAMARD8 |
|
198 |
-%if ARCH_X86_64 |
|
199 |
- TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 |
|
200 |
-%else |
|
201 |
- TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] |
|
202 |
-%endif |
|
203 |
- HADAMARD8 |
|
204 |
- ABS_SUM_8x8 rsp+gprsize |
|
205 |
- HSUM m0, m1, eax |
|
206 |
- and eax, 0xFFFF |
|
207 |
- ret |
|
208 |
- |
|
209 |
-hadamard8_16_wrapper %1, 3 |
|
210 |
-%elif cpuflag(mmx) |
|
211 |
-ALIGN 16 |
|
212 |
-; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, |
|
213 |
-; uint8_t *src2, int stride, int h) |
|
214 |
-; r0 = void *s = unused, int h = unused (always 8) |
|
215 |
-; note how r1, r2 and r3 are not clobbered in this function, so 16x16 |
|
216 |
-; can simply call this 2x2x (and that's why we access rsp+gprsize |
|
217 |
-; everywhere, which is rsp of calling func |
|
218 |
-hadamard8x8_diff %+ SUFFIX: |
|
219 |
- lea r0, [r3*3] |
|
220 |
- |
|
221 |
- ; first 4x8 pixels |
|
222 |
- DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 |
|
223 |
- HADAMARD8 |
|
224 |
- mova [rsp+gprsize+0x60], m7 |
|
225 |
- TRANSPOSE4x4W 0, 1, 2, 3, 7 |
|
226 |
- STORE4 rsp+gprsize, m0, m1, m2, m3 |
|
227 |
- mova m7, [rsp+gprsize+0x60] |
|
228 |
- TRANSPOSE4x4W 4, 5, 6, 7, 0 |
|
229 |
- STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 |
|
230 |
- |
|
231 |
- ; second 4x8 pixels |
|
232 |
- DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 |
|
233 |
- HADAMARD8 |
|
234 |
- mova [rsp+gprsize+0x60], m7 |
|
235 |
- TRANSPOSE4x4W 0, 1, 2, 3, 7 |
|
236 |
- STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 |
|
237 |
- mova m7, [rsp+gprsize+0x60] |
|
238 |
- TRANSPOSE4x4W 4, 5, 6, 7, 0 |
|
239 |
- |
|
240 |
- LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 |
|
241 |
- HADAMARD8 |
|
242 |
- ABS_SUM_8x8_32 rsp+gprsize+0x60 |
|
243 |
- mova [rsp+gprsize+0x60], m0 |
|
244 |
- |
|
245 |
- LOAD4 rsp+gprsize , m0, m1, m2, m3 |
|
246 |
- LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 |
|
247 |
- HADAMARD8 |
|
248 |
- ABS_SUM_8x8_32 rsp+gprsize |
|
249 |
- paddusw m0, [rsp+gprsize+0x60] |
|
250 |
- |
|
251 |
- HSUM m0, m1, eax |
|
252 |
- and rax, 0xFFFF |
|
253 |
- ret |
|
254 |
- |
|
255 |
-hadamard8_16_wrapper 0, 14 |
|
256 |
-%endif |
|
257 |
-%endmacro |
|
258 |
- |
|
259 |
-INIT_MMX mmx |
|
260 |
-HADAMARD8_DIFF |
|
261 |
- |
|
262 |
-INIT_MMX mmxext |
|
263 |
-HADAMARD8_DIFF |
|
264 |
- |
|
265 |
-INIT_XMM sse2 |
|
266 |
-%if ARCH_X86_64 |
|
267 |
-%define ABS_SUM_8x8 ABS_SUM_8x8_64 |
|
268 |
-%else |
|
269 |
-%define ABS_SUM_8x8 ABS_SUM_8x8_32 |
|
270 |
-%endif |
|
271 |
-HADAMARD8_DIFF 10 |
|
272 |
- |
|
273 |
-INIT_XMM ssse3 |
|
274 |
-%define ABS_SUM_8x8 ABS_SUM_8x8_64 |
|
275 |
-HADAMARD8_DIFF 9 |
|
276 |
- |
|
277 |
-; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
278 |
-; int line_size, int h) |
|
279 |
- |
|
280 |
-%macro SUM_SQUARED_ERRORS 1 |
|
281 |
-cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h |
|
282 |
-%if %1 == mmsize |
|
283 |
- shr hd, 1 |
|
284 |
-%endif |
|
285 |
- pxor m0, m0 ; mm0 = 0 |
|
286 |
- pxor m7, m7 ; mm7 holds the sum |
|
287 |
- |
|
288 |
-.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned |
|
289 |
- movu m1, [pix1q] ; m1 = pix1[0][0-15], [0-7] for mmx |
|
290 |
- movu m2, [pix2q] ; m2 = pix2[0][0-15], [0-7] for mmx |
|
291 |
-%if %1 == mmsize |
|
292 |
- movu m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx |
|
293 |
- movu m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx |
|
294 |
-%else ; %1 / 2 == mmsize; mmx only |
|
295 |
- mova m3, [pix1q+8] ; m3 = pix1[0][8-15] |
|
296 |
- mova m4, [pix2q+8] ; m4 = pix2[0][8-15] |
|
297 |
-%endif |
|
298 |
- |
|
299 |
- ; todo: mm1-mm2, mm3-mm4 |
|
300 |
- ; algo: subtract mm1 from mm2 with saturation and vice versa |
|
301 |
- ; OR the result to get the absolute difference |
|
302 |
- mova m5, m1 |
|
303 |
- mova m6, m3 |
|
304 |
- psubusb m1, m2 |
|
305 |
- psubusb m3, m4 |
|
306 |
- psubusb m2, m5 |
|
307 |
- psubusb m4, m6 |
|
308 |
- |
|
309 |
- por m2, m1 |
|
310 |
- por m4, m3 |
|
311 |
- |
|
312 |
- ; now convert to 16-bit vectors so we can square them |
|
313 |
- mova m1, m2 |
|
314 |
- mova m3, m4 |
|
315 |
- |
|
316 |
- punpckhbw m2, m0 |
|
317 |
- punpckhbw m4, m0 |
|
318 |
- punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) |
|
319 |
- punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) |
|
320 |
- |
|
321 |
- pmaddwd m2, m2 |
|
322 |
- pmaddwd m4, m4 |
|
323 |
- pmaddwd m1, m1 |
|
324 |
- pmaddwd m3, m3 |
|
325 |
- |
|
326 |
- paddd m1, m2 |
|
327 |
- paddd m3, m4 |
|
328 |
- paddd m7, m1 |
|
329 |
- paddd m7, m3 |
|
330 |
- |
|
331 |
-%if %1 == mmsize |
|
332 |
- lea pix1q, [pix1q + 2*lsizeq] |
|
333 |
- lea pix2q, [pix2q + 2*lsizeq] |
|
334 |
-%else |
|
335 |
- add pix1q, lsizeq |
|
336 |
- add pix2q, lsizeq |
|
337 |
-%endif |
|
338 |
- dec hd |
|
339 |
- jnz .next2lines |
|
340 |
- |
|
341 |
- HADDD m7, m1 |
|
342 |
- movd eax, m7 ; return value |
|
343 |
- RET |
|
344 |
-%endmacro |
|
345 |
- |
|
346 |
-INIT_MMX mmx |
|
347 |
-SUM_SQUARED_ERRORS 8 |
|
348 |
- |
|
349 |
-INIT_MMX mmx |
|
350 |
-SUM_SQUARED_ERRORS 16 |
|
351 |
- |
|
352 |
-INIT_XMM sse2 |
|
353 |
-SUM_SQUARED_ERRORS 16 |
|
354 |
- |
|
355 |
-;----------------------------------------------- |
|
356 |
-;int ff_sum_abs_dctelem(int16_t *block) |
|
357 |
-;----------------------------------------------- |
|
358 |
-; %1 = number of xmm registers used |
|
359 |
-; %2 = number of inline loops |
|
360 |
- |
|
361 |
-%macro SUM_ABS_DCTELEM 2 |
|
362 |
-cglobal sum_abs_dctelem, 1, 1, %1, block |
|
363 |
- pxor m0, m0 |
|
364 |
- pxor m1, m1 |
|
365 |
-%assign %%i 0 |
|
366 |
-%rep %2 |
|
367 |
- mova m2, [blockq+mmsize*(0+%%i)] |
|
368 |
- mova m3, [blockq+mmsize*(1+%%i)] |
|
369 |
- mova m4, [blockq+mmsize*(2+%%i)] |
|
370 |
- mova m5, [blockq+mmsize*(3+%%i)] |
|
371 |
- ABS1_SUM m2, m6, m0 |
|
372 |
- ABS1_SUM m3, m6, m1 |
|
373 |
- ABS1_SUM m4, m6, m0 |
|
374 |
- ABS1_SUM m5, m6, m1 |
|
375 |
-%assign %%i %%i+4 |
|
376 |
-%endrep |
|
377 |
- paddusw m0, m1 |
|
378 |
- HSUM m0, m1, eax |
|
379 |
- and eax, 0xFFFF |
|
380 |
- RET |
|
381 |
-%endmacro |
|
382 |
- |
|
383 |
-INIT_MMX mmx |
|
384 |
-SUM_ABS_DCTELEM 0, 4 |
|
385 |
-INIT_MMX mmxext |
|
386 |
-SUM_ABS_DCTELEM 0, 4 |
|
387 |
-INIT_XMM sse2 |
|
388 |
-SUM_ABS_DCTELEM 7, 2 |
|
389 |
-INIT_XMM ssse3 |
|
390 |
-SUM_ABS_DCTELEM 6, 2 |
|
391 |
- |
|
392 |
-;------------------------------------------------------------------------------ |
|
393 |
-; int ff_hf_noise*_mmx(uint8_t *pix1, int lsize, int h) |
|
394 |
-;------------------------------------------------------------------------------ |
|
395 |
-; %1 = 8/16. %2-5=m# |
|
396 |
-%macro HF_NOISE_PART1 5 |
|
397 |
- mova m%2, [pix1q] |
|
398 |
-%if %1 == 8 |
|
399 |
- mova m%3, m%2 |
|
400 |
- psllq m%2, 8 |
|
401 |
- psrlq m%3, 8 |
|
402 |
- psrlq m%2, 8 |
|
403 |
-%else |
|
404 |
- mova m%3, [pix1q+1] |
|
405 |
-%endif |
|
406 |
- mova m%4, m%2 |
|
407 |
- mova m%5, m%3 |
|
408 |
- punpcklbw m%2, m7 |
|
409 |
- punpcklbw m%3, m7 |
|
410 |
- punpckhbw m%4, m7 |
|
411 |
- punpckhbw m%5, m7 |
|
412 |
- psubw m%2, m%3 |
|
413 |
- psubw m%4, m%5 |
|
414 |
-%endmacro |
|
415 |
- |
|
416 |
-; %1-2 = m# |
|
417 |
-%macro HF_NOISE_PART2 4 |
|
418 |
- psubw m%1, m%3 |
|
419 |
- psubw m%2, m%4 |
|
420 |
- pxor m3, m3 |
|
421 |
- pxor m1, m1 |
|
422 |
- pcmpgtw m3, m%1 |
|
423 |
- pcmpgtw m1, m%2 |
|
424 |
- pxor m%1, m3 |
|
425 |
- pxor m%2, m1 |
|
426 |
- psubw m%1, m3 |
|
427 |
- psubw m%2, m1 |
|
428 |
- paddw m%2, m%1 |
|
429 |
- paddw m6, m%2 |
|
430 |
-%endmacro |
|
431 |
- |
|
432 |
-; %1 = 8/16 |
|
433 |
-%macro HF_NOISE 1 |
|
434 |
-cglobal hf_noise%1, 3,3,0, pix1, lsize, h |
|
435 |
- movsxdifnidn lsizeq, lsized |
|
436 |
- sub hd, 2 |
|
437 |
- pxor m7, m7 |
|
438 |
- pxor m6, m6 |
|
439 |
- HF_NOISE_PART1 %1, 0, 1, 2, 3 |
|
440 |
- add pix1q, lsizeq |
|
441 |
- HF_NOISE_PART1 %1, 4, 1, 5, 3 |
|
442 |
- HF_NOISE_PART2 0, 2, 4, 5 |
|
443 |
- add pix1q, lsizeq |
|
444 |
-.loop: |
|
445 |
- HF_NOISE_PART1 %1, 0, 1, 2, 3 |
|
446 |
- HF_NOISE_PART2 4, 5, 0, 2 |
|
447 |
- add pix1q, lsizeq |
|
448 |
- HF_NOISE_PART1 %1, 4, 1, 5, 3 |
|
449 |
- HF_NOISE_PART2 0, 2, 4, 5 |
|
450 |
- add pix1q, lsizeq |
|
451 |
- sub hd, 2 |
|
452 |
- jne .loop |
|
453 |
- |
|
454 |
- mova m0, m6 |
|
455 |
- punpcklwd m0, m7 |
|
456 |
- punpckhwd m6, m7 |
|
457 |
- paddd m6, m0 |
|
458 |
- mova m0, m6 |
|
459 |
- psrlq m6, 32 |
|
460 |
- paddd m0, m6 |
|
461 |
- movd eax, m0 ; eax = result of hf_noise8; |
|
462 |
- REP_RET ; return eax; |
|
463 |
-%endmacro |
|
464 |
- |
|
465 |
-INIT_MMX mmx |
|
466 |
-HF_NOISE 8 |
|
467 |
-HF_NOISE 16 |
468 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,467 @@ |
0 |
+;***************************************************************************** |
|
1 |
+;* SIMD-optimized motion compensation estimation |
|
2 |
+;***************************************************************************** |
|
3 |
+;* Copyright (c) 2000, 2001 Fabrice Bellard |
|
4 |
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
5 |
+;* |
|
6 |
+;* This file is part of FFmpeg. |
|
7 |
+;* |
|
8 |
+;* FFmpeg is free software; you can redistribute it and/or |
|
9 |
+;* modify it under the terms of the GNU Lesser General Public |
|
10 |
+;* License as published by the Free Software Foundation; either |
|
11 |
+;* version 2.1 of the License, or (at your option) any later version. |
|
12 |
+;* |
|
13 |
+;* FFmpeg is distributed in the hope that it will be useful, |
|
14 |
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
15 |
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
16 |
+;* Lesser General Public License for more details. |
|
17 |
+;* |
|
18 |
+;* You should have received a copy of the GNU Lesser General Public |
|
19 |
+;* License along with FFmpeg; if not, write to the Free Software |
|
20 |
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
21 |
+;***************************************************************************** |
|
22 |
+ |
|
23 |
+%include "libavutil/x86/x86util.asm" |
|
24 |
+ |
|
25 |
+SECTION .text |
|
26 |
+ |
|
27 |
+%macro DIFF_PIXELS_1 4 |
|
28 |
+ movh %1, %3 |
|
29 |
+ movh %2, %4 |
|
30 |
+ punpcklbw %2, %1 |
|
31 |
+ punpcklbw %1, %1 |
|
32 |
+ psubw %1, %2 |
|
33 |
+%endmacro |
|
34 |
+ |
|
35 |
+; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 |
|
36 |
+; %6=temporary storage location |
|
37 |
+; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) |
|
38 |
+%macro DIFF_PIXELS_8 6 |
|
39 |
+ DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] |
|
40 |
+ DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] |
|
41 |
+ DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] |
|
42 |
+ add %1, %5 |
|
43 |
+ add %2, %5 |
|
44 |
+ DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] |
|
45 |
+ DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] |
|
46 |
+ DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] |
|
47 |
+ DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] |
|
48 |
+%ifdef m8 |
|
49 |
+ DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] |
|
50 |
+%else |
|
51 |
+ mova [%6], m0 |
|
52 |
+ DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] |
|
53 |
+ mova m0, [%6] |
|
54 |
+%endif |
|
55 |
+ sub %1, %5 |
|
56 |
+ sub %2, %5 |
|
57 |
+%endmacro |
|
58 |
+ |
|
59 |
+%macro HADAMARD8 0 |
|
60 |
+ SUMSUB_BADC w, 0, 1, 2, 3 |
|
61 |
+ SUMSUB_BADC w, 4, 5, 6, 7 |
|
62 |
+ SUMSUB_BADC w, 0, 2, 1, 3 |
|
63 |
+ SUMSUB_BADC w, 4, 6, 5, 7 |
|
64 |
+ SUMSUB_BADC w, 0, 4, 1, 5 |
|
65 |
+ SUMSUB_BADC w, 2, 6, 3, 7 |
|
66 |
+%endmacro |
|
67 |
+ |
|
68 |
+%macro ABS1_SUM 3 |
|
69 |
+ ABS1 %1, %2 |
|
70 |
+ paddusw %3, %1 |
|
71 |
+%endmacro |
|
72 |
+ |
|
73 |
+%macro ABS2_SUM 6 |
|
74 |
+ ABS2 %1, %2, %3, %4 |
|
75 |
+ paddusw %5, %1 |
|
76 |
+ paddusw %6, %2 |
|
77 |
+%endmacro |
|
78 |
+ |
|
79 |
+%macro ABS_SUM_8x8_64 1 |
|
80 |
+ ABS2 m0, m1, m8, m9 |
|
81 |
+ ABS2_SUM m2, m3, m8, m9, m0, m1 |
|
82 |
+ ABS2_SUM m4, m5, m8, m9, m0, m1 |
|
83 |
+ ABS2_SUM m6, m7, m8, m9, m0, m1 |
|
84 |
+ paddusw m0, m1 |
|
85 |
+%endmacro |
|
86 |
+ |
|
87 |
+%macro ABS_SUM_8x8_32 1 |
|
88 |
+ mova [%1], m7 |
|
89 |
+ ABS1 m0, m7 |
|
90 |
+ ABS1 m1, m7 |
|
91 |
+ ABS1_SUM m2, m7, m0 |
|
92 |
+ ABS1_SUM m3, m7, m1 |
|
93 |
+ ABS1_SUM m4, m7, m0 |
|
94 |
+ ABS1_SUM m5, m7, m1 |
|
95 |
+ ABS1_SUM m6, m7, m0 |
|
96 |
+ mova m2, [%1] |
|
97 |
+ ABS1_SUM m2, m7, m1 |
|
98 |
+ paddusw m0, m1 |
|
99 |
+%endmacro |
|
100 |
+ |
|
101 |
+; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to |
|
102 |
+; about 100k on extreme inputs. But that's very unlikely to occur in natural video, |
|
103 |
+; and it's even more unlikely to not have any alternative mvs/modes with lower cost. |
|
104 |
+%macro HSUM 3 |
|
105 |
+%if cpuflag(sse2) |
|
106 |
+ movhlps %2, %1 |
|
107 |
+ paddusw %1, %2 |
|
108 |
+ pshuflw %2, %1, 0xE |
|
109 |
+ paddusw %1, %2 |
|
110 |
+ pshuflw %2, %1, 0x1 |
|
111 |
+ paddusw %1, %2 |
|
112 |
+ movd %3, %1 |
|
113 |
+%elif cpuflag(mmxext) |
|
114 |
+ pshufw %2, %1, 0xE |
|
115 |
+ paddusw %1, %2 |
|
116 |
+ pshufw %2, %1, 0x1 |
|
117 |
+ paddusw %1, %2 |
|
118 |
+ movd %3, %1 |
|
119 |
+%elif cpuflag(mmx) |
|
120 |
+ mova %2, %1 |
|
121 |
+ psrlq %1, 32 |
|
122 |
+ paddusw %1, %2 |
|
123 |
+ mova %2, %1 |
|
124 |
+ psrlq %1, 16 |
|
125 |
+ paddusw %1, %2 |
|
126 |
+ movd %3, %1 |
|
127 |
+%endif |
|
128 |
+%endmacro |
|
129 |
+ |
|
130 |
+%macro STORE4 5 |
|
131 |
+ mova [%1+mmsize*0], %2 |
|
132 |
+ mova [%1+mmsize*1], %3 |
|
133 |
+ mova [%1+mmsize*2], %4 |
|
134 |
+ mova [%1+mmsize*3], %5 |
|
135 |
+%endmacro |
|
136 |
+ |
|
137 |
+%macro LOAD4 5 |
|
138 |
+ mova %2, [%1+mmsize*0] |
|
139 |
+ mova %3, [%1+mmsize*1] |
|
140 |
+ mova %4, [%1+mmsize*2] |
|
141 |
+ mova %5, [%1+mmsize*3] |
|
142 |
+%endmacro |
|
143 |
+ |
|
144 |
+%macro hadamard8_16_wrapper 2 |
|
145 |
+cglobal hadamard8_diff, 4, 4, %1 |
|
146 |
+%ifndef m8 |
|
147 |
+ %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) |
|
148 |
+ SUB rsp, pad |
|
149 |
+%endif |
|
150 |
+ call hadamard8x8_diff %+ SUFFIX |
|
151 |
+%ifndef m8 |
|
152 |
+ ADD rsp, pad |
|
153 |
+%endif |
|
154 |
+ RET |
|
155 |
+ |
|
156 |
+cglobal hadamard8_diff16, 5, 6, %1 |
|
157 |
+%ifndef m8 |
|
158 |
+ %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) |
|
159 |
+ SUB rsp, pad |
|
160 |
+%endif |
|
161 |
+ |
|
162 |
+ call hadamard8x8_diff %+ SUFFIX |
|
163 |
+ mov r5d, eax |
|
164 |
+ |
|
165 |
+ add r1, 8 |
|
166 |
+ add r2, 8 |
|
167 |
+ call hadamard8x8_diff %+ SUFFIX |
|
168 |
+ add r5d, eax |
|
169 |
+ |
|
170 |
+ cmp r4d, 16 |
|
171 |
+ jne .done |
|
172 |
+ |
|
173 |
+ lea r1, [r1+r3*8-8] |
|
174 |
+ lea r2, [r2+r3*8-8] |
|
175 |
+ call hadamard8x8_diff %+ SUFFIX |
|
176 |
+ add r5d, eax |
|
177 |
+ |
|
178 |
+ add r1, 8 |
|
179 |
+ add r2, 8 |
|
180 |
+ call hadamard8x8_diff %+ SUFFIX |
|
181 |
+ add r5d, eax |
|
182 |
+ |
|
183 |
+.done: |
|
184 |
+ mov eax, r5d |
|
185 |
+%ifndef m8 |
|
186 |
+ ADD rsp, pad |
|
187 |
+%endif |
|
188 |
+ RET |
|
189 |
+%endmacro |
|
190 |
+ |
|
191 |
+%macro HADAMARD8_DIFF 0-1 |
|
192 |
+%if cpuflag(sse2) |
|
193 |
+hadamard8x8_diff %+ SUFFIX: |
|
194 |
+ lea r0, [r3*3] |
|
195 |
+ DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize |
|
196 |
+ HADAMARD8 |
|
197 |
+%if ARCH_X86_64 |
|
198 |
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 |
|
199 |
+%else |
|
200 |
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] |
|
201 |
+%endif |
|
202 |
+ HADAMARD8 |
|
203 |
+ ABS_SUM_8x8 rsp+gprsize |
|
204 |
+ HSUM m0, m1, eax |
|
205 |
+ and eax, 0xFFFF |
|
206 |
+ ret |
|
207 |
+ |
|
208 |
+hadamard8_16_wrapper %1, 3 |
|
209 |
+%elif cpuflag(mmx) |
|
210 |
+ALIGN 16 |
|
211 |
+; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, |
|
212 |
+; uint8_t *src2, int stride, int h) |
|
213 |
+; r0 = void *s = unused, int h = unused (always 8) |
|
214 |
+; note how r1, r2 and r3 are not clobbered in this function, so 16x16 |
|
215 |
+; can simply call this 2x2x (and that's why we access rsp+gprsize |
|
216 |
+; everywhere, which is rsp of calling func |
|
217 |
+hadamard8x8_diff %+ SUFFIX: |
|
218 |
+ lea r0, [r3*3] |
|
219 |
+ |
|
220 |
+ ; first 4x8 pixels |
|
221 |
+ DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 |
|
222 |
+ HADAMARD8 |
|
223 |
+ mova [rsp+gprsize+0x60], m7 |
|
224 |
+ TRANSPOSE4x4W 0, 1, 2, 3, 7 |
|
225 |
+ STORE4 rsp+gprsize, m0, m1, m2, m3 |
|
226 |
+ mova m7, [rsp+gprsize+0x60] |
|
227 |
+ TRANSPOSE4x4W 4, 5, 6, 7, 0 |
|
228 |
+ STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 |
|
229 |
+ |
|
230 |
+ ; second 4x8 pixels |
|
231 |
+ DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 |
|
232 |
+ HADAMARD8 |
|
233 |
+ mova [rsp+gprsize+0x60], m7 |
|
234 |
+ TRANSPOSE4x4W 0, 1, 2, 3, 7 |
|
235 |
+ STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 |
|
236 |
+ mova m7, [rsp+gprsize+0x60] |
|
237 |
+ TRANSPOSE4x4W 4, 5, 6, 7, 0 |
|
238 |
+ |
|
239 |
+ LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 |
|
240 |
+ HADAMARD8 |
|
241 |
+ ABS_SUM_8x8_32 rsp+gprsize+0x60 |
|
242 |
+ mova [rsp+gprsize+0x60], m0 |
|
243 |
+ |
|
244 |
+ LOAD4 rsp+gprsize , m0, m1, m2, m3 |
|
245 |
+ LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 |
|
246 |
+ HADAMARD8 |
|
247 |
+ ABS_SUM_8x8_32 rsp+gprsize |
|
248 |
+ paddusw m0, [rsp+gprsize+0x60] |
|
249 |
+ |
|
250 |
+ HSUM m0, m1, eax |
|
251 |
+ and rax, 0xFFFF |
|
252 |
+ ret |
|
253 |
+ |
|
254 |
+hadamard8_16_wrapper 0, 14 |
|
255 |
+%endif |
|
256 |
+%endmacro |
|
257 |
+ |
|
258 |
+INIT_MMX mmx |
|
259 |
+HADAMARD8_DIFF |
|
260 |
+ |
|
261 |
+INIT_MMX mmxext |
|
262 |
+HADAMARD8_DIFF |
|
263 |
+ |
|
264 |
+INIT_XMM sse2 |
|
265 |
+%if ARCH_X86_64 |
|
266 |
+%define ABS_SUM_8x8 ABS_SUM_8x8_64 |
|
267 |
+%else |
|
268 |
+%define ABS_SUM_8x8 ABS_SUM_8x8_32 |
|
269 |
+%endif |
|
270 |
+HADAMARD8_DIFF 10 |
|
271 |
+ |
|
272 |
+INIT_XMM ssse3 |
|
273 |
+%define ABS_SUM_8x8 ABS_SUM_8x8_64 |
|
274 |
+HADAMARD8_DIFF 9 |
|
275 |
+ |
|
276 |
+; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
277 |
+; int line_size, int h) |
|
278 |
+ |
|
279 |
+%macro SUM_SQUARED_ERRORS 1 |
|
280 |
+cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h |
|
281 |
+%if %1 == mmsize |
|
282 |
+ shr hd, 1 |
|
283 |
+%endif |
|
284 |
+ pxor m0, m0 ; mm0 = 0 |
|
285 |
+ pxor m7, m7 ; mm7 holds the sum |
|
286 |
+ |
|
287 |
+.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned |
|
288 |
+ movu m1, [pix1q] ; m1 = pix1[0][0-15], [0-7] for mmx |
|
289 |
+ movu m2, [pix2q] ; m2 = pix2[0][0-15], [0-7] for mmx |
|
290 |
+%if %1 == mmsize |
|
291 |
+ movu m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx |
|
292 |
+ movu m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx |
|
293 |
+%else ; %1 / 2 == mmsize; mmx only |
|
294 |
+ mova m3, [pix1q+8] ; m3 = pix1[0][8-15] |
|
295 |
+ mova m4, [pix2q+8] ; m4 = pix2[0][8-15] |
|
296 |
+%endif |
|
297 |
+ |
|
298 |
+ ; todo: mm1-mm2, mm3-mm4 |
|
299 |
+ ; algo: subtract mm1 from mm2 with saturation and vice versa |
|
300 |
+ ; OR the result to get the absolute difference |
|
301 |
+ mova m5, m1 |
|
302 |
+ mova m6, m3 |
|
303 |
+ psubusb m1, m2 |
|
304 |
+ psubusb m3, m4 |
|
305 |
+ psubusb m2, m5 |
|
306 |
+ psubusb m4, m6 |
|
307 |
+ |
|
308 |
+ por m2, m1 |
|
309 |
+ por m4, m3 |
|
310 |
+ |
|
311 |
+ ; now convert to 16-bit vectors so we can square them |
|
312 |
+ mova m1, m2 |
|
313 |
+ mova m3, m4 |
|
314 |
+ |
|
315 |
+ punpckhbw m2, m0 |
|
316 |
+ punpckhbw m4, m0 |
|
317 |
+ punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) |
|
318 |
+ punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) |
|
319 |
+ |
|
320 |
+ pmaddwd m2, m2 |
|
321 |
+ pmaddwd m4, m4 |
|
322 |
+ pmaddwd m1, m1 |
|
323 |
+ pmaddwd m3, m3 |
|
324 |
+ |
|
325 |
+ paddd m1, m2 |
|
326 |
+ paddd m3, m4 |
|
327 |
+ paddd m7, m1 |
|
328 |
+ paddd m7, m3 |
|
329 |
+ |
|
330 |
+%if %1 == mmsize |
|
331 |
+ lea pix1q, [pix1q + 2*lsizeq] |
|
332 |
+ lea pix2q, [pix2q + 2*lsizeq] |
|
333 |
+%else |
|
334 |
+ add pix1q, lsizeq |
|
335 |
+ add pix2q, lsizeq |
|
336 |
+%endif |
|
337 |
+ dec hd |
|
338 |
+ jnz .next2lines |
|
339 |
+ |
|
340 |
+ HADDD m7, m1 |
|
341 |
+ movd eax, m7 ; return value |
|
342 |
+ RET |
|
343 |
+%endmacro |
|
344 |
+ |
|
345 |
+INIT_MMX mmx |
|
346 |
+SUM_SQUARED_ERRORS 8 |
|
347 |
+ |
|
348 |
+INIT_MMX mmx |
|
349 |
+SUM_SQUARED_ERRORS 16 |
|
350 |
+ |
|
351 |
+INIT_XMM sse2 |
|
352 |
+SUM_SQUARED_ERRORS 16 |
|
353 |
+ |
|
354 |
+;----------------------------------------------- |
|
355 |
+;int ff_sum_abs_dctelem(int16_t *block) |
|
356 |
+;----------------------------------------------- |
|
357 |
+; %1 = number of xmm registers used |
|
358 |
+; %2 = number of inline loops |
|
359 |
+ |
|
360 |
+%macro SUM_ABS_DCTELEM 2 |
|
361 |
+cglobal sum_abs_dctelem, 1, 1, %1, block |
|
362 |
+ pxor m0, m0 |
|
363 |
+ pxor m1, m1 |
|
364 |
+%assign %%i 0 |
|
365 |
+%rep %2 |
|
366 |
+ mova m2, [blockq+mmsize*(0+%%i)] |
|
367 |
+ mova m3, [blockq+mmsize*(1+%%i)] |
|
368 |
+ mova m4, [blockq+mmsize*(2+%%i)] |
|
369 |
+ mova m5, [blockq+mmsize*(3+%%i)] |
|
370 |
+ ABS1_SUM m2, m6, m0 |
|
371 |
+ ABS1_SUM m3, m6, m1 |
|
372 |
+ ABS1_SUM m4, m6, m0 |
|
373 |
+ ABS1_SUM m5, m6, m1 |
|
374 |
+%assign %%i %%i+4 |
|
375 |
+%endrep |
|
376 |
+ paddusw m0, m1 |
|
377 |
+ HSUM m0, m1, eax |
|
378 |
+ and eax, 0xFFFF |
|
379 |
+ RET |
|
380 |
+%endmacro |
|
381 |
+ |
|
382 |
+INIT_MMX mmx |
|
383 |
+SUM_ABS_DCTELEM 0, 4 |
|
384 |
+INIT_MMX mmxext |
|
385 |
+SUM_ABS_DCTELEM 0, 4 |
|
386 |
+INIT_XMM sse2 |
|
387 |
+SUM_ABS_DCTELEM 7, 2 |
|
388 |
+INIT_XMM ssse3 |
|
389 |
+SUM_ABS_DCTELEM 6, 2 |
|
390 |
+ |
|
391 |
+;------------------------------------------------------------------------------ |
|
392 |
+; int ff_hf_noise*_mmx(uint8_t *pix1, int lsize, int h) |
|
393 |
+;------------------------------------------------------------------------------ |
|
394 |
+; %1 = 8/16. %2-5=m# |
|
395 |
+%macro HF_NOISE_PART1 5 |
|
396 |
+ mova m%2, [pix1q] |
|
397 |
+%if %1 == 8 |
|
398 |
+ mova m%3, m%2 |
|
399 |
+ psllq m%2, 8 |
|
400 |
+ psrlq m%3, 8 |
|
401 |
+ psrlq m%2, 8 |
|
402 |
+%else |
|
403 |
+ mova m%3, [pix1q+1] |
|
404 |
+%endif |
|
405 |
+ mova m%4, m%2 |
|
406 |
+ mova m%5, m%3 |
|
407 |
+ punpcklbw m%2, m7 |
|
408 |
+ punpcklbw m%3, m7 |
|
409 |
+ punpckhbw m%4, m7 |
|
410 |
+ punpckhbw m%5, m7 |
|
411 |
+ psubw m%2, m%3 |
|
412 |
+ psubw m%4, m%5 |
|
413 |
+%endmacro |
|
414 |
+ |
|
415 |
+; %1-2 = m# |
|
416 |
+%macro HF_NOISE_PART2 4 |
|
417 |
+ psubw m%1, m%3 |
|
418 |
+ psubw m%2, m%4 |
|
419 |
+ pxor m3, m3 |
|
420 |
+ pxor m1, m1 |
|
421 |
+ pcmpgtw m3, m%1 |
|
422 |
+ pcmpgtw m1, m%2 |
|
423 |
+ pxor m%1, m3 |
|
424 |
+ pxor m%2, m1 |
|
425 |
+ psubw m%1, m3 |
|
426 |
+ psubw m%2, m1 |
|
427 |
+ paddw m%2, m%1 |
|
428 |
+ paddw m6, m%2 |
|
429 |
+%endmacro |
|
430 |
+ |
|
431 |
+; %1 = 8/16 |
|
432 |
+%macro HF_NOISE 1 |
|
433 |
+cglobal hf_noise%1, 3,3,0, pix1, lsize, h |
|
434 |
+ movsxdifnidn lsizeq, lsized |
|
435 |
+ sub hd, 2 |
|
436 |
+ pxor m7, m7 |
|
437 |
+ pxor m6, m6 |
|
438 |
+ HF_NOISE_PART1 %1, 0, 1, 2, 3 |
|
439 |
+ add pix1q, lsizeq |
|
440 |
+ HF_NOISE_PART1 %1, 4, 1, 5, 3 |
|
441 |
+ HF_NOISE_PART2 0, 2, 4, 5 |
|
442 |
+ add pix1q, lsizeq |
|
443 |
+.loop: |
|
444 |
+ HF_NOISE_PART1 %1, 0, 1, 2, 3 |
|
445 |
+ HF_NOISE_PART2 4, 5, 0, 2 |
|
446 |
+ add pix1q, lsizeq |
|
447 |
+ HF_NOISE_PART1 %1, 4, 1, 5, 3 |
|
448 |
+ HF_NOISE_PART2 0, 2, 4, 5 |
|
449 |
+ add pix1q, lsizeq |
|
450 |
+ sub hd, 2 |
|
451 |
+ jne .loop |
|
452 |
+ |
|
453 |
+ mova m0, m6 |
|
454 |
+ punpcklwd m0, m7 |
|
455 |
+ punpckhwd m6, m7 |
|
456 |
+ paddd m6, m0 |
|
457 |
+ mova m0, m6 |
|
458 |
+ psrlq m6, 32 |
|
459 |
+ paddd m0, m6 |
|
460 |
+ movd eax, m0 ; eax = result of hf_noise8; |
|
461 |
+ REP_RET ; return eax; |
|
462 |
+%endmacro |
|
463 |
+ |
|
464 |
+INIT_MMX mmx |
|
465 |
+HF_NOISE 8 |
|
466 |
+HF_NOISE 16 |
0 | 467 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,845 @@ |
0 |
+/* |
|
1 |
+ * SIMD-optimized motion estimation |
|
2 |
+ * Copyright (c) 2000, 2001 Fabrice Bellard |
|
3 |
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
|
4 |
+ * |
|
5 |
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
|
6 |
+ * |
|
7 |
+ * This file is part of FFmpeg. |
|
8 |
+ * |
|
9 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
10 |
+ * modify it under the terms of the GNU Lesser General Public |
|
11 |
+ * License as published by the Free Software Foundation; either |
|
12 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
13 |
+ * |
|
14 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
15 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
16 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
17 |
+ * Lesser General Public License for more details. |
|
18 |
+ * |
|
19 |
+ * You should have received a copy of the GNU Lesser General Public |
|
20 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
21 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
22 |
+ */ |
|
23 |
+ |
|
24 |
+#include "libavutil/attributes.h" |
|
25 |
+#include "libavutil/cpu.h" |
|
26 |
+#include "libavutil/x86/asm.h" |
|
27 |
+#include "libavutil/x86/cpu.h" |
|
28 |
+#include "libavcodec/me_cmp.h" |
|
29 |
+#include "libavcodec/mpegvideo.h" |
|
30 |
+ |
|
31 |
+int ff_sum_abs_dctelem_mmx(int16_t *block); |
|
32 |
+int ff_sum_abs_dctelem_mmxext(int16_t *block); |
|
33 |
+int ff_sum_abs_dctelem_sse2(int16_t *block); |
|
34 |
+int ff_sum_abs_dctelem_ssse3(int16_t *block); |
|
35 |
+int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
36 |
+ int line_size, int h); |
|
37 |
+int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
38 |
+ int line_size, int h); |
|
39 |
+int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
40 |
+ int line_size, int h); |
|
41 |
+int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h); |
|
42 |
+int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h); |
|
43 |
+ |
|
44 |
+#define hadamard_func(cpu) \ |
|
45 |
+ int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \ |
|
46 |
+ uint8_t *src2, int stride, int h); \ |
|
47 |
+ int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \ |
|
48 |
+ uint8_t *src2, int stride, int h); |
|
49 |
+ |
|
50 |
+hadamard_func(mmx) |
|
51 |
+hadamard_func(mmxext) |
|
52 |
+hadamard_func(sse2) |
|
53 |
+hadamard_func(ssse3) |
|
54 |
+ |
|
55 |
+#if HAVE_YASM |
|
56 |
+static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, |
|
57 |
+ int line_size, int h) |
|
58 |
+{ |
|
59 |
+ int score1, score2; |
|
60 |
+ |
|
61 |
+ if (c) |
|
62 |
+ score1 = c->mecc.sse[0](c, pix1, pix2, line_size, h); |
|
63 |
+ else |
|
64 |
+ score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h); |
|
65 |
+ score2 = ff_hf_noise16_mmx(pix1, line_size, h) + ff_hf_noise8_mmx(pix1+8, line_size, h) |
|
66 |
+ - ff_hf_noise16_mmx(pix2, line_size, h) - ff_hf_noise8_mmx(pix2+8, line_size, h); |
|
67 |
+ |
|
68 |
+ if (c) |
|
69 |
+ return score1 + FFABS(score2) * c->avctx->nsse_weight; |
|
70 |
+ else |
|
71 |
+ return score1 + FFABS(score2) * 8; |
|
72 |
+} |
|
73 |
+ |
|
74 |
+static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, |
|
75 |
+ int line_size, int h) |
|
76 |
+{ |
|
77 |
+ int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h); |
|
78 |
+ int score2 = ff_hf_noise8_mmx(pix1, line_size, h) - |
|
79 |
+ ff_hf_noise8_mmx(pix2, line_size, h); |
|
80 |
+ |
|
81 |
+ if (c) |
|
82 |
+ return score1 + FFABS(score2) * c->avctx->nsse_weight; |
|
83 |
+ else |
|
84 |
+ return score1 + FFABS(score2) * 8; |
|
85 |
+} |
|
86 |
+ |
|
87 |
+#endif /* HAVE_YASM */ |
|
88 |
+ |
|
89 |
+#if HAVE_INLINE_ASM |
|
90 |
+ |
|
91 |
+static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, |
|
92 |
+ int line_size, int h) |
|
93 |
+{ |
|
94 |
+ int tmp; |
|
95 |
+ |
|
96 |
+ av_assert2((((int) pix) & 7) == 0); |
|
97 |
+ av_assert2((line_size & 7) == 0); |
|
98 |
+ |
|
99 |
+#define SUM(in0, in1, out0, out1) \ |
|
100 |
+ "movq (%0), %%mm2\n" \ |
|
101 |
+ "movq 8(%0), %%mm3\n" \ |
|
102 |
+ "add %2,%0\n" \ |
|
103 |
+ "movq %%mm2, " #out0 "\n" \ |
|
104 |
+ "movq %%mm3, " #out1 "\n" \ |
|
105 |
+ "psubusb " #in0 ", %%mm2\n" \ |
|
106 |
+ "psubusb " #in1 ", %%mm3\n" \ |
|
107 |
+ "psubusb " #out0 ", " #in0 "\n" \ |
|
108 |
+ "psubusb " #out1 ", " #in1 "\n" \ |
|
109 |
+ "por %%mm2, " #in0 "\n" \ |
|
110 |
+ "por %%mm3, " #in1 "\n" \ |
|
111 |
+ "movq " #in0 ", %%mm2\n" \ |
|
112 |
+ "movq " #in1 ", %%mm3\n" \ |
|
113 |
+ "punpcklbw %%mm7, " #in0 "\n" \ |
|
114 |
+ "punpcklbw %%mm7, " #in1 "\n" \ |
|
115 |
+ "punpckhbw %%mm7, %%mm2\n" \ |
|
116 |
+ "punpckhbw %%mm7, %%mm3\n" \ |
|
117 |
+ "paddw " #in1 ", " #in0 "\n" \ |
|
118 |
+ "paddw %%mm3, %%mm2\n" \ |
|
119 |
+ "paddw %%mm2, " #in0 "\n" \ |
|
120 |
+ "paddw " #in0 ", %%mm6\n" |
|
121 |
+ |
|
122 |
+ |
|
123 |
+ __asm__ volatile ( |
|
124 |
+ "movl %3, %%ecx\n" |
|
125 |
+ "pxor %%mm6, %%mm6\n" |
|
126 |
+ "pxor %%mm7, %%mm7\n" |
|
127 |
+ "movq (%0), %%mm0\n" |
|
128 |
+ "movq 8(%0), %%mm1\n" |
|
129 |
+ "add %2, %0\n" |
|
130 |
+ "jmp 2f\n" |
|
131 |
+ "1:\n" |
|
132 |
+ |
|
133 |
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
|
134 |
+ "2:\n" |
|
135 |
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
|
136 |
+ |
|
137 |
+ "subl $2, %%ecx\n" |
|
138 |
+ "jnz 1b\n" |
|
139 |
+ |
|
140 |
+ "movq %%mm6, %%mm0\n" |
|
141 |
+ "psrlq $32, %%mm6\n" |
|
142 |
+ "paddw %%mm6, %%mm0\n" |
|
143 |
+ "movq %%mm0, %%mm6\n" |
|
144 |
+ "psrlq $16, %%mm0\n" |
|
145 |
+ "paddw %%mm6, %%mm0\n" |
|
146 |
+ "movd %%mm0, %1\n" |
|
147 |
+ : "+r" (pix), "=r" (tmp) |
|
148 |
+ : "r" ((x86_reg) line_size), "m" (h) |
|
149 |
+ : "%ecx"); |
|
150 |
+ |
|
151 |
+ return tmp & 0xFFFF; |
|
152 |
+} |
|
153 |
+#undef SUM |
|
154 |
+ |
|
155 |
+static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, |
|
156 |
+ int line_size, int h) |
|
157 |
+{ |
|
158 |
+ int tmp; |
|
159 |
+ |
|
160 |
+ av_assert2((((int) pix) & 7) == 0); |
|
161 |
+ av_assert2((line_size & 7) == 0); |
|
162 |
+ |
|
163 |
+#define SUM(in0, in1, out0, out1) \ |
|
164 |
+ "movq (%0), " #out0 "\n" \ |
|
165 |
+ "movq 8(%0), " #out1 "\n" \ |
|
166 |
+ "add %2, %0\n" \ |
|
167 |
+ "psadbw " #out0 ", " #in0 "\n" \ |
|
168 |
+ "psadbw " #out1 ", " #in1 "\n" \ |
|
169 |
+ "paddw " #in1 ", " #in0 "\n" \ |
|
170 |
+ "paddw " #in0 ", %%mm6\n" |
|
171 |
+ |
|
172 |
+ __asm__ volatile ( |
|
173 |
+ "movl %3, %%ecx\n" |
|
174 |
+ "pxor %%mm6, %%mm6\n" |
|
175 |
+ "pxor %%mm7, %%mm7\n" |
|
176 |
+ "movq (%0), %%mm0\n" |
|
177 |
+ "movq 8(%0), %%mm1\n" |
|
178 |
+ "add %2, %0\n" |
|
179 |
+ "jmp 2f\n" |
|
180 |
+ "1:\n" |
|
181 |
+ |
|
182 |
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
|
183 |
+ "2:\n" |
|
184 |
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
|
185 |
+ |
|
186 |
+ "subl $2, %%ecx\n" |
|
187 |
+ "jnz 1b\n" |
|
188 |
+ |
|
189 |
+ "movd %%mm6, %1\n" |
|
190 |
+ : "+r" (pix), "=r" (tmp) |
|
191 |
+ : "r" ((x86_reg) line_size), "m" (h) |
|
192 |
+ : "%ecx"); |
|
193 |
+ |
|
194 |
+ return tmp; |
|
195 |
+} |
|
196 |
+#undef SUM |
|
197 |
+ |
|
198 |
+static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
199 |
+ int line_size, int h) |
|
200 |
+{ |
|
201 |
+ int tmp; |
|
202 |
+ |
|
203 |
+ av_assert2((((int) pix1) & 7) == 0); |
|
204 |
+ av_assert2((((int) pix2) & 7) == 0); |
|
205 |
+ av_assert2((line_size & 7) == 0); |
|
206 |
+ |
|
207 |
+#define SUM(in0, in1, out0, out1) \ |
|
208 |
+ "movq (%0), %%mm2\n" \ |
|
209 |
+ "movq (%1), " #out0 "\n" \ |
|
210 |
+ "movq 8(%0), %%mm3\n" \ |
|
211 |
+ "movq 8(%1), " #out1 "\n" \ |
|
212 |
+ "add %3, %0\n" \ |
|
213 |
+ "add %3, %1\n" \ |
|
214 |
+ "psubb " #out0 ", %%mm2\n" \ |
|
215 |
+ "psubb " #out1 ", %%mm3\n" \ |
|
216 |
+ "pxor %%mm7, %%mm2\n" \ |
|
217 |
+ "pxor %%mm7, %%mm3\n" \ |
|
218 |
+ "movq %%mm2, " #out0 "\n" \ |
|
219 |
+ "movq %%mm3, " #out1 "\n" \ |
|
220 |
+ "psubusb " #in0 ", %%mm2\n" \ |
|
221 |
+ "psubusb " #in1 ", %%mm3\n" \ |
|
222 |
+ "psubusb " #out0 ", " #in0 "\n" \ |
|
223 |
+ "psubusb " #out1 ", " #in1 "\n" \ |
|
224 |
+ "por %%mm2, " #in0 "\n" \ |
|
225 |
+ "por %%mm3, " #in1 "\n" \ |
|
226 |
+ "movq " #in0 ", %%mm2\n" \ |
|
227 |
+ "movq " #in1 ", %%mm3\n" \ |
|
228 |
+ "punpcklbw %%mm7, " #in0 "\n" \ |
|
229 |
+ "punpcklbw %%mm7, " #in1 "\n" \ |
|
230 |
+ "punpckhbw %%mm7, %%mm2\n" \ |
|
231 |
+ "punpckhbw %%mm7, %%mm3\n" \ |
|
232 |
+ "paddw " #in1 ", " #in0 "\n" \ |
|
233 |
+ "paddw %%mm3, %%mm2\n" \ |
|
234 |
+ "paddw %%mm2, " #in0 "\n" \ |
|
235 |
+ "paddw " #in0 ", %%mm6\n" |
|
236 |
+ |
|
237 |
+ |
|
238 |
+ __asm__ volatile ( |
|
239 |
+ "movl %4, %%ecx\n" |
|
240 |
+ "pxor %%mm6, %%mm6\n" |
|
241 |
+ "pcmpeqw %%mm7, %%mm7\n" |
|
242 |
+ "psllw $15, %%mm7\n" |
|
243 |
+ "packsswb %%mm7, %%mm7\n" |
|
244 |
+ "movq (%0), %%mm0\n" |
|
245 |
+ "movq (%1), %%mm2\n" |
|
246 |
+ "movq 8(%0), %%mm1\n" |
|
247 |
+ "movq 8(%1), %%mm3\n" |
|
248 |
+ "add %3, %0\n" |
|
249 |
+ "add %3, %1\n" |
|
250 |
+ "psubb %%mm2, %%mm0\n" |
|
251 |
+ "psubb %%mm3, %%mm1\n" |
|
252 |
+ "pxor %%mm7, %%mm0\n" |
|
253 |
+ "pxor %%mm7, %%mm1\n" |
|
254 |
+ "jmp 2f\n" |
|
255 |
+ "1:\n" |
|
256 |
+ |
|
257 |
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
|
258 |
+ "2:\n" |
|
259 |
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
|
260 |
+ |
|
261 |
+ "subl $2, %%ecx\n" |
|
262 |
+ "jnz 1b\n" |
|
263 |
+ |
|
264 |
+ "movq %%mm6, %%mm0\n" |
|
265 |
+ "psrlq $32, %%mm6\n" |
|
266 |
+ "paddw %%mm6, %%mm0\n" |
|
267 |
+ "movq %%mm0, %%mm6\n" |
|
268 |
+ "psrlq $16, %%mm0\n" |
|
269 |
+ "paddw %%mm6, %%mm0\n" |
|
270 |
+ "movd %%mm0, %2\n" |
|
271 |
+ : "+r" (pix1), "+r" (pix2), "=r" (tmp) |
|
272 |
+ : "r" ((x86_reg) line_size), "m" (h) |
|
273 |
+ : "%ecx"); |
|
274 |
+ |
|
275 |
+ return tmp & 0x7FFF; |
|
276 |
+} |
|
277 |
+#undef SUM |
|
278 |
+ |
|
279 |
+static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
|
280 |
+ int line_size, int h) |
|
281 |
+{ |
|
282 |
+ int tmp; |
|
283 |
+ |
|
284 |
+ av_assert2((((int) pix1) & 7) == 0); |
|
285 |
+ av_assert2((((int) pix2) & 7) == 0); |
|
286 |
+ av_assert2((line_size & 7) == 0); |
|
287 |
+ |
|
288 |
+#define SUM(in0, in1, out0, out1) \ |
|
289 |
+ "movq (%0), " #out0 "\n" \ |
|
290 |
+ "movq (%1), %%mm2\n" \ |
|
291 |
+ "movq 8(%0), " #out1 "\n" \ |
|
292 |
+ "movq 8(%1), %%mm3\n" \ |
|
293 |
+ "add %3, %0\n" \ |
|
294 |
+ "add %3, %1\n" \ |
|
295 |
+ "psubb %%mm2, " #out0 "\n" \ |
|
296 |
+ "psubb %%mm3, " #out1 "\n" \ |
|
297 |
+ "pxor %%mm7, " #out0 "\n" \ |
|
298 |
+ "pxor %%mm7, " #out1 "\n" \ |
|
299 |
+ "psadbw " #out0 ", " #in0 "\n" \ |
|
300 |
+ "psadbw " #out1 ", " #in1 "\n" \ |
|
301 |
+ "paddw " #in1 ", " #in0 "\n" \ |
|
302 |
+ "paddw " #in0 ", %%mm6\n " |
|
303 |
+ |
|
304 |
+ __asm__ volatile ( |
|
305 |
+ "movl %4, %%ecx\n" |
|
306 |
+ "pxor %%mm6, %%mm6\n" |
|
307 |
+ "pcmpeqw %%mm7, %%mm7\n" |
|
308 |
+ "psllw $15, %%mm7\n" |
|
309 |
+ "packsswb %%mm7, %%mm7\n" |
|
310 |
+ "movq (%0), %%mm0\n" |
|
311 |
+ "movq (%1), %%mm2\n" |
|
312 |
+ "movq 8(%0), %%mm1\n" |
|
313 |
+ "movq 8(%1), %%mm3\n" |
|
314 |
+ "add %3, %0\n" |
|
315 |
+ "add %3, %1\n" |
|
316 |
+ "psubb %%mm2, %%mm0\n" |
|
317 |
+ "psubb %%mm3, %%mm1\n" |
|
318 |
+ "pxor %%mm7, %%mm0\n" |
|
319 |
+ "pxor %%mm7, %%mm1\n" |
|
320 |
+ "jmp 2f\n" |
|
321 |
+ "1:\n" |
|
322 |
+ |
|
323 |
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
|
324 |
+ "2:\n" |
|
325 |
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
|
326 |
+ |
|
327 |
+ "subl $2, %%ecx\n" |
|
328 |
+ "jnz 1b\n" |
|
329 |
+ |
|
330 |
+ "movd %%mm6, %2\n" |
|
331 |
+ : "+r" (pix1), "+r" (pix2), "=r" (tmp) |
|
332 |
+ : "r" ((x86_reg) line_size), "m" (h) |
|
333 |
+ : "%ecx"); |
|
334 |
+ |
|
335 |
+ return tmp; |
|
336 |
+} |
|
337 |
+#undef SUM |
|
338 |
+ |
|
339 |
+ |
|
340 |
+ |
|
341 |
+DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = { |
|
342 |
+ 0x0000000000000000ULL, |
|
343 |
+ 0x0001000100010001ULL, |
|
344 |
+ 0x0002000200020002ULL, |
|
345 |
+}; |
|
346 |
+ |
|
347 |
+DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL; |
|
348 |
+ |
|
349 |
+static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
|
350 |
+{ |
|
351 |
+ x86_reg len = -(x86_reg)stride * h; |
|
352 |
+ __asm__ volatile ( |
|
353 |
+ ".p2align 4 \n\t" |
|
354 |
+ "1: \n\t" |
|
355 |
+ "movq (%1, %%"REG_a"), %%mm0 \n\t" |
|
356 |
+ "movq (%2, %%"REG_a"), %%mm2 \n\t" |
|
357 |
+ "movq (%2, %%"REG_a"), %%mm4 \n\t" |
|
358 |
+ "add %3, %%"REG_a" \n\t" |
|
359 |
+ "psubusb %%mm0, %%mm2 \n\t" |
|
360 |
+ "psubusb %%mm4, %%mm0 \n\t" |
|
361 |
+ "movq (%1, %%"REG_a"), %%mm1 \n\t" |
|
362 |
+ "movq (%2, %%"REG_a"), %%mm3 \n\t" |
|
363 |
+ "movq (%2, %%"REG_a"), %%mm5 \n\t" |
|
364 |
+ "psubusb %%mm1, %%mm3 \n\t" |
|
365 |
+ "psubusb %%mm5, %%mm1 \n\t" |
|
366 |
+ "por %%mm2, %%mm0 \n\t" |
|
367 |
+ "por %%mm1, %%mm3 \n\t" |
|
368 |
+ "movq %%mm0, %%mm1 \n\t" |
|
369 |
+ "movq %%mm3, %%mm2 \n\t" |
|
370 |
+ "punpcklbw %%mm7, %%mm0 \n\t" |
|
371 |
+ "punpckhbw %%mm7, %%mm1 \n\t" |
|
372 |
+ "punpcklbw %%mm7, %%mm3 \n\t" |
|
373 |
+ "punpckhbw %%mm7, %%mm2 \n\t" |
|
374 |
+ "paddw %%mm1, %%mm0 \n\t" |
|
375 |
+ "paddw %%mm3, %%mm2 \n\t" |
|
376 |
+ "paddw %%mm2, %%mm0 \n\t" |
|
377 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
378 |
+ "add %3, %%"REG_a" \n\t" |
|
379 |
+ " js 1b \n\t" |
|
380 |
+ : "+a" (len) |
|
381 |
+ : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride)); |
|
382 |
+} |
|
383 |
+ |
|
384 |
+static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2, |
|
385 |
+ int stride, int h) |
|
386 |
+{ |
|
387 |
+ __asm__ volatile ( |
|
388 |
+ ".p2align 4 \n\t" |
|
389 |
+ "1: \n\t" |
|
390 |
+ "movq (%1), %%mm0 \n\t" |
|
391 |
+ "movq (%1, %3), %%mm1 \n\t" |
|
392 |
+ "psadbw (%2), %%mm0 \n\t" |
|
393 |
+ "psadbw (%2, %3), %%mm1 \n\t" |
|
394 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
395 |
+ "paddw %%mm1, %%mm6 \n\t" |
|
396 |
+ "lea (%1,%3,2), %1 \n\t" |
|
397 |
+ "lea (%2,%3,2), %2 \n\t" |
|
398 |
+ "sub $2, %0 \n\t" |
|
399 |
+ " jg 1b \n\t" |
|
400 |
+ : "+r" (h), "+r" (blk1), "+r" (blk2) |
|
401 |
+ : "r" ((x86_reg) stride)); |
|
402 |
+} |
|
403 |
+ |
|
404 |
+static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1, |
|
405 |
+ int stride, int h) |
|
406 |
+{ |
|
407 |
+ int ret; |
|
408 |
+ __asm__ volatile ( |
|
409 |
+ "pxor %%xmm2, %%xmm2 \n\t" |
|
410 |
+ ".p2align 4 \n\t" |
|
411 |
+ "1: \n\t" |
|
412 |
+ "movdqu (%1), %%xmm0 \n\t" |
|
413 |
+ "movdqu (%1, %4), %%xmm1 \n\t" |
|
414 |
+ "psadbw (%2), %%xmm0 \n\t" |
|
415 |
+ "psadbw (%2, %4), %%xmm1 \n\t" |
|
416 |
+ "paddw %%xmm0, %%xmm2 \n\t" |
|
417 |
+ "paddw %%xmm1, %%xmm2 \n\t" |
|
418 |
+ "lea (%1,%4,2), %1 \n\t" |
|
419 |
+ "lea (%2,%4,2), %2 \n\t" |
|
420 |
+ "sub $2, %0 \n\t" |
|
421 |
+ " jg 1b \n\t" |
|
422 |
+ "movhlps %%xmm2, %%xmm0 \n\t" |
|
423 |
+ "paddw %%xmm0, %%xmm2 \n\t" |
|
424 |
+ "movd %%xmm2, %3 \n\t" |
|
425 |
+ : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret) |
|
426 |
+ : "r" ((x86_reg) stride)); |
|
427 |
+ return ret; |
|
428 |
+} |
|
429 |
+ |
|
430 |
+static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2, |
|
431 |
+ int stride, int h) |
|
432 |
+{ |
|
433 |
+ __asm__ volatile ( |
|
434 |
+ ".p2align 4 \n\t" |
|
435 |
+ "1: \n\t" |
|
436 |
+ "movq (%1), %%mm0 \n\t" |
|
437 |
+ "movq (%1, %3), %%mm1 \n\t" |
|
438 |
+ "pavgb 1(%1), %%mm0 \n\t" |
|
439 |
+ "pavgb 1(%1, %3), %%mm1 \n\t" |
|
440 |
+ "psadbw (%2), %%mm0 \n\t" |
|
441 |
+ "psadbw (%2, %3), %%mm1 \n\t" |
|
442 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
443 |
+ "paddw %%mm1, %%mm6 \n\t" |
|
444 |
+ "lea (%1,%3,2), %1 \n\t" |
|
445 |
+ "lea (%2,%3,2), %2 \n\t" |
|
446 |
+ "sub $2, %0 \n\t" |
|
447 |
+ " jg 1b \n\t" |
|
448 |
+ : "+r" (h), "+r" (blk1), "+r" (blk2) |
|
449 |
+ : "r" ((x86_reg) stride)); |
|
450 |
+} |
|
451 |
+ |
|
452 |
+static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2, |
|
453 |
+ int stride, int h) |
|
454 |
+{ |
|
455 |
+ __asm__ volatile ( |
|
456 |
+ "movq (%1), %%mm0 \n\t" |
|
457 |
+ "add %3, %1 \n\t" |
|
458 |
+ ".p2align 4 \n\t" |
|
459 |
+ "1: \n\t" |
|
460 |
+ "movq (%1), %%mm1 \n\t" |
|
461 |
+ "movq (%1, %3), %%mm2 \n\t" |
|
462 |
+ "pavgb %%mm1, %%mm0 \n\t" |
|
463 |
+ "pavgb %%mm2, %%mm1 \n\t" |
|
464 |
+ "psadbw (%2), %%mm0 \n\t" |
|
465 |
+ "psadbw (%2, %3), %%mm1 \n\t" |
|
466 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
467 |
+ "paddw %%mm1, %%mm6 \n\t" |
|
468 |
+ "movq %%mm2, %%mm0 \n\t" |
|
469 |
+ "lea (%1,%3,2), %1 \n\t" |
|
470 |
+ "lea (%2,%3,2), %2 \n\t" |
|
471 |
+ "sub $2, %0 \n\t" |
|
472 |
+ " jg 1b \n\t" |
|
473 |
+ : "+r" (h), "+r" (blk1), "+r" (blk2) |
|
474 |
+ : "r" ((x86_reg) stride)); |
|
475 |
+} |
|
476 |
+ |
|
477 |
+static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2, |
|
478 |
+ int stride, int h) |
|
479 |
+{ |
|
480 |
+ __asm__ volatile ( |
|
481 |
+ "movq "MANGLE(bone)", %%mm5 \n\t" |
|
482 |
+ "movq (%1), %%mm0 \n\t" |
|
483 |
+ "pavgb 1(%1), %%mm0 \n\t" |
|
484 |
+ "add %3, %1 \n\t" |
|
485 |
+ ".p2align 4 \n\t" |
|
486 |
+ "1: \n\t" |
|
487 |
+ "movq (%1), %%mm1 \n\t" |
|
488 |
+ "movq (%1,%3), %%mm2 \n\t" |
|
489 |
+ "pavgb 1(%1), %%mm1 \n\t" |
|
490 |
+ "pavgb 1(%1,%3), %%mm2 \n\t" |
|
491 |
+ "psubusb %%mm5, %%mm1 \n\t" |
|
492 |
+ "pavgb %%mm1, %%mm0 \n\t" |
|
493 |
+ "pavgb %%mm2, %%mm1 \n\t" |
|
494 |
+ "psadbw (%2), %%mm0 \n\t" |
|
495 |
+ "psadbw (%2,%3), %%mm1 \n\t" |
|
496 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
497 |
+ "paddw %%mm1, %%mm6 \n\t" |
|
498 |
+ "movq %%mm2, %%mm0 \n\t" |
|
499 |
+ "lea (%1,%3,2), %1 \n\t" |
|
500 |
+ "lea (%2,%3,2), %2 \n\t" |
|
501 |
+ "sub $2, %0 \n\t" |
|
502 |
+ " jg 1b \n\t" |
|
503 |
+ : "+r" (h), "+r" (blk1), "+r" (blk2) |
|
504 |
+ : "r" ((x86_reg) stride) |
|
505 |
+ NAMED_CONSTRAINTS_ADD(bone)); |
|
506 |
+} |
|
507 |
+ |
|
508 |
+static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, |
|
509 |
+ int stride, int h) |
|
510 |
+{ |
|
511 |
+ x86_reg len = -(x86_reg)stride * h; |
|
512 |
+ __asm__ volatile ( |
|
513 |
+ ".p2align 4 \n\t" |
|
514 |
+ "1: \n\t" |
|
515 |
+ "movq (%1, %%"REG_a"), %%mm0 \n\t" |
|
516 |
+ "movq (%2, %%"REG_a"), %%mm1 \n\t" |
|
517 |
+ "movq (%1, %%"REG_a"), %%mm2 \n\t" |
|
518 |
+ "movq (%2, %%"REG_a"), %%mm3 \n\t" |
|
519 |
+ "punpcklbw %%mm7, %%mm0 \n\t" |
|
520 |
+ "punpcklbw %%mm7, %%mm1 \n\t" |
|
521 |
+ "punpckhbw %%mm7, %%mm2 \n\t" |
|
522 |
+ "punpckhbw %%mm7, %%mm3 \n\t" |
|
523 |
+ "paddw %%mm0, %%mm1 \n\t" |
|
524 |
+ "paddw %%mm2, %%mm3 \n\t" |
|
525 |
+ "movq (%3, %%"REG_a"), %%mm4 \n\t" |
|
526 |
+ "movq (%3, %%"REG_a"), %%mm2 \n\t" |
|
527 |
+ "paddw %%mm5, %%mm1 \n\t" |
|
528 |
+ "paddw %%mm5, %%mm3 \n\t" |
|
529 |
+ "psrlw $1, %%mm1 \n\t" |
|
530 |
+ "psrlw $1, %%mm3 \n\t" |
|
531 |
+ "packuswb %%mm3, %%mm1 \n\t" |
|
532 |
+ "psubusb %%mm1, %%mm4 \n\t" |
|
533 |
+ "psubusb %%mm2, %%mm1 \n\t" |
|
534 |
+ "por %%mm4, %%mm1 \n\t" |
|
535 |
+ "movq %%mm1, %%mm0 \n\t" |
|
536 |
+ "punpcklbw %%mm7, %%mm0 \n\t" |
|
537 |
+ "punpckhbw %%mm7, %%mm1 \n\t" |
|
538 |
+ "paddw %%mm1, %%mm0 \n\t" |
|
539 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
540 |
+ "add %4, %%"REG_a" \n\t" |
|
541 |
+ " js 1b \n\t" |
|
542 |
+ : "+a" (len) |
|
543 |
+ : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len), |
|
544 |
+ "r" ((x86_reg) stride)); |
|
545 |
+} |
|
546 |
+ |
|
547 |
+static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
|
548 |
+{ |
|
549 |
+ x86_reg len = -(x86_reg)stride * h; |
|
550 |
+ __asm__ volatile ( |
|
551 |
+ "movq (%1, %%"REG_a"), %%mm0 \n\t" |
|
552 |
+ "movq 1(%1, %%"REG_a"), %%mm2 \n\t" |
|
553 |
+ "movq %%mm0, %%mm1 \n\t" |
|
554 |
+ "movq %%mm2, %%mm3 \n\t" |
|
555 |
+ "punpcklbw %%mm7, %%mm0 \n\t" |
|
556 |
+ "punpckhbw %%mm7, %%mm1 \n\t" |
|
557 |
+ "punpcklbw %%mm7, %%mm2 \n\t" |
|
558 |
+ "punpckhbw %%mm7, %%mm3 \n\t" |
|
559 |
+ "paddw %%mm2, %%mm0 \n\t" |
|
560 |
+ "paddw %%mm3, %%mm1 \n\t" |
|
561 |
+ ".p2align 4 \n\t" |
|
562 |
+ "1: \n\t" |
|
563 |
+ "movq (%2, %%"REG_a"), %%mm2 \n\t" |
|
564 |
+ "movq 1(%2, %%"REG_a"), %%mm4 \n\t" |
|
565 |
+ "movq %%mm2, %%mm3 \n\t" |
|
566 |
+ "movq %%mm4, %%mm5 \n\t" |
|
567 |
+ "punpcklbw %%mm7, %%mm2 \n\t" |
|
568 |
+ "punpckhbw %%mm7, %%mm3 \n\t" |
|
569 |
+ "punpcklbw %%mm7, %%mm4 \n\t" |
|
570 |
+ "punpckhbw %%mm7, %%mm5 \n\t" |
|
571 |
+ "paddw %%mm4, %%mm2 \n\t" |
|
572 |
+ "paddw %%mm5, %%mm3 \n\t" |
|
573 |
+ "movq %5, %%mm5 \n\t" |
|
574 |
+ "paddw %%mm2, %%mm0 \n\t" |
|
575 |
+ "paddw %%mm3, %%mm1 \n\t" |
|
576 |
+ "paddw %%mm5, %%mm0 \n\t" |
|
577 |
+ "paddw %%mm5, %%mm1 \n\t" |
|
578 |
+ "movq (%3, %%"REG_a"), %%mm4 \n\t" |
|
579 |
+ "movq (%3, %%"REG_a"), %%mm5 \n\t" |
|
580 |
+ "psrlw $2, %%mm0 \n\t" |
|
581 |
+ "psrlw $2, %%mm1 \n\t" |
|
582 |
+ "packuswb %%mm1, %%mm0 \n\t" |
|
583 |
+ "psubusb %%mm0, %%mm4 \n\t" |
|
584 |
+ "psubusb %%mm5, %%mm0 \n\t" |
|
585 |
+ "por %%mm4, %%mm0 \n\t" |
|
586 |
+ "movq %%mm0, %%mm4 \n\t" |
|
587 |
+ "punpcklbw %%mm7, %%mm0 \n\t" |
|
588 |
+ "punpckhbw %%mm7, %%mm4 \n\t" |
|
589 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
590 |
+ "paddw %%mm4, %%mm6 \n\t" |
|
591 |
+ "movq %%mm2, %%mm0 \n\t" |
|
592 |
+ "movq %%mm3, %%mm1 \n\t" |
|
593 |
+ "add %4, %%"REG_a" \n\t" |
|
594 |
+ " js 1b \n\t" |
|
595 |
+ : "+a" (len) |
|
596 |
+ : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), |
|
597 |
+ "r" ((x86_reg) stride), "m" (round_tab[2])); |
|
598 |
+} |
|
599 |
+ |
|
600 |
+static inline int sum_mmx(void) |
|
601 |
+{ |
|
602 |
+ int ret; |
|
603 |
+ __asm__ volatile ( |
|
604 |
+ "movq %%mm6, %%mm0 \n\t" |
|
605 |
+ "psrlq $32, %%mm6 \n\t" |
|
606 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
607 |
+ "movq %%mm6, %%mm0 \n\t" |
|
608 |
+ "psrlq $16, %%mm6 \n\t" |
|
609 |
+ "paddw %%mm0, %%mm6 \n\t" |
|
610 |
+ "movd %%mm6, %0 \n\t" |
|
611 |
+ : "=r" (ret)); |
|
612 |
+ return ret & 0xFFFF; |
|
613 |
+} |
|
614 |
+ |
|
615 |
+static inline int sum_mmxext(void) |
|
616 |
+{ |
|
617 |
+ int ret; |
|
618 |
+ __asm__ volatile ( |
|
619 |
+ "movd %%mm6, %0 \n\t" |
|
620 |
+ : "=r" (ret)); |
|
621 |
+ return ret; |
|
622 |
+} |
|
623 |
+ |
|
624 |
+static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
|
625 |
+{ |
|
626 |
+ sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h); |
|
627 |
+} |
|
628 |
+ |
|
629 |
+static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
|
630 |
+{ |
|
631 |
+ sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h); |
|
632 |
+} |
|
633 |
+ |
|
634 |
+#define PIX_SAD(suf) \ |
|
635 |
+static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
636 |
+ uint8_t *blk1, int stride, int h) \ |
|
637 |
+{ \ |
|
638 |
+ av_assert2(h == 8); \ |
|
639 |
+ __asm__ volatile ( \ |
|
640 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
641 |
+ "pxor %%mm6, %%mm6 \n\t" \ |
|
642 |
+ :); \ |
|
643 |
+ \ |
|
644 |
+ sad8_1_ ## suf(blk1, blk2, stride, 8); \ |
|
645 |
+ \ |
|
646 |
+ return sum_ ## suf(); \ |
|
647 |
+} \ |
|
648 |
+ \ |
|
649 |
+static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
650 |
+ uint8_t *blk1, int stride, int h) \ |
|
651 |
+{ \ |
|
652 |
+ av_assert2(h == 8); \ |
|
653 |
+ __asm__ volatile ( \ |
|
654 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
655 |
+ "pxor %%mm6, %%mm6 \n\t" \ |
|
656 |
+ "movq %0, %%mm5 \n\t" \ |
|
657 |
+ :: "m" (round_tab[1])); \ |
|
658 |
+ \ |
|
659 |
+ sad8_x2a_ ## suf(blk1, blk2, stride, 8); \ |
|
660 |
+ \ |
|
661 |
+ return sum_ ## suf(); \ |
|
662 |
+} \ |
|
663 |
+ \ |
|
664 |
+static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
665 |
+ uint8_t *blk1, int stride, int h) \ |
|
666 |
+{ \ |
|
667 |
+ av_assert2(h == 8); \ |
|
668 |
+ __asm__ volatile ( \ |
|
669 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
670 |
+ "pxor %%mm6, %%mm6 \n\t" \ |
|
671 |
+ "movq %0, %%mm5 \n\t" \ |
|
672 |
+ :: "m" (round_tab[1])); \ |
|
673 |
+ \ |
|
674 |
+ sad8_y2a_ ## suf(blk1, blk2, stride, 8); \ |
|
675 |
+ \ |
|
676 |
+ return sum_ ## suf(); \ |
|
677 |
+} \ |
|
678 |
+ \ |
|
679 |
+static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
680 |
+ uint8_t *blk1, int stride, int h) \ |
|
681 |
+{ \ |
|
682 |
+ av_assert2(h == 8); \ |
|
683 |
+ __asm__ volatile ( \ |
|
684 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
685 |
+ "pxor %%mm6, %%mm6 \n\t" \ |
|
686 |
+ ::); \ |
|
687 |
+ \ |
|
688 |
+ sad8_4_ ## suf(blk1, blk2, stride, 8); \ |
|
689 |
+ \ |
|
690 |
+ return sum_ ## suf(); \ |
|
691 |
+} \ |
|
692 |
+ \ |
|
693 |
+static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
694 |
+ uint8_t *blk1, int stride, int h) \ |
|
695 |
+{ \ |
|
696 |
+ __asm__ volatile ( \ |
|
697 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
698 |
+ "pxor %%mm6, %%mm6 \n\t" \ |
|
699 |
+ :); \ |
|
700 |
+ \ |
|
701 |
+ sad8_1_ ## suf(blk1, blk2, stride, h); \ |
|
702 |
+ sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ |
|
703 |
+ \ |
|
704 |
+ return sum_ ## suf(); \ |
|
705 |
+} \ |
|
706 |
+ \ |
|
707 |
+static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
708 |
+ uint8_t *blk1, int stride, int h) \ |
|
709 |
+{ \ |
|
710 |
+ __asm__ volatile ( \ |
|
711 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
712 |
+ "pxor %%mm6, %%mm6 \n\t" \ |
|
713 |
+ "movq %0, %%mm5 \n\t" \ |
|
714 |
+ :: "m" (round_tab[1])); \ |
|
715 |
+ \ |
|
716 |
+ sad8_x2a_ ## suf(blk1, blk2, stride, h); \ |
|
717 |
+ sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ |
|
718 |
+ \ |
|
719 |
+ return sum_ ## suf(); \ |
|
720 |
+} \ |
|
721 |
+ \ |
|
722 |
+static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
723 |
+ uint8_t *blk1, int stride, int h) \ |
|
724 |
+{ \ |
|
725 |
+ __asm__ volatile ( \ |
|
726 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
727 |
+ "pxor %%mm6, %%mm6 \n\t" \ |
|
728 |
+ "movq %0, %%mm5 \n\t" \ |
|
729 |
+ :: "m" (round_tab[1])); \ |
|
730 |
+ \ |
|
731 |
+ sad8_y2a_ ## suf(blk1, blk2, stride, h); \ |
|
732 |
+ sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ |
|
733 |
+ \ |
|
734 |
+ return sum_ ## suf(); \ |
|
735 |
+} \ |
|
736 |
+ \ |
|
737 |
+static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ |
|
738 |
+ uint8_t *blk1, int stride, int h) \ |
|
739 |
+{ \ |
|
740 |
+ __asm__ volatile ( \ |
|
741 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
742 |
+ "pxor %%mm6, %%mm6 \n\t" \ |
|
743 |
+ ::); \ |
|
744 |
+ \ |
|
745 |
+ sad8_4_ ## suf(blk1, blk2, stride, h); \ |
|
746 |
+ sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ |
|
747 |
+ \ |
|
748 |
+ return sum_ ## suf(); \ |
|
749 |
+} \ |
|
750 |
+ |
|
751 |
+PIX_SAD(mmx) |
|
752 |
+PIX_SAD(mmxext) |
|
753 |
+ |
|
754 |
+#endif /* HAVE_INLINE_ASM */ |
|
755 |
+ |
|
756 |
+av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx) |
|
757 |
+{ |
|
758 |
+ int cpu_flags = av_get_cpu_flags(); |
|
759 |
+ |
|
760 |
+#if HAVE_INLINE_ASM |
|
761 |
+ if (INLINE_MMX(cpu_flags)) { |
|
762 |
+ c->pix_abs[0][0] = sad16_mmx; |
|
763 |
+ c->pix_abs[0][1] = sad16_x2_mmx; |
|
764 |
+ c->pix_abs[0][2] = sad16_y2_mmx; |
|
765 |
+ c->pix_abs[0][3] = sad16_xy2_mmx; |
|
766 |
+ c->pix_abs[1][0] = sad8_mmx; |
|
767 |
+ c->pix_abs[1][1] = sad8_x2_mmx; |
|
768 |
+ c->pix_abs[1][2] = sad8_y2_mmx; |
|
769 |
+ c->pix_abs[1][3] = sad8_xy2_mmx; |
|
770 |
+ |
|
771 |
+ c->sad[0] = sad16_mmx; |
|
772 |
+ c->sad[1] = sad8_mmx; |
|
773 |
+ |
|
774 |
+ c->vsad[4] = vsad_intra16_mmx; |
|
775 |
+ |
|
776 |
+ if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
|
777 |
+ c->vsad[0] = vsad16_mmx; |
|
778 |
+ } |
|
779 |
+ } |
|
780 |
+ |
|
781 |
+ if (INLINE_MMXEXT(cpu_flags)) { |
|
782 |
+ c->vsad[4] = vsad_intra16_mmxext; |
|
783 |
+ |
|
784 |
+ c->pix_abs[0][0] = sad16_mmxext; |
|
785 |
+ c->pix_abs[1][0] = sad8_mmxext; |
|
786 |
+ |
|
787 |
+ c->sad[0] = sad16_mmxext; |
|
788 |
+ c->sad[1] = sad8_mmxext; |
|
789 |
+ |
|
790 |
+ c->pix_abs[0][1] = sad16_x2_mmxext; |
|
791 |
+ c->pix_abs[0][2] = sad16_y2_mmxext; |
|
792 |
+ c->pix_abs[1][1] = sad8_x2_mmxext; |
|
793 |
+ c->pix_abs[1][2] = sad8_y2_mmxext; |
|
794 |
+ |
|
795 |
+ if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
|
796 |
+ c->pix_abs[0][3] = sad16_xy2_mmxext; |
|
797 |
+ c->pix_abs[1][3] = sad8_xy2_mmxext; |
|
798 |
+ |
|
799 |
+ c->vsad[0] = vsad16_mmxext; |
|
800 |
+ } |
|
801 |
+ } |
|
802 |
+ |
|
803 |
+ if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) { |
|
804 |
+ c->sad[0] = sad16_sse2; |
|
805 |
+ } |
|
806 |
+ |
|
807 |
+#endif /* HAVE_INLINE_ASM */ |
|
808 |
+ |
|
809 |
+ if (EXTERNAL_MMX(cpu_flags)) { |
|
810 |
+ c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx; |
|
811 |
+ c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; |
|
812 |
+ c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx; |
|
813 |
+ c->sse[0] = ff_sse16_mmx; |
|
814 |
+ c->sse[1] = ff_sse8_mmx; |
|
815 |
+#if HAVE_YASM |
|
816 |
+ c->nsse[0] = nsse16_mmx; |
|
817 |
+ c->nsse[1] = nsse8_mmx; |
|
818 |
+#endif |
|
819 |
+ } |
|
820 |
+ |
|
821 |
+ if (EXTERNAL_MMXEXT(cpu_flags)) { |
|
822 |
+ c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; |
|
823 |
+ c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; |
|
824 |
+ c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext; |
|
825 |
+ } |
|
826 |
+ |
|
827 |
+ if (EXTERNAL_SSE2(cpu_flags)) { |
|
828 |
+ c->sse[0] = ff_sse16_sse2; |
|
829 |
+ c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2; |
|
830 |
+ |
|
831 |
+#if HAVE_ALIGNED_STACK |
|
832 |
+ c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; |
|
833 |
+ c->hadamard8_diff[1] = ff_hadamard8_diff_sse2; |
|
834 |
+#endif |
|
835 |
+ } |
|
836 |
+ |
|
837 |
+ if (EXTERNAL_SSSE3(cpu_flags)) { |
|
838 |
+ c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3; |
|
839 |
+#if HAVE_ALIGNED_STACK |
|
840 |
+ c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; |
|
841 |
+ c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; |
|
842 |
+#endif |
|
843 |
+ } |
|
844 |
+} |
... | ... |
@@ -24,7 +24,7 @@ |
24 | 24 |
|
25 | 25 |
#include "config.h" |
26 | 26 |
#include "avfilter.h" |
27 |
-#include "libavcodec/dsputil.h" |
|
27 |
+#include "libavcodec/me_cmp.h" |
|
28 | 28 |
#include "transform.h" |
29 | 29 |
#if CONFIG_OPENCL |
30 | 30 |
#include "libavutil/opencl.h" |
... | ... |
@@ -81,7 +81,7 @@ typedef struct { |
81 | 81 |
int contrast; ///< Contrast threshold |
82 | 82 |
int search; ///< Motion search method |
83 | 83 |
AVCodecContext *avctx; |
84 |
- DSPContext c; ///< Context providing optimized SAD methods |
|
84 |
+ MECmpContext c; ///< Context providing optimized SAD methods |
|
85 | 85 |
Transform last; ///< Transform from last frame |
86 | 86 |
int refcount; ///< Number of reference frames (defines averaging window) |
87 | 87 |
FILE *fp; |
... | ... |
@@ -35,7 +35,7 @@ |
35 | 35 |
#include "video.h" |
36 | 36 |
|
37 | 37 |
#if CONFIG_AVCODEC |
38 |
-#include "libavcodec/dsputil.h" |
|
38 |
+#include "libavcodec/me_cmp.h" |
|
39 | 39 |
#endif |
40 | 40 |
|
41 | 41 |
static const char *const var_names[] = { |
... | ... |
@@ -146,7 +146,7 @@ typedef struct SelectContext { |
146 | 146 |
int do_scene_detect; ///< 1 if the expression requires scene detection variables, 0 otherwise |
147 | 147 |
#if CONFIG_AVCODEC |
148 | 148 |
AVCodecContext *avctx; ///< codec context required for the DSPContext (scene detect only) |
149 |
- DSPContext c; ///< context providing optimized SAD methods (scene detect only) |
|
149 |
+ MECmpContext c; ///< context providing optimized SAD methods (scene detect only) |
|
150 | 150 |
double prev_mafd; ///< previous MAFD (scene detect only) |
151 | 151 |
#endif |
152 | 152 |
AVFrame *prev_picref; ///< previous frame (scene detect only) |
... | ... |
@@ -245,7 +245,7 @@ static int config_input(AVFilterLink *inlink) |
245 | 245 |
select->avctx = avcodec_alloc_context3(NULL); |
246 | 246 |
if (!select->avctx) |
247 | 247 |
return AVERROR(ENOMEM); |
248 |
- avpriv_dsputil_init(&select->c, select->avctx); |
|
248 |
+ ff_me_cmp_init(&select->c, select->avctx); |
|
249 | 249 |
} |
250 | 250 |
#endif |
251 | 251 |
return 0; |
... | ... |
@@ -57,7 +57,7 @@ |
57 | 57 |
#include "libavutil/mem.h" |
58 | 58 |
#include "libavutil/opt.h" |
59 | 59 |
#include "libavutil/pixdesc.h" |
60 |
-#include "libavcodec/dsputil.h" |
|
60 |
+#include "libavcodec/me_cmp.h" |
|
61 | 61 |
|
62 | 62 |
#include "deshake.h" |
63 | 63 |
#include "deshake_opencl.h" |
... | ... |
@@ -414,7 +414,7 @@ static int config_props(AVFilterLink *link) |
414 | 414 |
deshake->last.zoom = 0; |
415 | 415 |
|
416 | 416 |
deshake->avctx = avcodec_alloc_context3(NULL); |
417 |
- avpriv_dsputil_init(&deshake->c, deshake->avctx); |
|
417 |
+ ff_me_cmp_init(&deshake->c, deshake->avctx); |
|
418 | 418 |
|
419 | 419 |
return 0; |
420 | 420 |
} |
... | ... |
@@ -27,7 +27,7 @@ |
27 | 27 |
#include "libavutil/opt.h" |
28 | 28 |
#include "libavutil/pixdesc.h" |
29 | 29 |
#include "libavutil/timestamp.h" |
30 |
-#include "libavcodec/dsputil.h" |
|
30 |
+#include "libavcodec/me_cmp.h" |
|
31 | 31 |
#include "libavcodec/pixblockdsp.h" |
32 | 32 |
#include "avfilter.h" |
33 | 33 |
#include "internal.h" |
... | ... |
@@ -49,7 +49,7 @@ typedef struct { |
49 | 49 |
|
50 | 50 |
int hsub, vsub; ///< chroma subsampling values |
51 | 51 |
AVFrame *ref; ///< reference picture |
52 |
- DSPContext dspctx; ///< context providing optimized diff routines |
|
52 |
+ MECmpContext mecc; ///< context providing optimized diff routines |
|
53 | 53 |
PixblockDSPContext pdsp; |
54 | 54 |
AVCodecContext *avctx; ///< codec context required for the DSPContext |
55 | 55 |
} DecimateContext; |
... | ... |
@@ -76,7 +76,7 @@ static int diff_planes(AVFilterContext *ctx, |
76 | 76 |
int w, int h) |
77 | 77 |
{ |
78 | 78 |
DecimateContext *decimate = ctx->priv; |
79 |
- DSPContext *dspctx = &decimate->dspctx; |
|
79 |
+ MECmpContext *mecc = &decimate->mecc; |
|
80 | 80 |
PixblockDSPContext *pdsp = &decimate->pdsp; |
81 | 81 |
|
82 | 82 |
int x, y; |
... | ... |
@@ -90,7 +90,7 @@ static int diff_planes(AVFilterContext *ctx, |
90 | 90 |
pdsp->diff_pixels(block, |
91 | 91 |
cur+x+y*linesize, |
92 | 92 |
ref+x+y*linesize, linesize); |
93 |
- d = dspctx->sum_abs_dctelem(block); |
|
93 |
+ d = mecc->sum_abs_dctelem(block); |
|
94 | 94 |
if (d > decimate->hi) |
95 | 95 |
return 1; |
96 | 96 |
if (d > decimate->lo) { |
... | ... |
@@ -143,7 +143,7 @@ static av_cold int init(AVFilterContext *ctx) |
143 | 143 |
decimate->avctx = avcodec_alloc_context3(NULL); |
144 | 144 |
if (!decimate->avctx) |
145 | 145 |
return AVERROR(ENOMEM); |
146 |
- avpriv_dsputil_init(&decimate->dspctx, decimate->avctx); |
|
146 |
+ ff_me_cmp_init(&decimate->mecc, decimate->avctx); |
|
147 | 147 |
ff_pixblockdsp_init(&decimate->pdsp, decimate->avctx); |
148 | 148 |
|
149 | 149 |
return 0; |