Browse code

Merge commit '2d60444331fca1910510038dd3817bea885c2367'

* commit '2d60444331fca1910510038dd3817bea885c2367':
dsputil: Split motion estimation compare bits off into their own context

Conflicts:
configure
libavcodec/Makefile
libavcodec/arm/Makefile
libavcodec/dvenc.c
libavcodec/error_resilience.c
libavcodec/h264.h
libavcodec/h264_slice.c
libavcodec/me_cmp.c
libavcodec/me_cmp.h
libavcodec/motion_est.c
libavcodec/motion_est_template.c
libavcodec/mpeg4videoenc.c
libavcodec/mpegvideo.c
libavcodec/mpegvideo_enc.c
libavcodec/x86/Makefile
libavcodec/x86/me_cmp_init.c

Merged-by: Michael Niedermayer <michaelni@gmx.at>

Michael Niedermayer authored on 2014/07/18 06:27:40
Showing 58 changed files
... ...
@@ -1804,7 +1804,6 @@ CONFIG_EXTRA="
1804 1804
     blockdsp
1805 1805
     bswapdsp
1806 1806
     cabac
1807
-    dsputil
1808 1807
     dvprofile
1809 1808
     exif
1810 1809
     fdctdsp
... ...
@@ -1827,6 +1826,7 @@ CONFIG_EXTRA="
1827 1827
     llauddsp
1828 1828
     llviddsp
1829 1829
     lpc
1830
+    me_cmp
1830 1831
     mpeg_er
1831 1832
     mpegaudio
1832 1833
     mpegaudiodsp
... ...
@@ -2002,17 +2002,17 @@ threads_if_any="$THREADS_LIST"
2002 2002
 
2003 2003
 # subsystems
2004 2004
 dct_select="rdft"
2005
-dsputil_select="fdctdsp idctdsp pixblockdsp"
2006
-error_resilience_select="dsputil"
2005
+error_resilience_select="me_cmp"
2007 2006
 frame_thread_encoder_deps="encoders threads"
2008 2007
 intrax8_select="error_resilience"
2009 2008
 mdct_select="fft"
2010 2009
 rdft_select="fft"
2010
+me_cmp_select="fdctdsp idctdsp pixblockdsp"
2011 2011
 mpeg_er_select="error_resilience"
2012 2012
 mpegaudio_select="mpegaudiodsp"
2013 2013
 mpegaudiodsp_select="dct"
2014
-mpegvideo_select="blockdsp dsputil h264chroma hpeldsp idctdsp videodsp"
2015
-mpegvideoenc_select="dsputil mpegvideo pixblockdsp qpeldsp"
2014
+mpegvideo_select="blockdsp h264chroma hpeldsp idctdsp me_cmp videodsp"
2015
+mpegvideoenc_select="me_cmp mpegvideo pixblockdsp qpeldsp"
2016 2016
 
2017 2017
 # decoders / encoders
2018 2018
 aac_decoder_select="mdct sinewin"
... ...
@@ -2020,8 +2020,8 @@ aac_encoder_select="audio_frame_queue mdct sinewin"
2020 2020
 aac_latm_decoder_select="aac_decoder aac_latm_parser"
2021 2021
 ac3_decoder_select="ac3_parser ac3dsp bswapdsp mdct"
2022 2022
 ac3_fixed_decoder_select="ac3_parser ac3dsp bswapdsp mdct"
2023
-ac3_encoder_select="ac3dsp audiodsp dsputil mdct"
2024
-ac3_fixed_encoder_select="ac3dsp audiodsp dsputil mdct"
2023
+ac3_encoder_select="ac3dsp audiodsp mdct me_cmp"
2024
+ac3_fixed_encoder_select="ac3dsp audiodsp mdct me_cmp"
2025 2025
 aic_decoder_select="golomb idctdsp"
2026 2026
 alac_encoder_select="lpc"
2027 2027
 als_decoder_select="bswapdsp"
... ...
@@ -2048,11 +2048,11 @@ cook_decoder_select="audiodsp mdct sinewin"
2048 2048
 cscd_decoder_select="lzo"
2049 2049
 cscd_decoder_suggest="zlib"
2050 2050
 dca_decoder_select="mdct"
2051
-dirac_decoder_select="dsputil dwt golomb videodsp mpegvideoenc"
2051
+dirac_decoder_select="dwt golomb videodsp mpegvideoenc"
2052 2052
 dnxhd_decoder_select="blockdsp idctdsp"
2053 2053
 dnxhd_encoder_select="aandcttables blockdsp fdctdsp idctdsp mpegvideoenc pixblockdsp"
2054 2054
 dvvideo_decoder_select="dvprofile idctdsp"
2055
-dvvideo_encoder_select="dsputil dvprofile fdctdsp pixblockdsp"
2055
+dvvideo_encoder_select="dvprofile fdctdsp me_cmp pixblockdsp"
2056 2056
 dxa_decoder_select="zlib"
2057 2057
 eac3_decoder_select="ac3_decoder"
2058 2058
 eac3_encoder_select="ac3_encoder"
... ...
@@ -2147,8 +2147,8 @@ qdm2_decoder_select="mdct rdft mpegaudiodsp"
2147 2147
 ra_144_encoder_select="audio_frame_queue lpc audiodsp"
2148 2148
 ra_144_decoder_select="audiodsp"
2149 2149
 ralf_decoder_select="golomb"
2150
-rawvideo_decoder_select="dsputil bswapdsp"
2151
-rtjpeg_decoder_select="dsputil"
2150
+rawvideo_decoder_select="bswapdsp"
2151
+rtjpeg_decoder_select="me_cmp"
2152 2152
 rv10_decoder_select="error_resilience h263_decoder h263dsp mpeg_er"
2153 2153
 rv10_encoder_select="h263_encoder"
2154 2154
 rv20_decoder_select="error_resilience h263_decoder h263dsp mpeg_er"
... ...
@@ -2157,14 +2157,14 @@ rv30_decoder_select="error_resilience golomb h264chroma h264pred h264qpel mpeg_e
2157 2157
 rv40_decoder_select="error_resilience golomb h264chroma h264pred h264qpel mpeg_er mpegvideo videodsp"
2158 2158
 shorten_decoder_select="golomb"
2159 2159
 sipr_decoder_select="lsp"
2160
-snow_decoder_select="dsputil dwt h264qpel hpeldsp rangecoder"
2161
-snow_encoder_select="aandcttables dsputil dwt h264qpel hpeldsp mpegvideoenc rangecoder"
2160
+snow_decoder_select="dwt h264qpel hpeldsp rangecoder"
2161
+snow_encoder_select="aandcttables dwt h264qpel hpeldsp me_cmp mpegvideoenc rangecoder"
2162 2162
 sonic_decoder_select="golomb rangecoder"
2163 2163
 sonic_encoder_select="golomb rangecoder"
2164 2164
 sonic_ls_encoder_select="golomb rangecoder"
2165 2165
 sp5x_decoder_select="mjpeg_decoder"
2166 2166
 svq1_decoder_select="hpeldsp"
2167
-svq1_encoder_select="aandcttables dsputil hpeldsp mpegvideoenc"
2167
+svq1_encoder_select="aandcttables hpeldsp me_cmp mpegvideoenc"
2168 2168
 svq3_decoder_select="h264_decoder hpeldsp tpeldsp"
2169 2169
 svq3_decoder_suggest="zlib"
2170 2170
 tak_decoder_select="audiodsp"
... ...
@@ -2517,7 +2517,7 @@ dctdnoiz_filter_deps="avcodec"
2517 2517
 dctdnoiz_filter_select="dct"
2518 2518
 delogo_filter_deps="gpl"
2519 2519
 deshake_filter_deps="avcodec"
2520
-deshake_filter_select="dsputil"
2520
+deshake_filter_select="me_cmp"
2521 2521
 drawtext_filter_deps="libfreetype"
2522 2522
 ebur128_filter_deps="gpl"
2523 2523
 flite_filter_deps="libflite"
... ...
@@ -2536,7 +2536,7 @@ mcdeint_filter_deps="avcodec gpl"
2536 2536
 movie_filter_deps="avcodec avformat"
2537 2537
 mp_filter_deps="gpl avcodec swscale inline_asm"
2538 2538
 mpdecimate_filter_deps="gpl avcodec"
2539
-mpdecimate_filter_select="dsputil pixblockdsp"
2539
+mpdecimate_filter_select="me_cmp pixblockdsp"
2540 2540
 mptestsrc_filter_deps="gpl"
2541 2541
 negate_filter_deps="lut_filter"
2542 2542
 perspective_filter_deps="gpl"
... ...
@@ -2554,7 +2554,7 @@ smartblur_filter_deps="gpl swscale"
2554 2554
 showspectrum_filter_deps="avcodec"
2555 2555
 showspectrum_filter_select="rdft"
2556 2556
 spp_filter_deps="gpl avcodec"
2557
-spp_filter_select="dsputil fft idctdsp fdctdsp pixblockdsp"
2557
+spp_filter_select="fft idctdsp fdctdsp me_cmp pixblockdsp"
2558 2558
 stereo3d_filter_deps="gpl"
2559 2559
 subtitles_filter_deps="avformat avcodec libass"
2560 2560
 super2xsai_filter_deps="gpl"
... ...
@@ -41,7 +41,6 @@ OBJS-$(CONFIG_BSWAPDSP)                += bswapdsp.o
41 41
 OBJS-$(CONFIG_CABAC)                   += cabac.o
42 42
 OBJS-$(CONFIG_CRYSTALHD)               += crystalhd.o
43 43
 OBJS-$(CONFIG_DCT)                     += dct.o dct32_fixed.o dct32_float.o
44
-OBJS-$(CONFIG_DSPUTIL)                 += dsputil.o
45 44
 OBJS-$(CONFIG_DXVA2)                   += dxva2.o
46 45
 OBJS-$(CONFIG_ERROR_RESILIENCE)        += error_resilience.o
47 46
 OBJS-$(CONFIG_EXIF)                    += exif.o tiff_common.o
... ...
@@ -70,6 +69,7 @@ OBJS-$(CONFIG_LLVIDDSP)                += lossless_videodsp.o
70 70
 OBJS-$(CONFIG_LPC)                     += lpc.o
71 71
 OBJS-$(CONFIG_LSP)                     += lsp.o
72 72
 OBJS-$(CONFIG_MDCT)                    += mdct_fixed.o mdct_float.o mdct_fixed_32.o
73
+OBJS-$(CONFIG_ME_CMP)                  += me_cmp.o
73 74
 OBJS-$(CONFIG_MPEG_ER)                 += mpeg_er.o
74 75
 OBJS-$(CONFIG_MPEGAUDIO)               += mpegaudio.o mpegaudiodata.o   \
75 76
                                           mpegaudiodecheader.o
... ...
@@ -36,6 +36,7 @@
36 36
 #include "libavutil/internal.h"
37 37
 #include "libavutil/opt.h"
38 38
 #include "avcodec.h"
39
+#include "me_cmp.h"
39 40
 #include "put_bits.h"
40 41
 #include "audiodsp.h"
41 42
 #include "ac3dsp.h"
... ...
@@ -379,7 +380,7 @@ static void compute_exp_strategy(AC3EncodeContext *s)
379 379
                 exp_strategy[blk] = EXP_NEW;
380 380
                 continue;
381 381
             }
382
-            exp_diff = s->dsp.sad[0](NULL, exp, exp - AC3_MAX_COEFS, 16, 16);
382
+            exp_diff = s->mecc.sad[0](NULL, exp, exp - AC3_MAX_COEFS, 16, 16);
383 383
             exp_strategy[blk] = EXP_REUSE;
384 384
             if (ch == CPL_CH && exp_diff > (EXP_DIFF_THRESHOLD * (s->blocks[blk].end_freq[ch] - s->start_freq[ch]) / AC3_MAX_COEFS))
385 385
                 exp_strategy[blk] = EXP_NEW;
... ...
@@ -2480,7 +2481,7 @@ av_cold int ff_ac3_encode_init(AVCodecContext *avctx)
2480 2480
         goto init_fail;
2481 2481
 
2482 2482
     ff_audiodsp_init(&s->adsp);
2483
-    ff_dsputil_init(&s->dsp, avctx);
2483
+    ff_me_cmp_init(&s->mecc, avctx);
2484 2484
     ff_ac3dsp_init(&s->ac3dsp, avctx->flags & CODEC_FLAG_BITEXACT);
2485 2485
 
2486 2486
     dprint_options(s);
... ...
@@ -35,9 +35,9 @@
35 35
 #include "ac3.h"
36 36
 #include "ac3dsp.h"
37 37
 #include "avcodec.h"
38
-#include "dsputil.h"
39 38
 #include "fft.h"
40 39
 #include "mathops.h"
40
+#include "me_cmp.h"
41 41
 #include "put_bits.h"
42 42
 #include "audiodsp.h"
43 43
 
... ...
@@ -162,9 +162,9 @@ typedef struct AC3EncodeContext {
162 162
     AC3EncOptions options;                  ///< encoding options
163 163
     AVCodecContext *avctx;                  ///< parent AVCodecContext
164 164
     PutBitContext pb;                       ///< bitstream writer context
165
-    DSPContext dsp;
166 165
     AudioDSPContext adsp;
167 166
     AVFloatDSPContext fdsp;
167
+    MECmpContext mecc;
168 168
     AC3DSPContext ac3dsp;                   ///< AC-3 optimized functions
169 169
     FFTContext mdct;                        ///< FFT context for MDCT calculation
170 170
     const SampleType *mdct_window;          ///< MDCT window function array
... ...
@@ -6,7 +6,6 @@ OBJS-$(CONFIG_AC3DSP)                  += arm/ac3dsp_init_arm.o         \
6 6
                                           arm/ac3dsp_arm.o
7 7
 OBJS-$(CONFIG_AUDIODSP)                += arm/audiodsp_init_arm.o
8 8
 OBJS-$(CONFIG_BLOCKDSP)                += arm/blockdsp_init_arm.o
9
-OBJS-$(CONFIG_DSPUTIL)                 += arm/dsputil_init_arm.o
10 9
 OBJS-$(CONFIG_FFT)                     += arm/fft_init_arm.o            \
11 10
                                           arm/fft_fixed_init_arm.o
12 11
 OBJS-$(CONFIG_H264CHROMA)              += arm/h264chroma_init_arm.o
... ...
@@ -20,6 +19,7 @@ OBJS-$(CONFIG_IDCTDSP)                 += arm/idctdsp_init_arm.o        \
20 20
                                           arm/jrevdct_arm.o             \
21 21
                                           arm/simple_idct_arm.o
22 22
 OBJS-$(CONFIG_LLAUDDSP)                += arm/lossless_audiodsp_init_arm.o
23
+OBJS-$(CONFIG_ME_CMP)                  += arm/me_cmp_init_arm.o
23 24
 OBJS-$(CONFIG_MPEGAUDIODSP)            += arm/mpegaudiodsp_init_arm.o
24 25
 OBJS-$(CONFIG_MPEGVIDEO)               += arm/mpegvideo_arm.o
25 26
 OBJS-$(CONFIG_MPEGVIDEOENC)            += arm/mpegvideoencdsp_init_arm.o
... ...
@@ -54,13 +54,13 @@ ARMV5TE-OBJS-$(CONFIG_VIDEODSP)        += arm/videodsp_init_armv5te.o   \
54 54
 ARMV5TE-OBJS-$(CONFIG_MLP_DECODER)     += arm/mlpdsp_armv5te.o
55 55
 
56 56
 ARMV6-OBJS-$(CONFIG_AC3DSP)            += arm/ac3dsp_armv6.o
57
-ARMV6-OBJS-$(CONFIG_DSPUTIL)           += arm/dsputil_armv6.o
58 57
 ARMV6-OBJS-$(CONFIG_H264DSP)           += arm/startcode_armv6.o
59 58
 ARMV6-OBJS-$(CONFIG_HPELDSP)           += arm/hpeldsp_init_armv6.o      \
60 59
                                           arm/hpeldsp_armv6.o
61 60
 ARMV6-OBJS-$(CONFIG_IDCTDSP)           += arm/idctdsp_init_armv6.o      \
62 61
                                           arm/idctdsp_armv6.o           \
63 62
                                           arm/simple_idct_armv6.o
63
+ARMV6-OBJS-$(CONFIG_ME_CMP)            += arm/me_cmp_armv6.o
64 64
 ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP)      += arm/mpegaudiodsp_fixed_armv6.o
65 65
 ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC)      += arm/mpegvideoencdsp_armv6.o
66 66
 ARMV6-OBJS-$(CONFIG_PIXBLOCKDSP)       += arm/pixblockdsp_armv6.o
67 67
deleted file mode 100644
... ...
@@ -1,244 +0,0 @@
1
-/*
2
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3
- *
4
- * This file is part of FFmpeg.
5
- *
6
- * FFmpeg is free software; you can redistribute it and/or
7
- * modify it under the terms of the GNU Lesser General Public
8
- * License as published by the Free Software Foundation; either
9
- * version 2.1 of the License, or (at your option) any later version.
10
- *
11
- * FFmpeg is distributed in the hope that it will be useful,
12
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
- * Lesser General Public License for more details.
15
- *
16
- * You should have received a copy of the GNU Lesser General Public
17
- * License along with FFmpeg; if not, write to the Free Software
18
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
- */
20
-
21
-#include "libavutil/arm/asm.S"
22
-
23
-function ff_pix_abs16_armv6, export=1
24
-        ldr             r0,  [sp]
25
-        push            {r4-r9, lr}
26
-        mov             r12, #0
27
-        mov             lr,  #0
28
-        ldm             r1,  {r4-r7}
29
-        ldr             r8,  [r2]
30
-1:
31
-        ldr             r9,  [r2, #4]
32
-        pld             [r1, r3]
33
-        usada8          r12, r4,  r8,  r12
34
-        ldr             r8,  [r2, #8]
35
-        pld             [r2, r3]
36
-        usada8          lr,  r5,  r9,  lr
37
-        ldr             r9,  [r2, #12]
38
-        usada8          r12, r6,  r8,  r12
39
-        subs            r0,  r0,  #1
40
-        usada8          lr,  r7,  r9,  lr
41
-        beq             2f
42
-        add             r1,  r1,  r3
43
-        ldm             r1,  {r4-r7}
44
-        add             r2,  r2,  r3
45
-        ldr             r8,  [r2]
46
-        b               1b
47
-2:
48
-        add             r0,  r12, lr
49
-        pop             {r4-r9, pc}
50
-endfunc
51
-
52
-function ff_pix_abs16_x2_armv6, export=1
53
-        ldr             r12, [sp]
54
-        push            {r4-r11, lr}
55
-        mov             r0,  #0
56
-        mov             lr,  #1
57
-        orr             lr,  lr,  lr,  lsl #8
58
-        orr             lr,  lr,  lr,  lsl #16
59
-1:
60
-        ldr             r8,  [r2]
61
-        ldr             r9,  [r2, #4]
62
-        lsr             r10, r8,  #8
63
-        ldr             r4,  [r1]
64
-        lsr             r6,  r9,  #8
65
-        orr             r10, r10, r9,  lsl #24
66
-        ldr             r5,  [r2, #8]
67
-        eor             r11, r8,  r10
68
-        uhadd8          r7,  r8,  r10
69
-        orr             r6,  r6,  r5,  lsl #24
70
-        and             r11, r11, lr
71
-        uadd8           r7,  r7,  r11
72
-        ldr             r8,  [r1, #4]
73
-        usada8          r0,  r4,  r7,  r0
74
-        eor             r7,  r9,  r6
75
-        lsr             r10, r5,  #8
76
-        and             r7,  r7,  lr
77
-        uhadd8          r4,  r9,  r6
78
-        ldr             r6,  [r2, #12]
79
-        uadd8           r4,  r4,  r7
80
-        pld             [r1, r3]
81
-        orr             r10, r10, r6,  lsl #24
82
-        usada8          r0,  r8,  r4,  r0
83
-        ldr             r4,  [r1, #8]
84
-        eor             r11, r5,  r10
85
-        ldrb            r7,  [r2, #16]
86
-        and             r11, r11, lr
87
-        uhadd8          r8,  r5,  r10
88
-        ldr             r5,  [r1, #12]
89
-        uadd8           r8,  r8,  r11
90
-        pld             [r2, r3]
91
-        lsr             r10, r6,  #8
92
-        usada8          r0,  r4,  r8,  r0
93
-        orr             r10, r10, r7,  lsl #24
94
-        subs            r12,  r12,  #1
95
-        eor             r11, r6,  r10
96
-        add             r1,  r1,  r3
97
-        uhadd8          r9,  r6,  r10
98
-        and             r11, r11, lr
99
-        uadd8           r9,  r9,  r11
100
-        add             r2,  r2,  r3
101
-        usada8          r0,  r5,  r9,  r0
102
-        bgt             1b
103
-
104
-        pop             {r4-r11, pc}
105
-endfunc
106
-
107
-.macro  usad_y2         p0,  p1,  p2,  p3,  n0,  n1,  n2,  n3
108
-        ldr             \n0, [r2]
109
-        eor             \n1, \p0, \n0
110
-        uhadd8          \p0, \p0, \n0
111
-        and             \n1, \n1, lr
112
-        ldr             \n2, [r1]
113
-        uadd8           \p0, \p0, \n1
114
-        ldr             \n1, [r2, #4]
115
-        usada8          r0,  \p0, \n2, r0
116
-        pld             [r1,  r3]
117
-        eor             \n3, \p1, \n1
118
-        uhadd8          \p1, \p1, \n1
119
-        and             \n3, \n3, lr
120
-        ldr             \p0, [r1, #4]
121
-        uadd8           \p1, \p1, \n3
122
-        ldr             \n2, [r2, #8]
123
-        usada8          r0,  \p1, \p0, r0
124
-        pld             [r2,  r3]
125
-        eor             \p0, \p2, \n2
126
-        uhadd8          \p2, \p2, \n2
127
-        and             \p0, \p0, lr
128
-        ldr             \p1, [r1, #8]
129
-        uadd8           \p2, \p2, \p0
130
-        ldr             \n3, [r2, #12]
131
-        usada8          r0,  \p2, \p1, r0
132
-        eor             \p1, \p3, \n3
133
-        uhadd8          \p3, \p3, \n3
134
-        and             \p1, \p1, lr
135
-        ldr             \p0,  [r1, #12]
136
-        uadd8           \p3, \p3, \p1
137
-        add             r1,  r1,  r3
138
-        usada8          r0,  \p3, \p0,  r0
139
-        add             r2,  r2,  r3
140
-.endm
141
-
142
-function ff_pix_abs16_y2_armv6, export=1
143
-        pld             [r1]
144
-        pld             [r2]
145
-        ldr             r12, [sp]
146
-        push            {r4-r11, lr}
147
-        mov             r0,  #0
148
-        mov             lr,  #1
149
-        orr             lr,  lr,  lr,  lsl #8
150
-        orr             lr,  lr,  lr,  lsl #16
151
-        ldr             r4,  [r2]
152
-        ldr             r5,  [r2, #4]
153
-        ldr             r6,  [r2, #8]
154
-        ldr             r7,  [r2, #12]
155
-        add             r2,  r2,  r3
156
-1:
157
-        usad_y2         r4,  r5,  r6,  r7,  r8,  r9,  r10, r11
158
-        subs            r12, r12, #2
159
-        usad_y2         r8,  r9,  r10, r11, r4,  r5,  r6,  r7
160
-        bgt             1b
161
-
162
-        pop             {r4-r11, pc}
163
-endfunc
164
-
165
-function ff_pix_abs8_armv6, export=1
166
-        pld             [r2, r3]
167
-        ldr             r12, [sp]
168
-        push            {r4-r9, lr}
169
-        mov             r0,  #0
170
-        mov             lr,  #0
171
-        ldrd_post       r4,  r5,  r1,  r3
172
-1:
173
-        subs            r12, r12, #2
174
-        ldr             r7,  [r2, #4]
175
-        ldr_post        r6,  r2,  r3
176
-        ldrd_post       r8,  r9,  r1,  r3
177
-        usada8          r0,  r4,  r6,  r0
178
-        pld             [r2, r3]
179
-        usada8          lr,  r5,  r7,  lr
180
-        ldr             r7,  [r2, #4]
181
-        ldr_post        r6,  r2,  r3
182
-        beq             2f
183
-        ldrd_post       r4,  r5,  r1,  r3
184
-        usada8          r0,  r8,  r6,  r0
185
-        pld             [r2, r3]
186
-        usada8          lr,  r9,  r7,  lr
187
-        b               1b
188
-2:
189
-        usada8          r0,  r8,  r6,  r0
190
-        usada8          lr,  r9,  r7,  lr
191
-        add             r0,  r0,  lr
192
-        pop             {r4-r9, pc}
193
-endfunc
194
-
195
-function ff_sse16_armv6, export=1
196
-        ldr             r12, [sp]
197
-        push            {r4-r9, lr}
198
-        mov             r0,  #0
199
-1:
200
-        ldrd            r4,  r5,  [r1]
201
-        ldr             r8,  [r2]
202
-        uxtb16          lr,  r4
203
-        uxtb16          r4,  r4,  ror #8
204
-        uxtb16          r9,  r8
205
-        uxtb16          r8,  r8,  ror #8
206
-        ldr             r7,  [r2, #4]
207
-        usub16          lr,  lr,  r9
208
-        usub16          r4,  r4,  r8
209
-        smlad           r0,  lr,  lr,  r0
210
-        uxtb16          r6,  r5
211
-        uxtb16          lr,  r5,  ror #8
212
-        uxtb16          r8,  r7
213
-        uxtb16          r9,  r7,  ror #8
214
-        smlad           r0,  r4,  r4,  r0
215
-        ldrd            r4,  r5,  [r1, #8]
216
-        usub16          r6,  r6,  r8
217
-        usub16          r8,  lr,  r9
218
-        ldr             r7,  [r2, #8]
219
-        smlad           r0,  r6,  r6,  r0
220
-        uxtb16          lr,  r4
221
-        uxtb16          r4,  r4,  ror #8
222
-        uxtb16          r9,  r7
223
-        uxtb16          r7,  r7, ror #8
224
-        smlad           r0,  r8,  r8,  r0
225
-        ldr             r8,  [r2, #12]
226
-        usub16          lr,  lr,  r9
227
-        usub16          r4,  r4,  r7
228
-        smlad           r0,  lr,  lr,  r0
229
-        uxtb16          r6,  r5
230
-        uxtb16          r5,  r5,  ror #8
231
-        uxtb16          r9,  r8
232
-        uxtb16          r8,  r8,  ror #8
233
-        smlad           r0,  r4,  r4,  r0
234
-        usub16          r6,  r6,  r9
235
-        usub16          r5,  r5,  r8
236
-        smlad           r0,  r6,  r6,  r0
237
-        add             r1,  r1,  r3
238
-        add             r2,  r2,  r3
239
-        subs            r12, r12, #1
240
-        smlad           r0,  r5,  r5,  r0
241
-        bgt             1b
242
-
243
-        pop             {r4-r9, pc}
244
-endfunc
245 1
deleted file mode 100644
... ...
@@ -1,58 +0,0 @@
1
-/*
2
- * This file is part of FFmpeg.
3
- *
4
- * FFmpeg is free software; you can redistribute it and/or
5
- * modify it under the terms of the GNU Lesser General Public
6
- * License as published by the Free Software Foundation; either
7
- * version 2.1 of the License, or (at your option) any later version.
8
- *
9
- * FFmpeg is distributed in the hope that it will be useful,
10
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
- * Lesser General Public License for more details.
13
- *
14
- * You should have received a copy of the GNU Lesser General Public
15
- * License along with FFmpeg; if not, write to the Free Software
16
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
- */
18
-
19
-#include <stdint.h>
20
-
21
-#include "libavutil/attributes.h"
22
-#include "libavutil/cpu.h"
23
-#include "libavutil/arm/cpu.h"
24
-#include "libavcodec/avcodec.h"
25
-#include "libavcodec/dsputil.h"
26
-#include "libavcodec/mpegvideo.h"
27
-
28
-int ff_pix_abs16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
29
-                       int line_size, int h);
30
-int ff_pix_abs16_x2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
31
-                          int line_size, int h);
32
-int ff_pix_abs16_y2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
33
-                          int line_size, int h);
34
-
35
-int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
36
-                      int line_size, int h);
37
-
38
-int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
39
-                   int line_size, int h);
40
-
41
-
42
-av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx)
43
-{
44
-    int cpu_flags = av_get_cpu_flags();
45
-
46
-    if (have_armv6(cpu_flags)) {
47
-        c->pix_abs[0][0] = ff_pix_abs16_armv6;
48
-        c->pix_abs[0][1] = ff_pix_abs16_x2_armv6;
49
-        c->pix_abs[0][2] = ff_pix_abs16_y2_armv6;
50
-
51
-        c->pix_abs[1][0] = ff_pix_abs8_armv6;
52
-
53
-        c->sad[0] = ff_pix_abs16_armv6;
54
-        c->sad[1] = ff_pix_abs8_armv6;
55
-
56
-        c->sse[0] = ff_sse16_armv6;
57
-    }
58
-}
59 1
new file mode 100644
... ...
@@ -0,0 +1,244 @@
0
+/*
1
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
2
+ *
3
+ * This file is part of FFmpeg.
4
+ *
5
+ * FFmpeg is free software; you can redistribute it and/or
6
+ * modify it under the terms of the GNU Lesser General Public
7
+ * License as published by the Free Software Foundation; either
8
+ * version 2.1 of the License, or (at your option) any later version.
9
+ *
10
+ * FFmpeg is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
+ * Lesser General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU Lesser General Public
16
+ * License along with FFmpeg; if not, write to the Free Software
17
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+#include "libavutil/arm/asm.S"
21
+
22
+function ff_pix_abs16_armv6, export=1
23
+        ldr             r0,  [sp]
24
+        push            {r4-r9, lr}
25
+        mov             r12, #0
26
+        mov             lr,  #0
27
+        ldm             r1,  {r4-r7}
28
+        ldr             r8,  [r2]
29
+1:
30
+        ldr             r9,  [r2, #4]
31
+        pld             [r1, r3]
32
+        usada8          r12, r4,  r8,  r12
33
+        ldr             r8,  [r2, #8]
34
+        pld             [r2, r3]
35
+        usada8          lr,  r5,  r9,  lr
36
+        ldr             r9,  [r2, #12]
37
+        usada8          r12, r6,  r8,  r12
38
+        subs            r0,  r0,  #1
39
+        usada8          lr,  r7,  r9,  lr
40
+        beq             2f
41
+        add             r1,  r1,  r3
42
+        ldm             r1,  {r4-r7}
43
+        add             r2,  r2,  r3
44
+        ldr             r8,  [r2]
45
+        b               1b
46
+2:
47
+        add             r0,  r12, lr
48
+        pop             {r4-r9, pc}
49
+endfunc
50
+
51
+function ff_pix_abs16_x2_armv6, export=1
52
+        ldr             r12, [sp]
53
+        push            {r4-r11, lr}
54
+        mov             r0,  #0
55
+        mov             lr,  #1
56
+        orr             lr,  lr,  lr,  lsl #8
57
+        orr             lr,  lr,  lr,  lsl #16
58
+1:
59
+        ldr             r8,  [r2]
60
+        ldr             r9,  [r2, #4]
61
+        lsr             r10, r8,  #8
62
+        ldr             r4,  [r1]
63
+        lsr             r6,  r9,  #8
64
+        orr             r10, r10, r9,  lsl #24
65
+        ldr             r5,  [r2, #8]
66
+        eor             r11, r8,  r10
67
+        uhadd8          r7,  r8,  r10
68
+        orr             r6,  r6,  r5,  lsl #24
69
+        and             r11, r11, lr
70
+        uadd8           r7,  r7,  r11
71
+        ldr             r8,  [r1, #4]
72
+        usada8          r0,  r4,  r7,  r0
73
+        eor             r7,  r9,  r6
74
+        lsr             r10, r5,  #8
75
+        and             r7,  r7,  lr
76
+        uhadd8          r4,  r9,  r6
77
+        ldr             r6,  [r2, #12]
78
+        uadd8           r4,  r4,  r7
79
+        pld             [r1, r3]
80
+        orr             r10, r10, r6,  lsl #24
81
+        usada8          r0,  r8,  r4,  r0
82
+        ldr             r4,  [r1, #8]
83
+        eor             r11, r5,  r10
84
+        ldrb            r7,  [r2, #16]
85
+        and             r11, r11, lr
86
+        uhadd8          r8,  r5,  r10
87
+        ldr             r5,  [r1, #12]
88
+        uadd8           r8,  r8,  r11
89
+        pld             [r2, r3]
90
+        lsr             r10, r6,  #8
91
+        usada8          r0,  r4,  r8,  r0
92
+        orr             r10, r10, r7,  lsl #24
93
+        subs            r12,  r12,  #1
94
+        eor             r11, r6,  r10
95
+        add             r1,  r1,  r3
96
+        uhadd8          r9,  r6,  r10
97
+        and             r11, r11, lr
98
+        uadd8           r9,  r9,  r11
99
+        add             r2,  r2,  r3
100
+        usada8          r0,  r5,  r9,  r0
101
+        bgt             1b
102
+
103
+        pop             {r4-r11, pc}
104
+endfunc
105
+
106
+.macro  usad_y2         p0,  p1,  p2,  p3,  n0,  n1,  n2,  n3
107
+        ldr             \n0, [r2]
108
+        eor             \n1, \p0, \n0
109
+        uhadd8          \p0, \p0, \n0
110
+        and             \n1, \n1, lr
111
+        ldr             \n2, [r1]
112
+        uadd8           \p0, \p0, \n1
113
+        ldr             \n1, [r2, #4]
114
+        usada8          r0,  \p0, \n2, r0
115
+        pld             [r1,  r3]
116
+        eor             \n3, \p1, \n1
117
+        uhadd8          \p1, \p1, \n1
118
+        and             \n3, \n3, lr
119
+        ldr             \p0, [r1, #4]
120
+        uadd8           \p1, \p1, \n3
121
+        ldr             \n2, [r2, #8]
122
+        usada8          r0,  \p1, \p0, r0
123
+        pld             [r2,  r3]
124
+        eor             \p0, \p2, \n2
125
+        uhadd8          \p2, \p2, \n2
126
+        and             \p0, \p0, lr
127
+        ldr             \p1, [r1, #8]
128
+        uadd8           \p2, \p2, \p0
129
+        ldr             \n3, [r2, #12]
130
+        usada8          r0,  \p2, \p1, r0
131
+        eor             \p1, \p3, \n3
132
+        uhadd8          \p3, \p3, \n3
133
+        and             \p1, \p1, lr
134
+        ldr             \p0,  [r1, #12]
135
+        uadd8           \p3, \p3, \p1
136
+        add             r1,  r1,  r3
137
+        usada8          r0,  \p3, \p0,  r0
138
+        add             r2,  r2,  r3
139
+.endm
140
+
141
+function ff_pix_abs16_y2_armv6, export=1
142
+        pld             [r1]
143
+        pld             [r2]
144
+        ldr             r12, [sp]
145
+        push            {r4-r11, lr}
146
+        mov             r0,  #0
147
+        mov             lr,  #1
148
+        orr             lr,  lr,  lr,  lsl #8
149
+        orr             lr,  lr,  lr,  lsl #16
150
+        ldr             r4,  [r2]
151
+        ldr             r5,  [r2, #4]
152
+        ldr             r6,  [r2, #8]
153
+        ldr             r7,  [r2, #12]
154
+        add             r2,  r2,  r3
155
+1:
156
+        usad_y2         r4,  r5,  r6,  r7,  r8,  r9,  r10, r11
157
+        subs            r12, r12, #2
158
+        usad_y2         r8,  r9,  r10, r11, r4,  r5,  r6,  r7
159
+        bgt             1b
160
+
161
+        pop             {r4-r11, pc}
162
+endfunc
163
+
164
+function ff_pix_abs8_armv6, export=1
165
+        pld             [r2, r3]
166
+        ldr             r12, [sp]
167
+        push            {r4-r9, lr}
168
+        mov             r0,  #0
169
+        mov             lr,  #0
170
+        ldrd_post       r4,  r5,  r1,  r3
171
+1:
172
+        subs            r12, r12, #2
173
+        ldr             r7,  [r2, #4]
174
+        ldr_post        r6,  r2,  r3
175
+        ldrd_post       r8,  r9,  r1,  r3
176
+        usada8          r0,  r4,  r6,  r0
177
+        pld             [r2, r3]
178
+        usada8          lr,  r5,  r7,  lr
179
+        ldr             r7,  [r2, #4]
180
+        ldr_post        r6,  r2,  r3
181
+        beq             2f
182
+        ldrd_post       r4,  r5,  r1,  r3
183
+        usada8          r0,  r8,  r6,  r0
184
+        pld             [r2, r3]
185
+        usada8          lr,  r9,  r7,  lr
186
+        b               1b
187
+2:
188
+        usada8          r0,  r8,  r6,  r0
189
+        usada8          lr,  r9,  r7,  lr
190
+        add             r0,  r0,  lr
191
+        pop             {r4-r9, pc}
192
+endfunc
193
+
194
+function ff_sse16_armv6, export=1
195
+        ldr             r12, [sp]
196
+        push            {r4-r9, lr}
197
+        mov             r0,  #0
198
+1:
199
+        ldrd            r4,  r5,  [r1]
200
+        ldr             r8,  [r2]
201
+        uxtb16          lr,  r4
202
+        uxtb16          r4,  r4,  ror #8
203
+        uxtb16          r9,  r8
204
+        uxtb16          r8,  r8,  ror #8
205
+        ldr             r7,  [r2, #4]
206
+        usub16          lr,  lr,  r9
207
+        usub16          r4,  r4,  r8
208
+        smlad           r0,  lr,  lr,  r0
209
+        uxtb16          r6,  r5
210
+        uxtb16          lr,  r5,  ror #8
211
+        uxtb16          r8,  r7
212
+        uxtb16          r9,  r7,  ror #8
213
+        smlad           r0,  r4,  r4,  r0
214
+        ldrd            r4,  r5,  [r1, #8]
215
+        usub16          r6,  r6,  r8
216
+        usub16          r8,  lr,  r9
217
+        ldr             r7,  [r2, #8]
218
+        smlad           r0,  r6,  r6,  r0
219
+        uxtb16          lr,  r4
220
+        uxtb16          r4,  r4,  ror #8
221
+        uxtb16          r9,  r7
222
+        uxtb16          r7,  r7, ror #8
223
+        smlad           r0,  r8,  r8,  r0
224
+        ldr             r8,  [r2, #12]
225
+        usub16          lr,  lr,  r9
226
+        usub16          r4,  r4,  r7
227
+        smlad           r0,  lr,  lr,  r0
228
+        uxtb16          r6,  r5
229
+        uxtb16          r5,  r5,  ror #8
230
+        uxtb16          r9,  r8
231
+        uxtb16          r8,  r8,  ror #8
232
+        smlad           r0,  r4,  r4,  r0
233
+        usub16          r6,  r6,  r9
234
+        usub16          r5,  r5,  r8
235
+        smlad           r0,  r6,  r6,  r0
236
+        add             r1,  r1,  r3
237
+        add             r2,  r2,  r3
238
+        subs            r12, r12, #1
239
+        smlad           r0,  r5,  r5,  r0
240
+        bgt             1b
241
+
242
+        pop             {r4-r9, pc}
243
+endfunc
0 244
new file mode 100644
... ...
@@ -0,0 +1,57 @@
0
+/*
1
+ * This file is part of FFmpeg.
2
+ *
3
+ * FFmpeg is free software; you can redistribute it and/or
4
+ * modify it under the terms of the GNU Lesser General Public
5
+ * License as published by the Free Software Foundation; either
6
+ * version 2.1 of the License, or (at your option) any later version.
7
+ *
8
+ * FFmpeg is distributed in the hope that it will be useful,
9
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11
+ * Lesser General Public License for more details.
12
+ *
13
+ * You should have received a copy of the GNU Lesser General Public
14
+ * License along with FFmpeg; if not, write to the Free Software
15
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+ */
17
+
18
+#include <stdint.h>
19
+
20
+#include "libavutil/attributes.h"
21
+#include "libavutil/cpu.h"
22
+#include "libavutil/arm/cpu.h"
23
+#include "libavcodec/avcodec.h"
24
+#include "libavcodec/me_cmp.h"
25
+#include "libavcodec/mpegvideo.h"
26
+
27
+int ff_pix_abs16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
28
+                       int line_size, int h);
29
+int ff_pix_abs16_x2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
30
+                          int line_size, int h);
31
+int ff_pix_abs16_y2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
32
+                          int line_size, int h);
33
+
34
+int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
35
+                      int line_size, int h);
36
+
37
+int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
38
+                   int line_size, int h);
39
+
40
+av_cold void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx)
41
+{
42
+    int cpu_flags = av_get_cpu_flags();
43
+
44
+    if (have_armv6(cpu_flags)) {
45
+        c->pix_abs[0][0] = ff_pix_abs16_armv6;
46
+        c->pix_abs[0][1] = ff_pix_abs16_x2_armv6;
47
+        c->pix_abs[0][2] = ff_pix_abs16_y2_armv6;
48
+
49
+        c->pix_abs[1][0] = ff_pix_abs8_armv6;
50
+
51
+        c->sad[0] = ff_pix_abs16_armv6;
52
+        c->sad[1] = ff_pix_abs8_armv6;
53
+
54
+        c->sse[0] = ff_sse16_armv6;
55
+    }
56
+}
... ...
@@ -22,7 +22,6 @@
22 22
 #include "libavutil/attributes.h"
23 23
 #include "libavutil/avassert.h"
24 24
 #include "libavutil/common.h"
25
-#include "dsputil.h"
26 25
 #include "dirac_dwt.h"
27 26
 #include "libavcodec/x86/dirac_dwt.h"
28 27
 
... ...
@@ -18,7 +18,7 @@
18 18
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 19
  */
20 20
 
21
-#include "dsputil.h"
21
+#include "avcodec.h"
22 22
 #include "diracdsp.h"
23 23
 #include "libavcodec/x86/diracdsp_mmx.h"
24 24
 
25 25
deleted file mode 100644
... ...
@@ -1,1008 +0,0 @@
1
-/*
2
- * DSP utils
3
- * Copyright (c) 2000, 2001 Fabrice Bellard
4
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
- *
6
- * This file is part of FFmpeg.
7
- *
8
- * FFmpeg is free software; you can redistribute it and/or
9
- * modify it under the terms of the GNU Lesser General Public
10
- * License as published by the Free Software Foundation; either
11
- * version 2.1 of the License, or (at your option) any later version.
12
- *
13
- * FFmpeg is distributed in the hope that it will be useful,
14
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
- * Lesser General Public License for more details.
17
- *
18
- * You should have received a copy of the GNU Lesser General Public
19
- * License along with FFmpeg; if not, write to the Free Software
20
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
- */
22
-
23
-/**
24
- * @file
25
- * DSP utils
26
- */
27
-
28
-#include "libavutil/attributes.h"
29
-#include "libavutil/internal.h"
30
-#include "avcodec.h"
31
-#include "copy_block.h"
32
-#include "dsputil.h"
33
-#include "simple_idct.h"
34
-#include "mpegvideo.h"
35
-#include "config.h"
36
-
37
-uint32_t ff_square_tab[512] = { 0, };
38
-
39
-static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
40
-                  int line_size, int h)
41
-{
42
-    int s = 0, i;
43
-    uint32_t *sq = ff_square_tab + 256;
44
-
45
-    for (i = 0; i < h; i++) {
46
-        s    += sq[pix1[0] - pix2[0]];
47
-        s    += sq[pix1[1] - pix2[1]];
48
-        s    += sq[pix1[2] - pix2[2]];
49
-        s    += sq[pix1[3] - pix2[3]];
50
-        pix1 += line_size;
51
-        pix2 += line_size;
52
-    }
53
-    return s;
54
-}
55
-
56
-static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
57
-                  int line_size, int h)
58
-{
59
-    int s = 0, i;
60
-    uint32_t *sq = ff_square_tab + 256;
61
-
62
-    for (i = 0; i < h; i++) {
63
-        s    += sq[pix1[0] - pix2[0]];
64
-        s    += sq[pix1[1] - pix2[1]];
65
-        s    += sq[pix1[2] - pix2[2]];
66
-        s    += sq[pix1[3] - pix2[3]];
67
-        s    += sq[pix1[4] - pix2[4]];
68
-        s    += sq[pix1[5] - pix2[5]];
69
-        s    += sq[pix1[6] - pix2[6]];
70
-        s    += sq[pix1[7] - pix2[7]];
71
-        pix1 += line_size;
72
-        pix2 += line_size;
73
-    }
74
-    return s;
75
-}
76
-
77
-static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
78
-                   int line_size, int h)
79
-{
80
-    int s = 0, i;
81
-    uint32_t *sq = ff_square_tab + 256;
82
-
83
-    for (i = 0; i < h; i++) {
84
-        s += sq[pix1[0]  - pix2[0]];
85
-        s += sq[pix1[1]  - pix2[1]];
86
-        s += sq[pix1[2]  - pix2[2]];
87
-        s += sq[pix1[3]  - pix2[3]];
88
-        s += sq[pix1[4]  - pix2[4]];
89
-        s += sq[pix1[5]  - pix2[5]];
90
-        s += sq[pix1[6]  - pix2[6]];
91
-        s += sq[pix1[7]  - pix2[7]];
92
-        s += sq[pix1[8]  - pix2[8]];
93
-        s += sq[pix1[9]  - pix2[9]];
94
-        s += sq[pix1[10] - pix2[10]];
95
-        s += sq[pix1[11] - pix2[11]];
96
-        s += sq[pix1[12] - pix2[12]];
97
-        s += sq[pix1[13] - pix2[13]];
98
-        s += sq[pix1[14] - pix2[14]];
99
-        s += sq[pix1[15] - pix2[15]];
100
-
101
-        pix1 += line_size;
102
-        pix2 += line_size;
103
-    }
104
-    return s;
105
-}
106
-
107
-static int sum_abs_dctelem_c(int16_t *block)
108
-{
109
-    int sum = 0, i;
110
-
111
-    for (i = 0; i < 64; i++)
112
-        sum += FFABS(block[i]);
113
-    return sum;
114
-}
115
-
116
-#define avg2(a, b) ((a + b + 1) >> 1)
117
-#define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
118
-
119
-static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
120
-                              int line_size, int h)
121
-{
122
-    int s = 0, i;
123
-
124
-    for (i = 0; i < h; i++) {
125
-        s    += abs(pix1[0]  - pix2[0]);
126
-        s    += abs(pix1[1]  - pix2[1]);
127
-        s    += abs(pix1[2]  - pix2[2]);
128
-        s    += abs(pix1[3]  - pix2[3]);
129
-        s    += abs(pix1[4]  - pix2[4]);
130
-        s    += abs(pix1[5]  - pix2[5]);
131
-        s    += abs(pix1[6]  - pix2[6]);
132
-        s    += abs(pix1[7]  - pix2[7]);
133
-        s    += abs(pix1[8]  - pix2[8]);
134
-        s    += abs(pix1[9]  - pix2[9]);
135
-        s    += abs(pix1[10] - pix2[10]);
136
-        s    += abs(pix1[11] - pix2[11]);
137
-        s    += abs(pix1[12] - pix2[12]);
138
-        s    += abs(pix1[13] - pix2[13]);
139
-        s    += abs(pix1[14] - pix2[14]);
140
-        s    += abs(pix1[15] - pix2[15]);
141
-        pix1 += line_size;
142
-        pix2 += line_size;
143
-    }
144
-    return s;
145
-}
146
-
147
-static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
148
-                          int line_size, int h)
149
-{
150
-    int s = 0, i;
151
-
152
-    for (i = 0; i < h; i++) {
153
-        s    += abs(pix1[0]  - avg2(pix2[0],  pix2[1]));
154
-        s    += abs(pix1[1]  - avg2(pix2[1],  pix2[2]));
155
-        s    += abs(pix1[2]  - avg2(pix2[2],  pix2[3]));
156
-        s    += abs(pix1[3]  - avg2(pix2[3],  pix2[4]));
157
-        s    += abs(pix1[4]  - avg2(pix2[4],  pix2[5]));
158
-        s    += abs(pix1[5]  - avg2(pix2[5],  pix2[6]));
159
-        s    += abs(pix1[6]  - avg2(pix2[6],  pix2[7]));
160
-        s    += abs(pix1[7]  - avg2(pix2[7],  pix2[8]));
161
-        s    += abs(pix1[8]  - avg2(pix2[8],  pix2[9]));
162
-        s    += abs(pix1[9]  - avg2(pix2[9],  pix2[10]));
163
-        s    += abs(pix1[10] - avg2(pix2[10], pix2[11]));
164
-        s    += abs(pix1[11] - avg2(pix2[11], pix2[12]));
165
-        s    += abs(pix1[12] - avg2(pix2[12], pix2[13]));
166
-        s    += abs(pix1[13] - avg2(pix2[13], pix2[14]));
167
-        s    += abs(pix1[14] - avg2(pix2[14], pix2[15]));
168
-        s    += abs(pix1[15] - avg2(pix2[15], pix2[16]));
169
-        pix1 += line_size;
170
-        pix2 += line_size;
171
-    }
172
-    return s;
173
-}
174
-
175
-static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
176
-                          int line_size, int h)
177
-{
178
-    int s = 0, i;
179
-    uint8_t *pix3 = pix2 + line_size;
180
-
181
-    for (i = 0; i < h; i++) {
182
-        s    += abs(pix1[0]  - avg2(pix2[0],  pix3[0]));
183
-        s    += abs(pix1[1]  - avg2(pix2[1],  pix3[1]));
184
-        s    += abs(pix1[2]  - avg2(pix2[2],  pix3[2]));
185
-        s    += abs(pix1[3]  - avg2(pix2[3],  pix3[3]));
186
-        s    += abs(pix1[4]  - avg2(pix2[4],  pix3[4]));
187
-        s    += abs(pix1[5]  - avg2(pix2[5],  pix3[5]));
188
-        s    += abs(pix1[6]  - avg2(pix2[6],  pix3[6]));
189
-        s    += abs(pix1[7]  - avg2(pix2[7],  pix3[7]));
190
-        s    += abs(pix1[8]  - avg2(pix2[8],  pix3[8]));
191
-        s    += abs(pix1[9]  - avg2(pix2[9],  pix3[9]));
192
-        s    += abs(pix1[10] - avg2(pix2[10], pix3[10]));
193
-        s    += abs(pix1[11] - avg2(pix2[11], pix3[11]));
194
-        s    += abs(pix1[12] - avg2(pix2[12], pix3[12]));
195
-        s    += abs(pix1[13] - avg2(pix2[13], pix3[13]));
196
-        s    += abs(pix1[14] - avg2(pix2[14], pix3[14]));
197
-        s    += abs(pix1[15] - avg2(pix2[15], pix3[15]));
198
-        pix1 += line_size;
199
-        pix2 += line_size;
200
-        pix3 += line_size;
201
-    }
202
-    return s;
203
-}
204
-
205
-static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
206
-                           int line_size, int h)
207
-{
208
-    int s = 0, i;
209
-    uint8_t *pix3 = pix2 + line_size;
210
-
211
-    for (i = 0; i < h; i++) {
212
-        s    += abs(pix1[0]  - avg4(pix2[0],  pix2[1],  pix3[0],  pix3[1]));
213
-        s    += abs(pix1[1]  - avg4(pix2[1],  pix2[2],  pix3[1],  pix3[2]));
214
-        s    += abs(pix1[2]  - avg4(pix2[2],  pix2[3],  pix3[2],  pix3[3]));
215
-        s    += abs(pix1[3]  - avg4(pix2[3],  pix2[4],  pix3[3],  pix3[4]));
216
-        s    += abs(pix1[4]  - avg4(pix2[4],  pix2[5],  pix3[4],  pix3[5]));
217
-        s    += abs(pix1[5]  - avg4(pix2[5],  pix2[6],  pix3[5],  pix3[6]));
218
-        s    += abs(pix1[6]  - avg4(pix2[6],  pix2[7],  pix3[6],  pix3[7]));
219
-        s    += abs(pix1[7]  - avg4(pix2[7],  pix2[8],  pix3[7],  pix3[8]));
220
-        s    += abs(pix1[8]  - avg4(pix2[8],  pix2[9],  pix3[8],  pix3[9]));
221
-        s    += abs(pix1[9]  - avg4(pix2[9],  pix2[10], pix3[9],  pix3[10]));
222
-        s    += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
223
-        s    += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
224
-        s    += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
225
-        s    += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
226
-        s    += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
227
-        s    += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
228
-        pix1 += line_size;
229
-        pix2 += line_size;
230
-        pix3 += line_size;
231
-    }
232
-    return s;
233
-}
234
-
235
-static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
236
-                             int line_size, int h)
237
-{
238
-    int s = 0, i;
239
-
240
-    for (i = 0; i < h; i++) {
241
-        s    += abs(pix1[0] - pix2[0]);
242
-        s    += abs(pix1[1] - pix2[1]);
243
-        s    += abs(pix1[2] - pix2[2]);
244
-        s    += abs(pix1[3] - pix2[3]);
245
-        s    += abs(pix1[4] - pix2[4]);
246
-        s    += abs(pix1[5] - pix2[5]);
247
-        s    += abs(pix1[6] - pix2[6]);
248
-        s    += abs(pix1[7] - pix2[7]);
249
-        pix1 += line_size;
250
-        pix2 += line_size;
251
-    }
252
-    return s;
253
-}
254
-
255
-static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
256
-                         int line_size, int h)
257
-{
258
-    int s = 0, i;
259
-
260
-    for (i = 0; i < h; i++) {
261
-        s    += abs(pix1[0] - avg2(pix2[0], pix2[1]));
262
-        s    += abs(pix1[1] - avg2(pix2[1], pix2[2]));
263
-        s    += abs(pix1[2] - avg2(pix2[2], pix2[3]));
264
-        s    += abs(pix1[3] - avg2(pix2[3], pix2[4]));
265
-        s    += abs(pix1[4] - avg2(pix2[4], pix2[5]));
266
-        s    += abs(pix1[5] - avg2(pix2[5], pix2[6]));
267
-        s    += abs(pix1[6] - avg2(pix2[6], pix2[7]));
268
-        s    += abs(pix1[7] - avg2(pix2[7], pix2[8]));
269
-        pix1 += line_size;
270
-        pix2 += line_size;
271
-    }
272
-    return s;
273
-}
274
-
275
-static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
276
-                         int line_size, int h)
277
-{
278
-    int s = 0, i;
279
-    uint8_t *pix3 = pix2 + line_size;
280
-
281
-    for (i = 0; i < h; i++) {
282
-        s    += abs(pix1[0] - avg2(pix2[0], pix3[0]));
283
-        s    += abs(pix1[1] - avg2(pix2[1], pix3[1]));
284
-        s    += abs(pix1[2] - avg2(pix2[2], pix3[2]));
285
-        s    += abs(pix1[3] - avg2(pix2[3], pix3[3]));
286
-        s    += abs(pix1[4] - avg2(pix2[4], pix3[4]));
287
-        s    += abs(pix1[5] - avg2(pix2[5], pix3[5]));
288
-        s    += abs(pix1[6] - avg2(pix2[6], pix3[6]));
289
-        s    += abs(pix1[7] - avg2(pix2[7], pix3[7]));
290
-        pix1 += line_size;
291
-        pix2 += line_size;
292
-        pix3 += line_size;
293
-    }
294
-    return s;
295
-}
296
-
297
-static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
298
-                          int line_size, int h)
299
-{
300
-    int s = 0, i;
301
-    uint8_t *pix3 = pix2 + line_size;
302
-
303
-    for (i = 0; i < h; i++) {
304
-        s    += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
305
-        s    += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
306
-        s    += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
307
-        s    += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
308
-        s    += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
309
-        s    += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
310
-        s    += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
311
-        s    += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
312
-        pix1 += line_size;
313
-        pix2 += line_size;
314
-        pix3 += line_size;
315
-    }
316
-    return s;
317
-}
318
-
319
-static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
320
-{
321
-    int score1 = 0, score2 = 0, x, y;
322
-
323
-    for (y = 0; y < h; y++) {
324
-        for (x = 0; x < 16; x++)
325
-            score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
326
-        if (y + 1 < h) {
327
-            for (x = 0; x < 15; x++)
328
-                score2 += FFABS(s1[x]     - s1[x + stride] -
329
-                                s1[x + 1] + s1[x + stride + 1]) -
330
-                          FFABS(s2[x]     - s2[x + stride] -
331
-                                s2[x + 1] + s2[x + stride + 1]);
332
-        }
333
-        s1 += stride;
334
-        s2 += stride;
335
-    }
336
-
337
-    if (c)
338
-        return score1 + FFABS(score2) * c->avctx->nsse_weight;
339
-    else
340
-        return score1 + FFABS(score2) * 8;
341
-}
342
-
343
-static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
344
-{
345
-    int score1 = 0, score2 = 0, x, y;
346
-
347
-    for (y = 0; y < h; y++) {
348
-        for (x = 0; x < 8; x++)
349
-            score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
350
-        if (y + 1 < h) {
351
-            for (x = 0; x < 7; x++)
352
-                score2 += FFABS(s1[x]     - s1[x + stride] -
353
-                                s1[x + 1] + s1[x + stride + 1]) -
354
-                          FFABS(s2[x]     - s2[x + stride] -
355
-                                s2[x + 1] + s2[x + stride + 1]);
356
-        }
357
-        s1 += stride;
358
-        s2 += stride;
359
-    }
360
-
361
-    if (c)
362
-        return score1 + FFABS(score2) * c->avctx->nsse_weight;
363
-    else
364
-        return score1 + FFABS(score2) * 8;
365
-}
366
-
367
-static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
368
-                    int stride, int h)
369
-{
370
-    return 0;
371
-}
372
-
373
-void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
374
-{
375
-    int i;
376
-
377
-    memset(cmp, 0, sizeof(void *) * 6);
378
-
379
-    for (i = 0; i < 6; i++) {
380
-        switch (type & 0xFF) {
381
-        case FF_CMP_SAD:
382
-            cmp[i] = c->sad[i];
383
-            break;
384
-        case FF_CMP_SATD:
385
-            cmp[i] = c->hadamard8_diff[i];
386
-            break;
387
-        case FF_CMP_SSE:
388
-            cmp[i] = c->sse[i];
389
-            break;
390
-        case FF_CMP_DCT:
391
-            cmp[i] = c->dct_sad[i];
392
-            break;
393
-        case FF_CMP_DCT264:
394
-            cmp[i] = c->dct264_sad[i];
395
-            break;
396
-        case FF_CMP_DCTMAX:
397
-            cmp[i] = c->dct_max[i];
398
-            break;
399
-        case FF_CMP_PSNR:
400
-            cmp[i] = c->quant_psnr[i];
401
-            break;
402
-        case FF_CMP_BIT:
403
-            cmp[i] = c->bit[i];
404
-            break;
405
-        case FF_CMP_RD:
406
-            cmp[i] = c->rd[i];
407
-            break;
408
-        case FF_CMP_VSAD:
409
-            cmp[i] = c->vsad[i];
410
-            break;
411
-        case FF_CMP_VSSE:
412
-            cmp[i] = c->vsse[i];
413
-            break;
414
-        case FF_CMP_ZERO:
415
-            cmp[i] = zero_cmp;
416
-            break;
417
-        case FF_CMP_NSSE:
418
-            cmp[i] = c->nsse[i];
419
-            break;
420
-#if CONFIG_DWT
421
-        case FF_CMP_W53:
422
-            cmp[i]= c->w53[i];
423
-            break;
424
-        case FF_CMP_W97:
425
-            cmp[i]= c->w97[i];
426
-            break;
427
-#endif
428
-        default:
429
-            av_log(NULL, AV_LOG_ERROR,
430
-                   "internal error in cmp function selection\n");
431
-        }
432
-    }
433
-}
434
-
435
-#define BUTTERFLY2(o1, o2, i1, i2)              \
436
-    o1 = (i1) + (i2);                           \
437
-    o2 = (i1) - (i2);
438
-
439
-#define BUTTERFLY1(x, y)                        \
440
-    {                                           \
441
-        int a, b;                               \
442
-        a = x;                                  \
443
-        b = y;                                  \
444
-        x = a + b;                              \
445
-        y = a - b;                              \
446
-    }
447
-
448
-#define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
449
-
450
-static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
451
-                               uint8_t *src, int stride, int h)
452
-{
453
-    int i, temp[64], sum = 0;
454
-
455
-    av_assert2(h == 8);
456
-
457
-    for (i = 0; i < 8; i++) {
458
-        // FIXME: try pointer walks
459
-        BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
460
-                   src[stride * i + 0] - dst[stride * i + 0],
461
-                   src[stride * i + 1] - dst[stride * i + 1]);
462
-        BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
463
-                   src[stride * i + 2] - dst[stride * i + 2],
464
-                   src[stride * i + 3] - dst[stride * i + 3]);
465
-        BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
466
-                   src[stride * i + 4] - dst[stride * i + 4],
467
-                   src[stride * i + 5] - dst[stride * i + 5]);
468
-        BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
469
-                   src[stride * i + 6] - dst[stride * i + 6],
470
-                   src[stride * i + 7] - dst[stride * i + 7]);
471
-
472
-        BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
473
-        BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
474
-        BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
475
-        BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
476
-
477
-        BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
478
-        BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
479
-        BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
480
-        BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
481
-    }
482
-
483
-    for (i = 0; i < 8; i++) {
484
-        BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
485
-        BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
486
-        BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
487
-        BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
488
-
489
-        BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
490
-        BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
491
-        BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
492
-        BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
493
-
494
-        sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
495
-               BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
496
-               BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
497
-               BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
498
-    }
499
-    return sum;
500
-}
501
-
502
-static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
503
-                                uint8_t *dummy, int stride, int h)
504
-{
505
-    int i, temp[64], sum = 0;
506
-
507
-    av_assert2(h == 8);
508
-
509
-    for (i = 0; i < 8; i++) {
510
-        // FIXME: try pointer walks
511
-        BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
512
-                   src[stride * i + 0], src[stride * i + 1]);
513
-        BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
514
-                   src[stride * i + 2], src[stride * i + 3]);
515
-        BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
516
-                   src[stride * i + 4], src[stride * i + 5]);
517
-        BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
518
-                   src[stride * i + 6], src[stride * i + 7]);
519
-
520
-        BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
521
-        BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
522
-        BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
523
-        BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
524
-
525
-        BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
526
-        BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
527
-        BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
528
-        BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
529
-    }
530
-
531
-    for (i = 0; i < 8; i++) {
532
-        BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
533
-        BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
534
-        BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
535
-        BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
536
-
537
-        BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
538
-        BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
539
-        BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
540
-        BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
541
-
542
-        sum +=
543
-            BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
544
-            + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
545
-            + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
546
-            + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
547
-    }
548
-
549
-    sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
550
-
551
-    return sum;
552
-}
553
-
554
-static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
555
-                        uint8_t *src2, int stride, int h)
556
-{
557
-    LOCAL_ALIGNED_16(int16_t, temp, [64]);
558
-
559
-    av_assert2(h == 8);
560
-
561
-    s->pdsp.diff_pixels(temp, src1, src2, stride);
562
-    s->fdsp.fdct(temp);
563
-    return s->dsp.sum_abs_dctelem(temp);
564
-}
565
-
566
-#if CONFIG_GPL
567
-#define DCT8_1D                                         \
568
-    {                                                   \
569
-        const int s07 = SRC(0) + SRC(7);                \
570
-        const int s16 = SRC(1) + SRC(6);                \
571
-        const int s25 = SRC(2) + SRC(5);                \
572
-        const int s34 = SRC(3) + SRC(4);                \
573
-        const int a0  = s07 + s34;                      \
574
-        const int a1  = s16 + s25;                      \
575
-        const int a2  = s07 - s34;                      \
576
-        const int a3  = s16 - s25;                      \
577
-        const int d07 = SRC(0) - SRC(7);                \
578
-        const int d16 = SRC(1) - SRC(6);                \
579
-        const int d25 = SRC(2) - SRC(5);                \
580
-        const int d34 = SRC(3) - SRC(4);                \
581
-        const int a4  = d16 + d25 + (d07 + (d07 >> 1)); \
582
-        const int a5  = d07 - d34 - (d25 + (d25 >> 1)); \
583
-        const int a6  = d07 + d34 - (d16 + (d16 >> 1)); \
584
-        const int a7  = d16 - d25 + (d34 + (d34 >> 1)); \
585
-        DST(0, a0 + a1);                                \
586
-        DST(1, a4 + (a7 >> 2));                         \
587
-        DST(2, a2 + (a3 >> 1));                         \
588
-        DST(3, a5 + (a6 >> 2));                         \
589
-        DST(4, a0 - a1);                                \
590
-        DST(5, a6 - (a5 >> 2));                         \
591
-        DST(6, (a2 >> 1) - a3);                         \
592
-        DST(7, (a4 >> 2) - a7);                         \
593
-    }
594
-
595
-static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
596
-                           uint8_t *src2, int stride, int h)
597
-{
598
-    int16_t dct[8][8];
599
-    int i, sum = 0;
600
-
601
-    s->pdsp.diff_pixels(dct[0], src1, src2, stride);
602
-
603
-#define SRC(x) dct[i][x]
604
-#define DST(x, v) dct[i][x] = v
605
-    for (i = 0; i < 8; i++)
606
-        DCT8_1D
607
-#undef SRC
608
-#undef DST
609
-
610
-#define SRC(x) dct[x][i]
611
-#define DST(x, v) sum += FFABS(v)
612
-        for (i = 0; i < 8; i++)
613
-            DCT8_1D
614
-#undef SRC
615
-#undef DST
616
-            return sum;
617
-}
618
-#endif
619
-
620
-static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
621
-                        uint8_t *src2, int stride, int h)
622
-{
623
-    LOCAL_ALIGNED_16(int16_t, temp, [64]);
624
-    int sum = 0, i;
625
-
626
-    av_assert2(h == 8);
627
-
628
-    s->pdsp.diff_pixels(temp, src1, src2, stride);
629
-    s->fdsp.fdct(temp);
630
-
631
-    for (i = 0; i < 64; i++)
632
-        sum = FFMAX(sum, FFABS(temp[i]));
633
-
634
-    return sum;
635
-}
636
-
637
-static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
638
-                           uint8_t *src2, int stride, int h)
639
-{
640
-    LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
641
-    int16_t *const bak = temp + 64;
642
-    int sum = 0, i;
643
-
644
-    av_assert2(h == 8);
645
-    s->mb_intra = 0;
646
-
647
-    s->pdsp.diff_pixels(temp, src1, src2, stride);
648
-
649
-    memcpy(bak, temp, 64 * sizeof(int16_t));
650
-
651
-    s->block_last_index[0 /* FIXME */] =
652
-        s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
653
-    s->dct_unquantize_inter(s, temp, 0, s->qscale);
654
-    ff_simple_idct_8(temp); // FIXME
655
-
656
-    for (i = 0; i < 64; i++)
657
-        sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
658
-
659
-    return sum;
660
-}
661
-
662
-static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
663
-                   int stride, int h)
664
-{
665
-    const uint8_t *scantable = s->intra_scantable.permutated;
666
-    LOCAL_ALIGNED_16(int16_t, temp, [64]);
667
-    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
668
-    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
669
-    int i, last, run, bits, level, distortion, start_i;
670
-    const int esc_length = s->ac_esc_length;
671
-    uint8_t *length, *last_length;
672
-
673
-    av_assert2(h == 8);
674
-
675
-    copy_block8(lsrc1, src1, 8, stride, 8);
676
-    copy_block8(lsrc2, src2, 8, stride, 8);
677
-
678
-    s->pdsp.diff_pixels(temp, lsrc1, lsrc2, 8);
679
-
680
-    s->block_last_index[0 /* FIXME */] =
681
-    last                               =
682
-        s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
683
-
684
-    bits = 0;
685
-
686
-    if (s->mb_intra) {
687
-        start_i     = 1;
688
-        length      = s->intra_ac_vlc_length;
689
-        last_length = s->intra_ac_vlc_last_length;
690
-        bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
691
-    } else {
692
-        start_i     = 0;
693
-        length      = s->inter_ac_vlc_length;
694
-        last_length = s->inter_ac_vlc_last_length;
695
-    }
696
-
697
-    if (last >= start_i) {
698
-        run = 0;
699
-        for (i = start_i; i < last; i++) {
700
-            int j = scantable[i];
701
-            level = temp[j];
702
-
703
-            if (level) {
704
-                level += 64;
705
-                if ((level & (~127)) == 0)
706
-                    bits += length[UNI_AC_ENC_INDEX(run, level)];
707
-                else
708
-                    bits += esc_length;
709
-                run = 0;
710
-            } else
711
-                run++;
712
-        }
713
-        i = scantable[last];
714
-
715
-        level = temp[i] + 64;
716
-
717
-        av_assert2(level - 64);
718
-
719
-        if ((level & (~127)) == 0) {
720
-            bits += last_length[UNI_AC_ENC_INDEX(run, level)];
721
-        } else
722
-            bits += esc_length;
723
-    }
724
-
725
-    if (last >= 0) {
726
-        if (s->mb_intra)
727
-            s->dct_unquantize_intra(s, temp, 0, s->qscale);
728
-        else
729
-            s->dct_unquantize_inter(s, temp, 0, s->qscale);
730
-    }
731
-
732
-    s->idsp.idct_add(lsrc2, 8, temp);
733
-
734
-    distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
735
-
736
-    return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
737
-}
738
-
739
-static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
740
-                    int stride, int h)
741
-{
742
-    const uint8_t *scantable = s->intra_scantable.permutated;
743
-    LOCAL_ALIGNED_16(int16_t, temp, [64]);
744
-    int i, last, run, bits, level, start_i;
745
-    const int esc_length = s->ac_esc_length;
746
-    uint8_t *length, *last_length;
747
-
748
-    av_assert2(h == 8);
749
-
750
-    s->pdsp.diff_pixels(temp, src1, src2, stride);
751
-
752
-    s->block_last_index[0 /* FIXME */] =
753
-    last                               =
754
-        s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
755
-
756
-    bits = 0;
757
-
758
-    if (s->mb_intra) {
759
-        start_i     = 1;
760
-        length      = s->intra_ac_vlc_length;
761
-        last_length = s->intra_ac_vlc_last_length;
762
-        bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
763
-    } else {
764
-        start_i     = 0;
765
-        length      = s->inter_ac_vlc_length;
766
-        last_length = s->inter_ac_vlc_last_length;
767
-    }
768
-
769
-    if (last >= start_i) {
770
-        run = 0;
771
-        for (i = start_i; i < last; i++) {
772
-            int j = scantable[i];
773
-            level = temp[j];
774
-
775
-            if (level) {
776
-                level += 64;
777
-                if ((level & (~127)) == 0)
778
-                    bits += length[UNI_AC_ENC_INDEX(run, level)];
779
-                else
780
-                    bits += esc_length;
781
-                run = 0;
782
-            } else
783
-                run++;
784
-        }
785
-        i = scantable[last];
786
-
787
-        level = temp[i] + 64;
788
-
789
-        av_assert2(level - 64);
790
-
791
-        if ((level & (~127)) == 0)
792
-            bits += last_length[UNI_AC_ENC_INDEX(run, level)];
793
-        else
794
-            bits += esc_length;
795
-    }
796
-
797
-    return bits;
798
-}
799
-
800
-#define VSAD_INTRA(size)                                                \
801
-static int vsad_intra ## size ## _c(MpegEncContext *c,                  \
802
-                                    uint8_t *s, uint8_t *dummy,         \
803
-                                    int stride, int h)                  \
804
-{                                                                       \
805
-    int score = 0, x, y;                                                \
806
-                                                                        \
807
-    for (y = 1; y < h; y++) {                                           \
808
-        for (x = 0; x < size; x += 4) {                                 \
809
-            score += FFABS(s[x]     - s[x + stride])     +              \
810
-                     FFABS(s[x + 1] - s[x + stride + 1]) +              \
811
-                     FFABS(s[x + 2] - s[x + 2 + stride]) +              \
812
-                     FFABS(s[x + 3] - s[x + 3 + stride]);               \
813
-        }                                                               \
814
-        s += stride;                                                    \
815
-    }                                                                   \
816
-                                                                        \
817
-    return score;                                                       \
818
-}
819
-VSAD_INTRA(8)
820
-VSAD_INTRA(16)
821
-
822
-#define VSAD(size)                                                             \
823
-static int vsad ## size ## _c(MpegEncContext *c,                               \
824
-                              uint8_t *s1, uint8_t *s2,                        \
825
-                              int stride, int h)                               \
826
-{                                                                              \
827
-    int score = 0, x, y;                                                       \
828
-                                                                               \
829
-    for (y = 1; y < h; y++) {                                                  \
830
-        for (x = 0; x < size; x++)                                             \
831
-            score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);   \
832
-        s1 += stride;                                                          \
833
-        s2 += stride;                                                          \
834
-    }                                                                          \
835
-                                                                               \
836
-    return score;                                                              \
837
-}
838
-VSAD(8)
839
-VSAD(16)
840
-
841
-#define SQ(a) ((a) * (a))
842
-#define VSSE_INTRA(size)                                                \
843
-static int vsse_intra ## size ## _c(MpegEncContext *c,                  \
844
-                                    uint8_t *s, uint8_t *dummy,         \
845
-                                    int stride, int h)                  \
846
-{                                                                       \
847
-    int score = 0, x, y;                                                \
848
-                                                                        \
849
-    for (y = 1; y < h; y++) {                                           \
850
-        for (x = 0; x < size; x += 4) {                                 \
851
-            score += SQ(s[x]     - s[x + stride]) +                     \
852
-                     SQ(s[x + 1] - s[x + stride + 1]) +                 \
853
-                     SQ(s[x + 2] - s[x + stride + 2]) +                 \
854
-                     SQ(s[x + 3] - s[x + stride + 3]);                  \
855
-        }                                                               \
856
-        s += stride;                                                    \
857
-    }                                                                   \
858
-                                                                        \
859
-    return score;                                                       \
860
-}
861
-VSSE_INTRA(8)
862
-VSSE_INTRA(16)
863
-
864
-#define VSSE(size)                                                             \
865
-static int vsse ## size ## _c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,     \
866
-                    int stride, int h)                                         \
867
-{                                                                              \
868
-    int score = 0, x, y;                                                       \
869
-                                                                               \
870
-    for (y = 1; y < h; y++) {                                                  \
871
-        for (x = 0; x < size; x++)                                             \
872
-            score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);      \
873
-        s1 += stride;                                                          \
874
-        s2 += stride;                                                          \
875
-    }                                                                          \
876
-                                                                               \
877
-    return score;                                                              \
878
-}
879
-VSSE(8)
880
-VSSE(16)
881
-
882
-#define WRAPPER8_16_SQ(name8, name16)                                   \
883
-static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,        \
884
-                  int stride, int h)                                    \
885
-{                                                                       \
886
-    int score = 0;                                                      \
887
-                                                                        \
888
-    score += name8(s, dst, src, stride, 8);                             \
889
-    score += name8(s, dst + 8, src + 8, stride, 8);                     \
890
-    if (h == 16) {                                                      \
891
-        dst   += 8 * stride;                                            \
892
-        src   += 8 * stride;                                            \
893
-        score += name8(s, dst, src, stride, 8);                         \
894
-        score += name8(s, dst + 8, src + 8, stride, 8);                 \
895
-    }                                                                   \
896
-    return score;                                                       \
897
-}
898
-
899
-WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
900
-WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
901
-WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
902
-#if CONFIG_GPL
903
-WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
904
-#endif
905
-WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
906
-WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
907
-WRAPPER8_16_SQ(rd8x8_c, rd16_c)
908
-WRAPPER8_16_SQ(bit8x8_c, bit16_c)
909
-
910
-/* init static data */
911
-av_cold void ff_dsputil_static_init(void)
912
-{
913
-    int i;
914
-
915
-    for (i = 0; i < 512; i++)
916
-        ff_square_tab[i] = (i - 256) * (i - 256);
917
-}
918
-
919
-int ff_check_alignment(void)
920
-{
921
-    static int did_fail = 0;
922
-    LOCAL_ALIGNED_16(int, aligned, [4]);
923
-
924
-    if ((intptr_t)aligned & 15) {
925
-        if (!did_fail) {
926
-#if HAVE_MMX || HAVE_ALTIVEC
927
-            av_log(NULL, AV_LOG_ERROR,
928
-                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
929
-                "and may be very slow or crash. This is not a bug in libavcodec,\n"
930
-                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
931
-                "Do not report crashes to FFmpeg developers.\n");
932
-#endif
933
-            did_fail=1;
934
-        }
935
-        return -1;
936
-    }
937
-    return 0;
938
-}
939
-
940
-av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
941
-{
942
-    ff_check_alignment();
943
-
944
-    c->sum_abs_dctelem = sum_abs_dctelem_c;
945
-
946
-    /* TODO [0] 16  [1] 8 */
947
-    c->pix_abs[0][0] = pix_abs16_c;
948
-    c->pix_abs[0][1] = pix_abs16_x2_c;
949
-    c->pix_abs[0][2] = pix_abs16_y2_c;
950
-    c->pix_abs[0][3] = pix_abs16_xy2_c;
951
-    c->pix_abs[1][0] = pix_abs8_c;
952
-    c->pix_abs[1][1] = pix_abs8_x2_c;
953
-    c->pix_abs[1][2] = pix_abs8_y2_c;
954
-    c->pix_abs[1][3] = pix_abs8_xy2_c;
955
-
956
-#define SET_CMP_FUNC(name)                      \
957
-    c->name[0] = name ## 16_c;                  \
958
-    c->name[1] = name ## 8x8_c;
959
-
960
-    SET_CMP_FUNC(hadamard8_diff)
961
-    c->hadamard8_diff[4] = hadamard8_intra16_c;
962
-    c->hadamard8_diff[5] = hadamard8_intra8x8_c;
963
-    SET_CMP_FUNC(dct_sad)
964
-    SET_CMP_FUNC(dct_max)
965
-#if CONFIG_GPL
966
-    SET_CMP_FUNC(dct264_sad)
967
-#endif
968
-    c->sad[0] = pix_abs16_c;
969
-    c->sad[1] = pix_abs8_c;
970
-    c->sse[0] = sse16_c;
971
-    c->sse[1] = sse8_c;
972
-    c->sse[2] = sse4_c;
973
-    SET_CMP_FUNC(quant_psnr)
974
-    SET_CMP_FUNC(rd)
975
-    SET_CMP_FUNC(bit)
976
-    c->vsad[0] = vsad16_c;
977
-    c->vsad[1] = vsad8_c;
978
-    c->vsad[4] = vsad_intra16_c;
979
-    c->vsad[5] = vsad_intra8_c;
980
-    c->vsse[0] = vsse16_c;
981
-    c->vsse[1] = vsse8_c;
982
-    c->vsse[4] = vsse_intra16_c;
983
-    c->vsse[5] = vsse_intra8_c;
984
-    c->nsse[0] = nsse16_c;
985
-    c->nsse[1] = nsse8_c;
986
-#if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
987
-    ff_dsputil_init_dwt(c);
988
-#endif
989
-
990
-    if (ARCH_ALPHA)
991
-        ff_dsputil_init_alpha(c, avctx);
992
-    if (ARCH_ARM)
993
-        ff_dsputil_init_arm(c, avctx);
994
-    if (ARCH_PPC)
995
-        ff_dsputil_init_ppc(c, avctx);
996
-    if (ARCH_X86)
997
-        ff_dsputil_init_x86(c, avctx);
998
-}
999
-
1000
-av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
1001
-{
1002
-    ff_dsputil_init(c, avctx);
1003
-}
1004
-
1005
-av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx)
1006
-{
1007
-    ff_dsputil_init(c, avctx);
1008
-}
1009 1
deleted file mode 100644
... ...
@@ -1,108 +0,0 @@
1
-/*
2
- * DSP utils
3
- * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
4
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
- *
6
- * This file is part of FFmpeg.
7
- *
8
- * FFmpeg is free software; you can redistribute it and/or
9
- * modify it under the terms of the GNU Lesser General Public
10
- * License as published by the Free Software Foundation; either
11
- * version 2.1 of the License, or (at your option) any later version.
12
- *
13
- * FFmpeg is distributed in the hope that it will be useful,
14
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
- * Lesser General Public License for more details.
17
- *
18
- * You should have received a copy of the GNU Lesser General Public
19
- * License along with FFmpeg; if not, write to the Free Software
20
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
- */
22
-
23
-/**
24
- * @file
25
- * DSP utils.
26
- * Note, many functions in here may use MMX which trashes the FPU state, it is
27
- * absolutely necessary to call emms_c() between DSP & float/double code.
28
- */
29
-
30
-#ifndef AVCODEC_DSPUTIL_H
31
-#define AVCODEC_DSPUTIL_H
32
-
33
-#include "avcodec.h"
34
-
35
-extern uint32_t ff_square_tab[512];
36
-
37
-
38
-/* minimum alignment rules ;)
39
- * If you notice errors in the align stuff, need more alignment for some ASM code
40
- * for some CPU or need to use a function with less aligned data then send a mail
41
- * to the ffmpeg-devel mailing list, ...
42
- *
43
- * !warning These alignments might not match reality, (missing attribute((align))
44
- * stuff somewhere possible).
45
- * I (Michael) did not check them, these are just the alignments which I think
46
- * could be reached easily ...
47
- *
48
- * !future video codecs might need functions with less strict alignment
49
- */
50
-
51
-struct MpegEncContext;
52
-/* Motion estimation:
53
- * h is limited to { width / 2, width, 2 * width },
54
- * but never larger than 16 and never smaller than 2.
55
- * Although currently h < 4 is not used as functions with
56
- * width < 8 are neither used nor implemented. */
57
-typedef int (*me_cmp_func)(struct MpegEncContext *c,
58
-                           uint8_t *blk1 /* align width (8 or 16) */,
59
-                           uint8_t *blk2 /* align 1 */, int line_size, int h);
60
-
61
-/**
62
- * DSPContext.
63
- */
64
-typedef struct DSPContext {
65
-    int (*sum_abs_dctelem)(int16_t *block /* align 16 */);
66
-
67
-    me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */
68
-    me_cmp_func sse[6];
69
-    me_cmp_func hadamard8_diff[6];
70
-    me_cmp_func dct_sad[6];
71
-    me_cmp_func quant_psnr[6];
72
-    me_cmp_func bit[6];
73
-    me_cmp_func rd[6];
74
-    me_cmp_func vsad[6];
75
-    me_cmp_func vsse[6];
76
-    me_cmp_func nsse[6];
77
-    me_cmp_func w53[6];
78
-    me_cmp_func w97[6];
79
-    me_cmp_func dct_max[6];
80
-    me_cmp_func dct264_sad[6];
81
-
82
-    me_cmp_func me_pre_cmp[6];
83
-    me_cmp_func me_cmp[6];
84
-    me_cmp_func me_sub_cmp[6];
85
-    me_cmp_func mb_cmp[6];
86
-    me_cmp_func ildct_cmp[6]; // only width 16 used
87
-    me_cmp_func frame_skip_cmp[6]; // only width 8 used
88
-
89
-    me_cmp_func pix_abs[2][4];
90
-} DSPContext;
91
-
92
-void ff_dsputil_static_init(void);
93
-void ff_dsputil_init(DSPContext *p, AVCodecContext *avctx);
94
-void avpriv_dsputil_init(DSPContext* p, AVCodecContext *avctx);
95
-attribute_deprecated void dsputil_init(DSPContext* c, AVCodecContext *avctx);
96
-
97
-int ff_check_alignment(void);
98
-
99
-void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type);
100
-
101
-void ff_dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
102
-void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx);
103
-void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx);
104
-void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx);
105
-
106
-void ff_dsputil_init_dwt(DSPContext *c);
107
-
108
-#endif /* AVCODEC_DSPUTIL_H */
... ...
@@ -28,7 +28,7 @@
28 28
 #define AVCODEC_DV_H
29 29
 
30 30
 #include "avcodec.h"
31
-#include "dsputil.h"
31
+#include "me_cmp.h"
32 32
 #include "get_bits.h"
33 33
 #include "dv_profile.h"
34 34
 
... ...
@@ -19,7 +19,6 @@
19 19
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 20
  */
21 21
 #include "avcodec.h"
22
-#include "dsputil.h"
23 22
 #include "get_bits.h"
24 23
 #include "parser.h"
25 24
 
... ...
@@ -28,9 +28,9 @@
28 28
 #include "libavutil/pixdesc.h"
29 29
 #include "config.h"
30 30
 #include "avcodec.h"
31
-#include "dsputil.h"
32 31
 #include "fdctdsp.h"
33 32
 #include "internal.h"
33
+#include "me_cmp.h"
34 34
 #include "pixblockdsp.h"
35 35
 #include "put_bits.h"
36 36
 #include "dv.h"
... ...
@@ -40,8 +40,8 @@
40 40
 static av_cold int dvvideo_encode_init(AVCodecContext *avctx)
41 41
 {
42 42
     DVVideoContext *s = avctx->priv_data;
43
-    DSPContext dsp;
44 43
     FDCTDSPContext fdsp;
44
+    MECmpContext mecc;
45 45
     PixblockDSPContext pdsp;
46 46
     int ret;
47 47
 
... ...
@@ -69,14 +69,14 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx)
69 69
 
70 70
     dv_vlc_map_tableinit();
71 71
 
72
-    memset(&dsp,0, sizeof(dsp));
73
-    ff_dsputil_init(&dsp, avctx);
72
+    memset(&mecc,0, sizeof(mecc));
74 73
     ff_fdctdsp_init(&fdsp, avctx);
74
+    ff_me_cmp_init(&mecc, avctx);
75 75
     ff_pixblockdsp_init(&pdsp, avctx);
76
-    ff_set_cmp(&dsp, dsp.ildct_cmp, avctx->ildct_cmp);
76
+    ff_set_cmp(&mecc, mecc.ildct_cmp, avctx->ildct_cmp);
77 77
 
78 78
     s->get_pixels = pdsp.get_pixels;
79
-    s->ildct_cmp  = dsp.ildct_cmp[5];
79
+    s->ildct_cmp  = mecc.ildct_cmp[5];
80 80
 
81 81
     s->fdct[0]    = fdsp.fdct;
82 82
     s->fdct[1]    = fdsp.fdct248;
... ...
@@ -739,12 +739,12 @@ static int is_intra_more_likely(ERContext *s)
739 739
                 } else {
740 740
                     ff_thread_await_progress(s->last_pic.tf, mb_y, 0);
741 741
                 }
742
-                is_intra_likely += s->dsp->sad[0](NULL, last_mb_ptr, mb_ptr,
743
-                                                 linesize[0], 16);
742
+                is_intra_likely += s->mecc->sad[0](NULL, last_mb_ptr, mb_ptr,
743
+                                                   linesize[0], 16);
744 744
                 // FIXME need await_progress() here
745
-                is_intra_likely -= s->dsp->sad[0](NULL, last_mb_ptr,
746
-                                                 last_mb_ptr + linesize[0] * 16,
747
-                                                 linesize[0], 16);
745
+                is_intra_likely -= s->mecc->sad[0](NULL, last_mb_ptr,
746
+                                                   last_mb_ptr + linesize[0] * 16,
747
+                                                   linesize[0], 16);
748 748
             } else {
749 749
                 if (IS_INTRA(s->cur_pic.mb_type[mb_xy]))
750 750
                    is_intra_likely++;
... ...
@@ -23,7 +23,7 @@
23 23
 #include <stdint.h>
24 24
 
25 25
 #include "avcodec.h"
26
-#include "dsputil.h"
26
+#include "me_cmp.h"
27 27
 #include "thread.h"
28 28
 
29 29
 ///< current MB is the first after a resync marker
... ...
@@ -52,7 +52,7 @@ typedef struct ERPicture {
52 52
 
53 53
 typedef struct ERContext {
54 54
     AVCodecContext *avctx;
55
-    DSPContext *dsp;
55
+    MECmpContext *mecc;
56 56
 
57 57
     int *mb_index2xy;
58 58
     int mb_num;
... ...
@@ -36,7 +36,6 @@
36 36
 #include "internal.h"
37 37
 #include "cabac.h"
38 38
 #include "cabac_functions.h"
39
-#include "dsputil.h"
40 39
 #include "error_resilience.h"
41 40
 #include "avcodec.h"
42 41
 #include "h264.h"
... ...
@@ -45,6 +44,7 @@
45 45
 #include "h264_mvpred.h"
46 46
 #include "golomb.h"
47 47
 #include "mathops.h"
48
+#include "me_cmp.h"
48 49
 #include "mpegutils.h"
49 50
 #include "rectangle.h"
50 51
 #include "svq3.h"
... ...
@@ -515,7 +515,7 @@ int ff_h264_context_init(H264Context *h)
515 515
     if (CONFIG_ERROR_RESILIENCE) {
516 516
         /* init ER */
517 517
         er->avctx          = h->avctx;
518
-        er->dsp            = &h->dsp;
518
+        er->mecc           = &h->mecc;
519 519
         er->decode_mb      = h264_er_decode_mb;
520 520
         er->opaque         = h;
521 521
         er->quarter_sample = 1;
... ...
@@ -653,7 +653,7 @@ av_cold int ff_h264_decode_init(AVCodecContext *avctx)
653 653
 
654 654
     /* needed so that IDCT permutation is known early */
655 655
     if (CONFIG_ERROR_RESILIENCE)
656
-        ff_dsputil_init(&h->dsp, h->avctx);
656
+        ff_me_cmp_init(&h->mecc, h->avctx);
657 657
     ff_videodsp_init(&h->vdsp, 8);
658 658
 
659 659
     memset(h->pps.scaling_matrix4, 16, 6 * 16 * sizeof(uint8_t));
... ...
@@ -1266,7 +1266,7 @@ int ff_h264_set_parameter_from_sps(H264Context *h)
1266 1266
                               h->sps.chroma_format_idc);
1267 1267
 
1268 1268
             if (CONFIG_ERROR_RESILIENCE)
1269
-                ff_dsputil_init(&h->dsp, h->avctx);
1269
+                ff_me_cmp_init(&h->mecc, h->avctx);
1270 1270
             ff_videodsp_init(&h->vdsp, h->sps.bit_depth_luma);
1271 1271
         } else {
1272 1272
             av_log(h->avctx, AV_LOG_ERROR, "Unsupported bit depth %d\n",
... ...
@@ -30,13 +30,13 @@
30 30
 
31 31
 #include "libavutil/intreadwrite.h"
32 32
 #include "cabac.h"
33
-#include "dsputil.h"
34 33
 #include "error_resilience.h"
35 34
 #include "get_bits.h"
36 35
 #include "h264chroma.h"
37 36
 #include "h264dsp.h"
38 37
 #include "h264pred.h"
39 38
 #include "h264qpel.h"
39
+#include "me_cmp.h"
40 40
 #include "mpegutils.h"
41 41
 #include "parser.h"
42 42
 #include "qpeldsp.h"
... ...
@@ -338,13 +338,13 @@ typedef struct H264Picture {
338 338
  */
339 339
 typedef struct H264Context {
340 340
     AVCodecContext *avctx;
341
+    MECmpContext mecc;
341 342
     VideoDSPContext vdsp;
342 343
     H264DSPContext h264dsp;
343 344
     H264ChromaContext h264chroma;
344 345
     H264QpelContext h264qpel;
345 346
     ParseContext parse_context;
346 347
     GetBitContext gb;
347
-    DSPContext       dsp;
348 348
     ERContext er;
349 349
 
350 350
     H264Picture *DPB;
... ...
@@ -31,7 +31,6 @@
31 31
 #include "internal.h"
32 32
 #include "cabac.h"
33 33
 #include "cabac_functions.h"
34
-#include "dsputil.h"
35 34
 #include "error_resilience.h"
36 35
 #include "avcodec.h"
37 36
 #include "h264.h"
... ...
@@ -1203,7 +1202,7 @@ static int h264_slice_header_init(H264Context *h, int reinit)
1203 1203
                 return AVERROR(ENOMEM);
1204 1204
             c->avctx             = h->avctx;
1205 1205
             if (CONFIG_ERROR_RESILIENCE) {
1206
-                c->dsp               = h->dsp;
1206
+                c->mecc              = h->mecc;
1207 1207
             }
1208 1208
             c->vdsp              = h->vdsp;
1209 1209
             c->h264dsp           = h->h264dsp;
... ...
@@ -3,8 +3,6 @@ LIBAVCODEC_$MAJOR {
3 3
                 #deprecated, remove after next bump
4 4
                 audio_resample;
5 5
                 audio_resample_close;
6
-                dsputil_init;
7
-                ff_dsputil_init;
8 6
                 ff_find_pix_fmt;
9 7
                 ff_framenum_to_drop_timecode;
10 8
                 ff_framenum_to_smtpe_timecode;
... ...
@@ -30,5 +28,6 @@ LIBAVCODEC_$MAJOR {
30 30
                 ff_idctdsp_init;
31 31
                 ff_fdctdsp_init;
32 32
                 ff_pixblockdsp_init;
33
+                ff_me_cmp_init;
33 34
         local:  *;
34 35
 };
35 36
new file mode 100644
... ...
@@ -0,0 +1,988 @@
0
+/*
1
+ * This file is part of FFmpeg.
2
+ *
3
+ * FFmpeg is free software; you can redistribute it and/or
4
+ * modify it under the terms of the GNU Lesser General Public
5
+ * License as published by the Free Software Foundation; either
6
+ * version 2.1 of the License, or (at your option) any later version.
7
+ *
8
+ * FFmpeg is distributed in the hope that it will be useful,
9
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11
+ * Lesser General Public License for more details.
12
+ *
13
+ * You should have received a copy of the GNU Lesser General Public
14
+ * License along with FFmpeg; if not, write to the Free Software
15
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+ */
17
+
18
+#include "libavutil/attributes.h"
19
+#include "libavutil/internal.h"
20
+#include "avcodec.h"
21
+#include "copy_block.h"
22
+#include "simple_idct.h"
23
+#include "me_cmp.h"
24
+#include "mpegvideo.h"
25
+#include "config.h"
26
+
27
+uint32_t ff_square_tab[512] = { 0, };
28
+
29
+static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
30
+                  int line_size, int h)
31
+{
32
+    int s = 0, i;
33
+    uint32_t *sq = ff_square_tab + 256;
34
+
35
+    for (i = 0; i < h; i++) {
36
+        s    += sq[pix1[0] - pix2[0]];
37
+        s    += sq[pix1[1] - pix2[1]];
38
+        s    += sq[pix1[2] - pix2[2]];
39
+        s    += sq[pix1[3] - pix2[3]];
40
+        pix1 += line_size;
41
+        pix2 += line_size;
42
+    }
43
+    return s;
44
+}
45
+
46
+static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
47
+                  int line_size, int h)
48
+{
49
+    int s = 0, i;
50
+    uint32_t *sq = ff_square_tab + 256;
51
+
52
+    for (i = 0; i < h; i++) {
53
+        s    += sq[pix1[0] - pix2[0]];
54
+        s    += sq[pix1[1] - pix2[1]];
55
+        s    += sq[pix1[2] - pix2[2]];
56
+        s    += sq[pix1[3] - pix2[3]];
57
+        s    += sq[pix1[4] - pix2[4]];
58
+        s    += sq[pix1[5] - pix2[5]];
59
+        s    += sq[pix1[6] - pix2[6]];
60
+        s    += sq[pix1[7] - pix2[7]];
61
+        pix1 += line_size;
62
+        pix2 += line_size;
63
+    }
64
+    return s;
65
+}
66
+
67
+static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
68
+                   int line_size, int h)
69
+{
70
+    int s = 0, i;
71
+    uint32_t *sq = ff_square_tab + 256;
72
+
73
+    for (i = 0; i < h; i++) {
74
+        s += sq[pix1[0]  - pix2[0]];
75
+        s += sq[pix1[1]  - pix2[1]];
76
+        s += sq[pix1[2]  - pix2[2]];
77
+        s += sq[pix1[3]  - pix2[3]];
78
+        s += sq[pix1[4]  - pix2[4]];
79
+        s += sq[pix1[5]  - pix2[5]];
80
+        s += sq[pix1[6]  - pix2[6]];
81
+        s += sq[pix1[7]  - pix2[7]];
82
+        s += sq[pix1[8]  - pix2[8]];
83
+        s += sq[pix1[9]  - pix2[9]];
84
+        s += sq[pix1[10] - pix2[10]];
85
+        s += sq[pix1[11] - pix2[11]];
86
+        s += sq[pix1[12] - pix2[12]];
87
+        s += sq[pix1[13] - pix2[13]];
88
+        s += sq[pix1[14] - pix2[14]];
89
+        s += sq[pix1[15] - pix2[15]];
90
+
91
+        pix1 += line_size;
92
+        pix2 += line_size;
93
+    }
94
+    return s;
95
+}
96
+
97
+static int sum_abs_dctelem_c(int16_t *block)
98
+{
99
+    int sum = 0, i;
100
+
101
+    for (i = 0; i < 64; i++)
102
+        sum += FFABS(block[i]);
103
+    return sum;
104
+}
105
+
106
+#define avg2(a, b) ((a + b + 1) >> 1)
107
+#define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
108
+
109
+static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
110
+                              int line_size, int h)
111
+{
112
+    int s = 0, i;
113
+
114
+    for (i = 0; i < h; i++) {
115
+        s    += abs(pix1[0]  - pix2[0]);
116
+        s    += abs(pix1[1]  - pix2[1]);
117
+        s    += abs(pix1[2]  - pix2[2]);
118
+        s    += abs(pix1[3]  - pix2[3]);
119
+        s    += abs(pix1[4]  - pix2[4]);
120
+        s    += abs(pix1[5]  - pix2[5]);
121
+        s    += abs(pix1[6]  - pix2[6]);
122
+        s    += abs(pix1[7]  - pix2[7]);
123
+        s    += abs(pix1[8]  - pix2[8]);
124
+        s    += abs(pix1[9]  - pix2[9]);
125
+        s    += abs(pix1[10] - pix2[10]);
126
+        s    += abs(pix1[11] - pix2[11]);
127
+        s    += abs(pix1[12] - pix2[12]);
128
+        s    += abs(pix1[13] - pix2[13]);
129
+        s    += abs(pix1[14] - pix2[14]);
130
+        s    += abs(pix1[15] - pix2[15]);
131
+        pix1 += line_size;
132
+        pix2 += line_size;
133
+    }
134
+    return s;
135
+}
136
+
137
+static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
138
+                          int line_size, int h)
139
+{
140
+    int s = 0, i;
141
+
142
+    for (i = 0; i < h; i++) {
143
+        s    += abs(pix1[0]  - avg2(pix2[0],  pix2[1]));
144
+        s    += abs(pix1[1]  - avg2(pix2[1],  pix2[2]));
145
+        s    += abs(pix1[2]  - avg2(pix2[2],  pix2[3]));
146
+        s    += abs(pix1[3]  - avg2(pix2[3],  pix2[4]));
147
+        s    += abs(pix1[4]  - avg2(pix2[4],  pix2[5]));
148
+        s    += abs(pix1[5]  - avg2(pix2[5],  pix2[6]));
149
+        s    += abs(pix1[6]  - avg2(pix2[6],  pix2[7]));
150
+        s    += abs(pix1[7]  - avg2(pix2[7],  pix2[8]));
151
+        s    += abs(pix1[8]  - avg2(pix2[8],  pix2[9]));
152
+        s    += abs(pix1[9]  - avg2(pix2[9],  pix2[10]));
153
+        s    += abs(pix1[10] - avg2(pix2[10], pix2[11]));
154
+        s    += abs(pix1[11] - avg2(pix2[11], pix2[12]));
155
+        s    += abs(pix1[12] - avg2(pix2[12], pix2[13]));
156
+        s    += abs(pix1[13] - avg2(pix2[13], pix2[14]));
157
+        s    += abs(pix1[14] - avg2(pix2[14], pix2[15]));
158
+        s    += abs(pix1[15] - avg2(pix2[15], pix2[16]));
159
+        pix1 += line_size;
160
+        pix2 += line_size;
161
+    }
162
+    return s;
163
+}
164
+
165
+static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
166
+                          int line_size, int h)
167
+{
168
+    int s = 0, i;
169
+    uint8_t *pix3 = pix2 + line_size;
170
+
171
+    for (i = 0; i < h; i++) {
172
+        s    += abs(pix1[0]  - avg2(pix2[0],  pix3[0]));
173
+        s    += abs(pix1[1]  - avg2(pix2[1],  pix3[1]));
174
+        s    += abs(pix1[2]  - avg2(pix2[2],  pix3[2]));
175
+        s    += abs(pix1[3]  - avg2(pix2[3],  pix3[3]));
176
+        s    += abs(pix1[4]  - avg2(pix2[4],  pix3[4]));
177
+        s    += abs(pix1[5]  - avg2(pix2[5],  pix3[5]));
178
+        s    += abs(pix1[6]  - avg2(pix2[6],  pix3[6]));
179
+        s    += abs(pix1[7]  - avg2(pix2[7],  pix3[7]));
180
+        s    += abs(pix1[8]  - avg2(pix2[8],  pix3[8]));
181
+        s    += abs(pix1[9]  - avg2(pix2[9],  pix3[9]));
182
+        s    += abs(pix1[10] - avg2(pix2[10], pix3[10]));
183
+        s    += abs(pix1[11] - avg2(pix2[11], pix3[11]));
184
+        s    += abs(pix1[12] - avg2(pix2[12], pix3[12]));
185
+        s    += abs(pix1[13] - avg2(pix2[13], pix3[13]));
186
+        s    += abs(pix1[14] - avg2(pix2[14], pix3[14]));
187
+        s    += abs(pix1[15] - avg2(pix2[15], pix3[15]));
188
+        pix1 += line_size;
189
+        pix2 += line_size;
190
+        pix3 += line_size;
191
+    }
192
+    return s;
193
+}
194
+
195
+static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
196
+                           int line_size, int h)
197
+{
198
+    int s = 0, i;
199
+    uint8_t *pix3 = pix2 + line_size;
200
+
201
+    for (i = 0; i < h; i++) {
202
+        s    += abs(pix1[0]  - avg4(pix2[0],  pix2[1],  pix3[0],  pix3[1]));
203
+        s    += abs(pix1[1]  - avg4(pix2[1],  pix2[2],  pix3[1],  pix3[2]));
204
+        s    += abs(pix1[2]  - avg4(pix2[2],  pix2[3],  pix3[2],  pix3[3]));
205
+        s    += abs(pix1[3]  - avg4(pix2[3],  pix2[4],  pix3[3],  pix3[4]));
206
+        s    += abs(pix1[4]  - avg4(pix2[4],  pix2[5],  pix3[4],  pix3[5]));
207
+        s    += abs(pix1[5]  - avg4(pix2[5],  pix2[6],  pix3[5],  pix3[6]));
208
+        s    += abs(pix1[6]  - avg4(pix2[6],  pix2[7],  pix3[6],  pix3[7]));
209
+        s    += abs(pix1[7]  - avg4(pix2[7],  pix2[8],  pix3[7],  pix3[8]));
210
+        s    += abs(pix1[8]  - avg4(pix2[8],  pix2[9],  pix3[8],  pix3[9]));
211
+        s    += abs(pix1[9]  - avg4(pix2[9],  pix2[10], pix3[9],  pix3[10]));
212
+        s    += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
213
+        s    += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
214
+        s    += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
215
+        s    += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
216
+        s    += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
217
+        s    += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
218
+        pix1 += line_size;
219
+        pix2 += line_size;
220
+        pix3 += line_size;
221
+    }
222
+    return s;
223
+}
224
+
225
+static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
226
+                             int line_size, int h)
227
+{
228
+    int s = 0, i;
229
+
230
+    for (i = 0; i < h; i++) {
231
+        s    += abs(pix1[0] - pix2[0]);
232
+        s    += abs(pix1[1] - pix2[1]);
233
+        s    += abs(pix1[2] - pix2[2]);
234
+        s    += abs(pix1[3] - pix2[3]);
235
+        s    += abs(pix1[4] - pix2[4]);
236
+        s    += abs(pix1[5] - pix2[5]);
237
+        s    += abs(pix1[6] - pix2[6]);
238
+        s    += abs(pix1[7] - pix2[7]);
239
+        pix1 += line_size;
240
+        pix2 += line_size;
241
+    }
242
+    return s;
243
+}
244
+
245
+static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
246
+                         int line_size, int h)
247
+{
248
+    int s = 0, i;
249
+
250
+    for (i = 0; i < h; i++) {
251
+        s    += abs(pix1[0] - avg2(pix2[0], pix2[1]));
252
+        s    += abs(pix1[1] - avg2(pix2[1], pix2[2]));
253
+        s    += abs(pix1[2] - avg2(pix2[2], pix2[3]));
254
+        s    += abs(pix1[3] - avg2(pix2[3], pix2[4]));
255
+        s    += abs(pix1[4] - avg2(pix2[4], pix2[5]));
256
+        s    += abs(pix1[5] - avg2(pix2[5], pix2[6]));
257
+        s    += abs(pix1[6] - avg2(pix2[6], pix2[7]));
258
+        s    += abs(pix1[7] - avg2(pix2[7], pix2[8]));
259
+        pix1 += line_size;
260
+        pix2 += line_size;
261
+    }
262
+    return s;
263
+}
264
+
265
+static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
266
+                         int line_size, int h)
267
+{
268
+    int s = 0, i;
269
+    uint8_t *pix3 = pix2 + line_size;
270
+
271
+    for (i = 0; i < h; i++) {
272
+        s    += abs(pix1[0] - avg2(pix2[0], pix3[0]));
273
+        s    += abs(pix1[1] - avg2(pix2[1], pix3[1]));
274
+        s    += abs(pix1[2] - avg2(pix2[2], pix3[2]));
275
+        s    += abs(pix1[3] - avg2(pix2[3], pix3[3]));
276
+        s    += abs(pix1[4] - avg2(pix2[4], pix3[4]));
277
+        s    += abs(pix1[5] - avg2(pix2[5], pix3[5]));
278
+        s    += abs(pix1[6] - avg2(pix2[6], pix3[6]));
279
+        s    += abs(pix1[7] - avg2(pix2[7], pix3[7]));
280
+        pix1 += line_size;
281
+        pix2 += line_size;
282
+        pix3 += line_size;
283
+    }
284
+    return s;
285
+}
286
+
287
+static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
288
+                          int line_size, int h)
289
+{
290
+    int s = 0, i;
291
+    uint8_t *pix3 = pix2 + line_size;
292
+
293
+    for (i = 0; i < h; i++) {
294
+        s    += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
295
+        s    += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
296
+        s    += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
297
+        s    += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
298
+        s    += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
299
+        s    += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
300
+        s    += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
301
+        s    += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
302
+        pix1 += line_size;
303
+        pix2 += line_size;
304
+        pix3 += line_size;
305
+    }
306
+    return s;
307
+}
308
+
309
+static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
310
+{
311
+    int score1 = 0, score2 = 0, x, y;
312
+
313
+    for (y = 0; y < h; y++) {
314
+        for (x = 0; x < 16; x++)
315
+            score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
316
+        if (y + 1 < h) {
317
+            for (x = 0; x < 15; x++)
318
+                score2 += FFABS(s1[x]     - s1[x + stride] -
319
+                                s1[x + 1] + s1[x + stride + 1]) -
320
+                          FFABS(s2[x]     - s2[x + stride] -
321
+                                s2[x + 1] + s2[x + stride + 1]);
322
+        }
323
+        s1 += stride;
324
+        s2 += stride;
325
+    }
326
+
327
+    if (c)
328
+        return score1 + FFABS(score2) * c->avctx->nsse_weight;
329
+    else
330
+        return score1 + FFABS(score2) * 8;
331
+}
332
+
333
+static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
334
+{
335
+    int score1 = 0, score2 = 0, x, y;
336
+
337
+    for (y = 0; y < h; y++) {
338
+        for (x = 0; x < 8; x++)
339
+            score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
340
+        if (y + 1 < h) {
341
+            for (x = 0; x < 7; x++)
342
+                score2 += FFABS(s1[x]     - s1[x + stride] -
343
+                                s1[x + 1] + s1[x + stride + 1]) -
344
+                          FFABS(s2[x]     - s2[x + stride] -
345
+                                s2[x + 1] + s2[x + stride + 1]);
346
+        }
347
+        s1 += stride;
348
+        s2 += stride;
349
+    }
350
+
351
+    if (c)
352
+        return score1 + FFABS(score2) * c->avctx->nsse_weight;
353
+    else
354
+        return score1 + FFABS(score2) * 8;
355
+}
356
+
357
+static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
358
+                    int stride, int h)
359
+{
360
+    return 0;
361
+}
362
+
363
+void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type)
364
+{
365
+    int i;
366
+
367
+    memset(cmp, 0, sizeof(void *) * 6);
368
+
369
+    for (i = 0; i < 6; i++) {
370
+        switch (type & 0xFF) {
371
+        case FF_CMP_SAD:
372
+            cmp[i] = c->sad[i];
373
+            break;
374
+        case FF_CMP_SATD:
375
+            cmp[i] = c->hadamard8_diff[i];
376
+            break;
377
+        case FF_CMP_SSE:
378
+            cmp[i] = c->sse[i];
379
+            break;
380
+        case FF_CMP_DCT:
381
+            cmp[i] = c->dct_sad[i];
382
+            break;
383
+        case FF_CMP_DCT264:
384
+            cmp[i] = c->dct264_sad[i];
385
+            break;
386
+        case FF_CMP_DCTMAX:
387
+            cmp[i] = c->dct_max[i];
388
+            break;
389
+        case FF_CMP_PSNR:
390
+            cmp[i] = c->quant_psnr[i];
391
+            break;
392
+        case FF_CMP_BIT:
393
+            cmp[i] = c->bit[i];
394
+            break;
395
+        case FF_CMP_RD:
396
+            cmp[i] = c->rd[i];
397
+            break;
398
+        case FF_CMP_VSAD:
399
+            cmp[i] = c->vsad[i];
400
+            break;
401
+        case FF_CMP_VSSE:
402
+            cmp[i] = c->vsse[i];
403
+            break;
404
+        case FF_CMP_ZERO:
405
+            cmp[i] = zero_cmp;
406
+            break;
407
+        case FF_CMP_NSSE:
408
+            cmp[i] = c->nsse[i];
409
+            break;
410
+#if CONFIG_DWT
411
+        case FF_CMP_W53:
412
+            cmp[i]= c->w53[i];
413
+            break;
414
+        case FF_CMP_W97:
415
+            cmp[i]= c->w97[i];
416
+            break;
417
+#endif
418
+        default:
419
+            av_log(NULL, AV_LOG_ERROR,
420
+                   "internal error in cmp function selection\n");
421
+        }
422
+    }
423
+}
424
+
425
+#define BUTTERFLY2(o1, o2, i1, i2)              \
426
+    o1 = (i1) + (i2);                           \
427
+    o2 = (i1) - (i2);
428
+
429
+#define BUTTERFLY1(x, y)                        \
430
+    {                                           \
431
+        int a, b;                               \
432
+        a = x;                                  \
433
+        b = y;                                  \
434
+        x = a + b;                              \
435
+        y = a - b;                              \
436
+    }
437
+
438
+#define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
439
+
440
+static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
441
+                               uint8_t *src, int stride, int h)
442
+{
443
+    int i, temp[64], sum = 0;
444
+
445
+    av_assert2(h == 8);
446
+
447
+    for (i = 0; i < 8; i++) {
448
+        // FIXME: try pointer walks
449
+        BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
450
+                   src[stride * i + 0] - dst[stride * i + 0],
451
+                   src[stride * i + 1] - dst[stride * i + 1]);
452
+        BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
453
+                   src[stride * i + 2] - dst[stride * i + 2],
454
+                   src[stride * i + 3] - dst[stride * i + 3]);
455
+        BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
456
+                   src[stride * i + 4] - dst[stride * i + 4],
457
+                   src[stride * i + 5] - dst[stride * i + 5]);
458
+        BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
459
+                   src[stride * i + 6] - dst[stride * i + 6],
460
+                   src[stride * i + 7] - dst[stride * i + 7]);
461
+
462
+        BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
463
+        BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
464
+        BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
465
+        BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
466
+
467
+        BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
468
+        BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
469
+        BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
470
+        BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
471
+    }
472
+
473
+    for (i = 0; i < 8; i++) {
474
+        BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
475
+        BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
476
+        BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
477
+        BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
478
+
479
+        BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
480
+        BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
481
+        BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
482
+        BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
483
+
484
+        sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
485
+               BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
486
+               BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
487
+               BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
488
+    }
489
+    return sum;
490
+}
491
+
492
+static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
493
+                                uint8_t *dummy, int stride, int h)
494
+{
495
+    int i, temp[64], sum = 0;
496
+
497
+    av_assert2(h == 8);
498
+
499
+    for (i = 0; i < 8; i++) {
500
+        // FIXME: try pointer walks
501
+        BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
502
+                   src[stride * i + 0], src[stride * i + 1]);
503
+        BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
504
+                   src[stride * i + 2], src[stride * i + 3]);
505
+        BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
506
+                   src[stride * i + 4], src[stride * i + 5]);
507
+        BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
508
+                   src[stride * i + 6], src[stride * i + 7]);
509
+
510
+        BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
511
+        BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
512
+        BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
513
+        BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
514
+
515
+        BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
516
+        BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
517
+        BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
518
+        BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
519
+    }
520
+
521
+    for (i = 0; i < 8; i++) {
522
+        BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
523
+        BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
524
+        BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
525
+        BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
526
+
527
+        BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
528
+        BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
529
+        BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
530
+        BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
531
+
532
+        sum +=
533
+            BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
534
+            + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
535
+            + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
536
+            + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
537
+    }
538
+
539
+    sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
540
+
541
+    return sum;
542
+}
543
+
544
+static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
545
+                        uint8_t *src2, int stride, int h)
546
+{
547
+    LOCAL_ALIGNED_16(int16_t, temp, [64]);
548
+
549
+    av_assert2(h == 8);
550
+
551
+    s->pdsp.diff_pixels(temp, src1, src2, stride);
552
+    s->fdsp.fdct(temp);
553
+    return s->mecc.sum_abs_dctelem(temp);
554
+}
555
+
556
+#if CONFIG_GPL
557
+#define DCT8_1D                                         \
558
+    {                                                   \
559
+        const int s07 = SRC(0) + SRC(7);                \
560
+        const int s16 = SRC(1) + SRC(6);                \
561
+        const int s25 = SRC(2) + SRC(5);                \
562
+        const int s34 = SRC(3) + SRC(4);                \
563
+        const int a0  = s07 + s34;                      \
564
+        const int a1  = s16 + s25;                      \
565
+        const int a2  = s07 - s34;                      \
566
+        const int a3  = s16 - s25;                      \
567
+        const int d07 = SRC(0) - SRC(7);                \
568
+        const int d16 = SRC(1) - SRC(6);                \
569
+        const int d25 = SRC(2) - SRC(5);                \
570
+        const int d34 = SRC(3) - SRC(4);                \
571
+        const int a4  = d16 + d25 + (d07 + (d07 >> 1)); \
572
+        const int a5  = d07 - d34 - (d25 + (d25 >> 1)); \
573
+        const int a6  = d07 + d34 - (d16 + (d16 >> 1)); \
574
+        const int a7  = d16 - d25 + (d34 + (d34 >> 1)); \
575
+        DST(0, a0 + a1);                                \
576
+        DST(1, a4 + (a7 >> 2));                         \
577
+        DST(2, a2 + (a3 >> 1));                         \
578
+        DST(3, a5 + (a6 >> 2));                         \
579
+        DST(4, a0 - a1);                                \
580
+        DST(5, a6 - (a5 >> 2));                         \
581
+        DST(6, (a2 >> 1) - a3);                         \
582
+        DST(7, (a4 >> 2) - a7);                         \
583
+    }
584
+
585
+static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
586
+                           uint8_t *src2, int stride, int h)
587
+{
588
+    int16_t dct[8][8];
589
+    int i, sum = 0;
590
+
591
+    s->pdsp.diff_pixels(dct[0], src1, src2, stride);
592
+
593
+#define SRC(x) dct[i][x]
594
+#define DST(x, v) dct[i][x] = v
595
+    for (i = 0; i < 8; i++)
596
+        DCT8_1D
597
+#undef SRC
598
+#undef DST
599
+
600
+#define SRC(x) dct[x][i]
601
+#define DST(x, v) sum += FFABS(v)
602
+        for (i = 0; i < 8; i++)
603
+            DCT8_1D
604
+#undef SRC
605
+#undef DST
606
+            return sum;
607
+}
608
+#endif
609
+
610
+static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
611
+                        uint8_t *src2, int stride, int h)
612
+{
613
+    LOCAL_ALIGNED_16(int16_t, temp, [64]);
614
+    int sum = 0, i;
615
+
616
+    av_assert2(h == 8);
617
+
618
+    s->pdsp.diff_pixels(temp, src1, src2, stride);
619
+    s->fdsp.fdct(temp);
620
+
621
+    for (i = 0; i < 64; i++)
622
+        sum = FFMAX(sum, FFABS(temp[i]));
623
+
624
+    return sum;
625
+}
626
+
627
+static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
628
+                           uint8_t *src2, int stride, int h)
629
+{
630
+    LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
631
+    int16_t *const bak = temp + 64;
632
+    int sum = 0, i;
633
+
634
+    av_assert2(h == 8);
635
+    s->mb_intra = 0;
636
+
637
+    s->pdsp.diff_pixels(temp, src1, src2, stride);
638
+
639
+    memcpy(bak, temp, 64 * sizeof(int16_t));
640
+
641
+    s->block_last_index[0 /* FIXME */] =
642
+        s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
643
+    s->dct_unquantize_inter(s, temp, 0, s->qscale);
644
+    ff_simple_idct_8(temp); // FIXME
645
+
646
+    for (i = 0; i < 64; i++)
647
+        sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
648
+
649
+    return sum;
650
+}
651
+
652
+static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
653
+                   int stride, int h)
654
+{
655
+    const uint8_t *scantable = s->intra_scantable.permutated;
656
+    LOCAL_ALIGNED_16(int16_t, temp, [64]);
657
+    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
658
+    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
659
+    int i, last, run, bits, level, distortion, start_i;
660
+    const int esc_length = s->ac_esc_length;
661
+    uint8_t *length, *last_length;
662
+
663
+    av_assert2(h == 8);
664
+
665
+    copy_block8(lsrc1, src1, 8, stride, 8);
666
+    copy_block8(lsrc2, src2, 8, stride, 8);
667
+
668
+    s->pdsp.diff_pixels(temp, lsrc1, lsrc2, 8);
669
+
670
+    s->block_last_index[0 /* FIXME */] =
671
+    last                               =
672
+        s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
673
+
674
+    bits = 0;
675
+
676
+    if (s->mb_intra) {
677
+        start_i     = 1;
678
+        length      = s->intra_ac_vlc_length;
679
+        last_length = s->intra_ac_vlc_last_length;
680
+        bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
681
+    } else {
682
+        start_i     = 0;
683
+        length      = s->inter_ac_vlc_length;
684
+        last_length = s->inter_ac_vlc_last_length;
685
+    }
686
+
687
+    if (last >= start_i) {
688
+        run = 0;
689
+        for (i = start_i; i < last; i++) {
690
+            int j = scantable[i];
691
+            level = temp[j];
692
+
693
+            if (level) {
694
+                level += 64;
695
+                if ((level & (~127)) == 0)
696
+                    bits += length[UNI_AC_ENC_INDEX(run, level)];
697
+                else
698
+                    bits += esc_length;
699
+                run = 0;
700
+            } else
701
+                run++;
702
+        }
703
+        i = scantable[last];
704
+
705
+        level = temp[i] + 64;
706
+
707
+        av_assert2(level - 64);
708
+
709
+        if ((level & (~127)) == 0) {
710
+            bits += last_length[UNI_AC_ENC_INDEX(run, level)];
711
+        } else
712
+            bits += esc_length;
713
+    }
714
+
715
+    if (last >= 0) {
716
+        if (s->mb_intra)
717
+            s->dct_unquantize_intra(s, temp, 0, s->qscale);
718
+        else
719
+            s->dct_unquantize_inter(s, temp, 0, s->qscale);
720
+    }
721
+
722
+    s->idsp.idct_add(lsrc2, 8, temp);
723
+
724
+    distortion = s->mecc.sse[1](NULL, lsrc2, lsrc1, 8, 8);
725
+
726
+    return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
727
+}
728
+
729
+static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
730
+                    int stride, int h)
731
+{
732
+    const uint8_t *scantable = s->intra_scantable.permutated;
733
+    LOCAL_ALIGNED_16(int16_t, temp, [64]);
734
+    int i, last, run, bits, level, start_i;
735
+    const int esc_length = s->ac_esc_length;
736
+    uint8_t *length, *last_length;
737
+
738
+    av_assert2(h == 8);
739
+
740
+    s->pdsp.diff_pixels(temp, src1, src2, stride);
741
+
742
+    s->block_last_index[0 /* FIXME */] =
743
+    last                               =
744
+        s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
745
+
746
+    bits = 0;
747
+
748
+    if (s->mb_intra) {
749
+        start_i     = 1;
750
+        length      = s->intra_ac_vlc_length;
751
+        last_length = s->intra_ac_vlc_last_length;
752
+        bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
753
+    } else {
754
+        start_i     = 0;
755
+        length      = s->inter_ac_vlc_length;
756
+        last_length = s->inter_ac_vlc_last_length;
757
+    }
758
+
759
+    if (last >= start_i) {
760
+        run = 0;
761
+        for (i = start_i; i < last; i++) {
762
+            int j = scantable[i];
763
+            level = temp[j];
764
+
765
+            if (level) {
766
+                level += 64;
767
+                if ((level & (~127)) == 0)
768
+                    bits += length[UNI_AC_ENC_INDEX(run, level)];
769
+                else
770
+                    bits += esc_length;
771
+                run = 0;
772
+            } else
773
+                run++;
774
+        }
775
+        i = scantable[last];
776
+
777
+        level = temp[i] + 64;
778
+
779
+        av_assert2(level - 64);
780
+
781
+        if ((level & (~127)) == 0)
782
+            bits += last_length[UNI_AC_ENC_INDEX(run, level)];
783
+        else
784
+            bits += esc_length;
785
+    }
786
+
787
+    return bits;
788
+}
789
+
790
+#define VSAD_INTRA(size)                                                \
791
+static int vsad_intra ## size ## _c(MpegEncContext *c,                  \
792
+                                    uint8_t *s, uint8_t *dummy,         \
793
+                                    int stride, int h)                  \
794
+{                                                                       \
795
+    int score = 0, x, y;                                                \
796
+                                                                        \
797
+    for (y = 1; y < h; y++) {                                           \
798
+        for (x = 0; x < size; x += 4) {                                 \
799
+            score += FFABS(s[x]     - s[x + stride])     +              \
800
+                     FFABS(s[x + 1] - s[x + stride + 1]) +              \
801
+                     FFABS(s[x + 2] - s[x + 2 + stride]) +              \
802
+                     FFABS(s[x + 3] - s[x + 3 + stride]);               \
803
+        }                                                               \
804
+        s += stride;                                                    \
805
+    }                                                                   \
806
+                                                                        \
807
+    return score;                                                       \
808
+}
809
+VSAD_INTRA(8)
810
+VSAD_INTRA(16)
811
+
812
+#define VSAD(size)                                                             \
813
+static int vsad ## size ## _c(MpegEncContext *c,                               \
814
+                              uint8_t *s1, uint8_t *s2,                        \
815
+                              int stride, int h)                               \
816
+{                                                                              \
817
+    int score = 0, x, y;                                                       \
818
+                                                                               \
819
+    for (y = 1; y < h; y++) {                                                  \
820
+        for (x = 0; x < size; x++)                                             \
821
+            score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);   \
822
+        s1 += stride;                                                          \
823
+        s2 += stride;                                                          \
824
+    }                                                                          \
825
+                                                                               \
826
+    return score;                                                              \
827
+}
828
+VSAD(8)
829
+VSAD(16)
830
+
831
+#define SQ(a) ((a) * (a))
832
+#define VSSE_INTRA(size)                                                \
833
+static int vsse_intra ## size ## _c(MpegEncContext *c,                  \
834
+                                    uint8_t *s, uint8_t *dummy,         \
835
+                                    int stride, int h)                  \
836
+{                                                                       \
837
+    int score = 0, x, y;                                                \
838
+                                                                        \
839
+    for (y = 1; y < h; y++) {                                           \
840
+        for (x = 0; x < size; x += 4) {                                 \
841
+            score += SQ(s[x]     - s[x + stride]) +                     \
842
+                     SQ(s[x + 1] - s[x + stride + 1]) +                 \
843
+                     SQ(s[x + 2] - s[x + stride + 2]) +                 \
844
+                     SQ(s[x + 3] - s[x + stride + 3]);                  \
845
+        }                                                               \
846
+        s += stride;                                                    \
847
+    }                                                                   \
848
+                                                                        \
849
+    return score;                                                       \
850
+}
851
+VSSE_INTRA(8)
852
+VSSE_INTRA(16)
853
+
854
+#define VSSE(size)                                                             \
855
+static int vsse ## size ## _c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,     \
856
+                    int stride, int h)                                         \
857
+{                                                                              \
858
+    int score = 0, x, y;                                                       \
859
+                                                                               \
860
+    for (y = 1; y < h; y++) {                                                  \
861
+        for (x = 0; x < size; x++)                                             \
862
+            score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);      \
863
+        s1 += stride;                                                          \
864
+        s2 += stride;                                                          \
865
+    }                                                                          \
866
+                                                                               \
867
+    return score;                                                              \
868
+}
869
+VSSE(8)
870
+VSSE(16)
871
+
872
+#define WRAPPER8_16_SQ(name8, name16)                                   \
873
+static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,        \
874
+                  int stride, int h)                                    \
875
+{                                                                       \
876
+    int score = 0;                                                      \
877
+                                                                        \
878
+    score += name8(s, dst, src, stride, 8);                             \
879
+    score += name8(s, dst + 8, src + 8, stride, 8);                     \
880
+    if (h == 16) {                                                      \
881
+        dst   += 8 * stride;                                            \
882
+        src   += 8 * stride;                                            \
883
+        score += name8(s, dst, src, stride, 8);                         \
884
+        score += name8(s, dst + 8, src + 8, stride, 8);                 \
885
+    }                                                                   \
886
+    return score;                                                       \
887
+}
888
+
889
+WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
890
+WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
891
+WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
892
+#if CONFIG_GPL
893
+WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
894
+#endif
895
+WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
896
+WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
897
+WRAPPER8_16_SQ(rd8x8_c, rd16_c)
898
+WRAPPER8_16_SQ(bit8x8_c, bit16_c)
899
+
900
+av_cold void ff_me_cmp_init_static(void)
901
+{
902
+    int i;
903
+
904
+    for (i = 0; i < 512; i++)
905
+        ff_square_tab[i] = (i - 256) * (i - 256);
906
+}
907
+
908
+int ff_check_alignment(void)
909
+{
910
+    static int did_fail = 0;
911
+    LOCAL_ALIGNED_16(int, aligned, [4]);
912
+
913
+    if ((intptr_t)aligned & 15) {
914
+        if (!did_fail) {
915
+#if HAVE_MMX || HAVE_ALTIVEC
916
+            av_log(NULL, AV_LOG_ERROR,
917
+                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
918
+                "and may be very slow or crash. This is not a bug in libavcodec,\n"
919
+                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
920
+                "Do not report crashes to FFmpeg developers.\n");
921
+#endif
922
+            did_fail=1;
923
+        }
924
+        return -1;
925
+    }
926
+    return 0;
927
+}
928
+
929
+av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
930
+{
931
+    ff_check_alignment();
932
+
933
+    c->sum_abs_dctelem = sum_abs_dctelem_c;
934
+
935
+    /* TODO [0] 16  [1] 8 */
936
+    c->pix_abs[0][0] = pix_abs16_c;
937
+    c->pix_abs[0][1] = pix_abs16_x2_c;
938
+    c->pix_abs[0][2] = pix_abs16_y2_c;
939
+    c->pix_abs[0][3] = pix_abs16_xy2_c;
940
+    c->pix_abs[1][0] = pix_abs8_c;
941
+    c->pix_abs[1][1] = pix_abs8_x2_c;
942
+    c->pix_abs[1][2] = pix_abs8_y2_c;
943
+    c->pix_abs[1][3] = pix_abs8_xy2_c;
944
+
945
+#define SET_CMP_FUNC(name)                      \
946
+    c->name[0] = name ## 16_c;                  \
947
+    c->name[1] = name ## 8x8_c;
948
+
949
+    SET_CMP_FUNC(hadamard8_diff)
950
+    c->hadamard8_diff[4] = hadamard8_intra16_c;
951
+    c->hadamard8_diff[5] = hadamard8_intra8x8_c;
952
+    SET_CMP_FUNC(dct_sad)
953
+    SET_CMP_FUNC(dct_max)
954
+#if CONFIG_GPL
955
+    SET_CMP_FUNC(dct264_sad)
956
+#endif
957
+    c->sad[0] = pix_abs16_c;
958
+    c->sad[1] = pix_abs8_c;
959
+    c->sse[0] = sse16_c;
960
+    c->sse[1] = sse8_c;
961
+    c->sse[2] = sse4_c;
962
+    SET_CMP_FUNC(quant_psnr)
963
+    SET_CMP_FUNC(rd)
964
+    SET_CMP_FUNC(bit)
965
+    c->vsad[0] = vsad16_c;
966
+    c->vsad[1] = vsad8_c;
967
+    c->vsad[4] = vsad_intra16_c;
968
+    c->vsad[5] = vsad_intra8_c;
969
+    c->vsse[0] = vsse16_c;
970
+    c->vsse[1] = vsse8_c;
971
+    c->vsse[4] = vsse_intra16_c;
972
+    c->vsse[5] = vsse_intra8_c;
973
+    c->nsse[0] = nsse16_c;
974
+    c->nsse[1] = nsse8_c;
975
+#if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
976
+    ff_dsputil_init_dwt(c);
977
+#endif
978
+
979
+    if (ARCH_ALPHA)
980
+        ff_me_cmp_init_alpha(c, avctx);
981
+    if (ARCH_ARM)
982
+        ff_me_cmp_init_arm(c, avctx);
983
+    if (ARCH_PPC)
984
+        ff_me_cmp_init_ppc(c, avctx);
985
+    if (ARCH_X86)
986
+        ff_me_cmp_init_x86(c, avctx);
987
+}
0 988
new file mode 100644
... ...
@@ -0,0 +1,94 @@
0
+/*
1
+ * This file is part of FFmpeg.
2
+ *
3
+ * FFmpeg is free software; you can redistribute it and/or
4
+ * modify it under the terms of the GNU Lesser General Public
5
+ * License as published by the Free Software Foundation; either
6
+ * version 2.1 of the License, or (at your option) any later version.
7
+ *
8
+ * FFmpeg is distributed in the hope that it will be useful,
9
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11
+ * Lesser General Public License for more details.
12
+ *
13
+ * You should have received a copy of the GNU Lesser General Public
14
+ * License along with FFmpeg; if not, write to the Free Software
15
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+ */
17
+
18
+#ifndef AVCODEC_ME_CMP_H
19
+#define AVCODEC_ME_CMP_H
20
+
21
+#include <stdint.h>
22
+
23
+#include "avcodec.h"
24
+
25
+extern uint32_t ff_square_tab[512];
26
+
27
+
28
+/* minimum alignment rules ;)
29
+ * If you notice errors in the align stuff, need more alignment for some ASM code
30
+ * for some CPU or need to use a function with less aligned data then send a mail
31
+ * to the ffmpeg-devel mailing list, ...
32
+ *
33
+ * !warning These alignments might not match reality, (missing attribute((align))
34
+ * stuff somewhere possible).
35
+ * I (Michael) did not check them, these are just the alignments which I think
36
+ * could be reached easily ...
37
+ *
38
+ * !future video codecs might need functions with less strict alignment
39
+ */
40
+
41
+struct MpegEncContext;
42
+/* Motion estimation:
43
+ * h is limited to { width / 2, width, 2 * width },
44
+ * but never larger than 16 and never smaller than 2.
45
+ * Although currently h < 4 is not used as functions with
46
+ * width < 8 are neither used nor implemented. */
47
+typedef int (*me_cmp_func)(struct MpegEncContext *c,
48
+                           uint8_t *blk1 /* align width (8 or 16) */,
49
+                           uint8_t *blk2 /* align 1 */, int line_size, int h);
50
+
51
+typedef struct MECmpContext {
52
+    int (*sum_abs_dctelem)(int16_t *block /* align 16 */);
53
+
54
+    me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */
55
+    me_cmp_func sse[6];
56
+    me_cmp_func hadamard8_diff[6];
57
+    me_cmp_func dct_sad[6];
58
+    me_cmp_func quant_psnr[6];
59
+    me_cmp_func bit[6];
60
+    me_cmp_func rd[6];
61
+    me_cmp_func vsad[6];
62
+    me_cmp_func vsse[6];
63
+    me_cmp_func nsse[6];
64
+    me_cmp_func w53[6];
65
+    me_cmp_func w97[6];
66
+    me_cmp_func dct_max[6];
67
+    me_cmp_func dct264_sad[6];
68
+
69
+    me_cmp_func me_pre_cmp[6];
70
+    me_cmp_func me_cmp[6];
71
+    me_cmp_func me_sub_cmp[6];
72
+    me_cmp_func mb_cmp[6];
73
+    me_cmp_func ildct_cmp[6]; // only width 16 used
74
+    me_cmp_func frame_skip_cmp[6]; // only width 8 used
75
+
76
+    me_cmp_func pix_abs[2][4];
77
+} MECmpContext;
78
+
79
+void ff_me_cmp_init_static(void);
80
+
81
+int ff_check_alignment(void);
82
+
83
+void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
84
+void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
85
+void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
86
+void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
87
+void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
88
+
89
+void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type);
90
+
91
+void ff_dsputil_init_dwt(MECmpContext *c);
92
+
93
+#endif /* AVCODEC_ME_CMP_H */
... ...
@@ -28,7 +28,7 @@
28 28
 #include <string.h>
29 29
 
30 30
 #include "config.h"
31
-#include "dsputil.h"
31
+#include "me_cmp.h"
32 32
 #include "libavutil/internal.h"
33 33
 #include "libavutil/lfg.h"
34 34
 #include "libavutil/mem.h"
... ...
@@ -115,7 +115,7 @@ int main(int argc, char **argv)
115 115
 {
116 116
     AVCodecContext *ctx;
117 117
     int c;
118
-    DSPContext cctx, mmxctx;
118
+    MECmpContext cctx, mmxctx;
119 119
     int flags[2] = { AV_CPU_FLAG_MMX, AV_CPU_FLAG_MMXEXT };
120 120
     int flags_size = HAVE_MMXEXT ? 2 : 1;
121 121
 
... ...
@@ -130,12 +130,12 @@ int main(int argc, char **argv)
130 130
     ctx->flags |= CODEC_FLAG_BITEXACT;
131 131
     av_force_cpu_flags(0);
132 132
     memset(&cctx, 0, sizeof(cctx));
133
-    ff_dsputil_init(&cctx, ctx);
133
+    ff_me_cmp_init(&cctx, ctx);
134 134
     for (c = 0; c < flags_size; c++) {
135 135
         int x;
136 136
         av_force_cpu_flags(flags[c]);
137 137
         memset(&mmxctx, 0, sizeof(mmxctx));
138
-        ff_dsputil_init(&mmxctx, ctx);
138
+        ff_me_cmp_init(&mmxctx, ctx);
139 139
 
140 140
         for (x = 0; x < 2; x++) {
141 141
             printf("%s for %dx%d pixels\n", c ? "mmx2" : "mmx",
... ...
@@ -316,10 +316,10 @@ int ff_init_me(MpegEncContext *s){
316 316
         av_log(s->avctx, AV_LOG_INFO, "ME_MAP size may be a little small for the selected diamond size\n");
317 317
     }
318 318
 
319
-    ff_set_cmp(&s->dsp, s->dsp.me_pre_cmp, c->avctx->me_pre_cmp);
320
-    ff_set_cmp(&s->dsp, s->dsp.me_cmp, c->avctx->me_cmp);
321
-    ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, c->avctx->me_sub_cmp);
322
-    ff_set_cmp(&s->dsp, s->dsp.mb_cmp, c->avctx->mb_cmp);
319
+    ff_set_cmp(&s->mecc, s->mecc.me_pre_cmp, c->avctx->me_pre_cmp);
320
+    ff_set_cmp(&s->mecc, s->mecc.me_cmp,     c->avctx->me_cmp);
321
+    ff_set_cmp(&s->mecc, s->mecc.me_sub_cmp, c->avctx->me_sub_cmp);
322
+    ff_set_cmp(&s->mecc, s->mecc.mb_cmp,     c->avctx->mb_cmp);
323 323
 
324 324
     c->flags    = get_flags(c, 0, c->avctx->me_cmp    &FF_CMP_CHROMA);
325 325
     c->sub_flags= get_flags(c, 0, c->avctx->me_sub_cmp&FF_CMP_CHROMA);
... ...
@@ -360,13 +360,11 @@ int ff_init_me(MpegEncContext *s){
360 360
     /* 8x8 fullpel search would need a 4x4 chroma compare, which we do
361 361
      * not have yet, and even if we had, the motion estimation code
362 362
      * does not expect it. */
363
-    if(s->codec_id != AV_CODEC_ID_SNOW){
364
-        if((c->avctx->me_cmp&FF_CMP_CHROMA)/* && !s->dsp.me_cmp[2]*/){
365
-            s->dsp.me_cmp[2]= zero_cmp;
366
-        }
367
-        if((c->avctx->me_sub_cmp&FF_CMP_CHROMA) && !s->dsp.me_sub_cmp[2]){
368
-            s->dsp.me_sub_cmp[2]= zero_cmp;
369
-        }
363
+    if (s->codec_id != AV_CODEC_ID_SNOW) {
364
+        if ((c->avctx->me_cmp & FF_CMP_CHROMA) /* && !s->mecc.me_cmp[2] */)
365
+            s->mecc.me_cmp[2] = zero_cmp;
366
+        if ((c->avctx->me_sub_cmp & FF_CMP_CHROMA) && !s->mecc.me_sub_cmp[2])
367
+            s->mecc.me_sub_cmp[2] = zero_cmp;
370 368
         c->hpel_put[2][0]= c->hpel_put[2][1]=
371 369
         c->hpel_put[2][2]= c->hpel_put[2][3]= zero_hpel;
372 370
     }
... ...
@@ -380,7 +378,7 @@ int ff_init_me(MpegEncContext *s){
380 380
 
381 381
 #define CHECK_SAD_HALF_MV(suffix, x, y) \
382 382
 {\
383
-    d= s->dsp.pix_abs[size][(x?1:0)+(y?2:0)](NULL, pix, ptr+((x)>>1), stride, h);\
383
+    d  = s->mecc.pix_abs[size][(x ? 1 : 0) + (y ? 2 : 0)](NULL, pix, ptr + ((x) >> 1), stride, h); \
384 384
     d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*penalty_factor;\
385 385
     COPY3_IF_LT(dminh, d, dx, x, dy, y)\
386 386
 }
... ...
@@ -633,7 +631,7 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
633 633
 
634 634
         dmin4= c->sub_motion_search(s, &mx4, &my4, dmin4, block, block, size, h);
635 635
 
636
-        if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){
636
+        if (s->mecc.me_sub_cmp[0] != s->mecc.mb_cmp[0]) {
637 637
             int dxy;
638 638
             const int offset= ((block&1) + (block>>1)*stride)*8;
639 639
             uint8_t *dest_y = c->scratchpad + offset;
... ...
@@ -675,8 +673,11 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
675 675
     if(same)
676 676
         return INT_MAX;
677 677
 
678
-    if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){
679
-        dmin_sum += s->dsp.mb_cmp[0](s, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*16*stride, c->scratchpad, stride, 16);
678
+    if (s->mecc.me_sub_cmp[0] != s->mecc.mb_cmp[0]) {
679
+        dmin_sum += s->mecc.mb_cmp[0](s,
680
+                                      s->new_picture.f->data[0] +
681
+                                      s->mb_x * 16 + s->mb_y * 16 * stride,
682
+                                      c->scratchpad, stride, 16);
680 683
     }
681 684
 
682 685
     if(c->avctx->mb_cmp&FF_CMP_CHROMA){
... ...
@@ -698,8 +699,8 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
698 698
             s->hdsp.put_pixels_tab       [1][dxy](c->scratchpad + 8, s->last_picture.f->data[2] + offset, s->uvlinesize, 8);
699 699
         }
700 700
 
701
-        dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.f->data[1] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, c->scratchpad  , s->uvlinesize, 8);
702
-        dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.f->data[2] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, c->scratchpad+8, s->uvlinesize, 8);
701
+        dmin_sum += s->mecc.mb_cmp[1](s, s->new_picture.f->data[1] + s->mb_x * 8 + s->mb_y * 8 * s->uvlinesize, c->scratchpad,     s->uvlinesize, 8);
702
+        dmin_sum += s->mecc.mb_cmp[1](s, s->new_picture.f->data[2] + s->mb_x * 8 + s->mb_y * 8 * s->uvlinesize, c->scratchpad + 8, s->uvlinesize, 8);
703 703
     }
704 704
 
705 705
     c->pred_x= mx;
... ...
@@ -795,7 +796,7 @@ static int interlaced_search(MpegEncContext *s, int ref_index,
795 795
             mv_table[xy][0]= mx_i;
796 796
             mv_table[xy][1]= my_i;
797 797
 
798
-            if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){
798
+            if (s->mecc.me_sub_cmp[0] != s->mecc.mb_cmp[0]) {
799 799
                 int dxy;
800 800
 
801 801
                 //FIXME chroma ME
... ...
@@ -807,7 +808,7 @@ static int interlaced_search(MpegEncContext *s, int ref_index,
807 807
                 }else{
808 808
                     s->hdsp.put_pixels_tab       [size][dxy](c->scratchpad, ref    , stride, h);
809 809
                 }
810
-                dmin= s->dsp.mb_cmp[size](s, c->src[block][0], c->scratchpad, stride, h);
810
+                dmin = s->mecc.mb_cmp[size](s, c->src[block][0], c->scratchpad, stride, h);
811 811
                 dmin+= (mv_penalty[mx_i-c->pred_x] + mv_penalty[my_i-c->pred_y] + 1)*c->mb_penalty_factor;
812 812
             }else
813 813
                 dmin+= c->mb_penalty_factor; //field_select bits
... ...
@@ -962,7 +963,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
962 962
     /* At this point (mx,my) are full-pell and the relative displacement */
963 963
     ppix = c->ref[0][0] + (my * s->linesize) + mx;
964 964
 
965
-    vard = s->dsp.sse[0](NULL, pix, ppix, s->linesize, 16);
965
+    vard = s->mecc.sse[0](NULL, pix, ppix, s->linesize, 16);
966 966
 
967 967
     pic->mc_mb_var[s->mb_stride * mb_y + mb_x] = (vard+128)>>8;
968 968
     c->mc_mb_var_sum_temp += (vard+128)>>8;
... ...
@@ -1059,7 +1060,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
1059 1059
                 *(uint32_t*)(&c->scratchpad[i*s->linesize+12]) = mean;
1060 1060
             }
1061 1061
 
1062
-            intra_score= s->dsp.mb_cmp[0](s, c->scratchpad, pix, s->linesize, 16);
1062
+            intra_score= s->mecc.mb_cmp[0](s, c->scratchpad, pix, s->linesize, 16);
1063 1063
         }
1064 1064
         intra_score += c->mb_penalty_factor*16;
1065 1065
 
... ...
@@ -1259,7 +1260,7 @@ static inline int check_bidir_mv(MpegEncContext * s,
1259 1259
 
1260 1260
     fbmin = (mv_penalty_f[motion_fx-pred_fx] + mv_penalty_f[motion_fy-pred_fy])*c->mb_penalty_factor
1261 1261
            +(mv_penalty_b[motion_bx-pred_bx] + mv_penalty_b[motion_by-pred_by])*c->mb_penalty_factor
1262
-           + s->dsp.mb_cmp[size](s, src_data[0], dest_y, stride, h); //FIXME new_pic
1262
+           + s->mecc.mb_cmp[size](s, src_data[0], dest_y, stride, h); // FIXME new_pic
1263 1263
 
1264 1264
     if(c->avctx->mb_cmp&FF_CMP_CHROMA){
1265 1265
     }
... ...
@@ -63,8 +63,8 @@ static int hpel_motion_search(MpegEncContext * s,
63 63
 
64 64
  //FIXME factorize
65 65
 
66
-    cmp_sub= s->dsp.me_sub_cmp[size];
67
-    chroma_cmp_sub= s->dsp.me_sub_cmp[size+1];
66
+    cmp_sub        = s->mecc.me_sub_cmp[size];
67
+    chroma_cmp_sub = s->mecc.me_sub_cmp[size + 1];
68 68
 
69 69
     if(c->skip){ //FIXME move out of hpel?
70 70
         *mx_ptr = 0;
... ...
@@ -165,7 +165,6 @@ static inline int get_mb_score(MpegEncContext *s, int mx, int my,
165 165
                                int src_index, int ref_index, int size,
166 166
                                int h, int add_rate)
167 167
 {
168
-//    const int check_luma= s->dsp.me_sub_cmp != s->dsp.mb_cmp;
169 168
     MotionEstContext * const c= &s->me;
170 169
     const int penalty_factor= c->mb_penalty_factor;
171 170
     const int flags= c->mb_flags;
... ...
@@ -178,8 +177,8 @@ static inline int get_mb_score(MpegEncContext *s, int mx, int my,
178 178
 
179 179
  //FIXME factorize
180 180
 
181
-    cmp_sub= s->dsp.mb_cmp[size];
182
-    chroma_cmp_sub= s->dsp.mb_cmp[size+1];
181
+    cmp_sub        = s->mecc.mb_cmp[size];
182
+    chroma_cmp_sub = s->mecc.mb_cmp[size + 1];
183 183
 
184 184
     d= cmp(s, mx>>(qpel+1), my>>(qpel+1), mx&mask, my&mask, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags);
185 185
     //FIXME check cbp before adding penalty for (0,0) vector
... ...
@@ -222,12 +221,12 @@ static int qpel_motion_search(MpegEncContext * s,
222 222
     LOAD_COMMON
223 223
     int flags= c->sub_flags;
224 224
 
225
-    cmpf= s->dsp.me_cmp[size];
226
-    chroma_cmpf= s->dsp.me_cmp[size+1]; //factorize FIXME
225
+    cmpf        = s->mecc.me_cmp[size];
226
+    chroma_cmpf = s->mecc.me_cmp[size + 1]; // FIXME: factorize
227 227
  //FIXME factorize
228 228
 
229
-    cmp_sub= s->dsp.me_sub_cmp[size];
230
-    chroma_cmp_sub= s->dsp.me_sub_cmp[size+1];
229
+    cmp_sub        = s->mecc.me_sub_cmp[size];
230
+    chroma_cmp_sub = s->mecc.me_sub_cmp[size + 1];
231 231
 
232 232
     if(c->skip){ //FIXME somehow move up (benchmark)
233 233
         *mx_ptr = 0;
... ...
@@ -423,8 +422,8 @@ static av_always_inline int small_diamond_search(MpegEncContext * s, int *best,
423 423
     LOAD_COMMON2
424 424
     unsigned map_generation = c->map_generation;
425 425
 
426
-    cmpf= s->dsp.me_cmp[size];
427
-    chroma_cmpf= s->dsp.me_cmp[size+1];
426
+    cmpf        = s->mecc.me_cmp[size];
427
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
428 428
 
429 429
     { /* ensure that the best point is in the MAP as h/qpel refinement needs it */
430 430
         const unsigned key = (best[1]<<ME_MAP_MV_BITS) + best[0] + map_generation;
... ...
@@ -464,8 +463,8 @@ static int funny_diamond_search(MpegEncContext * s, int *best, int dmin,
464 464
     LOAD_COMMON2
465 465
     unsigned map_generation = c->map_generation;
466 466
 
467
-    cmpf= s->dsp.me_cmp[size];
468
-    chroma_cmpf= s->dsp.me_cmp[size+1];
467
+    cmpf        = s->mecc.me_cmp[size];
468
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
469 469
 
470 470
     for(dia_size=1; dia_size<=4; dia_size++){
471 471
         int dir;
... ...
@@ -507,8 +506,8 @@ static int hex_search(MpegEncContext * s, int *best, int dmin,
507 507
     int x,y,d;
508 508
     const int dec= dia_size & (dia_size-1);
509 509
 
510
-    cmpf= s->dsp.me_cmp[size];
511
-    chroma_cmpf= s->dsp.me_cmp[size+1];
510
+    cmpf        = s->mecc.me_cmp[size];
511
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
512 512
 
513 513
     for(;dia_size; dia_size= dec ? dia_size-1 : dia_size>>1){
514 514
         do{
... ...
@@ -544,8 +543,8 @@ static int l2s_dia_search(MpegEncContext * s, int *best, int dmin,
544 544
     static const int hex[8][2]={{-2, 0}, {-1,-1}, { 0,-2}, { 1,-1},
545 545
                                 { 2, 0}, { 1, 1}, { 0, 2}, {-1, 1}};
546 546
 
547
-    cmpf= s->dsp.me_cmp[size];
548
-    chroma_cmpf= s->dsp.me_cmp[size+1];
547
+    cmpf        = s->mecc.me_cmp[size];
548
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
549 549
 
550 550
     for(; dia_size; dia_size= dec ? dia_size-1 : dia_size>>1){
551 551
         do{
... ...
@@ -583,8 +582,8 @@ static int umh_search(MpegEncContext * s, int *best, int dmin,
583 583
                                  {-2, 3}, { 0, 4}, { 2, 3},
584 584
                                  {-2,-3}, { 0,-4}, { 2,-3},};
585 585
 
586
-    cmpf= s->dsp.me_cmp[size];
587
-    chroma_cmpf= s->dsp.me_cmp[size+1];
586
+    cmpf        = s->mecc.me_cmp[size];
587
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
588 588
 
589 589
     x= best[0];
590 590
     y= best[1];
... ...
@@ -626,8 +625,8 @@ static int full_search(MpegEncContext * s, int *best, int dmin,
626 626
     int x,y, d;
627 627
     const int dia_size= c->dia_size&0xFF;
628 628
 
629
-    cmpf= s->dsp.me_cmp[size];
630
-    chroma_cmpf= s->dsp.me_cmp[size+1];
629
+    cmpf        = s->mecc.me_cmp[size];
630
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
631 631
 
632 632
     for(y=FFMAX(-dia_size, ymin); y<=FFMIN(dia_size,ymax); y++){
633 633
         for(x=FFMAX(-dia_size, xmin); x<=FFMIN(dia_size,xmax); x++){
... ...
@@ -692,8 +691,8 @@ static int sab_diamond_search(MpegEncContext * s, int *best, int dmin,
692 692
 
693 693
     av_assert1(minima_count <= MAX_SAB_SIZE);
694 694
 
695
-    cmpf= s->dsp.me_cmp[size];
696
-    chroma_cmpf= s->dsp.me_cmp[size+1];
695
+    cmpf        = s->mecc.me_cmp[size];
696
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
697 697
 
698 698
     /*Note j<MAX_SAB_SIZE is needed if MAX_SAB_SIZE < ME_MAP_SIZE as j can
699 699
       become larger due to MVs overflowing their ME_MAP_MV_BITS bits space in map
... ...
@@ -777,8 +776,8 @@ static int var_diamond_search(MpegEncContext * s, int *best, int dmin,
777 777
     LOAD_COMMON2
778 778
     unsigned map_generation = c->map_generation;
779 779
 
780
-    cmpf= s->dsp.me_cmp[size];
781
-    chroma_cmpf= s->dsp.me_cmp[size+1];
780
+    cmpf        = s->mecc.me_cmp[size];
781
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
782 782
 
783 783
     for(dia_size=1; dia_size<=c->dia_size; dia_size++){
784 784
         int dir, start, end;
... ...
@@ -878,12 +877,12 @@ static av_always_inline int epzs_motion_search_internal(MpegEncContext * s, int
878 878
 
879 879
     if(c->pre_pass){
880 880
         penalty_factor= c->pre_penalty_factor;
881
-        cmpf= s->dsp.me_pre_cmp[size];
882
-        chroma_cmpf= s->dsp.me_pre_cmp[size+1];
881
+        cmpf           = s->mecc.me_pre_cmp[size];
882
+        chroma_cmpf    = s->mecc.me_pre_cmp[size + 1];
883 883
     }else{
884 884
         penalty_factor= c->penalty_factor;
885
-        cmpf= s->dsp.me_cmp[size];
886
-        chroma_cmpf= s->dsp.me_cmp[size+1];
885
+        cmpf           = s->mecc.me_cmp[size];
886
+        chroma_cmpf    = s->mecc.me_cmp[size + 1];
887 887
     }
888 888
 
889 889
     map_generation= update_map_generation(c);
... ...
@@ -1007,8 +1006,8 @@ static int epzs_motion_search4(MpegEncContext * s,
1007 1007
     int flags= c->flags;
1008 1008
     LOAD_COMMON2
1009 1009
 
1010
-    cmpf= s->dsp.me_cmp[size];
1011
-    chroma_cmpf= s->dsp.me_cmp[size+1];
1010
+    cmpf        = s->mecc.me_cmp[size];
1011
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
1012 1012
 
1013 1013
     map_generation= update_map_generation(c);
1014 1014
 
... ...
@@ -1066,8 +1065,8 @@ static int epzs_motion_search2(MpegEncContext * s,
1066 1066
     int flags= c->flags;
1067 1067
     LOAD_COMMON2
1068 1068
 
1069
-    cmpf= s->dsp.me_cmp[size];
1070
-    chroma_cmpf= s->dsp.me_cmp[size+1];
1069
+    cmpf        = s->mecc.me_cmp[size];
1070
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
1071 1071
 
1072 1072
     map_generation= update_map_generation(c);
1073 1073
 
... ...
@@ -698,7 +698,7 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
698 698
                             }
699 699
                             diff = diff * 256 / (xe * ye);
700 700
                         } else {
701
-                            diff = s->dsp.sad[0](NULL, p_pic, b_pic, s->linesize, 16);
701
+                            diff = s->mecc.sad[0](NULL, p_pic, b_pic, s->linesize, 16);
702 702
                         }
703 703
                         if (diff > s->qscale * 70) {  // FIXME check that 70 is optimal
704 704
                             s->mb_skipped = 0;
... ...
@@ -380,10 +380,10 @@ static void gray8(uint8_t *dst, const uint8_t *src, ptrdiff_t linesize, int h)
380 380
 av_cold int ff_dct_common_init(MpegEncContext *s)
381 381
 {
382 382
     ff_blockdsp_init(&s->bdsp, s->avctx);
383
-    ff_dsputil_init(&s->dsp, s->avctx);
384 383
     ff_h264chroma_init(&s->h264chroma, 8); //for lowres
385 384
     ff_hpeldsp_init(&s->hdsp, s->avctx->flags);
386 385
     ff_idctdsp_init(&s->idsp, s->avctx);
386
+    ff_me_cmp_init(&s->mecc, s->avctx);
387 387
     ff_mpegvideodsp_init(&s->mdsp);
388 388
     ff_videodsp_init(&s->vdsp, s->avctx->bits_per_raw_sample);
389 389
 
... ...
@@ -1106,7 +1106,7 @@ static int init_er(MpegEncContext *s)
1106 1106
     int i;
1107 1107
 
1108 1108
     er->avctx       = s->avctx;
1109
-    er->dsp         = &s->dsp;
1109
+    er->mecc        = &s->mecc;
1110 1110
 
1111 1111
     er->mb_index2xy = s->mb_index2xy;
1112 1112
     er->mb_num      = s->mb_num;
... ...
@@ -30,7 +30,6 @@
30 30
 
31 31
 #include "avcodec.h"
32 32
 #include "blockdsp.h"
33
-#include "dsputil.h"
34 33
 #include "error_resilience.h"
35 34
 #include "fdctdsp.h"
36 35
 #include "get_bits.h"
... ...
@@ -38,6 +37,7 @@
38 38
 #include "h263dsp.h"
39 39
 #include "hpeldsp.h"
40 40
 #include "idctdsp.h"
41
+#include "me_cmp.h"
41 42
 #include "mpegvideodsp.h"
42 43
 #include "mpegvideoencdsp.h"
43 44
 #include "pixblockdsp.h"
... ...
@@ -365,11 +365,11 @@ typedef struct MpegEncContext {
365 365
     int h263_long_vectors;      ///< use horrible h263v1 long vector mode
366 366
 
367 367
     BlockDSPContext bdsp;
368
-    DSPContext dsp;             ///< pointers for accelerated dsp functions
369 368
     FDCTDSPContext fdsp;
370 369
     H264ChromaContext h264chroma;
371 370
     HpelDSPContext hdsp;
372 371
     IDCTDSPContext idsp;
372
+    MECmpContext mecc;
373 373
     MpegVideoDSPContext mdsp;
374 374
     MpegvideoEncDSPContext mpvencdsp;
375 375
     PixblockDSPContext pdsp;
... ...
@@ -836,6 +836,7 @@ av_cold int ff_MPV_encode_init(AVCodecContext *avctx)
836 836
         return -1;
837 837
 
838 838
     ff_fdctdsp_init(&s->fdsp, avctx);
839
+    ff_me_cmp_init(&s->mecc, avctx);
839 840
     ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
840 841
     ff_pixblockdsp_init(&s->pdsp, avctx);
841 842
     ff_qpeldsp_init(&s->qdsp);
... ...
@@ -872,8 +873,8 @@ av_cold int ff_MPV_encode_init(AVCodecContext *avctx)
872 872
 
873 873
     s->quant_precision = 5;
874 874
 
875
-    ff_set_cmp(&s->dsp, s->dsp.ildct_cmp, s->avctx->ildct_cmp);
876
-    ff_set_cmp(&s->dsp, s->dsp.frame_skip_cmp, s->avctx->frame_skip_cmp);
875
+    ff_set_cmp(&s->mecc, s->mecc.ildct_cmp,      s->avctx->ildct_cmp);
876
+    ff_set_cmp(&s->mecc, s->mecc.frame_skip_cmp, s->avctx->frame_skip_cmp);
877 877
 
878 878
     if (CONFIG_H261_ENCODER && s->out_format == FMT_H261)
879 879
         ff_h261_encode_init(s);
... ...
@@ -1027,8 +1028,8 @@ static int get_intra_count(MpegEncContext *s, uint8_t *src,
1027 1027
     for (y = 0; y < h; y += 16) {
1028 1028
         for (x = 0; x < w; x += 16) {
1029 1029
             int offset = x + y * stride;
1030
-            int sad  = s->dsp.sad[0](NULL, src + offset, ref + offset, stride,
1031
-                                     16);
1030
+            int sad  = s->mecc.sad[0](NULL, src + offset, ref + offset,
1031
+                                      stride, 16);
1032 1032
             int mean = (s->mpvencdsp.pix_sum(src + offset, stride) + 128) >> 8;
1033 1033
             int sae  = get_sae(src + offset, mean, stride);
1034 1034
 
... ...
@@ -1205,7 +1206,7 @@ static int skip_check(MpegEncContext *s, Picture *p, Picture *ref)
1205 1205
                 int off = p->shared ? 0 : 16;
1206 1206
                 uint8_t *dptr = p->f->data[plane] + 8 * (x + y * stride) + off;
1207 1207
                 uint8_t *rptr = ref->f->data[plane] + 8 * (x + y * stride);
1208
-                int v   = s->dsp.frame_skip_cmp[1](s, dptr, rptr, stride, 8);
1208
+                int v = s->mecc.frame_skip_cmp[1](s, dptr, rptr, stride, 8);
1209 1209
 
1210 1210
                 switch (FFABS(s->avctx->frame_skip_exp)) {
1211 1211
                 case 0: score    =  FFMAX(score, v);          break;
... ...
@@ -2089,16 +2090,15 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
2089 2089
             int progressive_score, interlaced_score;
2090 2090
 
2091 2091
             s->interlaced_dct = 0;
2092
-            progressive_score = s->dsp.ildct_cmp[4](s, ptr_y,
2093
-                                                    NULL, wrap_y, 8) +
2094
-                                s->dsp.ildct_cmp[4](s, ptr_y + wrap_y * 8,
2095
-                                                    NULL, wrap_y, 8) - 400;
2092
+            progressive_score = s->mecc.ildct_cmp[4](s, ptr_y, NULL, wrap_y, 8) +
2093
+                                s->mecc.ildct_cmp[4](s, ptr_y + wrap_y * 8,
2094
+                                                     NULL, wrap_y, 8) - 400;
2096 2095
 
2097 2096
             if (progressive_score > 0) {
2098
-                interlaced_score = s->dsp.ildct_cmp[4](s, ptr_y,
2099
-                                                       NULL, wrap_y * 2, 8) +
2100
-                                   s->dsp.ildct_cmp[4](s, ptr_y + wrap_y,
2101
-                                                       NULL, wrap_y * 2, 8);
2097
+                interlaced_score = s->mecc.ildct_cmp[4](s, ptr_y,
2098
+                                                        NULL, wrap_y * 2, 8) +
2099
+                                   s->mecc.ildct_cmp[4](s, ptr_y + wrap_y,
2100
+                                                        NULL, wrap_y * 2, 8);
2102 2101
                 if (progressive_score > interlaced_score) {
2103 2102
                     s->interlaced_dct = 1;
2104 2103
 
... ...
@@ -2169,23 +2169,20 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
2169 2169
             int progressive_score, interlaced_score;
2170 2170
 
2171 2171
             s->interlaced_dct = 0;
2172
-            progressive_score = s->dsp.ildct_cmp[0](s, dest_y,
2173
-                                                    ptr_y,              wrap_y,
2174
-                                                    8) +
2175
-                                s->dsp.ildct_cmp[0](s, dest_y + wrap_y * 8,
2176
-                                                    ptr_y + wrap_y * 8, wrap_y,
2177
-                                                    8) - 400;
2172
+            progressive_score = s->mecc.ildct_cmp[0](s, dest_y, ptr_y, wrap_y, 8) +
2173
+                                s->mecc.ildct_cmp[0](s, dest_y + wrap_y * 8,
2174
+                                                     ptr_y + wrap_y * 8,
2175
+                                                     wrap_y, 8) - 400;
2178 2176
 
2179 2177
             if (s->avctx->ildct_cmp == FF_CMP_VSSE)
2180 2178
                 progressive_score -= 400;
2181 2179
 
2182 2180
             if (progressive_score > 0) {
2183
-                interlaced_score = s->dsp.ildct_cmp[0](s, dest_y,
2184
-                                                       ptr_y,
2185
-                                                       wrap_y * 2, 8) +
2186
-                                   s->dsp.ildct_cmp[0](s, dest_y + wrap_y,
2187
-                                                       ptr_y + wrap_y,
2188
-                                                       wrap_y * 2, 8);
2181
+                interlaced_score = s->mecc.ildct_cmp[0](s, dest_y, ptr_y,
2182
+                                                        wrap_y * 2, 8) +
2183
+                                   s->mecc.ildct_cmp[0](s, dest_y + wrap_y,
2184
+                                                        ptr_y + wrap_y,
2185
+                                                        wrap_y * 2, 8);
2189 2186
 
2190 2187
                 if (progressive_score > interlaced_score) {
2191 2188
                     s->interlaced_dct = 1;
... ...
@@ -2223,33 +2220,28 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
2223 2223
         if (s->current_picture.mc_mb_var[s->mb_stride * mb_y + mb_x] <
2224 2224
                 2 * s->qscale * s->qscale) {
2225 2225
             // FIXME optimize
2226
-            if (s->dsp.sad[1](NULL, ptr_y , dest_y,
2227
-                              wrap_y, 8) < 20 * s->qscale)
2226
+            if (s->mecc.sad[1](NULL, ptr_y, dest_y, wrap_y, 8) < 20 * s->qscale)
2228 2227
                 skip_dct[0] = 1;
2229
-            if (s->dsp.sad[1](NULL, ptr_y + 8,
2230
-                              dest_y + 8, wrap_y, 8) < 20 * s->qscale)
2228
+            if (s->mecc.sad[1](NULL, ptr_y + 8, dest_y + 8, wrap_y, 8) < 20 * s->qscale)
2231 2229
                 skip_dct[1] = 1;
2232
-            if (s->dsp.sad[1](NULL, ptr_y + dct_offset,
2233
-                              dest_y + dct_offset, wrap_y, 8) < 20 * s->qscale)
2230
+            if (s->mecc.sad[1](NULL, ptr_y + dct_offset, dest_y + dct_offset,
2231
+                               wrap_y, 8) < 20 * s->qscale)
2234 2232
                 skip_dct[2] = 1;
2235
-            if (s->dsp.sad[1](NULL, ptr_y + dct_offset + 8,
2236
-                              dest_y + dct_offset + 8,
2237
-                              wrap_y, 8) < 20 * s->qscale)
2233
+            if (s->mecc.sad[1](NULL, ptr_y + dct_offset + 8, dest_y + dct_offset + 8,
2234
+                               wrap_y, 8) < 20 * s->qscale)
2238 2235
                 skip_dct[3] = 1;
2239
-            if (s->dsp.sad[1](NULL, ptr_cb, dest_cb,
2240
-                              wrap_c, 8) < 20 * s->qscale)
2236
+            if (s->mecc.sad[1](NULL, ptr_cb, dest_cb, wrap_c, 8) < 20 * s->qscale)
2241 2237
                 skip_dct[4] = 1;
2242
-            if (s->dsp.sad[1](NULL, ptr_cr, dest_cr,
2243
-                              wrap_c, 8) < 20 * s->qscale)
2238
+            if (s->mecc.sad[1](NULL, ptr_cr, dest_cr, wrap_c, 8) < 20 * s->qscale)
2244 2239
                 skip_dct[5] = 1;
2245 2240
             if (!s->chroma_y_shift) { /* 422 */
2246
-                if (s->dsp.sad[1](NULL, ptr_cb + uv_dct_offset,
2247
-                                  dest_cb + uv_dct_offset,
2248
-                                  wrap_c, 8) < 20 * s->qscale)
2241
+                if (s->mecc.sad[1](NULL, ptr_cb + uv_dct_offset,
2242
+                                   dest_cb + uv_dct_offset,
2243
+                                   wrap_c, 8) < 20 * s->qscale)
2249 2244
                     skip_dct[6] = 1;
2250
-                if (s->dsp.sad[1](NULL, ptr_cr + uv_dct_offset,
2251
-                                  dest_cr + uv_dct_offset,
2252
-                                  wrap_c, 8) < 20 * s->qscale)
2245
+                if (s->mecc.sad[1](NULL, ptr_cr + uv_dct_offset,
2246
+                                   dest_cr + uv_dct_offset,
2247
+                                   wrap_c, 8) < 20 * s->qscale)
2253 2248
                     skip_dct[7] = 1;
2254 2249
             }
2255 2250
         }
... ...
@@ -2522,9 +2514,9 @@ static int sse(MpegEncContext *s, uint8_t *src1, uint8_t *src2, int w, int h, in
2522 2522
     int x,y;
2523 2523
 
2524 2524
     if(w==16 && h==16)
2525
-        return s->dsp.sse[0](NULL, src1, src2, stride, 16);
2525
+        return s->mecc.sse[0](NULL, src1, src2, stride, 16);
2526 2526
     else if(w==8 && h==8)
2527
-        return s->dsp.sse[1](NULL, src1, src2, stride, 8);
2527
+        return s->mecc.sse[1](NULL, src1, src2, stride, 8);
2528 2528
 
2529 2529
     for(y=0; y<h; y++){
2530 2530
         for(x=0; x<w; x++){
... ...
@@ -2546,13 +2538,13 @@ static int sse_mb(MpegEncContext *s){
2546 2546
 
2547 2547
     if(w==16 && h==16)
2548 2548
       if(s->avctx->mb_cmp == FF_CMP_NSSE){
2549
-        return  s->dsp.nsse[0](s, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], s->linesize, 16)
2550
-               +s->dsp.nsse[1](s, s->new_picture.f->data[1] + s->mb_x*8  + s->mb_y*s->uvlinesize*8,s->dest[1], s->uvlinesize, 8)
2551
-               +s->dsp.nsse[1](s, s->new_picture.f->data[2] + s->mb_x*8  + s->mb_y*s->uvlinesize*8,s->dest[2], s->uvlinesize, 8);
2549
+        return s->mecc.nsse[0](s, s->new_picture.f->data[0] + s->mb_x * 16 + s->mb_y * s->linesize   * 16, s->dest[0], s->linesize,   16) +
2550
+               s->mecc.nsse[1](s, s->new_picture.f->data[1] + s->mb_x *  8 + s->mb_y * s->uvlinesize *  8, s->dest[1], s->uvlinesize,  8) +
2551
+               s->mecc.nsse[1](s, s->new_picture.f->data[2] + s->mb_x *  8 + s->mb_y * s->uvlinesize *  8, s->dest[2], s->uvlinesize,  8);
2552 2552
       }else{
2553
-        return  s->dsp.sse[0](NULL, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], s->linesize, 16)
2554
-               +s->dsp.sse[1](NULL, s->new_picture.f->data[1] + s->mb_x*8  + s->mb_y*s->uvlinesize*8,s->dest[1], s->uvlinesize, 8)
2555
-               +s->dsp.sse[1](NULL, s->new_picture.f->data[2] + s->mb_x*8  + s->mb_y*s->uvlinesize*8,s->dest[2], s->uvlinesize, 8);
2553
+        return s->mecc.sse[0](NULL, s->new_picture.f->data[0] + s->mb_x * 16 + s->mb_y * s->linesize   * 16, s->dest[0], s->linesize,   16) +
2554
+               s->mecc.sse[1](NULL, s->new_picture.f->data[1] + s->mb_x *  8 + s->mb_y * s->uvlinesize *  8, s->dest[1], s->uvlinesize,  8) +
2555
+               s->mecc.sse[1](NULL, s->new_picture.f->data[2] + s->mb_x *  8 + s->mb_y * s->uvlinesize *  8, s->dest[2], s->uvlinesize,  8);
2556 2556
       }
2557 2557
     else
2558 2558
         return  sse(s, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], w, h, s->linesize)
... ...
@@ -25,8 +25,8 @@
25 25
 #include "libavutil/attributes.h"
26 26
 #include "libavutil/imgutils.h"
27 27
 #include "avcodec.h"
28
-#include "dsputil.h"
29 28
 #include "imgconvert.h"
29
+#include "me_cmp.h"
30 30
 #include "mpegvideoencdsp.h"
31 31
 
32 32
 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
... ...
@@ -2,7 +2,6 @@ OBJS                                   += ppc/fmtconvert_altivec.o      \
2 2
 
3 3
 OBJS-$(CONFIG_AUDIODSP)                += ppc/audiodsp.o
4 4
 OBJS-$(CONFIG_BLOCKDSP)                += ppc/blockdsp.o
5
-OBJS-$(CONFIG_DSPUTIL)                 += ppc/dsputil_altivec.o
6 5
 OBJS-$(CONFIG_FFT)                     += ppc/fft_altivec.o
7 6
 OBJS-$(CONFIG_H264CHROMA)              += ppc/h264chroma_init.o
8 7
 OBJS-$(CONFIG_H264DSP)                 += ppc/h264dsp.o ppc/hpeldsp_altivec.o
... ...
@@ -11,6 +10,7 @@ OBJS-$(CONFIG_HPELDSP)                 += ppc/hpeldsp_altivec.o
11 11
 OBJS-$(CONFIG_HUFFYUVDSP)              += ppc/huffyuvdsp_altivec.o
12 12
 OBJS-$(CONFIG_FDCTDSP)                 += ppc/fdctdsp.o
13 13
 OBJS-$(CONFIG_IDCTDSP)                 += ppc/idctdsp.o
14
+OBJS-$(CONFIG_ME_CMP)                  += ppc/me_cmp.o
14 15
 OBJS-$(CONFIG_MPEGAUDIODSP)            += ppc/mpegaudiodsp_altivec.o
15 16
 OBJS-$(CONFIG_MPEGVIDEO)               += ppc/mpegvideo_altivec.o      \
16 17
                                           ppc/mpegvideodsp.o
17 18
deleted file mode 100644
... ...
@@ -1,767 +0,0 @@
1
-/*
2
- * Copyright (c) 2002 Brian Foley
3
- * Copyright (c) 2002 Dieter Shirley
4
- * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5
- *
6
- * This file is part of FFmpeg.
7
- *
8
- * FFmpeg is free software; you can redistribute it and/or
9
- * modify it under the terms of the GNU Lesser General Public
10
- * License as published by the Free Software Foundation; either
11
- * version 2.1 of the License, or (at your option) any later version.
12
- *
13
- * FFmpeg is distributed in the hope that it will be useful,
14
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
- * Lesser General Public License for more details.
17
- *
18
- * You should have received a copy of the GNU Lesser General Public
19
- * License along with FFmpeg; if not, write to the Free Software
20
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
- */
22
-
23
-#include "config.h"
24
-#if HAVE_ALTIVEC_H
25
-#include <altivec.h>
26
-#endif
27
-
28
-#include "libavutil/attributes.h"
29
-#include "libavutil/cpu.h"
30
-#include "libavutil/ppc/cpu.h"
31
-#include "libavutil/ppc/types_altivec.h"
32
-#include "libavutil/ppc/util_altivec.h"
33
-#include "libavcodec/avcodec.h"
34
-#include "libavcodec/dsputil.h"
35
-#include "libavcodec/mpegvideo.h"
36
-
37
-#if HAVE_ALTIVEC
38
-static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
39
-                            int line_size, int h)
40
-{
41
-    int i, s = 0;
42
-    const vector unsigned char zero =
43
-        (const vector unsigned char) vec_splat_u8(0);
44
-    vector unsigned char perm1 = vec_lvsl(0, pix2);
45
-    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
46
-    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
47
-    vector signed int sumdiffs;
48
-
49
-    for (i = 0; i < h; i++) {
50
-        /* Read unaligned pixels into our vectors. The vectors are as follows:
51
-         * pix1v: pix1[0] - pix1[15]
52
-         * pix2v: pix2[0] - pix2[15]      pix2iv: pix2[1] - pix2[16] */
53
-        vector unsigned char pix1v  = vec_ld(0,  pix1);
54
-        vector unsigned char pix2l  = vec_ld(0,  pix2);
55
-        vector unsigned char pix2r  = vec_ld(16, pix2);
56
-        vector unsigned char pix2v  = vec_perm(pix2l, pix2r, perm1);
57
-        vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
58
-
59
-        /* Calculate the average vector. */
60
-        vector unsigned char avgv = vec_avg(pix2v, pix2iv);
61
-
62
-        /* Calculate a sum of abs differences vector. */
63
-        vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv),
64
-                                          vec_min(pix1v, avgv));
65
-
66
-        /* Add each 4 pixel group together and put 4 results into sad. */
67
-        sad = vec_sum4s(t5, sad);
68
-
69
-        pix1 += line_size;
70
-        pix2 += line_size;
71
-    }
72
-    /* Sum up the four partial sums, and put the result into s. */
73
-    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
74
-    sumdiffs = vec_splat(sumdiffs, 3);
75
-    vec_ste(sumdiffs, 0, &s);
76
-
77
-    return s;
78
-}
79
-
80
-static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
81
-                            int line_size, int h)
82
-{
83
-    int i, s = 0;
84
-    const vector unsigned char zero =
85
-        (const vector unsigned char) vec_splat_u8(0);
86
-    vector unsigned char perm = vec_lvsl(0, pix2);
87
-    vector unsigned char pix1v, pix3v, avgv, t5;
88
-    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
89
-    vector signed int sumdiffs;
90
-    uint8_t *pix3 = pix2 + line_size;
91
-
92
-    /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
93
-     * iteration becomes pix2 in the next iteration. We can use this
94
-     * fact to avoid a potentially expensive unaligned read, each
95
-     * time around the loop.
96
-     * Read unaligned pixels into our vectors. The vectors are as follows:
97
-     * pix2v: pix2[0] - pix2[15]
98
-     * Split the pixel vectors into shorts. */
99
-    vector unsigned char pix2l = vec_ld(0,  pix2);
100
-    vector unsigned char pix2r = vec_ld(15, pix2);
101
-    vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm);
102
-
103
-    for (i = 0; i < h; i++) {
104
-        /* Read unaligned pixels into our vectors. The vectors are as follows:
105
-         * pix1v: pix1[0] - pix1[15]
106
-         * pix3v: pix3[0] - pix3[15] */
107
-        pix1v = vec_ld(0,  pix1);
108
-
109
-        pix2l = vec_ld(0,  pix3);
110
-        pix2r = vec_ld(15, pix3);
111
-        pix3v = vec_perm(pix2l, pix2r, perm);
112
-
113
-        /* Calculate the average vector. */
114
-        avgv = vec_avg(pix2v, pix3v);
115
-
116
-        /* Calculate a sum of abs differences vector. */
117
-        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
118
-
119
-        /* Add each 4 pixel group together and put 4 results into sad. */
120
-        sad = vec_sum4s(t5, sad);
121
-
122
-        pix1 += line_size;
123
-        pix2v = pix3v;
124
-        pix3 += line_size;
125
-    }
126
-
127
-    /* Sum up the four partial sums, and put the result into s. */
128
-    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
129
-    sumdiffs = vec_splat(sumdiffs, 3);
130
-    vec_ste(sumdiffs, 0, &s);
131
-    return s;
132
-}
133
-
134
-static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
135
-                             int line_size, int h)
136
-{
137
-    int i, s = 0;
138
-    uint8_t *pix3 = pix2 + line_size;
139
-    const vector unsigned char zero =
140
-        (const vector unsigned char) vec_splat_u8(0);
141
-    const vector unsigned short two =
142
-        (const vector unsigned short) vec_splat_u16(2);
143
-    vector unsigned char avgv, t5;
144
-    vector unsigned char perm1 = vec_lvsl(0, pix2);
145
-    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
146
-    vector unsigned char pix1v, pix3v, pix3iv;
147
-    vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
148
-    vector unsigned short avghv, avglv;
149
-    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
150
-    vector signed int sumdiffs;
151
-
152
-    /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
153
-     * iteration becomes pix2 in the next iteration. We can use this
154
-     * fact to avoid a potentially expensive unaligned read, as well
155
-     * as some splitting, and vector addition each time around the loop.
156
-     * Read unaligned pixels into our vectors. The vectors are as follows:
157
-     * pix2v: pix2[0] - pix2[15]  pix2iv: pix2[1] - pix2[16]
158
-     * Split the pixel vectors into shorts. */
159
-    vector unsigned char pix2l  = vec_ld(0,  pix2);
160
-    vector unsigned char pix2r  = vec_ld(16, pix2);
161
-    vector unsigned char pix2v  = vec_perm(pix2l, pix2r, perm1);
162
-    vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
163
-
164
-    vector unsigned short pix2hv  =
165
-        (vector unsigned short) vec_mergeh(zero, pix2v);
166
-    vector unsigned short pix2lv  =
167
-        (vector unsigned short) vec_mergel(zero, pix2v);
168
-    vector unsigned short pix2ihv =
169
-        (vector unsigned short) vec_mergeh(zero, pix2iv);
170
-    vector unsigned short pix2ilv =
171
-        (vector unsigned short) vec_mergel(zero, pix2iv);
172
-    vector unsigned short t1 = vec_add(pix2hv, pix2ihv);
173
-    vector unsigned short t2 = vec_add(pix2lv, pix2ilv);
174
-    vector unsigned short t3, t4;
175
-
176
-    for (i = 0; i < h; i++) {
177
-        /* Read unaligned pixels into our vectors. The vectors are as follows:
178
-         * pix1v: pix1[0] - pix1[15]
179
-         * pix3v: pix3[0] - pix3[15]      pix3iv: pix3[1] - pix3[16] */
180
-        pix1v  = vec_ld(0, pix1);
181
-
182
-        pix2l  = vec_ld(0, pix3);
183
-        pix2r  = vec_ld(16, pix3);
184
-        pix3v  = vec_perm(pix2l, pix2r, perm1);
185
-        pix3iv = vec_perm(pix2l, pix2r, perm2);
186
-
187
-        /* Note that AltiVec does have vec_avg, but this works on vector pairs
188
-         * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the
189
-         * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when
190
-         * it should be 1. Instead, we have to split the pixel vectors into
191
-         * vectors of shorts and do the averaging by hand. */
192
-
193
-        /* Split the pixel vectors into shorts. */
194
-        pix3hv  = (vector unsigned short) vec_mergeh(zero, pix3v);
195
-        pix3lv  = (vector unsigned short) vec_mergel(zero, pix3v);
196
-        pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
197
-        pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
198
-
199
-        /* Do the averaging on them. */
200
-        t3 = vec_add(pix3hv, pix3ihv);
201
-        t4 = vec_add(pix3lv, pix3ilv);
202
-
203
-        avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
204
-        avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
205
-
206
-        /* Pack the shorts back into a result. */
207
-        avgv = vec_pack(avghv, avglv);
208
-
209
-        /* Calculate a sum of abs differences vector. */
210
-        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
211
-
212
-        /* Add each 4 pixel group together and put 4 results into sad. */
213
-        sad = vec_sum4s(t5, sad);
214
-
215
-        pix1 += line_size;
216
-        pix3 += line_size;
217
-        /* Transfer the calculated values for pix3 into pix2. */
218
-        t1 = t3;
219
-        t2 = t4;
220
-    }
221
-    /* Sum up the four partial sums, and put the result into s. */
222
-    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
223
-    sumdiffs = vec_splat(sumdiffs, 3);
224
-    vec_ste(sumdiffs, 0, &s);
225
-
226
-    return s;
227
-}
228
-
229
-static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
230
-                         int line_size, int h)
231
-{
232
-    int i, s;
233
-    const vector unsigned int zero =
234
-        (const vector unsigned int) vec_splat_u32(0);
235
-    vector unsigned char perm = vec_lvsl(0, pix2);
236
-    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
237
-    vector signed int sumdiffs;
238
-
239
-    for (i = 0; i < h; i++) {
240
-        /* Read potentially unaligned pixels into t1 and t2. */
241
-        vector unsigned char pix2l = vec_ld(0,  pix2);
242
-        vector unsigned char pix2r = vec_ld(15, pix2);
243
-        vector unsigned char t1 = vec_ld(0, pix1);
244
-        vector unsigned char t2 = vec_perm(pix2l, pix2r, perm);
245
-
246
-        /* Calculate a sum of abs differences vector. */
247
-        vector unsigned char t3 = vec_max(t1, t2);
248
-        vector unsigned char t4 = vec_min(t1, t2);
249
-        vector unsigned char t5 = vec_sub(t3, t4);
250
-
251
-        /* Add each 4 pixel group together and put 4 results into sad. */
252
-        sad = vec_sum4s(t5, sad);
253
-
254
-        pix1 += line_size;
255
-        pix2 += line_size;
256
-    }
257
-
258
-    /* Sum up the four partial sums, and put the result into s. */
259
-    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
260
-    sumdiffs = vec_splat(sumdiffs, 3);
261
-    vec_ste(sumdiffs, 0, &s);
262
-
263
-    return s;
264
-}
265
-
266
-static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
267
-                        int line_size, int h)
268
-{
269
-    int i, s;
270
-    const vector unsigned int zero =
271
-        (const vector unsigned int) vec_splat_u32(0);
272
-    const vector unsigned char permclear =
273
-        (vector unsigned char)
274
-        { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
275
-    vector unsigned char perm1 = vec_lvsl(0, pix1);
276
-    vector unsigned char perm2 = vec_lvsl(0, pix2);
277
-    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
278
-    vector signed int sumdiffs;
279
-
280
-    for (i = 0; i < h; i++) {
281
-        /* Read potentially unaligned pixels into t1 and t2.
282
-         * Since we're reading 16 pixels, and actually only want 8,
283
-         * mask out the last 8 pixels. The 0s don't change the sum. */
284
-        vector unsigned char pix1l = vec_ld(0, pix1);
285
-        vector unsigned char pix1r = vec_ld(7, pix1);
286
-        vector unsigned char pix2l = vec_ld(0, pix2);
287
-        vector unsigned char pix2r = vec_ld(7, pix2);
288
-        vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1),
289
-                                          permclear);
290
-        vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2),
291
-                                          permclear);
292
-
293
-        /* Calculate a sum of abs differences vector. */
294
-        vector unsigned char t3 = vec_max(t1, t2);
295
-        vector unsigned char t4 = vec_min(t1, t2);
296
-        vector unsigned char t5 = vec_sub(t3, t4);
297
-
298
-        /* Add each 4 pixel group together and put 4 results into sad. */
299
-        sad = vec_sum4s(t5, sad);
300
-
301
-        pix1 += line_size;
302
-        pix2 += line_size;
303
-    }
304
-
305
-    /* Sum up the four partial sums, and put the result into s. */
306
-    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
307
-    sumdiffs = vec_splat(sumdiffs, 3);
308
-    vec_ste(sumdiffs, 0, &s);
309
-
310
-    return s;
311
-}
312
-
313
-/* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced.
314
- * It's the sad8_altivec code above w/ squaring added. */
315
-static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
316
-                        int line_size, int h)
317
-{
318
-    int i, s;
319
-    const vector unsigned int zero =
320
-        (const vector unsigned int) vec_splat_u32(0);
321
-    const vector unsigned char permclear =
322
-        (vector unsigned char)
323
-        { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
324
-    vector unsigned char perm1 = vec_lvsl(0, pix1);
325
-    vector unsigned char perm2 = vec_lvsl(0, pix2);
326
-    vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
327
-    vector signed int sumsqr;
328
-
329
-    for (i = 0; i < h; i++) {
330
-        /* Read potentially unaligned pixels into t1 and t2.
331
-         * Since we're reading 16 pixels, and actually only want 8,
332
-         * mask out the last 8 pixels. The 0s don't change the sum. */
333
-        vector unsigned char pix1l = vec_ld(0, pix1);
334
-        vector unsigned char pix1r = vec_ld(7, pix1);
335
-        vector unsigned char pix2l = vec_ld(0, pix2);
336
-        vector unsigned char pix2r = vec_ld(7, pix2);
337
-        vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1),
338
-                                          permclear);
339
-        vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2),
340
-                                          permclear);
341
-
342
-        /* Since we want to use unsigned chars, we can take advantage
343
-         * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
344
-
345
-        /* Calculate abs differences vector. */
346
-        vector unsigned char t3 = vec_max(t1, t2);
347
-        vector unsigned char t4 = vec_min(t1, t2);
348
-        vector unsigned char t5 = vec_sub(t3, t4);
349
-
350
-        /* Square the values and add them to our sum. */
351
-        sum = vec_msum(t5, t5, sum);
352
-
353
-        pix1 += line_size;
354
-        pix2 += line_size;
355
-    }
356
-
357
-    /* Sum up the four partial sums, and put the result into s. */
358
-    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
359
-    sumsqr = vec_splat(sumsqr, 3);
360
-    vec_ste(sumsqr, 0, &s);
361
-
362
-    return s;
363
-}
364
-
365
-/* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced.
366
- * It's the sad16_altivec code above w/ squaring added. */
367
-static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
368
-                         int line_size, int h)
369
-{
370
-    int i, s;
371
-    const vector unsigned int zero =
372
-        (const vector unsigned int) vec_splat_u32(0);
373
-    vector unsigned char perm = vec_lvsl(0, pix2);
374
-    vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
375
-    vector signed int sumsqr;
376
-
377
-    for (i = 0; i < h; i++) {
378
-        /* Read potentially unaligned pixels into t1 and t2. */
379
-        vector unsigned char pix2l = vec_ld(0,  pix2);
380
-        vector unsigned char pix2r = vec_ld(15, pix2);
381
-        vector unsigned char t1 = vec_ld(0, pix1);
382
-        vector unsigned char t2 = vec_perm(pix2l, pix2r, perm);
383
-
384
-        /* Since we want to use unsigned chars, we can take advantage
385
-         * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
386
-
387
-        /* Calculate abs differences vector. */
388
-        vector unsigned char t3 = vec_max(t1, t2);
389
-        vector unsigned char t4 = vec_min(t1, t2);
390
-        vector unsigned char t5 = vec_sub(t3, t4);
391
-
392
-        /* Square the values and add them to our sum. */
393
-        sum = vec_msum(t5, t5, sum);
394
-
395
-        pix1 += line_size;
396
-        pix2 += line_size;
397
-    }
398
-
399
-    /* Sum up the four partial sums, and put the result into s. */
400
-    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
401
-    sumsqr = vec_splat(sumsqr, 3);
402
-    vec_ste(sumsqr, 0, &s);
403
-
404
-    return s;
405
-}
406
-
407
-static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
408
-                                     uint8_t *src, int stride, int h)
409
-{
410
-    int sum;
411
-    register const vector unsigned char vzero =
412
-        (const vector unsigned char) vec_splat_u8(0);
413
-    register vector signed short temp0, temp1, temp2, temp3, temp4,
414
-                                 temp5, temp6, temp7;
415
-    {
416
-        register const vector signed short vprod1 =
417
-            (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
418
-        register const vector signed short vprod2 =
419
-            (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
420
-        register const vector signed short vprod3 =
421
-            (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
422
-        register const vector unsigned char perm1 =
423
-            (const vector unsigned char)
424
-            { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
425
-              0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
426
-        register const vector unsigned char perm2 =
427
-            (const vector unsigned char)
428
-            { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
429
-              0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
430
-        register const vector unsigned char perm3 =
431
-            (const vector unsigned char)
432
-            { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
433
-              0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
434
-
435
-#define ONEITERBUTTERFLY(i, res)                                            \
436
-    {                                                                       \
437
-        register vector unsigned char src1 = vec_ld(stride * i, src);       \
438
-        register vector unsigned char src2 = vec_ld(stride * i + 15, src);  \
439
-        register vector unsigned char srcO =                                \
440
-            vec_perm(src1, src2, vec_lvsl(stride * i, src));                \
441
-        register vector unsigned char dst1 = vec_ld(stride * i, dst);       \
442
-        register vector unsigned char dst2 = vec_ld(stride * i + 15, dst);  \
443
-        register vector unsigned char dstO =                                \
444
-            vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));                \
445
-                                                                            \
446
-        /* Promote the unsigned chars to signed shorts. */                  \
447
-        /* We're in the 8x8 function, we only care for the first 8. */      \
448
-        register vector signed short srcV =                                 \
449
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
450
-                                             (vector signed char) srcO);    \
451
-        register vector signed short dstV =                                 \
452
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
453
-                                             (vector signed char) dstO);    \
454
-                                                                            \
455
-        /* subtractions inside the first butterfly */                       \
456
-        register vector signed short but0 = vec_sub(srcV, dstV);            \
457
-        register vector signed short op1  = vec_perm(but0, but0, perm1);    \
458
-        register vector signed short but1 = vec_mladd(but0, vprod1, op1);   \
459
-        register vector signed short op2  = vec_perm(but1, but1, perm2);    \
460
-        register vector signed short but2 = vec_mladd(but1, vprod2, op2);   \
461
-        register vector signed short op3  = vec_perm(but2, but2, perm3);    \
462
-        res  = vec_mladd(but2, vprod3, op3);                                \
463
-    }
464
-        ONEITERBUTTERFLY(0, temp0);
465
-        ONEITERBUTTERFLY(1, temp1);
466
-        ONEITERBUTTERFLY(2, temp2);
467
-        ONEITERBUTTERFLY(3, temp3);
468
-        ONEITERBUTTERFLY(4, temp4);
469
-        ONEITERBUTTERFLY(5, temp5);
470
-        ONEITERBUTTERFLY(6, temp6);
471
-        ONEITERBUTTERFLY(7, temp7);
472
-    }
473
-#undef ONEITERBUTTERFLY
474
-    {
475
-        register vector signed int vsum;
476
-        register vector signed short line0  = vec_add(temp0, temp1);
477
-        register vector signed short line1  = vec_sub(temp0, temp1);
478
-        register vector signed short line2  = vec_add(temp2, temp3);
479
-        register vector signed short line3  = vec_sub(temp2, temp3);
480
-        register vector signed short line4  = vec_add(temp4, temp5);
481
-        register vector signed short line5  = vec_sub(temp4, temp5);
482
-        register vector signed short line6  = vec_add(temp6, temp7);
483
-        register vector signed short line7  = vec_sub(temp6, temp7);
484
-
485
-        register vector signed short line0B = vec_add(line0, line2);
486
-        register vector signed short line2B = vec_sub(line0, line2);
487
-        register vector signed short line1B = vec_add(line1, line3);
488
-        register vector signed short line3B = vec_sub(line1, line3);
489
-        register vector signed short line4B = vec_add(line4, line6);
490
-        register vector signed short line6B = vec_sub(line4, line6);
491
-        register vector signed short line5B = vec_add(line5, line7);
492
-        register vector signed short line7B = vec_sub(line5, line7);
493
-
494
-        register vector signed short line0C = vec_add(line0B, line4B);
495
-        register vector signed short line4C = vec_sub(line0B, line4B);
496
-        register vector signed short line1C = vec_add(line1B, line5B);
497
-        register vector signed short line5C = vec_sub(line1B, line5B);
498
-        register vector signed short line2C = vec_add(line2B, line6B);
499
-        register vector signed short line6C = vec_sub(line2B, line6B);
500
-        register vector signed short line3C = vec_add(line3B, line7B);
501
-        register vector signed short line7C = vec_sub(line3B, line7B);
502
-
503
-        vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
504
-        vsum = vec_sum4s(vec_abs(line1C), vsum);
505
-        vsum = vec_sum4s(vec_abs(line2C), vsum);
506
-        vsum = vec_sum4s(vec_abs(line3C), vsum);
507
-        vsum = vec_sum4s(vec_abs(line4C), vsum);
508
-        vsum = vec_sum4s(vec_abs(line5C), vsum);
509
-        vsum = vec_sum4s(vec_abs(line6C), vsum);
510
-        vsum = vec_sum4s(vec_abs(line7C), vsum);
511
-        vsum = vec_sums(vsum, (vector signed int) vzero);
512
-        vsum = vec_splat(vsum, 3);
513
-        vec_ste(vsum, 0, &sum);
514
-    }
515
-    return sum;
516
-}
517
-
518
-/*
519
- * 16x8 works with 16 elements; it allows to avoid replicating loads, and
520
- * gives the compiler more room for scheduling. It's only used from
521
- * inside hadamard8_diff16_altivec.
522
- *
523
- * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has
524
- * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in
525
- * registers by itself. The following code includes hand-made register
526
- * allocation. It's not clean, but on a 7450 the resulting code is much faster
527
- * (best case falls from 700+ cycles to 550).
528
- *
529
- * xlc doesn't add spill code, but it doesn't know how to schedule for the
530
- * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses
531
- * 25% fewer instructions...)
532
- *
533
- * On the 970, the hand-made RA is still a win (around 690 vs. around 780),
534
- * but xlc goes to around 660 on the regular C code...
535
- */
536
-static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
537
-                                      uint8_t *src, int stride, int h)
538
-{
539
-    int sum;
540
-    register vector signed short
541
-        temp0 __asm__ ("v0"),
542
-        temp1 __asm__ ("v1"),
543
-        temp2 __asm__ ("v2"),
544
-        temp3 __asm__ ("v3"),
545
-        temp4 __asm__ ("v4"),
546
-        temp5 __asm__ ("v5"),
547
-        temp6 __asm__ ("v6"),
548
-        temp7 __asm__ ("v7");
549
-    register vector signed short
550
-        temp0S __asm__ ("v8"),
551
-        temp1S __asm__ ("v9"),
552
-        temp2S __asm__ ("v10"),
553
-        temp3S __asm__ ("v11"),
554
-        temp4S __asm__ ("v12"),
555
-        temp5S __asm__ ("v13"),
556
-        temp6S __asm__ ("v14"),
557
-        temp7S __asm__ ("v15");
558
-    register const vector unsigned char vzero __asm__ ("v31") =
559
-        (const vector unsigned char) vec_splat_u8(0);
560
-    {
561
-        register const vector signed short vprod1 __asm__ ("v16") =
562
-            (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
563
-
564
-        register const vector signed short vprod2 __asm__ ("v17") =
565
-            (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
566
-
567
-        register const vector signed short vprod3 __asm__ ("v18") =
568
-            (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
569
-
570
-        register const vector unsigned char perm1 __asm__ ("v19") =
571
-            (const vector unsigned char)
572
-            { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
573
-              0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
574
-
575
-        register const vector unsigned char perm2 __asm__ ("v20") =
576
-            (const vector unsigned char)
577
-            { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
578
-              0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
579
-
580
-        register const vector unsigned char perm3 __asm__ ("v21") =
581
-            (const vector unsigned char)
582
-            { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
583
-              0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
584
-
585
-#define ONEITERBUTTERFLY(i, res1, res2)                                     \
586
-    {                                                                       \
587
-        register vector unsigned char src1 __asm__ ("v22") =                \
588
-            vec_ld(stride * i, src);                                        \
589
-        register vector unsigned char src2 __asm__ ("v23") =                \
590
-            vec_ld(stride * i + 16, src);                                   \
591
-        register vector unsigned char srcO __asm__ ("v22") =                \
592
-            vec_perm(src1, src2, vec_lvsl(stride * i, src));                \
593
-        register vector unsigned char dst1 __asm__ ("v24") =                \
594
-            vec_ld(stride * i, dst);                                        \
595
-        register vector unsigned char dst2 __asm__ ("v25") =                \
596
-            vec_ld(stride * i + 16, dst);                                   \
597
-        register vector unsigned char dstO __asm__ ("v23") =                \
598
-            vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));                \
599
-                                                                            \
600
-        /* Promote the unsigned chars to signed shorts. */                  \
601
-        register vector signed short srcV __asm__ ("v24") =                 \
602
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
603
-                                             (vector signed char) srcO);    \
604
-        register vector signed short dstV __asm__ ("v25") =                 \
605
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
606
-                                             (vector signed char) dstO);    \
607
-        register vector signed short srcW __asm__ ("v26") =                 \
608
-            (vector signed short) vec_mergel((vector signed char) vzero,    \
609
-                                             (vector signed char) srcO);    \
610
-        register vector signed short dstW __asm__ ("v27") =                 \
611
-            (vector signed short) vec_mergel((vector signed char) vzero,    \
612
-                                             (vector signed char) dstO);    \
613
-                                                                            \
614
-        /* subtractions inside the first butterfly */                       \
615
-        register vector signed short but0  __asm__ ("v28") =                \
616
-            vec_sub(srcV, dstV);                                            \
617
-        register vector signed short but0S __asm__ ("v29") =                \
618
-            vec_sub(srcW, dstW);                                            \
619
-        register vector signed short op1   __asm__ ("v30") =                \
620
-            vec_perm(but0, but0, perm1);                                    \
621
-        register vector signed short but1  __asm__ ("v22") =                \
622
-            vec_mladd(but0, vprod1, op1);                                   \
623
-        register vector signed short op1S  __asm__ ("v23") =                \
624
-            vec_perm(but0S, but0S, perm1);                                  \
625
-        register vector signed short but1S __asm__ ("v24") =                \
626
-            vec_mladd(but0S, vprod1, op1S);                                 \
627
-        register vector signed short op2   __asm__ ("v25") =                \
628
-            vec_perm(but1, but1, perm2);                                    \
629
-        register vector signed short but2  __asm__ ("v26") =                \
630
-            vec_mladd(but1, vprod2, op2);                                   \
631
-        register vector signed short op2S  __asm__ ("v27") =                \
632
-            vec_perm(but1S, but1S, perm2);                                  \
633
-        register vector signed short but2S __asm__ ("v28") =                \
634
-            vec_mladd(but1S, vprod2, op2S);                                 \
635
-        register vector signed short op3   __asm__ ("v29") =                \
636
-            vec_perm(but2, but2, perm3);                                    \
637
-        register vector signed short op3S  __asm__ ("v30") =                \
638
-            vec_perm(but2S, but2S, perm3);                                  \
639
-        res1 = vec_mladd(but2, vprod3, op3);                                \
640
-        res2 = vec_mladd(but2S, vprod3, op3S);                              \
641
-    }
642
-        ONEITERBUTTERFLY(0, temp0, temp0S);
643
-        ONEITERBUTTERFLY(1, temp1, temp1S);
644
-        ONEITERBUTTERFLY(2, temp2, temp2S);
645
-        ONEITERBUTTERFLY(3, temp3, temp3S);
646
-        ONEITERBUTTERFLY(4, temp4, temp4S);
647
-        ONEITERBUTTERFLY(5, temp5, temp5S);
648
-        ONEITERBUTTERFLY(6, temp6, temp6S);
649
-        ONEITERBUTTERFLY(7, temp7, temp7S);
650
-    }
651
-#undef ONEITERBUTTERFLY
652
-    {
653
-        register vector signed int vsum;
654
-
655
-        register vector signed short line0  = vec_add(temp0, temp1);
656
-        register vector signed short line1  = vec_sub(temp0, temp1);
657
-        register vector signed short line2  = vec_add(temp2, temp3);
658
-        register vector signed short line3  = vec_sub(temp2, temp3);
659
-        register vector signed short line4  = vec_add(temp4, temp5);
660
-        register vector signed short line5  = vec_sub(temp4, temp5);
661
-        register vector signed short line6  = vec_add(temp6, temp7);
662
-        register vector signed short line7  = vec_sub(temp6, temp7);
663
-
664
-        register vector signed short line0B = vec_add(line0, line2);
665
-        register vector signed short line2B = vec_sub(line0, line2);
666
-        register vector signed short line1B = vec_add(line1, line3);
667
-        register vector signed short line3B = vec_sub(line1, line3);
668
-        register vector signed short line4B = vec_add(line4, line6);
669
-        register vector signed short line6B = vec_sub(line4, line6);
670
-        register vector signed short line5B = vec_add(line5, line7);
671
-        register vector signed short line7B = vec_sub(line5, line7);
672
-
673
-        register vector signed short line0C = vec_add(line0B, line4B);
674
-        register vector signed short line4C = vec_sub(line0B, line4B);
675
-        register vector signed short line1C = vec_add(line1B, line5B);
676
-        register vector signed short line5C = vec_sub(line1B, line5B);
677
-        register vector signed short line2C = vec_add(line2B, line6B);
678
-        register vector signed short line6C = vec_sub(line2B, line6B);
679
-        register vector signed short line3C = vec_add(line3B, line7B);
680
-        register vector signed short line7C = vec_sub(line3B, line7B);
681
-
682
-        register vector signed short line0S = vec_add(temp0S, temp1S);
683
-        register vector signed short line1S = vec_sub(temp0S, temp1S);
684
-        register vector signed short line2S = vec_add(temp2S, temp3S);
685
-        register vector signed short line3S = vec_sub(temp2S, temp3S);
686
-        register vector signed short line4S = vec_add(temp4S, temp5S);
687
-        register vector signed short line5S = vec_sub(temp4S, temp5S);
688
-        register vector signed short line6S = vec_add(temp6S, temp7S);
689
-        register vector signed short line7S = vec_sub(temp6S, temp7S);
690
-
691
-        register vector signed short line0BS = vec_add(line0S, line2S);
692
-        register vector signed short line2BS = vec_sub(line0S, line2S);
693
-        register vector signed short line1BS = vec_add(line1S, line3S);
694
-        register vector signed short line3BS = vec_sub(line1S, line3S);
695
-        register vector signed short line4BS = vec_add(line4S, line6S);
696
-        register vector signed short line6BS = vec_sub(line4S, line6S);
697
-        register vector signed short line5BS = vec_add(line5S, line7S);
698
-        register vector signed short line7BS = vec_sub(line5S, line7S);
699
-
700
-        register vector signed short line0CS = vec_add(line0BS, line4BS);
701
-        register vector signed short line4CS = vec_sub(line0BS, line4BS);
702
-        register vector signed short line1CS = vec_add(line1BS, line5BS);
703
-        register vector signed short line5CS = vec_sub(line1BS, line5BS);
704
-        register vector signed short line2CS = vec_add(line2BS, line6BS);
705
-        register vector signed short line6CS = vec_sub(line2BS, line6BS);
706
-        register vector signed short line3CS = vec_add(line3BS, line7BS);
707
-        register vector signed short line7CS = vec_sub(line3BS, line7BS);
708
-
709
-        vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
710
-        vsum = vec_sum4s(vec_abs(line1C), vsum);
711
-        vsum = vec_sum4s(vec_abs(line2C), vsum);
712
-        vsum = vec_sum4s(vec_abs(line3C), vsum);
713
-        vsum = vec_sum4s(vec_abs(line4C), vsum);
714
-        vsum = vec_sum4s(vec_abs(line5C), vsum);
715
-        vsum = vec_sum4s(vec_abs(line6C), vsum);
716
-        vsum = vec_sum4s(vec_abs(line7C), vsum);
717
-
718
-        vsum = vec_sum4s(vec_abs(line0CS), vsum);
719
-        vsum = vec_sum4s(vec_abs(line1CS), vsum);
720
-        vsum = vec_sum4s(vec_abs(line2CS), vsum);
721
-        vsum = vec_sum4s(vec_abs(line3CS), vsum);
722
-        vsum = vec_sum4s(vec_abs(line4CS), vsum);
723
-        vsum = vec_sum4s(vec_abs(line5CS), vsum);
724
-        vsum = vec_sum4s(vec_abs(line6CS), vsum);
725
-        vsum = vec_sum4s(vec_abs(line7CS), vsum);
726
-        vsum = vec_sums(vsum, (vector signed int) vzero);
727
-        vsum = vec_splat(vsum, 3);
728
-        vec_ste(vsum, 0, &sum);
729
-    }
730
-    return sum;
731
-}
732
-
733
-static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst,
734
-                                    uint8_t *src, int stride, int h)
735
-{
736
-    int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
737
-
738
-    if (h == 16) {
739
-        dst   += 8 * stride;
740
-        src   += 8 * stride;
741
-        score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
742
-    }
743
-    return score;
744
-}
745
-#endif /* HAVE_ALTIVEC */
746
-
747
-av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx)
748
-{
749
-#if HAVE_ALTIVEC
750
-    if (!PPC_ALTIVEC(av_get_cpu_flags()))
751
-        return;
752
-
753
-    c->pix_abs[0][1] = sad16_x2_altivec;
754
-    c->pix_abs[0][2] = sad16_y2_altivec;
755
-    c->pix_abs[0][3] = sad16_xy2_altivec;
756
-    c->pix_abs[0][0] = sad16_altivec;
757
-    c->pix_abs[1][0] = sad8_altivec;
758
-
759
-    c->sad[0] = sad16_altivec;
760
-    c->sad[1] = sad8_altivec;
761
-    c->sse[0] = sse16_altivec;
762
-    c->sse[1] = sse8_altivec;
763
-
764
-    c->hadamard8_diff[0] = hadamard8_diff16_altivec;
765
-    c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
766
-#endif /* HAVE_ALTIVEC */
767
-}
768 1
new file mode 100644
... ...
@@ -0,0 +1,767 @@
0
+/*
1
+ * Copyright (c) 2002 Brian Foley
2
+ * Copyright (c) 2002 Dieter Shirley
3
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
4
+ *
5
+ * This file is part of FFmpeg.
6
+ *
7
+ * FFmpeg is free software; you can redistribute it and/or
8
+ * modify it under the terms of the GNU Lesser General Public
9
+ * License as published by the Free Software Foundation; either
10
+ * version 2.1 of the License, or (at your option) any later version.
11
+ *
12
+ * FFmpeg is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
+ * Lesser General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU Lesser General Public
18
+ * License along with FFmpeg; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ */
21
+
22
+#include "config.h"
23
+#if HAVE_ALTIVEC_H
24
+#include <altivec.h>
25
+#endif
26
+
27
+#include "libavutil/attributes.h"
28
+#include "libavutil/cpu.h"
29
+#include "libavutil/ppc/cpu.h"
30
+#include "libavutil/ppc/types_altivec.h"
31
+#include "libavutil/ppc/util_altivec.h"
32
+#include "libavcodec/avcodec.h"
33
+#include "libavcodec/mpegvideo.h"
34
+#include "libavcodec/me_cmp.h"
35
+
36
+#if HAVE_ALTIVEC
37
+static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
38
+                            int line_size, int h)
39
+{
40
+    int i, s = 0;
41
+    const vector unsigned char zero =
42
+        (const vector unsigned char) vec_splat_u8(0);
43
+    vector unsigned char perm1 = vec_lvsl(0, pix2);
44
+    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
45
+    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
46
+    vector signed int sumdiffs;
47
+
48
+    for (i = 0; i < h; i++) {
49
+        /* Read unaligned pixels into our vectors. The vectors are as follows:
50
+         * pix1v: pix1[0] - pix1[15]
51
+         * pix2v: pix2[0] - pix2[15]      pix2iv: pix2[1] - pix2[16] */
52
+        vector unsigned char pix1v  = vec_ld(0,  pix1);
53
+        vector unsigned char pix2l  = vec_ld(0,  pix2);
54
+        vector unsigned char pix2r  = vec_ld(16, pix2);
55
+        vector unsigned char pix2v  = vec_perm(pix2l, pix2r, perm1);
56
+        vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
57
+
58
+        /* Calculate the average vector. */
59
+        vector unsigned char avgv = vec_avg(pix2v, pix2iv);
60
+
61
+        /* Calculate a sum of abs differences vector. */
62
+        vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv),
63
+                                          vec_min(pix1v, avgv));
64
+
65
+        /* Add each 4 pixel group together and put 4 results into sad. */
66
+        sad = vec_sum4s(t5, sad);
67
+
68
+        pix1 += line_size;
69
+        pix2 += line_size;
70
+    }
71
+    /* Sum up the four partial sums, and put the result into s. */
72
+    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
73
+    sumdiffs = vec_splat(sumdiffs, 3);
74
+    vec_ste(sumdiffs, 0, &s);
75
+
76
+    return s;
77
+}
78
+
79
+static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
80
+                            int line_size, int h)
81
+{
82
+    int i, s = 0;
83
+    const vector unsigned char zero =
84
+        (const vector unsigned char) vec_splat_u8(0);
85
+    vector unsigned char perm = vec_lvsl(0, pix2);
86
+    vector unsigned char pix1v, pix3v, avgv, t5;
87
+    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
88
+    vector signed int sumdiffs;
89
+    uint8_t *pix3 = pix2 + line_size;
90
+
91
+    /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
92
+     * iteration becomes pix2 in the next iteration. We can use this
93
+     * fact to avoid a potentially expensive unaligned read, each
94
+     * time around the loop.
95
+     * Read unaligned pixels into our vectors. The vectors are as follows:
96
+     * pix2v: pix2[0] - pix2[15]
97
+     * Split the pixel vectors into shorts. */
98
+    vector unsigned char pix2l = vec_ld(0,  pix2);
99
+    vector unsigned char pix2r = vec_ld(15, pix2);
100
+    vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm);
101
+
102
+    for (i = 0; i < h; i++) {
103
+        /* Read unaligned pixels into our vectors. The vectors are as follows:
104
+         * pix1v: pix1[0] - pix1[15]
105
+         * pix3v: pix3[0] - pix3[15] */
106
+        pix1v = vec_ld(0,  pix1);
107
+
108
+        pix2l = vec_ld(0,  pix3);
109
+        pix2r = vec_ld(15, pix3);
110
+        pix3v = vec_perm(pix2l, pix2r, perm);
111
+
112
+        /* Calculate the average vector. */
113
+        avgv = vec_avg(pix2v, pix3v);
114
+
115
+        /* Calculate a sum of abs differences vector. */
116
+        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
117
+
118
+        /* Add each 4 pixel group together and put 4 results into sad. */
119
+        sad = vec_sum4s(t5, sad);
120
+
121
+        pix1 += line_size;
122
+        pix2v = pix3v;
123
+        pix3 += line_size;
124
+    }
125
+
126
+    /* Sum up the four partial sums, and put the result into s. */
127
+    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
128
+    sumdiffs = vec_splat(sumdiffs, 3);
129
+    vec_ste(sumdiffs, 0, &s);
130
+    return s;
131
+}
132
+
133
+static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
134
+                             int line_size, int h)
135
+{
136
+    int i, s = 0;
137
+    uint8_t *pix3 = pix2 + line_size;
138
+    const vector unsigned char zero =
139
+        (const vector unsigned char) vec_splat_u8(0);
140
+    const vector unsigned short two =
141
+        (const vector unsigned short) vec_splat_u16(2);
142
+    vector unsigned char avgv, t5;
143
+    vector unsigned char perm1 = vec_lvsl(0, pix2);
144
+    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
145
+    vector unsigned char pix1v, pix3v, pix3iv;
146
+    vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
147
+    vector unsigned short avghv, avglv;
148
+    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
149
+    vector signed int sumdiffs;
150
+
151
+    /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
152
+     * iteration becomes pix2 in the next iteration. We can use this
153
+     * fact to avoid a potentially expensive unaligned read, as well
154
+     * as some splitting, and vector addition each time around the loop.
155
+     * Read unaligned pixels into our vectors. The vectors are as follows:
156
+     * pix2v: pix2[0] - pix2[15]  pix2iv: pix2[1] - pix2[16]
157
+     * Split the pixel vectors into shorts. */
158
+    vector unsigned char pix2l  = vec_ld(0,  pix2);
159
+    vector unsigned char pix2r  = vec_ld(16, pix2);
160
+    vector unsigned char pix2v  = vec_perm(pix2l, pix2r, perm1);
161
+    vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
162
+
163
+    vector unsigned short pix2hv  =
164
+        (vector unsigned short) vec_mergeh(zero, pix2v);
165
+    vector unsigned short pix2lv  =
166
+        (vector unsigned short) vec_mergel(zero, pix2v);
167
+    vector unsigned short pix2ihv =
168
+        (vector unsigned short) vec_mergeh(zero, pix2iv);
169
+    vector unsigned short pix2ilv =
170
+        (vector unsigned short) vec_mergel(zero, pix2iv);
171
+    vector unsigned short t1 = vec_add(pix2hv, pix2ihv);
172
+    vector unsigned short t2 = vec_add(pix2lv, pix2ilv);
173
+    vector unsigned short t3, t4;
174
+
175
+    for (i = 0; i < h; i++) {
176
+        /* Read unaligned pixels into our vectors. The vectors are as follows:
177
+         * pix1v: pix1[0] - pix1[15]
178
+         * pix3v: pix3[0] - pix3[15]      pix3iv: pix3[1] - pix3[16] */
179
+        pix1v  = vec_ld(0, pix1);
180
+
181
+        pix2l  = vec_ld(0, pix3);
182
+        pix2r  = vec_ld(16, pix3);
183
+        pix3v  = vec_perm(pix2l, pix2r, perm1);
184
+        pix3iv = vec_perm(pix2l, pix2r, perm2);
185
+
186
+        /* Note that AltiVec does have vec_avg, but this works on vector pairs
187
+         * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the
188
+         * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when
189
+         * it should be 1. Instead, we have to split the pixel vectors into
190
+         * vectors of shorts and do the averaging by hand. */
191
+
192
+        /* Split the pixel vectors into shorts. */
193
+        pix3hv  = (vector unsigned short) vec_mergeh(zero, pix3v);
194
+        pix3lv  = (vector unsigned short) vec_mergel(zero, pix3v);
195
+        pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
196
+        pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
197
+
198
+        /* Do the averaging on them. */
199
+        t3 = vec_add(pix3hv, pix3ihv);
200
+        t4 = vec_add(pix3lv, pix3ilv);
201
+
202
+        avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
203
+        avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
204
+
205
+        /* Pack the shorts back into a result. */
206
+        avgv = vec_pack(avghv, avglv);
207
+
208
+        /* Calculate a sum of abs differences vector. */
209
+        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
210
+
211
+        /* Add each 4 pixel group together and put 4 results into sad. */
212
+        sad = vec_sum4s(t5, sad);
213
+
214
+        pix1 += line_size;
215
+        pix3 += line_size;
216
+        /* Transfer the calculated values for pix3 into pix2. */
217
+        t1 = t3;
218
+        t2 = t4;
219
+    }
220
+    /* Sum up the four partial sums, and put the result into s. */
221
+    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
222
+    sumdiffs = vec_splat(sumdiffs, 3);
223
+    vec_ste(sumdiffs, 0, &s);
224
+
225
+    return s;
226
+}
227
+
228
+static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
229
+                         int line_size, int h)
230
+{
231
+    int i, s;
232
+    const vector unsigned int zero =
233
+        (const vector unsigned int) vec_splat_u32(0);
234
+    vector unsigned char perm = vec_lvsl(0, pix2);
235
+    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
236
+    vector signed int sumdiffs;
237
+
238
+    for (i = 0; i < h; i++) {
239
+        /* Read potentially unaligned pixels into t1 and t2. */
240
+        vector unsigned char pix2l = vec_ld(0,  pix2);
241
+        vector unsigned char pix2r = vec_ld(15, pix2);
242
+        vector unsigned char t1 = vec_ld(0, pix1);
243
+        vector unsigned char t2 = vec_perm(pix2l, pix2r, perm);
244
+
245
+        /* Calculate a sum of abs differences vector. */
246
+        vector unsigned char t3 = vec_max(t1, t2);
247
+        vector unsigned char t4 = vec_min(t1, t2);
248
+        vector unsigned char t5 = vec_sub(t3, t4);
249
+
250
+        /* Add each 4 pixel group together and put 4 results into sad. */
251
+        sad = vec_sum4s(t5, sad);
252
+
253
+        pix1 += line_size;
254
+        pix2 += line_size;
255
+    }
256
+
257
+    /* Sum up the four partial sums, and put the result into s. */
258
+    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
259
+    sumdiffs = vec_splat(sumdiffs, 3);
260
+    vec_ste(sumdiffs, 0, &s);
261
+
262
+    return s;
263
+}
264
+
265
+static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
266
+                        int line_size, int h)
267
+{
268
+    int i, s;
269
+    const vector unsigned int zero =
270
+        (const vector unsigned int) vec_splat_u32(0);
271
+    const vector unsigned char permclear =
272
+        (vector unsigned char)
273
+        { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
274
+    vector unsigned char perm1 = vec_lvsl(0, pix1);
275
+    vector unsigned char perm2 = vec_lvsl(0, pix2);
276
+    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
277
+    vector signed int sumdiffs;
278
+
279
+    for (i = 0; i < h; i++) {
280
+        /* Read potentially unaligned pixels into t1 and t2.
281
+         * Since we're reading 16 pixels, and actually only want 8,
282
+         * mask out the last 8 pixels. The 0s don't change the sum. */
283
+        vector unsigned char pix1l = vec_ld(0, pix1);
284
+        vector unsigned char pix1r = vec_ld(7, pix1);
285
+        vector unsigned char pix2l = vec_ld(0, pix2);
286
+        vector unsigned char pix2r = vec_ld(7, pix2);
287
+        vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1),
288
+                                          permclear);
289
+        vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2),
290
+                                          permclear);
291
+
292
+        /* Calculate a sum of abs differences vector. */
293
+        vector unsigned char t3 = vec_max(t1, t2);
294
+        vector unsigned char t4 = vec_min(t1, t2);
295
+        vector unsigned char t5 = vec_sub(t3, t4);
296
+
297
+        /* Add each 4 pixel group together and put 4 results into sad. */
298
+        sad = vec_sum4s(t5, sad);
299
+
300
+        pix1 += line_size;
301
+        pix2 += line_size;
302
+    }
303
+
304
+    /* Sum up the four partial sums, and put the result into s. */
305
+    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
306
+    sumdiffs = vec_splat(sumdiffs, 3);
307
+    vec_ste(sumdiffs, 0, &s);
308
+
309
+    return s;
310
+}
311
+
312
+/* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced.
313
+ * It's the sad8_altivec code above w/ squaring added. */
314
+static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
315
+                        int line_size, int h)
316
+{
317
+    int i, s;
318
+    const vector unsigned int zero =
319
+        (const vector unsigned int) vec_splat_u32(0);
320
+    const vector unsigned char permclear =
321
+        (vector unsigned char)
322
+        { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
323
+    vector unsigned char perm1 = vec_lvsl(0, pix1);
324
+    vector unsigned char perm2 = vec_lvsl(0, pix2);
325
+    vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
326
+    vector signed int sumsqr;
327
+
328
+    for (i = 0; i < h; i++) {
329
+        /* Read potentially unaligned pixels into t1 and t2.
330
+         * Since we're reading 16 pixels, and actually only want 8,
331
+         * mask out the last 8 pixels. The 0s don't change the sum. */
332
+        vector unsigned char pix1l = vec_ld(0, pix1);
333
+        vector unsigned char pix1r = vec_ld(7, pix1);
334
+        vector unsigned char pix2l = vec_ld(0, pix2);
335
+        vector unsigned char pix2r = vec_ld(7, pix2);
336
+        vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1),
337
+                                          permclear);
338
+        vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2),
339
+                                          permclear);
340
+
341
+        /* Since we want to use unsigned chars, we can take advantage
342
+         * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
343
+
344
+        /* Calculate abs differences vector. */
345
+        vector unsigned char t3 = vec_max(t1, t2);
346
+        vector unsigned char t4 = vec_min(t1, t2);
347
+        vector unsigned char t5 = vec_sub(t3, t4);
348
+
349
+        /* Square the values and add them to our sum. */
350
+        sum = vec_msum(t5, t5, sum);
351
+
352
+        pix1 += line_size;
353
+        pix2 += line_size;
354
+    }
355
+
356
+    /* Sum up the four partial sums, and put the result into s. */
357
+    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
358
+    sumsqr = vec_splat(sumsqr, 3);
359
+    vec_ste(sumsqr, 0, &s);
360
+
361
+    return s;
362
+}
363
+
364
+/* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced.
365
+ * It's the sad16_altivec code above w/ squaring added. */
366
+static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
367
+                         int line_size, int h)
368
+{
369
+    int i, s;
370
+    const vector unsigned int zero =
371
+        (const vector unsigned int) vec_splat_u32(0);
372
+    vector unsigned char perm = vec_lvsl(0, pix2);
373
+    vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
374
+    vector signed int sumsqr;
375
+
376
+    for (i = 0; i < h; i++) {
377
+        /* Read potentially unaligned pixels into t1 and t2. */
378
+        vector unsigned char pix2l = vec_ld(0,  pix2);
379
+        vector unsigned char pix2r = vec_ld(15, pix2);
380
+        vector unsigned char t1 = vec_ld(0, pix1);
381
+        vector unsigned char t2 = vec_perm(pix2l, pix2r, perm);
382
+
383
+        /* Since we want to use unsigned chars, we can take advantage
384
+         * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
385
+
386
+        /* Calculate abs differences vector. */
387
+        vector unsigned char t3 = vec_max(t1, t2);
388
+        vector unsigned char t4 = vec_min(t1, t2);
389
+        vector unsigned char t5 = vec_sub(t3, t4);
390
+
391
+        /* Square the values and add them to our sum. */
392
+        sum = vec_msum(t5, t5, sum);
393
+
394
+        pix1 += line_size;
395
+        pix2 += line_size;
396
+    }
397
+
398
+    /* Sum up the four partial sums, and put the result into s. */
399
+    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
400
+    sumsqr = vec_splat(sumsqr, 3);
401
+    vec_ste(sumsqr, 0, &s);
402
+
403
+    return s;
404
+}
405
+
406
+static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
407
+                                     uint8_t *src, int stride, int h)
408
+{
409
+    int sum;
410
+    register const vector unsigned char vzero =
411
+        (const vector unsigned char) vec_splat_u8(0);
412
+    register vector signed short temp0, temp1, temp2, temp3, temp4,
413
+                                 temp5, temp6, temp7;
414
+    {
415
+        register const vector signed short vprod1 =
416
+            (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
417
+        register const vector signed short vprod2 =
418
+            (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
419
+        register const vector signed short vprod3 =
420
+            (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
421
+        register const vector unsigned char perm1 =
422
+            (const vector unsigned char)
423
+            { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
424
+              0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
425
+        register const vector unsigned char perm2 =
426
+            (const vector unsigned char)
427
+            { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
428
+              0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
429
+        register const vector unsigned char perm3 =
430
+            (const vector unsigned char)
431
+            { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
432
+              0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
433
+
434
+#define ONEITERBUTTERFLY(i, res)                                            \
435
+    {                                                                       \
436
+        register vector unsigned char src1 = vec_ld(stride * i, src);       \
437
+        register vector unsigned char src2 = vec_ld(stride * i + 15, src);  \
438
+        register vector unsigned char srcO =                                \
439
+            vec_perm(src1, src2, vec_lvsl(stride * i, src));                \
440
+        register vector unsigned char dst1 = vec_ld(stride * i, dst);       \
441
+        register vector unsigned char dst2 = vec_ld(stride * i + 15, dst);  \
442
+        register vector unsigned char dstO =                                \
443
+            vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));                \
444
+                                                                            \
445
+        /* Promote the unsigned chars to signed shorts. */                  \
446
+        /* We're in the 8x8 function, we only care for the first 8. */      \
447
+        register vector signed short srcV =                                 \
448
+            (vector signed short) vec_mergeh((vector signed char) vzero,    \
449
+                                             (vector signed char) srcO);    \
450
+        register vector signed short dstV =                                 \
451
+            (vector signed short) vec_mergeh((vector signed char) vzero,    \
452
+                                             (vector signed char) dstO);    \
453
+                                                                            \
454
+        /* subtractions inside the first butterfly */                       \
455
+        register vector signed short but0 = vec_sub(srcV, dstV);            \
456
+        register vector signed short op1  = vec_perm(but0, but0, perm1);    \
457
+        register vector signed short but1 = vec_mladd(but0, vprod1, op1);   \
458
+        register vector signed short op2  = vec_perm(but1, but1, perm2);    \
459
+        register vector signed short but2 = vec_mladd(but1, vprod2, op2);   \
460
+        register vector signed short op3  = vec_perm(but2, but2, perm3);    \
461
+        res  = vec_mladd(but2, vprod3, op3);                                \
462
+    }
463
+        ONEITERBUTTERFLY(0, temp0);
464
+        ONEITERBUTTERFLY(1, temp1);
465
+        ONEITERBUTTERFLY(2, temp2);
466
+        ONEITERBUTTERFLY(3, temp3);
467
+        ONEITERBUTTERFLY(4, temp4);
468
+        ONEITERBUTTERFLY(5, temp5);
469
+        ONEITERBUTTERFLY(6, temp6);
470
+        ONEITERBUTTERFLY(7, temp7);
471
+    }
472
+#undef ONEITERBUTTERFLY
473
+    {
474
+        register vector signed int vsum;
475
+        register vector signed short line0  = vec_add(temp0, temp1);
476
+        register vector signed short line1  = vec_sub(temp0, temp1);
477
+        register vector signed short line2  = vec_add(temp2, temp3);
478
+        register vector signed short line3  = vec_sub(temp2, temp3);
479
+        register vector signed short line4  = vec_add(temp4, temp5);
480
+        register vector signed short line5  = vec_sub(temp4, temp5);
481
+        register vector signed short line6  = vec_add(temp6, temp7);
482
+        register vector signed short line7  = vec_sub(temp6, temp7);
483
+
484
+        register vector signed short line0B = vec_add(line0, line2);
485
+        register vector signed short line2B = vec_sub(line0, line2);
486
+        register vector signed short line1B = vec_add(line1, line3);
487
+        register vector signed short line3B = vec_sub(line1, line3);
488
+        register vector signed short line4B = vec_add(line4, line6);
489
+        register vector signed short line6B = vec_sub(line4, line6);
490
+        register vector signed short line5B = vec_add(line5, line7);
491
+        register vector signed short line7B = vec_sub(line5, line7);
492
+
493
+        register vector signed short line0C = vec_add(line0B, line4B);
494
+        register vector signed short line4C = vec_sub(line0B, line4B);
495
+        register vector signed short line1C = vec_add(line1B, line5B);
496
+        register vector signed short line5C = vec_sub(line1B, line5B);
497
+        register vector signed short line2C = vec_add(line2B, line6B);
498
+        register vector signed short line6C = vec_sub(line2B, line6B);
499
+        register vector signed short line3C = vec_add(line3B, line7B);
500
+        register vector signed short line7C = vec_sub(line3B, line7B);
501
+
502
+        vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
503
+        vsum = vec_sum4s(vec_abs(line1C), vsum);
504
+        vsum = vec_sum4s(vec_abs(line2C), vsum);
505
+        vsum = vec_sum4s(vec_abs(line3C), vsum);
506
+        vsum = vec_sum4s(vec_abs(line4C), vsum);
507
+        vsum = vec_sum4s(vec_abs(line5C), vsum);
508
+        vsum = vec_sum4s(vec_abs(line6C), vsum);
509
+        vsum = vec_sum4s(vec_abs(line7C), vsum);
510
+        vsum = vec_sums(vsum, (vector signed int) vzero);
511
+        vsum = vec_splat(vsum, 3);
512
+        vec_ste(vsum, 0, &sum);
513
+    }
514
+    return sum;
515
+}
516
+
517
+/*
518
+ * 16x8 works with 16 elements; it allows to avoid replicating loads, and
519
+ * gives the compiler more room for scheduling. It's only used from
520
+ * inside hadamard8_diff16_altivec.
521
+ *
522
+ * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has
523
+ * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in
524
+ * registers by itself. The following code includes hand-made register
525
+ * allocation. It's not clean, but on a 7450 the resulting code is much faster
526
+ * (best case falls from 700+ cycles to 550).
527
+ *
528
+ * xlc doesn't add spill code, but it doesn't know how to schedule for the
529
+ * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses
530
+ * 25% fewer instructions...)
531
+ *
532
+ * On the 970, the hand-made RA is still a win (around 690 vs. around 780),
533
+ * but xlc goes to around 660 on the regular C code...
534
+ */
535
+static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
536
+                                      uint8_t *src, int stride, int h)
537
+{
538
+    int sum;
539
+    register vector signed short
540
+        temp0 __asm__ ("v0"),
541
+        temp1 __asm__ ("v1"),
542
+        temp2 __asm__ ("v2"),
543
+        temp3 __asm__ ("v3"),
544
+        temp4 __asm__ ("v4"),
545
+        temp5 __asm__ ("v5"),
546
+        temp6 __asm__ ("v6"),
547
+        temp7 __asm__ ("v7");
548
+    register vector signed short
549
+        temp0S __asm__ ("v8"),
550
+        temp1S __asm__ ("v9"),
551
+        temp2S __asm__ ("v10"),
552
+        temp3S __asm__ ("v11"),
553
+        temp4S __asm__ ("v12"),
554
+        temp5S __asm__ ("v13"),
555
+        temp6S __asm__ ("v14"),
556
+        temp7S __asm__ ("v15");
557
+    register const vector unsigned char vzero __asm__ ("v31") =
558
+        (const vector unsigned char) vec_splat_u8(0);
559
+    {
560
+        register const vector signed short vprod1 __asm__ ("v16") =
561
+            (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
562
+
563
+        register const vector signed short vprod2 __asm__ ("v17") =
564
+            (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
565
+
566
+        register const vector signed short vprod3 __asm__ ("v18") =
567
+            (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
568
+
569
+        register const vector unsigned char perm1 __asm__ ("v19") =
570
+            (const vector unsigned char)
571
+            { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
572
+              0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
573
+
574
+        register const vector unsigned char perm2 __asm__ ("v20") =
575
+            (const vector unsigned char)
576
+            { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
577
+              0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
578
+
579
+        register const vector unsigned char perm3 __asm__ ("v21") =
580
+            (const vector unsigned char)
581
+            { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
582
+              0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
583
+
584
+#define ONEITERBUTTERFLY(i, res1, res2)                                     \
585
+    {                                                                       \
586
+        register vector unsigned char src1 __asm__ ("v22") =                \
587
+            vec_ld(stride * i, src);                                        \
588
+        register vector unsigned char src2 __asm__ ("v23") =                \
589
+            vec_ld(stride * i + 16, src);                                   \
590
+        register vector unsigned char srcO __asm__ ("v22") =                \
591
+            vec_perm(src1, src2, vec_lvsl(stride * i, src));                \
592
+        register vector unsigned char dst1 __asm__ ("v24") =                \
593
+            vec_ld(stride * i, dst);                                        \
594
+        register vector unsigned char dst2 __asm__ ("v25") =                \
595
+            vec_ld(stride * i + 16, dst);                                   \
596
+        register vector unsigned char dstO __asm__ ("v23") =                \
597
+            vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));                \
598
+                                                                            \
599
+        /* Promote the unsigned chars to signed shorts. */                  \
600
+        register vector signed short srcV __asm__ ("v24") =                 \
601
+            (vector signed short) vec_mergeh((vector signed char) vzero,    \
602
+                                             (vector signed char) srcO);    \
603
+        register vector signed short dstV __asm__ ("v25") =                 \
604
+            (vector signed short) vec_mergeh((vector signed char) vzero,    \
605
+                                             (vector signed char) dstO);    \
606
+        register vector signed short srcW __asm__ ("v26") =                 \
607
+            (vector signed short) vec_mergel((vector signed char) vzero,    \
608
+                                             (vector signed char) srcO);    \
609
+        register vector signed short dstW __asm__ ("v27") =                 \
610
+            (vector signed short) vec_mergel((vector signed char) vzero,    \
611
+                                             (vector signed char) dstO);    \
612
+                                                                            \
613
+        /* subtractions inside the first butterfly */                       \
614
+        register vector signed short but0  __asm__ ("v28") =                \
615
+            vec_sub(srcV, dstV);                                            \
616
+        register vector signed short but0S __asm__ ("v29") =                \
617
+            vec_sub(srcW, dstW);                                            \
618
+        register vector signed short op1   __asm__ ("v30") =                \
619
+            vec_perm(but0, but0, perm1);                                    \
620
+        register vector signed short but1  __asm__ ("v22") =                \
621
+            vec_mladd(but0, vprod1, op1);                                   \
622
+        register vector signed short op1S  __asm__ ("v23") =                \
623
+            vec_perm(but0S, but0S, perm1);                                  \
624
+        register vector signed short but1S __asm__ ("v24") =                \
625
+            vec_mladd(but0S, vprod1, op1S);                                 \
626
+        register vector signed short op2   __asm__ ("v25") =                \
627
+            vec_perm(but1, but1, perm2);                                    \
628
+        register vector signed short but2  __asm__ ("v26") =                \
629
+            vec_mladd(but1, vprod2, op2);                                   \
630
+        register vector signed short op2S  __asm__ ("v27") =                \
631
+            vec_perm(but1S, but1S, perm2);                                  \
632
+        register vector signed short but2S __asm__ ("v28") =                \
633
+            vec_mladd(but1S, vprod2, op2S);                                 \
634
+        register vector signed short op3   __asm__ ("v29") =                \
635
+            vec_perm(but2, but2, perm3);                                    \
636
+        register vector signed short op3S  __asm__ ("v30") =                \
637
+            vec_perm(but2S, but2S, perm3);                                  \
638
+        res1 = vec_mladd(but2, vprod3, op3);                                \
639
+        res2 = vec_mladd(but2S, vprod3, op3S);                              \
640
+    }
641
+        ONEITERBUTTERFLY(0, temp0, temp0S);
642
+        ONEITERBUTTERFLY(1, temp1, temp1S);
643
+        ONEITERBUTTERFLY(2, temp2, temp2S);
644
+        ONEITERBUTTERFLY(3, temp3, temp3S);
645
+        ONEITERBUTTERFLY(4, temp4, temp4S);
646
+        ONEITERBUTTERFLY(5, temp5, temp5S);
647
+        ONEITERBUTTERFLY(6, temp6, temp6S);
648
+        ONEITERBUTTERFLY(7, temp7, temp7S);
649
+    }
650
+#undef ONEITERBUTTERFLY
651
+    {
652
+        register vector signed int vsum;
653
+
654
+        register vector signed short line0  = vec_add(temp0, temp1);
655
+        register vector signed short line1  = vec_sub(temp0, temp1);
656
+        register vector signed short line2  = vec_add(temp2, temp3);
657
+        register vector signed short line3  = vec_sub(temp2, temp3);
658
+        register vector signed short line4  = vec_add(temp4, temp5);
659
+        register vector signed short line5  = vec_sub(temp4, temp5);
660
+        register vector signed short line6  = vec_add(temp6, temp7);
661
+        register vector signed short line7  = vec_sub(temp6, temp7);
662
+
663
+        register vector signed short line0B = vec_add(line0, line2);
664
+        register vector signed short line2B = vec_sub(line0, line2);
665
+        register vector signed short line1B = vec_add(line1, line3);
666
+        register vector signed short line3B = vec_sub(line1, line3);
667
+        register vector signed short line4B = vec_add(line4, line6);
668
+        register vector signed short line6B = vec_sub(line4, line6);
669
+        register vector signed short line5B = vec_add(line5, line7);
670
+        register vector signed short line7B = vec_sub(line5, line7);
671
+
672
+        register vector signed short line0C = vec_add(line0B, line4B);
673
+        register vector signed short line4C = vec_sub(line0B, line4B);
674
+        register vector signed short line1C = vec_add(line1B, line5B);
675
+        register vector signed short line5C = vec_sub(line1B, line5B);
676
+        register vector signed short line2C = vec_add(line2B, line6B);
677
+        register vector signed short line6C = vec_sub(line2B, line6B);
678
+        register vector signed short line3C = vec_add(line3B, line7B);
679
+        register vector signed short line7C = vec_sub(line3B, line7B);
680
+
681
+        register vector signed short line0S = vec_add(temp0S, temp1S);
682
+        register vector signed short line1S = vec_sub(temp0S, temp1S);
683
+        register vector signed short line2S = vec_add(temp2S, temp3S);
684
+        register vector signed short line3S = vec_sub(temp2S, temp3S);
685
+        register vector signed short line4S = vec_add(temp4S, temp5S);
686
+        register vector signed short line5S = vec_sub(temp4S, temp5S);
687
+        register vector signed short line6S = vec_add(temp6S, temp7S);
688
+        register vector signed short line7S = vec_sub(temp6S, temp7S);
689
+
690
+        register vector signed short line0BS = vec_add(line0S, line2S);
691
+        register vector signed short line2BS = vec_sub(line0S, line2S);
692
+        register vector signed short line1BS = vec_add(line1S, line3S);
693
+        register vector signed short line3BS = vec_sub(line1S, line3S);
694
+        register vector signed short line4BS = vec_add(line4S, line6S);
695
+        register vector signed short line6BS = vec_sub(line4S, line6S);
696
+        register vector signed short line5BS = vec_add(line5S, line7S);
697
+        register vector signed short line7BS = vec_sub(line5S, line7S);
698
+
699
+        register vector signed short line0CS = vec_add(line0BS, line4BS);
700
+        register vector signed short line4CS = vec_sub(line0BS, line4BS);
701
+        register vector signed short line1CS = vec_add(line1BS, line5BS);
702
+        register vector signed short line5CS = vec_sub(line1BS, line5BS);
703
+        register vector signed short line2CS = vec_add(line2BS, line6BS);
704
+        register vector signed short line6CS = vec_sub(line2BS, line6BS);
705
+        register vector signed short line3CS = vec_add(line3BS, line7BS);
706
+        register vector signed short line7CS = vec_sub(line3BS, line7BS);
707
+
708
+        vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
709
+        vsum = vec_sum4s(vec_abs(line1C), vsum);
710
+        vsum = vec_sum4s(vec_abs(line2C), vsum);
711
+        vsum = vec_sum4s(vec_abs(line3C), vsum);
712
+        vsum = vec_sum4s(vec_abs(line4C), vsum);
713
+        vsum = vec_sum4s(vec_abs(line5C), vsum);
714
+        vsum = vec_sum4s(vec_abs(line6C), vsum);
715
+        vsum = vec_sum4s(vec_abs(line7C), vsum);
716
+
717
+        vsum = vec_sum4s(vec_abs(line0CS), vsum);
718
+        vsum = vec_sum4s(vec_abs(line1CS), vsum);
719
+        vsum = vec_sum4s(vec_abs(line2CS), vsum);
720
+        vsum = vec_sum4s(vec_abs(line3CS), vsum);
721
+        vsum = vec_sum4s(vec_abs(line4CS), vsum);
722
+        vsum = vec_sum4s(vec_abs(line5CS), vsum);
723
+        vsum = vec_sum4s(vec_abs(line6CS), vsum);
724
+        vsum = vec_sum4s(vec_abs(line7CS), vsum);
725
+        vsum = vec_sums(vsum, (vector signed int) vzero);
726
+        vsum = vec_splat(vsum, 3);
727
+        vec_ste(vsum, 0, &sum);
728
+    }
729
+    return sum;
730
+}
731
+
732
+static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst,
733
+                                    uint8_t *src, int stride, int h)
734
+{
735
+    int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
736
+
737
+    if (h == 16) {
738
+        dst   += 8 * stride;
739
+        src   += 8 * stride;
740
+        score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
741
+    }
742
+    return score;
743
+}
744
+#endif /* HAVE_ALTIVEC */
745
+
746
+av_cold void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx)
747
+{
748
+#if HAVE_ALTIVEC
749
+    if (!PPC_ALTIVEC(av_get_cpu_flags()))
750
+        return;
751
+
752
+    c->pix_abs[0][1] = sad16_x2_altivec;
753
+    c->pix_abs[0][2] = sad16_y2_altivec;
754
+    c->pix_abs[0][3] = sad16_xy2_altivec;
755
+    c->pix_abs[0][0] = sad16_altivec;
756
+    c->pix_abs[1][0] = sad8_altivec;
757
+
758
+    c->sad[0] = sad16_altivec;
759
+    c->sad[1] = sad8_altivec;
760
+    c->sse[0] = sse16_altivec;
761
+    c->sse[1] = sse8_altivec;
762
+
763
+    c->hadamard8_diff[0] = hadamard8_diff16_altivec;
764
+    c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
765
+#endif /* HAVE_ALTIVEC */
766
+}
... ...
@@ -22,7 +22,6 @@
22 22
 #ifndef AVCODEC_PRORESDEC_H
23 23
 #define AVCODEC_PRORESDEC_H
24 24
 
25
-#include "dsputil.h"
26 25
 #include "blockdsp.h"
27 26
 #include "proresdsp.h"
28 27
 
... ...
@@ -36,7 +35,6 @@ typedef struct {
36 36
 } SliceContext;
37 37
 
38 38
 typedef struct {
39
-    DSPContext dsp;
40 39
     BlockDSPContext bdsp;
41 40
     ProresDSPContext prodsp;
42 41
     AVFrame *frame;
... ...
@@ -50,7 +50,6 @@ static av_cold int decode_init(AVCodecContext *avctx)
50 50
 
51 51
     avctx->bits_per_raw_sample = 10;
52 52
 
53
-    ff_dsputil_init(&ctx->dsp, avctx);
54 53
     ff_blockdsp_init(&ctx->bdsp, avctx);
55 54
     ff_proresdsp_init(&ctx->prodsp, avctx);
56 55
 
... ...
@@ -24,7 +24,6 @@
24 24
 #define AVCODEC_PRORESDSP_H
25 25
 
26 26
 #include <stdint.h>
27
-#include "dsputil.h"
28 27
 
29 28
 #define PRORES_BITS_PER_SAMPLE 10 ///< output precision of prores decoder
30 29
 
... ...
@@ -31,7 +31,6 @@
31 31
 #include "internal.h"
32 32
 #include "put_bits.h"
33 33
 #include "bytestream.h"
34
-#include "dsputil.h"
35 34
 #include "fdctdsp.h"
36 35
 
37 36
 #define DEFAULT_SLICE_MB_WIDTH 8
... ...
@@ -22,7 +22,7 @@
22 22
 #include "libavutil/log.h"
23 23
 #include "libavutil/opt.h"
24 24
 #include "avcodec.h"
25
-#include "dsputil.h"
25
+#include "me_cmp.h"
26 26
 #include "snow_dwt.h"
27 27
 #include "internal.h"
28 28
 #include "snow.h"
... ...
@@ -435,7 +435,7 @@ av_cold int ff_snow_common_init(AVCodecContext *avctx){
435 435
     s->avctx= avctx;
436 436
     s->max_ref_frames=1; //just make sure it's not an invalid value in case of no initial keyframe
437 437
 
438
-    ff_dsputil_init(&s->dsp, avctx);
438
+    ff_me_cmp_init(&s->mecc, avctx);
439 439
     ff_hpeldsp_init(&s->hdsp, avctx->flags);
440 440
     ff_videodsp_init(&s->vdsp, 8);
441 441
     ff_dwt_init(&s->dwt);
... ...
@@ -22,8 +22,8 @@
22 22
 #ifndef AVCODEC_SNOW_H
23 23
 #define AVCODEC_SNOW_H
24 24
 
25
-#include "dsputil.h"
26 25
 #include "hpeldsp.h"
26
+#include "me_cmp.h"
27 27
 #include "qpeldsp.h"
28 28
 #include "snow_dwt.h"
29 29
 
... ...
@@ -110,7 +110,7 @@ typedef struct SnowContext{
110 110
     AVClass *class;
111 111
     AVCodecContext *avctx;
112 112
     RangeCoder c;
113
-    DSPContext dsp;
113
+    MECmpContext mecc;
114 114
     HpelDSPContext hdsp;
115 115
     QpelDSPContext qdsp;
116 116
     VideoDSPContext vdsp;
... ...
@@ -22,7 +22,7 @@
22 22
 #include "libavutil/attributes.h"
23 23
 #include "libavutil/avassert.h"
24 24
 #include "libavutil/common.h"
25
-#include "dsputil.h"
25
+#include "me_cmp.h"
26 26
 #include "snow_dwt.h"
27 27
 
28 28
 int ff_slice_buffer_init(slice_buffer *buf, int line_count,
... ...
@@ -844,7 +844,7 @@ int ff_w97_32_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int line
844 844
     return w_c(v, pix1, pix2, line_size, 32, h, 0);
845 845
 }
846 846
 
847
-void ff_dsputil_init_dwt(DSPContext *c)
847
+void ff_dsputil_init_dwt(MECmpContext *c)
848 848
 {
849 849
     c->w53[0] = w53_16_c;
850 850
     c->w53[1] = w53_8_c;
... ...
@@ -22,7 +22,6 @@
22 22
 #include "libavutil/log.h"
23 23
 #include "libavutil/opt.h"
24 24
 #include "avcodec.h"
25
-#include "dsputil.h"
26 25
 #include "snow_dwt.h"
27 26
 #include "internal.h"
28 27
 #include "snow.h"
... ...
@@ -22,7 +22,6 @@
22 22
 #include "libavutil/log.h"
23 23
 #include "libavutil/opt.h"
24 24
 #include "avcodec.h"
25
-#include "dsputil.h"
26 25
 #include "internal.h"
27 26
 #include "snow_dwt.h"
28 27
 #include "snow.h"
... ...
@@ -121,8 +120,8 @@ static av_cold int encode_init(AVCodecContext *avctx)
121 121
     }
122 122
     avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
123 123
 
124
-    ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
125
-    ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
124
+    ff_set_cmp(&s->mecc, s->mecc.me_cmp, s->avctx->me_cmp);
125
+    ff_set_cmp(&s->mecc, s->mecc.me_sub_cmp, s->avctx->me_sub_cmp);
126 126
 
127 127
     s->input_picture = av_frame_alloc();
128 128
     if (!s->input_picture)
... ...
@@ -669,12 +668,12 @@ static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, uin
669 669
             distortion = 0;
670 670
             for(i=0; i<4; i++){
671 671
                 int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
672
-                distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
672
+                distortion += s->mecc.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
673 673
             }
674 674
         }
675 675
     }else{
676 676
         av_assert2(block_w==8);
677
-        distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
677
+        distortion = s->mecc.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
678 678
     }
679 679
 
680 680
     if(plane_index==0){
... ...
@@ -738,7 +737,7 @@ static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
738 738
         }
739 739
 
740 740
         av_assert1(block_w== 8 || block_w==16);
741
-        distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_h);
741
+        distortion += s->mecc.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_h);
742 742
     }
743 743
 
744 744
     if(plane_index==0){
... ...
@@ -1660,12 +1659,12 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
1660 1660
         s->m.qscale= (s->m.lambda*139 + FF_LAMBDA_SCALE*64) >> (FF_LAMBDA_SHIFT + 7);
1661 1661
         s->lambda2= s->m.lambda2= (s->m.lambda*s->m.lambda + FF_LAMBDA_SCALE/2) >> FF_LAMBDA_SHIFT;
1662 1662
 
1663
-        s->m.dsp= s->dsp; //move
1663
+        s->m.mecc= s->mecc; //move
1664 1664
         s->m.qdsp= s->qdsp; //move
1665 1665
         s->m.hdsp = s->hdsp;
1666 1666
         ff_init_me(&s->m);
1667 1667
         s->hdsp = s->m.hdsp;
1668
-        s->dsp= s->m.dsp;
1668
+        s->mecc= s->m.mecc;
1669 1669
     }
1670 1670
 
1671 1671
     if(s->pass1_rc){
... ...
@@ -27,8 +27,8 @@
27 27
  */
28 28
 
29 29
 #include "avcodec.h"
30
-#include "dsputil.h"
31 30
 #include "hpeldsp.h"
31
+#include "me_cmp.h"
32 32
 #include "mpegvideo.h"
33 33
 #include "h263.h"
34 34
 #include "internal.h"
... ...
@@ -314,7 +314,7 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane,
314 314
         s->m.current_picture.motion_val[0]   = s->motion_val8[plane] + 2;
315 315
         s->m.p_mv_table                      = s->motion_val16[plane] +
316 316
                                                s->m.mb_stride + 1;
317
-        s->m.dsp                             = s->dsp; // move
317
+        s->m.mecc                            = s->mecc; // move
318 318
         ff_init_me(&s->m);
319 319
 
320 320
         s->m.me.dia_size      = s->avctx->dia_size;
... ...
@@ -437,8 +437,8 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane,
437 437
                     best      = score[1] <= score[0];
438 438
 
439 439
                     vlc       = ff_svq1_block_type_vlc[SVQ1_BLOCK_SKIP];
440
-                    score[2]  = s->dsp.sse[0](NULL, src + 16 * x, ref,
441
-                                              stride, 16);
440
+                    score[2]  = s->mecc.sse[0](NULL, src + 16 * x, ref,
441
+                                               stride, 16);
442 442
                     score[2] += vlc[1] * lambda;
443 443
                     if (score[2] < score[best] && mx == 0 && my == 0) {
444 444
                         best = 2;
... ...
@@ -515,8 +515,8 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx)
515 515
     SVQ1EncContext *const s = avctx->priv_data;
516 516
     int ret;
517 517
 
518
-    ff_dsputil_init(&s->dsp, avctx);
519 518
     ff_hpeldsp_init(&s->hdsp, avctx->flags);
519
+    ff_me_cmp_init(&s->mecc, avctx);
520 520
     ff_mpegvideoencdsp_init(&s->m.mpvencdsp, avctx);
521 521
 
522 522
     avctx->coded_frame = av_frame_alloc();
... ...
@@ -25,9 +25,9 @@
25 25
 
26 26
 #include "libavutil/frame.h"
27 27
 #include "avcodec.h"
28
-#include "dsputil.h"
29 28
 #include "get_bits.h"
30 29
 #include "hpeldsp.h"
30
+#include "me_cmp.h"
31 31
 #include "mpegvideo.h"
32 32
 #include "put_bits.h"
33 33
 
... ...
@@ -37,7 +37,7 @@ typedef struct SVQ1EncContext {
37 37
      * of MpegEncContext, so this will be removed then. */
38 38
     MpegEncContext m;
39 39
     AVCodecContext *avctx;
40
-    DSPContext dsp;
40
+    MECmpContext mecc;
41 41
     HpelDSPContext hdsp;
42 42
     AVFrame *current_picture;
43 43
     AVFrame *last_picture;
... ...
@@ -41,8 +41,8 @@
41 41
 #include "libavutil/samplefmt.h"
42 42
 #include "libavutil/dict.h"
43 43
 #include "avcodec.h"
44
-#include "dsputil.h"
45 44
 #include "libavutil/opt.h"
45
+#include "me_cmp.h"
46 46
 #include "mpegvideo.h"
47 47
 #include "thread.h"
48 48
 #include "frame_thread_encoder.h"
... ...
@@ -195,8 +195,8 @@ static av_cold void avcodec_init(void)
195 195
         return;
196 196
     initialized = 1;
197 197
 
198
-    if (CONFIG_DSPUTIL)
199
-        ff_dsputil_static_init();
198
+    if (CONFIG_ME_CMP)
199
+        ff_me_cmp_init_static();
200 200
 }
201 201
 
202 202
 int av_codec_is_encoder(const AVCodec *codec)
... ...
@@ -6,7 +6,6 @@ OBJS-$(CONFIG_AUDIODSP)                += x86/audiodsp_init.o
6 6
 OBJS-$(CONFIG_BLOCKDSP)                += x86/blockdsp_init.o
7 7
 OBJS-$(CONFIG_BSWAPDSP)                += x86/bswapdsp_init.o
8 8
 OBJS-$(CONFIG_DCT)                     += x86/dct_init.o
9
-OBJS-$(CONFIG_DSPUTIL)                 += x86/dsputil_init.o
10 9
 OBJS-$(CONFIG_FDCTDSP)                 += x86/fdctdsp_init.o
11 10
 OBJS-$(CONFIG_FFT)                     += x86/fft_init.o
12 11
 OBJS-$(CONFIG_FLAC_DECODER)            += x86/flacdsp_init.o
... ...
@@ -24,6 +23,7 @@ OBJS-$(CONFIG_HUFFYUVDSP)              += x86/huffyuvdsp_init.o
24 24
 OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_mmx.o
25 25
 OBJS-$(CONFIG_IDCTDSP)                 += x86/idctdsp_init.o
26 26
 OBJS-$(CONFIG_LPC)                     += x86/lpc.o
27
+OBJS-$(CONFIG_ME_CMP)                  += x86/me_cmp_init.o
27 28
 OBJS-$(CONFIG_MPEGAUDIODSP)            += x86/mpegaudiodsp.o
28 29
 OBJS-$(CONFIG_MPEGVIDEO)               += x86/mpegvideo.o              \
29 30
                                           x86/mpegvideodsp.o
... ...
@@ -80,7 +80,6 @@ YASM-OBJS-$(CONFIG_DCT)                += x86/dct32.o
80 80
 YASM-OBJS-$(CONFIG_DIRAC_DECODER)      += x86/diracdsp_mmx.o x86/diracdsp_yasm.o\
81 81
                                           x86/dwt_yasm.o
82 82
 YASM-OBJS-$(CONFIG_DNXHD_ENCODER)      += x86/dnxhdenc.o
83
-YASM-OBJS-$(CONFIG_DSPUTIL)            += x86/dsputilenc.o
84 83
 YASM-OBJS-$(CONFIG_FFT)                += x86/fft.o
85 84
 YASM-OBJS-$(CONFIG_FLAC_DECODER)       += x86/flacdsp.o
86 85
 YASM-OBJS-$(CONFIG_H263DSP)            += x86/h263_loopfilter.o
... ...
@@ -107,6 +106,7 @@ YASM-OBJS-$(CONFIG_HUFFYUVDSP)         += x86/huffyuvdsp.o
107 107
 YASM-OBJS-$(CONFIG_IDCTDSP)            += x86/idctdsp.o
108 108
 YASM-OBJS-$(CONFIG_LLAUDDSP)           += x86/lossless_audiodsp.o
109 109
 YASM-OBJS-$(CONFIG_LLVIDDSP)           += x86/lossless_videodsp.o
110
+YASM-OBJS-$(CONFIG_ME_CMP)             += x86/me_cmp.o
110 111
 YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36.o
111 112
 YASM-OBJS-$(CONFIG_MPEGVIDEOENC)       += x86/mpegvideoencdsp.o
112 113
 YASM-OBJS-$(CONFIG_PIXBLOCKDSP)        += x86/pixblockdsp.o
113 114
deleted file mode 100644
... ...
@@ -1,845 +0,0 @@
1
-/*
2
- * MMX optimized DSP utils
3
- * Copyright (c) 2000, 2001 Fabrice Bellard
4
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
- *
6
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7
- *
8
- * This file is part of FFmpeg.
9
- *
10
- * FFmpeg is free software; you can redistribute it and/or
11
- * modify it under the terms of the GNU Lesser General Public
12
- * License as published by the Free Software Foundation; either
13
- * version 2.1 of the License, or (at your option) any later version.
14
- *
15
- * FFmpeg is distributed in the hope that it will be useful,
16
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
- * Lesser General Public License for more details.
19
- *
20
- * You should have received a copy of the GNU Lesser General Public
21
- * License along with FFmpeg; if not, write to the Free Software
22
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
- */
24
-
25
-#include "libavutil/attributes.h"
26
-#include "libavutil/cpu.h"
27
-#include "libavutil/x86/asm.h"
28
-#include "libavutil/x86/cpu.h"
29
-#include "libavcodec/dsputil.h"
30
-#include "libavcodec/mpegvideo.h"
31
-
32
-int ff_sum_abs_dctelem_mmx(int16_t *block);
33
-int ff_sum_abs_dctelem_mmxext(int16_t *block);
34
-int ff_sum_abs_dctelem_sse2(int16_t *block);
35
-int ff_sum_abs_dctelem_ssse3(int16_t *block);
36
-int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
37
-                int line_size, int h);
38
-int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
39
-                 int line_size, int h);
40
-int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
41
-                  int line_size, int h);
42
-int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h);
43
-int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h);
44
-
45
-#define hadamard_func(cpu)                                              \
46
-    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,     \
47
-                                  uint8_t *src2, int stride, int h);    \
48
-    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,   \
49
-                                    uint8_t *src2, int stride, int h);
50
-
51
-hadamard_func(mmx)
52
-hadamard_func(mmxext)
53
-hadamard_func(sse2)
54
-hadamard_func(ssse3)
55
-
56
-#if HAVE_YASM
57
-static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
58
-                      int line_size, int h)
59
-{
60
-    int score1, score2;
61
-
62
-    if (c)
63
-        score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
64
-    else
65
-        score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h);
66
-    score2 = ff_hf_noise16_mmx(pix1, line_size, h) + ff_hf_noise8_mmx(pix1+8, line_size, h)
67
-           - ff_hf_noise16_mmx(pix2, line_size, h) - ff_hf_noise8_mmx(pix2+8, line_size, h);
68
-
69
-    if (c)
70
-        return score1 + FFABS(score2) * c->avctx->nsse_weight;
71
-    else
72
-        return score1 + FFABS(score2) * 8;
73
-}
74
-
75
-static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
76
-                     int line_size, int h)
77
-{
78
-    int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h);
79
-    int score2 = ff_hf_noise8_mmx(pix1, line_size, h) -
80
-                 ff_hf_noise8_mmx(pix2, line_size, h);
81
-
82
-    if (c)
83
-        return score1 + FFABS(score2) * c->avctx->nsse_weight;
84
-    else
85
-        return score1 + FFABS(score2) * 8;
86
-}
87
-
88
-#endif /* HAVE_YASM */
89
-
90
-#if HAVE_INLINE_ASM
91
-
92
-static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
93
-                            int line_size, int h)
94
-{
95
-    int tmp;
96
-
97
-    av_assert2((((int) pix) & 7) == 0);
98
-    av_assert2((line_size & 7) == 0);
99
-
100
-#define SUM(in0, in1, out0, out1)               \
101
-    "movq (%0), %%mm2\n"                        \
102
-    "movq 8(%0), %%mm3\n"                       \
103
-    "add %2,%0\n"                               \
104
-    "movq %%mm2, " #out0 "\n"                   \
105
-    "movq %%mm3, " #out1 "\n"                   \
106
-    "psubusb " #in0 ", %%mm2\n"                 \
107
-    "psubusb " #in1 ", %%mm3\n"                 \
108
-    "psubusb " #out0 ", " #in0 "\n"             \
109
-    "psubusb " #out1 ", " #in1 "\n"             \
110
-    "por %%mm2, " #in0 "\n"                     \
111
-    "por %%mm3, " #in1 "\n"                     \
112
-    "movq " #in0 ", %%mm2\n"                    \
113
-    "movq " #in1 ", %%mm3\n"                    \
114
-    "punpcklbw %%mm7, " #in0 "\n"               \
115
-    "punpcklbw %%mm7, " #in1 "\n"               \
116
-    "punpckhbw %%mm7, %%mm2\n"                  \
117
-    "punpckhbw %%mm7, %%mm3\n"                  \
118
-    "paddw " #in1 ", " #in0 "\n"                \
119
-    "paddw %%mm3, %%mm2\n"                      \
120
-    "paddw %%mm2, " #in0 "\n"                   \
121
-    "paddw " #in0 ", %%mm6\n"
122
-
123
-
124
-    __asm__ volatile (
125
-        "movl    %3, %%ecx\n"
126
-        "pxor %%mm6, %%mm6\n"
127
-        "pxor %%mm7, %%mm7\n"
128
-        "movq  (%0), %%mm0\n"
129
-        "movq 8(%0), %%mm1\n"
130
-        "add %2, %0\n"
131
-        "jmp 2f\n"
132
-        "1:\n"
133
-
134
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
135
-        "2:\n"
136
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
137
-
138
-        "subl $2, %%ecx\n"
139
-        "jnz 1b\n"
140
-
141
-        "movq  %%mm6, %%mm0\n"
142
-        "psrlq $32,   %%mm6\n"
143
-        "paddw %%mm6, %%mm0\n"
144
-        "movq  %%mm0, %%mm6\n"
145
-        "psrlq $16,   %%mm0\n"
146
-        "paddw %%mm6, %%mm0\n"
147
-        "movd  %%mm0, %1\n"
148
-        : "+r" (pix), "=r" (tmp)
149
-        : "r" ((x86_reg) line_size), "m" (h)
150
-        : "%ecx");
151
-
152
-    return tmp & 0xFFFF;
153
-}
154
-#undef SUM
155
-
156
-static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
157
-                               int line_size, int h)
158
-{
159
-    int tmp;
160
-
161
-    av_assert2((((int) pix) & 7) == 0);
162
-    av_assert2((line_size & 7) == 0);
163
-
164
-#define SUM(in0, in1, out0, out1)               \
165
-    "movq (%0), " #out0 "\n"                    \
166
-    "movq 8(%0), " #out1 "\n"                   \
167
-    "add %2, %0\n"                              \
168
-    "psadbw " #out0 ", " #in0 "\n"              \
169
-    "psadbw " #out1 ", " #in1 "\n"              \
170
-    "paddw " #in1 ", " #in0 "\n"                \
171
-    "paddw " #in0 ", %%mm6\n"
172
-
173
-    __asm__ volatile (
174
-        "movl %3, %%ecx\n"
175
-        "pxor %%mm6, %%mm6\n"
176
-        "pxor %%mm7, %%mm7\n"
177
-        "movq (%0), %%mm0\n"
178
-        "movq 8(%0), %%mm1\n"
179
-        "add %2, %0\n"
180
-        "jmp 2f\n"
181
-        "1:\n"
182
-
183
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
184
-        "2:\n"
185
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
186
-
187
-        "subl $2, %%ecx\n"
188
-        "jnz 1b\n"
189
-
190
-        "movd %%mm6, %1\n"
191
-        : "+r" (pix), "=r" (tmp)
192
-        : "r" ((x86_reg) line_size), "m" (h)
193
-        : "%ecx");
194
-
195
-    return tmp;
196
-}
197
-#undef SUM
198
-
199
-static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
200
-                      int line_size, int h)
201
-{
202
-    int tmp;
203
-
204
-    av_assert2((((int) pix1) & 7) == 0);
205
-    av_assert2((((int) pix2) & 7) == 0);
206
-    av_assert2((line_size & 7) == 0);
207
-
208
-#define SUM(in0, in1, out0, out1)       \
209
-    "movq (%0), %%mm2\n"                \
210
-    "movq (%1), " #out0 "\n"            \
211
-    "movq 8(%0), %%mm3\n"               \
212
-    "movq 8(%1), " #out1 "\n"           \
213
-    "add %3, %0\n"                      \
214
-    "add %3, %1\n"                      \
215
-    "psubb " #out0 ", %%mm2\n"          \
216
-    "psubb " #out1 ", %%mm3\n"          \
217
-    "pxor %%mm7, %%mm2\n"               \
218
-    "pxor %%mm7, %%mm3\n"               \
219
-    "movq %%mm2, " #out0 "\n"           \
220
-    "movq %%mm3, " #out1 "\n"           \
221
-    "psubusb " #in0 ", %%mm2\n"         \
222
-    "psubusb " #in1 ", %%mm3\n"         \
223
-    "psubusb " #out0 ", " #in0 "\n"     \
224
-    "psubusb " #out1 ", " #in1 "\n"     \
225
-    "por %%mm2, " #in0 "\n"             \
226
-    "por %%mm3, " #in1 "\n"             \
227
-    "movq " #in0 ", %%mm2\n"            \
228
-    "movq " #in1 ", %%mm3\n"            \
229
-    "punpcklbw %%mm7, " #in0 "\n"       \
230
-    "punpcklbw %%mm7, " #in1 "\n"       \
231
-    "punpckhbw %%mm7, %%mm2\n"          \
232
-    "punpckhbw %%mm7, %%mm3\n"          \
233
-    "paddw " #in1 ", " #in0 "\n"        \
234
-    "paddw %%mm3, %%mm2\n"              \
235
-    "paddw %%mm2, " #in0 "\n"           \
236
-    "paddw " #in0 ", %%mm6\n"
237
-
238
-
239
-    __asm__ volatile (
240
-        "movl %4, %%ecx\n"
241
-        "pxor %%mm6, %%mm6\n"
242
-        "pcmpeqw %%mm7, %%mm7\n"
243
-        "psllw $15, %%mm7\n"
244
-        "packsswb %%mm7, %%mm7\n"
245
-        "movq (%0), %%mm0\n"
246
-        "movq (%1), %%mm2\n"
247
-        "movq 8(%0), %%mm1\n"
248
-        "movq 8(%1), %%mm3\n"
249
-        "add %3, %0\n"
250
-        "add %3, %1\n"
251
-        "psubb %%mm2, %%mm0\n"
252
-        "psubb %%mm3, %%mm1\n"
253
-        "pxor %%mm7, %%mm0\n"
254
-        "pxor %%mm7, %%mm1\n"
255
-        "jmp 2f\n"
256
-        "1:\n"
257
-
258
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
259
-        "2:\n"
260
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
261
-
262
-        "subl $2, %%ecx\n"
263
-        "jnz 1b\n"
264
-
265
-        "movq %%mm6, %%mm0\n"
266
-        "psrlq $32, %%mm6\n"
267
-        "paddw %%mm6, %%mm0\n"
268
-        "movq %%mm0, %%mm6\n"
269
-        "psrlq $16, %%mm0\n"
270
-        "paddw %%mm6, %%mm0\n"
271
-        "movd %%mm0, %2\n"
272
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
273
-        : "r" ((x86_reg) line_size), "m" (h)
274
-        : "%ecx");
275
-
276
-    return tmp & 0x7FFF;
277
-}
278
-#undef SUM
279
-
280
-static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
281
-                         int line_size, int h)
282
-{
283
-    int tmp;
284
-
285
-    av_assert2((((int) pix1) & 7) == 0);
286
-    av_assert2((((int) pix2) & 7) == 0);
287
-    av_assert2((line_size & 7) == 0);
288
-
289
-#define SUM(in0, in1, out0, out1)               \
290
-    "movq (%0), " #out0 "\n"                    \
291
-    "movq (%1), %%mm2\n"                        \
292
-    "movq 8(%0), " #out1 "\n"                   \
293
-    "movq 8(%1), %%mm3\n"                       \
294
-    "add %3, %0\n"                              \
295
-    "add %3, %1\n"                              \
296
-    "psubb %%mm2, " #out0 "\n"                  \
297
-    "psubb %%mm3, " #out1 "\n"                  \
298
-    "pxor %%mm7, " #out0 "\n"                   \
299
-    "pxor %%mm7, " #out1 "\n"                   \
300
-    "psadbw " #out0 ", " #in0 "\n"              \
301
-    "psadbw " #out1 ", " #in1 "\n"              \
302
-    "paddw " #in1 ", " #in0 "\n"                \
303
-    "paddw " #in0 ", %%mm6\n    "
304
-
305
-    __asm__ volatile (
306
-        "movl %4, %%ecx\n"
307
-        "pxor %%mm6, %%mm6\n"
308
-        "pcmpeqw %%mm7, %%mm7\n"
309
-        "psllw $15, %%mm7\n"
310
-        "packsswb %%mm7, %%mm7\n"
311
-        "movq (%0), %%mm0\n"
312
-        "movq (%1), %%mm2\n"
313
-        "movq 8(%0), %%mm1\n"
314
-        "movq 8(%1), %%mm3\n"
315
-        "add %3, %0\n"
316
-        "add %3, %1\n"
317
-        "psubb %%mm2, %%mm0\n"
318
-        "psubb %%mm3, %%mm1\n"
319
-        "pxor %%mm7, %%mm0\n"
320
-        "pxor %%mm7, %%mm1\n"
321
-        "jmp 2f\n"
322
-        "1:\n"
323
-
324
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
325
-        "2:\n"
326
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
327
-
328
-        "subl $2, %%ecx\n"
329
-        "jnz 1b\n"
330
-
331
-        "movd %%mm6, %2\n"
332
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
333
-        : "r" ((x86_reg) line_size), "m" (h)
334
-        : "%ecx");
335
-
336
-    return tmp;
337
-}
338
-#undef SUM
339
-
340
-
341
-
342
-DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
343
-    0x0000000000000000ULL,
344
-    0x0001000100010001ULL,
345
-    0x0002000200020002ULL,
346
-};
347
-
348
-DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
349
-
350
-static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
351
-{
352
-    x86_reg len = -(x86_reg)stride * h;
353
-    __asm__ volatile (
354
-        ".p2align 4                     \n\t"
355
-        "1:                             \n\t"
356
-        "movq (%1, %%"REG_a"), %%mm0    \n\t"
357
-        "movq (%2, %%"REG_a"), %%mm2    \n\t"
358
-        "movq (%2, %%"REG_a"), %%mm4    \n\t"
359
-        "add %3, %%"REG_a"              \n\t"
360
-        "psubusb %%mm0, %%mm2           \n\t"
361
-        "psubusb %%mm4, %%mm0           \n\t"
362
-        "movq (%1, %%"REG_a"), %%mm1    \n\t"
363
-        "movq (%2, %%"REG_a"), %%mm3    \n\t"
364
-        "movq (%2, %%"REG_a"), %%mm5    \n\t"
365
-        "psubusb %%mm1, %%mm3           \n\t"
366
-        "psubusb %%mm5, %%mm1           \n\t"
367
-        "por %%mm2, %%mm0               \n\t"
368
-        "por %%mm1, %%mm3               \n\t"
369
-        "movq %%mm0, %%mm1              \n\t"
370
-        "movq %%mm3, %%mm2              \n\t"
371
-        "punpcklbw %%mm7, %%mm0         \n\t"
372
-        "punpckhbw %%mm7, %%mm1         \n\t"
373
-        "punpcklbw %%mm7, %%mm3         \n\t"
374
-        "punpckhbw %%mm7, %%mm2         \n\t"
375
-        "paddw %%mm1, %%mm0             \n\t"
376
-        "paddw %%mm3, %%mm2             \n\t"
377
-        "paddw %%mm2, %%mm0             \n\t"
378
-        "paddw %%mm0, %%mm6             \n\t"
379
-        "add %3, %%"REG_a"              \n\t"
380
-        " js 1b                         \n\t"
381
-        : "+a" (len)
382
-        : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride));
383
-}
384
-
385
-static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
386
-                                 int stride, int h)
387
-{
388
-    __asm__ volatile (
389
-        ".p2align 4                     \n\t"
390
-        "1:                             \n\t"
391
-        "movq (%1), %%mm0               \n\t"
392
-        "movq (%1, %3), %%mm1           \n\t"
393
-        "psadbw (%2), %%mm0             \n\t"
394
-        "psadbw (%2, %3), %%mm1         \n\t"
395
-        "paddw %%mm0, %%mm6             \n\t"
396
-        "paddw %%mm1, %%mm6             \n\t"
397
-        "lea (%1,%3,2), %1              \n\t"
398
-        "lea (%2,%3,2), %2              \n\t"
399
-        "sub $2, %0                     \n\t"
400
-        " jg 1b                         \n\t"
401
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
402
-        : "r" ((x86_reg) stride));
403
-}
404
-
405
-static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
406
-                      int stride, int h)
407
-{
408
-    int ret;
409
-    __asm__ volatile (
410
-        "pxor %%xmm2, %%xmm2            \n\t"
411
-        ".p2align 4                     \n\t"
412
-        "1:                             \n\t"
413
-        "movdqu (%1), %%xmm0            \n\t"
414
-        "movdqu (%1, %4), %%xmm1        \n\t"
415
-        "psadbw (%2), %%xmm0            \n\t"
416
-        "psadbw (%2, %4), %%xmm1        \n\t"
417
-        "paddw %%xmm0, %%xmm2           \n\t"
418
-        "paddw %%xmm1, %%xmm2           \n\t"
419
-        "lea (%1,%4,2), %1              \n\t"
420
-        "lea (%2,%4,2), %2              \n\t"
421
-        "sub $2, %0                     \n\t"
422
-        " jg 1b                         \n\t"
423
-        "movhlps %%xmm2, %%xmm0         \n\t"
424
-        "paddw   %%xmm0, %%xmm2         \n\t"
425
-        "movd    %%xmm2, %3             \n\t"
426
-        : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
427
-        : "r" ((x86_reg) stride));
428
-    return ret;
429
-}
430
-
431
-static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
432
-                                   int stride, int h)
433
-{
434
-    __asm__ volatile (
435
-        ".p2align 4                     \n\t"
436
-        "1:                             \n\t"
437
-        "movq (%1), %%mm0               \n\t"
438
-        "movq (%1, %3), %%mm1           \n\t"
439
-        "pavgb 1(%1), %%mm0             \n\t"
440
-        "pavgb 1(%1, %3), %%mm1         \n\t"
441
-        "psadbw (%2), %%mm0             \n\t"
442
-        "psadbw (%2, %3), %%mm1         \n\t"
443
-        "paddw %%mm0, %%mm6             \n\t"
444
-        "paddw %%mm1, %%mm6             \n\t"
445
-        "lea (%1,%3,2), %1              \n\t"
446
-        "lea (%2,%3,2), %2              \n\t"
447
-        "sub $2, %0                     \n\t"
448
-        " jg 1b                         \n\t"
449
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
450
-        : "r" ((x86_reg) stride));
451
-}
452
-
453
-static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
454
-                                   int stride, int h)
455
-{
456
-    __asm__ volatile (
457
-        "movq (%1), %%mm0               \n\t"
458
-        "add %3, %1                     \n\t"
459
-        ".p2align 4                     \n\t"
460
-        "1:                             \n\t"
461
-        "movq (%1), %%mm1               \n\t"
462
-        "movq (%1, %3), %%mm2           \n\t"
463
-        "pavgb %%mm1, %%mm0             \n\t"
464
-        "pavgb %%mm2, %%mm1             \n\t"
465
-        "psadbw (%2), %%mm0             \n\t"
466
-        "psadbw (%2, %3), %%mm1         \n\t"
467
-        "paddw %%mm0, %%mm6             \n\t"
468
-        "paddw %%mm1, %%mm6             \n\t"
469
-        "movq %%mm2, %%mm0              \n\t"
470
-        "lea (%1,%3,2), %1              \n\t"
471
-        "lea (%2,%3,2), %2              \n\t"
472
-        "sub $2, %0                     \n\t"
473
-        " jg 1b                         \n\t"
474
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
475
-        : "r" ((x86_reg) stride));
476
-}
477
-
478
-static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
479
-                                 int stride, int h)
480
-{
481
-    __asm__ volatile (
482
-        "movq "MANGLE(bone)", %%mm5     \n\t"
483
-        "movq (%1), %%mm0               \n\t"
484
-        "pavgb 1(%1), %%mm0             \n\t"
485
-        "add %3, %1                     \n\t"
486
-        ".p2align 4                     \n\t"
487
-        "1:                             \n\t"
488
-        "movq (%1), %%mm1               \n\t"
489
-        "movq (%1,%3), %%mm2            \n\t"
490
-        "pavgb 1(%1), %%mm1             \n\t"
491
-        "pavgb 1(%1,%3), %%mm2          \n\t"
492
-        "psubusb %%mm5, %%mm1           \n\t"
493
-        "pavgb %%mm1, %%mm0             \n\t"
494
-        "pavgb %%mm2, %%mm1             \n\t"
495
-        "psadbw (%2), %%mm0             \n\t"
496
-        "psadbw (%2,%3), %%mm1          \n\t"
497
-        "paddw %%mm0, %%mm6             \n\t"
498
-        "paddw %%mm1, %%mm6             \n\t"
499
-        "movq %%mm2, %%mm0              \n\t"
500
-        "lea (%1,%3,2), %1              \n\t"
501
-        "lea (%2,%3,2), %2              \n\t"
502
-        "sub $2, %0                     \n\t"
503
-        " jg 1b                         \n\t"
504
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
505
-        : "r" ((x86_reg) stride)
506
-          NAMED_CONSTRAINTS_ADD(bone));
507
-}
508
-
509
-static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
510
-                              int stride, int h)
511
-{
512
-    x86_reg len = -(x86_reg)stride * h;
513
-    __asm__ volatile (
514
-        ".p2align 4                     \n\t"
515
-        "1:                             \n\t"
516
-        "movq (%1, %%"REG_a"), %%mm0    \n\t"
517
-        "movq (%2, %%"REG_a"), %%mm1    \n\t"
518
-        "movq (%1, %%"REG_a"), %%mm2    \n\t"
519
-        "movq (%2, %%"REG_a"), %%mm3    \n\t"
520
-        "punpcklbw %%mm7, %%mm0         \n\t"
521
-        "punpcklbw %%mm7, %%mm1         \n\t"
522
-        "punpckhbw %%mm7, %%mm2         \n\t"
523
-        "punpckhbw %%mm7, %%mm3         \n\t"
524
-        "paddw %%mm0, %%mm1             \n\t"
525
-        "paddw %%mm2, %%mm3             \n\t"
526
-        "movq (%3, %%"REG_a"), %%mm4    \n\t"
527
-        "movq (%3, %%"REG_a"), %%mm2    \n\t"
528
-        "paddw %%mm5, %%mm1             \n\t"
529
-        "paddw %%mm5, %%mm3             \n\t"
530
-        "psrlw $1, %%mm1                \n\t"
531
-        "psrlw $1, %%mm3                \n\t"
532
-        "packuswb %%mm3, %%mm1          \n\t"
533
-        "psubusb %%mm1, %%mm4           \n\t"
534
-        "psubusb %%mm2, %%mm1           \n\t"
535
-        "por %%mm4, %%mm1               \n\t"
536
-        "movq %%mm1, %%mm0              \n\t"
537
-        "punpcklbw %%mm7, %%mm0         \n\t"
538
-        "punpckhbw %%mm7, %%mm1         \n\t"
539
-        "paddw %%mm1, %%mm0             \n\t"
540
-        "paddw %%mm0, %%mm6             \n\t"
541
-        "add %4, %%"REG_a"              \n\t"
542
-        " js 1b                         \n\t"
543
-        : "+a" (len)
544
-        : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
545
-          "r" ((x86_reg) stride));
546
-}
547
-
548
-static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
549
-{
550
-    x86_reg len = -(x86_reg)stride * h;
551
-    __asm__ volatile (
552
-        "movq  (%1, %%"REG_a"), %%mm0   \n\t"
553
-        "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
554
-        "movq %%mm0, %%mm1              \n\t"
555
-        "movq %%mm2, %%mm3              \n\t"
556
-        "punpcklbw %%mm7, %%mm0         \n\t"
557
-        "punpckhbw %%mm7, %%mm1         \n\t"
558
-        "punpcklbw %%mm7, %%mm2         \n\t"
559
-        "punpckhbw %%mm7, %%mm3         \n\t"
560
-        "paddw %%mm2, %%mm0             \n\t"
561
-        "paddw %%mm3, %%mm1             \n\t"
562
-        ".p2align 4                     \n\t"
563
-        "1:                             \n\t"
564
-        "movq  (%2, %%"REG_a"), %%mm2   \n\t"
565
-        "movq 1(%2, %%"REG_a"), %%mm4   \n\t"
566
-        "movq %%mm2, %%mm3              \n\t"
567
-        "movq %%mm4, %%mm5              \n\t"
568
-        "punpcklbw %%mm7, %%mm2         \n\t"
569
-        "punpckhbw %%mm7, %%mm3         \n\t"
570
-        "punpcklbw %%mm7, %%mm4         \n\t"
571
-        "punpckhbw %%mm7, %%mm5         \n\t"
572
-        "paddw %%mm4, %%mm2             \n\t"
573
-        "paddw %%mm5, %%mm3             \n\t"
574
-        "movq %5, %%mm5                 \n\t"
575
-        "paddw %%mm2, %%mm0             \n\t"
576
-        "paddw %%mm3, %%mm1             \n\t"
577
-        "paddw %%mm5, %%mm0             \n\t"
578
-        "paddw %%mm5, %%mm1             \n\t"
579
-        "movq (%3, %%"REG_a"), %%mm4    \n\t"
580
-        "movq (%3, %%"REG_a"), %%mm5    \n\t"
581
-        "psrlw $2, %%mm0                \n\t"
582
-        "psrlw $2, %%mm1                \n\t"
583
-        "packuswb %%mm1, %%mm0          \n\t"
584
-        "psubusb %%mm0, %%mm4           \n\t"
585
-        "psubusb %%mm5, %%mm0           \n\t"
586
-        "por %%mm4, %%mm0               \n\t"
587
-        "movq %%mm0, %%mm4              \n\t"
588
-        "punpcklbw %%mm7, %%mm0         \n\t"
589
-        "punpckhbw %%mm7, %%mm4         \n\t"
590
-        "paddw %%mm0, %%mm6             \n\t"
591
-        "paddw %%mm4, %%mm6             \n\t"
592
-        "movq  %%mm2, %%mm0             \n\t"
593
-        "movq  %%mm3, %%mm1             \n\t"
594
-        "add %4, %%"REG_a"              \n\t"
595
-        " js 1b                         \n\t"
596
-        : "+a" (len)
597
-        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
598
-          "r" ((x86_reg) stride), "m" (round_tab[2]));
599
-}
600
-
601
-static inline int sum_mmx(void)
602
-{
603
-    int ret;
604
-    __asm__ volatile (
605
-        "movq %%mm6, %%mm0              \n\t"
606
-        "psrlq $32, %%mm6               \n\t"
607
-        "paddw %%mm0, %%mm6             \n\t"
608
-        "movq %%mm6, %%mm0              \n\t"
609
-        "psrlq $16, %%mm6               \n\t"
610
-        "paddw %%mm0, %%mm6             \n\t"
611
-        "movd %%mm6, %0                 \n\t"
612
-        : "=r" (ret));
613
-    return ret & 0xFFFF;
614
-}
615
-
616
-static inline int sum_mmxext(void)
617
-{
618
-    int ret;
619
-    __asm__ volatile (
620
-        "movd %%mm6, %0                 \n\t"
621
-        : "=r" (ret));
622
-    return ret;
623
-}
624
-
625
-static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
626
-{
627
-    sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
628
-}
629
-
630
-static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
631
-{
632
-    sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
633
-}
634
-
635
-#define PIX_SAD(suf)                                                    \
636
-static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
637
-                        uint8_t *blk1, int stride, int h)               \
638
-{                                                                       \
639
-    av_assert2(h == 8);                                                     \
640
-    __asm__ volatile (                                                  \
641
-        "pxor %%mm7, %%mm7     \n\t"                                    \
642
-        "pxor %%mm6, %%mm6     \n\t"                                    \
643
-        :);                                                             \
644
-                                                                        \
645
-    sad8_1_ ## suf(blk1, blk2, stride, 8);                              \
646
-                                                                        \
647
-    return sum_ ## suf();                                               \
648
-}                                                                       \
649
-                                                                        \
650
-static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
651
-                           uint8_t *blk1, int stride, int h)            \
652
-{                                                                       \
653
-    av_assert2(h == 8);                                                     \
654
-    __asm__ volatile (                                                  \
655
-        "pxor %%mm7, %%mm7     \n\t"                                    \
656
-        "pxor %%mm6, %%mm6     \n\t"                                    \
657
-        "movq %0, %%mm5        \n\t"                                    \
658
-        :: "m" (round_tab[1]));                                         \
659
-                                                                        \
660
-    sad8_x2a_ ## suf(blk1, blk2, stride, 8);                            \
661
-                                                                        \
662
-    return sum_ ## suf();                                               \
663
-}                                                                       \
664
-                                                                        \
665
-static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
666
-                           uint8_t *blk1, int stride, int h)            \
667
-{                                                                       \
668
-    av_assert2(h == 8);                                                     \
669
-    __asm__ volatile (                                                  \
670
-        "pxor %%mm7, %%mm7     \n\t"                                    \
671
-        "pxor %%mm6, %%mm6     \n\t"                                    \
672
-        "movq %0, %%mm5        \n\t"                                    \
673
-        :: "m" (round_tab[1]));                                         \
674
-                                                                        \
675
-    sad8_y2a_ ## suf(blk1, blk2, stride, 8);                            \
676
-                                                                        \
677
-    return sum_ ## suf();                                               \
678
-}                                                                       \
679
-                                                                        \
680
-static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
681
-                            uint8_t *blk1, int stride, int h)           \
682
-{                                                                       \
683
-    av_assert2(h == 8);                                                     \
684
-    __asm__ volatile (                                                  \
685
-        "pxor %%mm7, %%mm7     \n\t"                                    \
686
-        "pxor %%mm6, %%mm6     \n\t"                                    \
687
-        ::);                                                            \
688
-                                                                        \
689
-    sad8_4_ ## suf(blk1, blk2, stride, 8);                              \
690
-                                                                        \
691
-    return sum_ ## suf();                                               \
692
-}                                                                       \
693
-                                                                        \
694
-static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2,              \
695
-                         uint8_t *blk1, int stride, int h)              \
696
-{                                                                       \
697
-    __asm__ volatile (                                                  \
698
-        "pxor %%mm7, %%mm7     \n\t"                                    \
699
-        "pxor %%mm6, %%mm6     \n\t"                                    \
700
-        :);                                                             \
701
-                                                                        \
702
-    sad8_1_ ## suf(blk1,     blk2,     stride, h);                      \
703
-    sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
704
-                                                                        \
705
-    return sum_ ## suf();                                               \
706
-}                                                                       \
707
-                                                                        \
708
-static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
709
-                            uint8_t *blk1, int stride, int h)           \
710
-{                                                                       \
711
-    __asm__ volatile (                                                  \
712
-        "pxor %%mm7, %%mm7     \n\t"                                    \
713
-        "pxor %%mm6, %%mm6     \n\t"                                    \
714
-        "movq %0, %%mm5        \n\t"                                    \
715
-        :: "m" (round_tab[1]));                                         \
716
-                                                                        \
717
-    sad8_x2a_ ## suf(blk1,     blk2,     stride, h);                    \
718
-    sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
719
-                                                                        \
720
-    return sum_ ## suf();                                               \
721
-}                                                                       \
722
-                                                                        \
723
-static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
724
-                            uint8_t *blk1, int stride, int h)           \
725
-{                                                                       \
726
-    __asm__ volatile (                                                  \
727
-        "pxor %%mm7, %%mm7     \n\t"                                    \
728
-        "pxor %%mm6, %%mm6     \n\t"                                    \
729
-        "movq %0, %%mm5        \n\t"                                    \
730
-        :: "m" (round_tab[1]));                                         \
731
-                                                                        \
732
-    sad8_y2a_ ## suf(blk1,     blk2,     stride, h);                    \
733
-    sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
734
-                                                                        \
735
-    return sum_ ## suf();                                               \
736
-}                                                                       \
737
-                                                                        \
738
-static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
739
-                             uint8_t *blk1, int stride, int h)          \
740
-{                                                                       \
741
-    __asm__ volatile (                                                  \
742
-        "pxor %%mm7, %%mm7     \n\t"                                    \
743
-        "pxor %%mm6, %%mm6     \n\t"                                    \
744
-        ::);                                                            \
745
-                                                                        \
746
-    sad8_4_ ## suf(blk1,     blk2,     stride, h);                      \
747
-    sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
748
-                                                                        \
749
-    return sum_ ## suf();                                               \
750
-}                                                                       \
751
-
752
-PIX_SAD(mmx)
753
-PIX_SAD(mmxext)
754
-
755
-#endif /* HAVE_INLINE_ASM */
756
-
757
-av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx)
758
-{
759
-    int cpu_flags = av_get_cpu_flags();
760
-
761
-#if HAVE_INLINE_ASM
762
-    if (INLINE_MMX(cpu_flags)) {
763
-        c->pix_abs[0][0] = sad16_mmx;
764
-        c->pix_abs[0][1] = sad16_x2_mmx;
765
-        c->pix_abs[0][2] = sad16_y2_mmx;
766
-        c->pix_abs[0][3] = sad16_xy2_mmx;
767
-        c->pix_abs[1][0] = sad8_mmx;
768
-        c->pix_abs[1][1] = sad8_x2_mmx;
769
-        c->pix_abs[1][2] = sad8_y2_mmx;
770
-        c->pix_abs[1][3] = sad8_xy2_mmx;
771
-
772
-        c->sad[0] = sad16_mmx;
773
-        c->sad[1] = sad8_mmx;
774
-
775
-        c->vsad[4] = vsad_intra16_mmx;
776
-
777
-        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
778
-            c->vsad[0] = vsad16_mmx;
779
-        }
780
-    }
781
-
782
-    if (INLINE_MMXEXT(cpu_flags)) {
783
-        c->vsad[4] = vsad_intra16_mmxext;
784
-
785
-        c->pix_abs[0][0] = sad16_mmxext;
786
-        c->pix_abs[1][0] = sad8_mmxext;
787
-
788
-        c->sad[0] = sad16_mmxext;
789
-        c->sad[1] = sad8_mmxext;
790
-
791
-        c->pix_abs[0][1] = sad16_x2_mmxext;
792
-        c->pix_abs[0][2] = sad16_y2_mmxext;
793
-        c->pix_abs[1][1] = sad8_x2_mmxext;
794
-        c->pix_abs[1][2] = sad8_y2_mmxext;
795
-
796
-        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
797
-            c->pix_abs[0][3] = sad16_xy2_mmxext;
798
-            c->pix_abs[1][3] = sad8_xy2_mmxext;
799
-
800
-            c->vsad[0] = vsad16_mmxext;
801
-        }
802
-    }
803
-
804
-    if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
805
-        c->sad[0] = sad16_sse2;
806
-    }
807
-
808
-#endif /* HAVE_INLINE_ASM */
809
-
810
-    if (EXTERNAL_MMX(cpu_flags)) {
811
-        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
812
-        c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
813
-        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmx;
814
-        c->sse[0]            = ff_sse16_mmx;
815
-        c->sse[1]            = ff_sse8_mmx;
816
-#if HAVE_YASM
817
-        c->nsse[0]           = nsse16_mmx;
818
-        c->nsse[1]           = nsse8_mmx;
819
-#endif
820
-    }
821
-
822
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
823
-        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
824
-        c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
825
-        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmxext;
826
-    }
827
-
828
-    if (EXTERNAL_SSE2(cpu_flags)) {
829
-        c->sse[0] = ff_sse16_sse2;
830
-        c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
831
-
832
-#if HAVE_ALIGNED_STACK
833
-        c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
834
-        c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
835
-#endif
836
-    }
837
-
838
-    if (EXTERNAL_SSSE3(cpu_flags)) {
839
-        c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
840
-#if HAVE_ALIGNED_STACK
841
-        c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
842
-        c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
843
-#endif
844
-    }
845
-}
846 1
deleted file mode 100644
... ...
@@ -1,467 +0,0 @@
1
-;*****************************************************************************
2
-;* MMX optimized DSP utils
3
-;*****************************************************************************
4
-;* Copyright (c) 2000, 2001 Fabrice Bellard
5
-;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6
-;*
7
-;* This file is part of FFmpeg.
8
-;*
9
-;* FFmpeg is free software; you can redistribute it and/or
10
-;* modify it under the terms of the GNU Lesser General Public
11
-;* License as published by the Free Software Foundation; either
12
-;* version 2.1 of the License, or (at your option) any later version.
13
-;*
14
-;* FFmpeg is distributed in the hope that it will be useful,
15
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
-;* Lesser General Public License for more details.
18
-;*
19
-;* You should have received a copy of the GNU Lesser General Public
20
-;* License along with FFmpeg; if not, write to the Free Software
21
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
-;*****************************************************************************
23
-
24
-%include "libavutil/x86/x86util.asm"
25
-
26
-SECTION .text
27
-
28
-%macro DIFF_PIXELS_1 4
29
-    movh            %1, %3
30
-    movh            %2, %4
31
-    punpcklbw       %2, %1
32
-    punpcklbw       %1, %1
33
-    psubw           %1, %2
34
-%endmacro
35
-
36
-; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
37
-; %6=temporary storage location
38
-; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
39
-%macro DIFF_PIXELS_8 6
40
-    DIFF_PIXELS_1   m0, m7, [%1     +%3], [%2     +%3]
41
-    DIFF_PIXELS_1   m1, m7, [%1+%4  +%3], [%2+%4  +%3]
42
-    DIFF_PIXELS_1   m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
43
-    add             %1, %5
44
-    add             %2, %5
45
-    DIFF_PIXELS_1   m3, m7, [%1     +%3], [%2     +%3]
46
-    DIFF_PIXELS_1   m4, m7, [%1+%4  +%3], [%2+%4  +%3]
47
-    DIFF_PIXELS_1   m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
48
-    DIFF_PIXELS_1   m6, m7, [%1+%5  +%3], [%2+%5  +%3]
49
-%ifdef m8
50
-    DIFF_PIXELS_1   m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
51
-%else
52
-    mova          [%6], m0
53
-    DIFF_PIXELS_1   m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
54
-    mova            m0, [%6]
55
-%endif
56
-    sub             %1, %5
57
-    sub             %2, %5
58
-%endmacro
59
-
60
-%macro HADAMARD8 0
61
-    SUMSUB_BADC       w, 0, 1, 2, 3
62
-    SUMSUB_BADC       w, 4, 5, 6, 7
63
-    SUMSUB_BADC       w, 0, 2, 1, 3
64
-    SUMSUB_BADC       w, 4, 6, 5, 7
65
-    SUMSUB_BADC       w, 0, 4, 1, 5
66
-    SUMSUB_BADC       w, 2, 6, 3, 7
67
-%endmacro
68
-
69
-%macro ABS1_SUM 3
70
-    ABS1            %1, %2
71
-    paddusw         %3, %1
72
-%endmacro
73
-
74
-%macro ABS2_SUM 6
75
-    ABS2            %1, %2, %3, %4
76
-    paddusw         %5, %1
77
-    paddusw         %6, %2
78
-%endmacro
79
-
80
-%macro ABS_SUM_8x8_64 1
81
-    ABS2            m0, m1, m8, m9
82
-    ABS2_SUM        m2, m3, m8, m9, m0, m1
83
-    ABS2_SUM        m4, m5, m8, m9, m0, m1
84
-    ABS2_SUM        m6, m7, m8, m9, m0, m1
85
-    paddusw         m0, m1
86
-%endmacro
87
-
88
-%macro ABS_SUM_8x8_32 1
89
-    mova          [%1], m7
90
-    ABS1            m0, m7
91
-    ABS1            m1, m7
92
-    ABS1_SUM        m2, m7, m0
93
-    ABS1_SUM        m3, m7, m1
94
-    ABS1_SUM        m4, m7, m0
95
-    ABS1_SUM        m5, m7, m1
96
-    ABS1_SUM        m6, m7, m0
97
-    mova            m2, [%1]
98
-    ABS1_SUM        m2, m7, m1
99
-    paddusw         m0, m1
100
-%endmacro
101
-
102
-; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
103
-; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
104
-; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
105
-%macro HSUM 3
106
-%if cpuflag(sse2)
107
-    movhlps         %2, %1
108
-    paddusw         %1, %2
109
-    pshuflw         %2, %1, 0xE
110
-    paddusw         %1, %2
111
-    pshuflw         %2, %1, 0x1
112
-    paddusw         %1, %2
113
-    movd            %3, %1
114
-%elif cpuflag(mmxext)
115
-    pshufw          %2, %1, 0xE
116
-    paddusw         %1, %2
117
-    pshufw          %2, %1, 0x1
118
-    paddusw         %1, %2
119
-    movd            %3, %1
120
-%elif cpuflag(mmx)
121
-    mova            %2, %1
122
-    psrlq           %1, 32
123
-    paddusw         %1, %2
124
-    mova            %2, %1
125
-    psrlq           %1, 16
126
-    paddusw         %1, %2
127
-    movd            %3, %1
128
-%endif
129
-%endmacro
130
-
131
-%macro STORE4 5
132
-    mova [%1+mmsize*0], %2
133
-    mova [%1+mmsize*1], %3
134
-    mova [%1+mmsize*2], %4
135
-    mova [%1+mmsize*3], %5
136
-%endmacro
137
-
138
-%macro LOAD4 5
139
-    mova            %2, [%1+mmsize*0]
140
-    mova            %3, [%1+mmsize*1]
141
-    mova            %4, [%1+mmsize*2]
142
-    mova            %5, [%1+mmsize*3]
143
-%endmacro
144
-
145
-%macro hadamard8_16_wrapper 2
146
-cglobal hadamard8_diff, 4, 4, %1
147
-%ifndef m8
148
-    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
149
-    SUB            rsp, pad
150
-%endif
151
-    call hadamard8x8_diff %+ SUFFIX
152
-%ifndef m8
153
-    ADD            rsp, pad
154
-%endif
155
-    RET
156
-
157
-cglobal hadamard8_diff16, 5, 6, %1
158
-%ifndef m8
159
-    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
160
-    SUB            rsp, pad
161
-%endif
162
-
163
-    call hadamard8x8_diff %+ SUFFIX
164
-    mov            r5d, eax
165
-
166
-    add             r1, 8
167
-    add             r2, 8
168
-    call hadamard8x8_diff %+ SUFFIX
169
-    add            r5d, eax
170
-
171
-    cmp            r4d, 16
172
-    jne .done
173
-
174
-    lea             r1, [r1+r3*8-8]
175
-    lea             r2, [r2+r3*8-8]
176
-    call hadamard8x8_diff %+ SUFFIX
177
-    add            r5d, eax
178
-
179
-    add             r1, 8
180
-    add             r2, 8
181
-    call hadamard8x8_diff %+ SUFFIX
182
-    add            r5d, eax
183
-
184
-.done:
185
-    mov            eax, r5d
186
-%ifndef m8
187
-    ADD            rsp, pad
188
-%endif
189
-    RET
190
-%endmacro
191
-
192
-%macro HADAMARD8_DIFF 0-1
193
-%if cpuflag(sse2)
194
-hadamard8x8_diff %+ SUFFIX:
195
-    lea                          r0, [r3*3]
196
-    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize
197
-    HADAMARD8
198
-%if ARCH_X86_64
199
-    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7,  8
200
-%else
201
-    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7, [rsp+gprsize], [rsp+mmsize+gprsize]
202
-%endif
203
-    HADAMARD8
204
-    ABS_SUM_8x8         rsp+gprsize
205
-    HSUM                        m0, m1, eax
206
-    and                         eax, 0xFFFF
207
-    ret
208
-
209
-hadamard8_16_wrapper %1, 3
210
-%elif cpuflag(mmx)
211
-ALIGN 16
212
-; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,
213
-;                               uint8_t *src2, int stride, int h)
214
-; r0 = void *s = unused, int h = unused (always 8)
215
-; note how r1, r2 and r3 are not clobbered in this function, so 16x16
216
-; can simply call this 2x2x (and that's why we access rsp+gprsize
217
-; everywhere, which is rsp of calling func
218
-hadamard8x8_diff %+ SUFFIX:
219
-    lea                          r0, [r3*3]
220
-
221
-    ; first 4x8 pixels
222
-    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize+0x60
223
-    HADAMARD8
224
-    mova         [rsp+gprsize+0x60], m7
225
-    TRANSPOSE4x4W                 0,  1,  2,  3,  7
226
-    STORE4              rsp+gprsize, m0, m1, m2, m3
227
-    mova                         m7, [rsp+gprsize+0x60]
228
-    TRANSPOSE4x4W                 4,  5,  6,  7,  0
229
-    STORE4         rsp+gprsize+0x40, m4, m5, m6, m7
230
-
231
-    ; second 4x8 pixels
232
-    DIFF_PIXELS_8                r1, r2,  4, r3, r0, rsp+gprsize+0x60
233
-    HADAMARD8
234
-    mova         [rsp+gprsize+0x60], m7
235
-    TRANSPOSE4x4W                 0,  1,  2,  3,  7
236
-    STORE4         rsp+gprsize+0x20, m0, m1, m2, m3
237
-    mova                         m7, [rsp+gprsize+0x60]
238
-    TRANSPOSE4x4W                 4,  5,  6,  7,  0
239
-
240
-    LOAD4          rsp+gprsize+0x40, m0, m1, m2, m3
241
-    HADAMARD8
242
-    ABS_SUM_8x8_32 rsp+gprsize+0x60
243
-    mova         [rsp+gprsize+0x60], m0
244
-
245
-    LOAD4          rsp+gprsize     , m0, m1, m2, m3
246
-    LOAD4          rsp+gprsize+0x20, m4, m5, m6, m7
247
-    HADAMARD8
248
-    ABS_SUM_8x8_32 rsp+gprsize
249
-    paddusw                      m0, [rsp+gprsize+0x60]
250
-
251
-    HSUM                         m0, m1, eax
252
-    and                         rax, 0xFFFF
253
-    ret
254
-
255
-hadamard8_16_wrapper 0, 14
256
-%endif
257
-%endmacro
258
-
259
-INIT_MMX mmx
260
-HADAMARD8_DIFF
261
-
262
-INIT_MMX mmxext
263
-HADAMARD8_DIFF
264
-
265
-INIT_XMM sse2
266
-%if ARCH_X86_64
267
-%define ABS_SUM_8x8 ABS_SUM_8x8_64
268
-%else
269
-%define ABS_SUM_8x8 ABS_SUM_8x8_32
270
-%endif
271
-HADAMARD8_DIFF 10
272
-
273
-INIT_XMM ssse3
274
-%define ABS_SUM_8x8 ABS_SUM_8x8_64
275
-HADAMARD8_DIFF 9
276
-
277
-; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
278
-;               int line_size, int h)
279
-
280
-%macro SUM_SQUARED_ERRORS 1
281
-cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
282
-%if %1 == mmsize
283
-    shr       hd, 1
284
-%endif
285
-    pxor      m0, m0         ; mm0 = 0
286
-    pxor      m7, m7         ; mm7 holds the sum
287
-
288
-.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
289
-    movu      m1, [pix1q]    ; m1 = pix1[0][0-15], [0-7] for mmx
290
-    movu      m2, [pix2q]    ; m2 = pix2[0][0-15], [0-7] for mmx
291
-%if %1 == mmsize
292
-    movu      m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx
293
-    movu      m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx
294
-%else  ; %1 / 2 == mmsize; mmx only
295
-    mova      m3, [pix1q+8]  ; m3 = pix1[0][8-15]
296
-    mova      m4, [pix2q+8]  ; m4 = pix2[0][8-15]
297
-%endif
298
-
299
-    ; todo: mm1-mm2, mm3-mm4
300
-    ; algo: subtract mm1 from mm2 with saturation and vice versa
301
-    ;       OR the result to get the absolute difference
302
-    mova      m5, m1
303
-    mova      m6, m3
304
-    psubusb   m1, m2
305
-    psubusb   m3, m4
306
-    psubusb   m2, m5
307
-    psubusb   m4, m6
308
-
309
-    por       m2, m1
310
-    por       m4, m3
311
-
312
-    ; now convert to 16-bit vectors so we can square them
313
-    mova      m1, m2
314
-    mova      m3, m4
315
-
316
-    punpckhbw m2, m0
317
-    punpckhbw m4, m0
318
-    punpcklbw m1, m0         ; mm1 not spread over (mm1,mm2)
319
-    punpcklbw m3, m0         ; mm4 not spread over (mm3,mm4)
320
-
321
-    pmaddwd   m2, m2
322
-    pmaddwd   m4, m4
323
-    pmaddwd   m1, m1
324
-    pmaddwd   m3, m3
325
-
326
-    paddd     m1, m2
327
-    paddd     m3, m4
328
-    paddd     m7, m1
329
-    paddd     m7, m3
330
-
331
-%if %1 == mmsize
332
-    lea    pix1q, [pix1q + 2*lsizeq]
333
-    lea    pix2q, [pix2q + 2*lsizeq]
334
-%else
335
-    add    pix1q, lsizeq
336
-    add    pix2q, lsizeq
337
-%endif
338
-    dec       hd
339
-    jnz .next2lines
340
-
341
-    HADDD     m7, m1
342
-    movd     eax, m7         ; return value
343
-    RET
344
-%endmacro
345
-
346
-INIT_MMX mmx
347
-SUM_SQUARED_ERRORS 8
348
-
349
-INIT_MMX mmx
350
-SUM_SQUARED_ERRORS 16
351
-
352
-INIT_XMM sse2
353
-SUM_SQUARED_ERRORS 16
354
-
355
-;-----------------------------------------------
356
-;int ff_sum_abs_dctelem(int16_t *block)
357
-;-----------------------------------------------
358
-; %1 = number of xmm registers used
359
-; %2 = number of inline loops
360
-
361
-%macro SUM_ABS_DCTELEM 2
362
-cglobal sum_abs_dctelem, 1, 1, %1, block
363
-    pxor    m0, m0
364
-    pxor    m1, m1
365
-%assign %%i 0
366
-%rep %2
367
-    mova      m2, [blockq+mmsize*(0+%%i)]
368
-    mova      m3, [blockq+mmsize*(1+%%i)]
369
-    mova      m4, [blockq+mmsize*(2+%%i)]
370
-    mova      m5, [blockq+mmsize*(3+%%i)]
371
-    ABS1_SUM  m2, m6, m0
372
-    ABS1_SUM  m3, m6, m1
373
-    ABS1_SUM  m4, m6, m0
374
-    ABS1_SUM  m5, m6, m1
375
-%assign %%i %%i+4
376
-%endrep
377
-    paddusw m0, m1
378
-    HSUM    m0, m1, eax
379
-    and     eax, 0xFFFF
380
-    RET
381
-%endmacro
382
-
383
-INIT_MMX mmx
384
-SUM_ABS_DCTELEM 0, 4
385
-INIT_MMX mmxext
386
-SUM_ABS_DCTELEM 0, 4
387
-INIT_XMM sse2
388
-SUM_ABS_DCTELEM 7, 2
389
-INIT_XMM ssse3
390
-SUM_ABS_DCTELEM 6, 2
391
-
392
-;------------------------------------------------------------------------------
393
-; int ff_hf_noise*_mmx(uint8_t *pix1, int lsize, int h)
394
-;------------------------------------------------------------------------------
395
-; %1 = 8/16. %2-5=m#
396
-%macro HF_NOISE_PART1 5
397
-    mova      m%2, [pix1q]
398
-%if %1 == 8
399
-    mova      m%3, m%2
400
-    psllq     m%2, 8
401
-    psrlq     m%3, 8
402
-    psrlq     m%2, 8
403
-%else
404
-    mova      m%3, [pix1q+1]
405
-%endif
406
-    mova      m%4, m%2
407
-    mova      m%5, m%3
408
-    punpcklbw m%2, m7
409
-    punpcklbw m%3, m7
410
-    punpckhbw m%4, m7
411
-    punpckhbw m%5, m7
412
-    psubw     m%2, m%3
413
-    psubw     m%4, m%5
414
-%endmacro
415
-
416
-; %1-2 = m#
417
-%macro HF_NOISE_PART2 4
418
-    psubw     m%1, m%3
419
-    psubw     m%2, m%4
420
-    pxor       m3, m3
421
-    pxor       m1, m1
422
-    pcmpgtw    m3, m%1
423
-    pcmpgtw    m1, m%2
424
-    pxor      m%1, m3
425
-    pxor      m%2, m1
426
-    psubw     m%1, m3
427
-    psubw     m%2, m1
428
-    paddw     m%2, m%1
429
-    paddw      m6, m%2
430
-%endmacro
431
-
432
-; %1 = 8/16
433
-%macro HF_NOISE 1
434
-cglobal hf_noise%1, 3,3,0, pix1, lsize, h
435
-    movsxdifnidn lsizeq, lsized
436
-    sub        hd, 2
437
-    pxor       m7, m7
438
-    pxor       m6, m6
439
-    HF_NOISE_PART1 %1, 0, 1, 2, 3
440
-    add     pix1q, lsizeq
441
-    HF_NOISE_PART1 %1, 4, 1, 5, 3
442
-    HF_NOISE_PART2     0, 2, 4, 5
443
-    add     pix1q, lsizeq
444
-.loop:
445
-    HF_NOISE_PART1 %1, 0, 1, 2, 3
446
-    HF_NOISE_PART2     4, 5, 0, 2
447
-    add     pix1q, lsizeq
448
-    HF_NOISE_PART1 %1, 4, 1, 5, 3
449
-    HF_NOISE_PART2     0, 2, 4, 5
450
-    add     pix1q, lsizeq
451
-    sub        hd, 2
452
-        jne .loop
453
-
454
-    mova       m0, m6
455
-    punpcklwd  m0, m7
456
-    punpckhwd  m6, m7
457
-    paddd      m6, m0
458
-    mova       m0, m6
459
-    psrlq      m6, 32
460
-    paddd      m0, m6
461
-    movd      eax, m0   ; eax = result of hf_noise8;
462
-    REP_RET                 ; return eax;
463
-%endmacro
464
-
465
-INIT_MMX mmx
466
-HF_NOISE 8
467
-HF_NOISE 16
468 1
new file mode 100644
... ...
@@ -0,0 +1,467 @@
0
+;*****************************************************************************
1
+;* SIMD-optimized motion compensation estimation
2
+;*****************************************************************************
3
+;* Copyright (c) 2000, 2001 Fabrice Bellard
4
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
+;*
6
+;* This file is part of FFmpeg.
7
+;*
8
+;* FFmpeg is free software; you can redistribute it and/or
9
+;* modify it under the terms of the GNU Lesser General Public
10
+;* License as published by the Free Software Foundation; either
11
+;* version 2.1 of the License, or (at your option) any later version.
12
+;*
13
+;* FFmpeg is distributed in the hope that it will be useful,
14
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
+;* Lesser General Public License for more details.
17
+;*
18
+;* You should have received a copy of the GNU Lesser General Public
19
+;* License along with FFmpeg; if not, write to the Free Software
20
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
+;*****************************************************************************
22
+
23
+%include "libavutil/x86/x86util.asm"
24
+
25
+SECTION .text
26
+
27
+%macro DIFF_PIXELS_1 4
28
+    movh            %1, %3
29
+    movh            %2, %4
30
+    punpcklbw       %2, %1
31
+    punpcklbw       %1, %1
32
+    psubw           %1, %2
33
+%endmacro
34
+
35
+; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
36
+; %6=temporary storage location
37
+; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
38
+%macro DIFF_PIXELS_8 6
39
+    DIFF_PIXELS_1   m0, m7, [%1     +%3], [%2     +%3]
40
+    DIFF_PIXELS_1   m1, m7, [%1+%4  +%3], [%2+%4  +%3]
41
+    DIFF_PIXELS_1   m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
42
+    add             %1, %5
43
+    add             %2, %5
44
+    DIFF_PIXELS_1   m3, m7, [%1     +%3], [%2     +%3]
45
+    DIFF_PIXELS_1   m4, m7, [%1+%4  +%3], [%2+%4  +%3]
46
+    DIFF_PIXELS_1   m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
47
+    DIFF_PIXELS_1   m6, m7, [%1+%5  +%3], [%2+%5  +%3]
48
+%ifdef m8
49
+    DIFF_PIXELS_1   m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
50
+%else
51
+    mova          [%6], m0
52
+    DIFF_PIXELS_1   m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
53
+    mova            m0, [%6]
54
+%endif
55
+    sub             %1, %5
56
+    sub             %2, %5
57
+%endmacro
58
+
59
+%macro HADAMARD8 0
60
+    SUMSUB_BADC       w, 0, 1, 2, 3
61
+    SUMSUB_BADC       w, 4, 5, 6, 7
62
+    SUMSUB_BADC       w, 0, 2, 1, 3
63
+    SUMSUB_BADC       w, 4, 6, 5, 7
64
+    SUMSUB_BADC       w, 0, 4, 1, 5
65
+    SUMSUB_BADC       w, 2, 6, 3, 7
66
+%endmacro
67
+
68
+%macro ABS1_SUM 3
69
+    ABS1            %1, %2
70
+    paddusw         %3, %1
71
+%endmacro
72
+
73
+%macro ABS2_SUM 6
74
+    ABS2            %1, %2, %3, %4
75
+    paddusw         %5, %1
76
+    paddusw         %6, %2
77
+%endmacro
78
+
79
+%macro ABS_SUM_8x8_64 1
80
+    ABS2            m0, m1, m8, m9
81
+    ABS2_SUM        m2, m3, m8, m9, m0, m1
82
+    ABS2_SUM        m4, m5, m8, m9, m0, m1
83
+    ABS2_SUM        m6, m7, m8, m9, m0, m1
84
+    paddusw         m0, m1
85
+%endmacro
86
+
87
+%macro ABS_SUM_8x8_32 1
88
+    mova          [%1], m7
89
+    ABS1            m0, m7
90
+    ABS1            m1, m7
91
+    ABS1_SUM        m2, m7, m0
92
+    ABS1_SUM        m3, m7, m1
93
+    ABS1_SUM        m4, m7, m0
94
+    ABS1_SUM        m5, m7, m1
95
+    ABS1_SUM        m6, m7, m0
96
+    mova            m2, [%1]
97
+    ABS1_SUM        m2, m7, m1
98
+    paddusw         m0, m1
99
+%endmacro
100
+
101
+; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
102
+; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
103
+; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
104
+%macro HSUM 3
105
+%if cpuflag(sse2)
106
+    movhlps         %2, %1
107
+    paddusw         %1, %2
108
+    pshuflw         %2, %1, 0xE
109
+    paddusw         %1, %2
110
+    pshuflw         %2, %1, 0x1
111
+    paddusw         %1, %2
112
+    movd            %3, %1
113
+%elif cpuflag(mmxext)
114
+    pshufw          %2, %1, 0xE
115
+    paddusw         %1, %2
116
+    pshufw          %2, %1, 0x1
117
+    paddusw         %1, %2
118
+    movd            %3, %1
119
+%elif cpuflag(mmx)
120
+    mova            %2, %1
121
+    psrlq           %1, 32
122
+    paddusw         %1, %2
123
+    mova            %2, %1
124
+    psrlq           %1, 16
125
+    paddusw         %1, %2
126
+    movd            %3, %1
127
+%endif
128
+%endmacro
129
+
130
+%macro STORE4 5
131
+    mova [%1+mmsize*0], %2
132
+    mova [%1+mmsize*1], %3
133
+    mova [%1+mmsize*2], %4
134
+    mova [%1+mmsize*3], %5
135
+%endmacro
136
+
137
+%macro LOAD4 5
138
+    mova            %2, [%1+mmsize*0]
139
+    mova            %3, [%1+mmsize*1]
140
+    mova            %4, [%1+mmsize*2]
141
+    mova            %5, [%1+mmsize*3]
142
+%endmacro
143
+
144
+%macro hadamard8_16_wrapper 2
145
+cglobal hadamard8_diff, 4, 4, %1
146
+%ifndef m8
147
+    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
148
+    SUB            rsp, pad
149
+%endif
150
+    call hadamard8x8_diff %+ SUFFIX
151
+%ifndef m8
152
+    ADD            rsp, pad
153
+%endif
154
+    RET
155
+
156
+cglobal hadamard8_diff16, 5, 6, %1
157
+%ifndef m8
158
+    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
159
+    SUB            rsp, pad
160
+%endif
161
+
162
+    call hadamard8x8_diff %+ SUFFIX
163
+    mov            r5d, eax
164
+
165
+    add             r1, 8
166
+    add             r2, 8
167
+    call hadamard8x8_diff %+ SUFFIX
168
+    add            r5d, eax
169
+
170
+    cmp            r4d, 16
171
+    jne .done
172
+
173
+    lea             r1, [r1+r3*8-8]
174
+    lea             r2, [r2+r3*8-8]
175
+    call hadamard8x8_diff %+ SUFFIX
176
+    add            r5d, eax
177
+
178
+    add             r1, 8
179
+    add             r2, 8
180
+    call hadamard8x8_diff %+ SUFFIX
181
+    add            r5d, eax
182
+
183
+.done:
184
+    mov            eax, r5d
185
+%ifndef m8
186
+    ADD            rsp, pad
187
+%endif
188
+    RET
189
+%endmacro
190
+
191
+%macro HADAMARD8_DIFF 0-1
192
+%if cpuflag(sse2)
193
+hadamard8x8_diff %+ SUFFIX:
194
+    lea                          r0, [r3*3]
195
+    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize
196
+    HADAMARD8
197
+%if ARCH_X86_64
198
+    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7,  8
199
+%else
200
+    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7, [rsp+gprsize], [rsp+mmsize+gprsize]
201
+%endif
202
+    HADAMARD8
203
+    ABS_SUM_8x8         rsp+gprsize
204
+    HSUM                        m0, m1, eax
205
+    and                         eax, 0xFFFF
206
+    ret
207
+
208
+hadamard8_16_wrapper %1, 3
209
+%elif cpuflag(mmx)
210
+ALIGN 16
211
+; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,
212
+;                               uint8_t *src2, int stride, int h)
213
+; r0 = void *s = unused, int h = unused (always 8)
214
+; note how r1, r2 and r3 are not clobbered in this function, so 16x16
215
+; can simply call this 2x2x (and that's why we access rsp+gprsize
216
+; everywhere, which is rsp of calling func
217
+hadamard8x8_diff %+ SUFFIX:
218
+    lea                          r0, [r3*3]
219
+
220
+    ; first 4x8 pixels
221
+    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize+0x60
222
+    HADAMARD8
223
+    mova         [rsp+gprsize+0x60], m7
224
+    TRANSPOSE4x4W                 0,  1,  2,  3,  7
225
+    STORE4              rsp+gprsize, m0, m1, m2, m3
226
+    mova                         m7, [rsp+gprsize+0x60]
227
+    TRANSPOSE4x4W                 4,  5,  6,  7,  0
228
+    STORE4         rsp+gprsize+0x40, m4, m5, m6, m7
229
+
230
+    ; second 4x8 pixels
231
+    DIFF_PIXELS_8                r1, r2,  4, r3, r0, rsp+gprsize+0x60
232
+    HADAMARD8
233
+    mova         [rsp+gprsize+0x60], m7
234
+    TRANSPOSE4x4W                 0,  1,  2,  3,  7
235
+    STORE4         rsp+gprsize+0x20, m0, m1, m2, m3
236
+    mova                         m7, [rsp+gprsize+0x60]
237
+    TRANSPOSE4x4W                 4,  5,  6,  7,  0
238
+
239
+    LOAD4          rsp+gprsize+0x40, m0, m1, m2, m3
240
+    HADAMARD8
241
+    ABS_SUM_8x8_32 rsp+gprsize+0x60
242
+    mova         [rsp+gprsize+0x60], m0
243
+
244
+    LOAD4          rsp+gprsize     , m0, m1, m2, m3
245
+    LOAD4          rsp+gprsize+0x20, m4, m5, m6, m7
246
+    HADAMARD8
247
+    ABS_SUM_8x8_32 rsp+gprsize
248
+    paddusw                      m0, [rsp+gprsize+0x60]
249
+
250
+    HSUM                         m0, m1, eax
251
+    and                         rax, 0xFFFF
252
+    ret
253
+
254
+hadamard8_16_wrapper 0, 14
255
+%endif
256
+%endmacro
257
+
258
+INIT_MMX mmx
259
+HADAMARD8_DIFF
260
+
261
+INIT_MMX mmxext
262
+HADAMARD8_DIFF
263
+
264
+INIT_XMM sse2
265
+%if ARCH_X86_64
266
+%define ABS_SUM_8x8 ABS_SUM_8x8_64
267
+%else
268
+%define ABS_SUM_8x8 ABS_SUM_8x8_32
269
+%endif
270
+HADAMARD8_DIFF 10
271
+
272
+INIT_XMM ssse3
273
+%define ABS_SUM_8x8 ABS_SUM_8x8_64
274
+HADAMARD8_DIFF 9
275
+
276
+; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
277
+;               int line_size, int h)
278
+
279
+%macro SUM_SQUARED_ERRORS 1
280
+cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
281
+%if %1 == mmsize
282
+    shr       hd, 1
283
+%endif
284
+    pxor      m0, m0         ; mm0 = 0
285
+    pxor      m7, m7         ; mm7 holds the sum
286
+
287
+.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
288
+    movu      m1, [pix1q]    ; m1 = pix1[0][0-15], [0-7] for mmx
289
+    movu      m2, [pix2q]    ; m2 = pix2[0][0-15], [0-7] for mmx
290
+%if %1 == mmsize
291
+    movu      m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx
292
+    movu      m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx
293
+%else  ; %1 / 2 == mmsize; mmx only
294
+    mova      m3, [pix1q+8]  ; m3 = pix1[0][8-15]
295
+    mova      m4, [pix2q+8]  ; m4 = pix2[0][8-15]
296
+%endif
297
+
298
+    ; todo: mm1-mm2, mm3-mm4
299
+    ; algo: subtract mm1 from mm2 with saturation and vice versa
300
+    ;       OR the result to get the absolute difference
301
+    mova      m5, m1
302
+    mova      m6, m3
303
+    psubusb   m1, m2
304
+    psubusb   m3, m4
305
+    psubusb   m2, m5
306
+    psubusb   m4, m6
307
+
308
+    por       m2, m1
309
+    por       m4, m3
310
+
311
+    ; now convert to 16-bit vectors so we can square them
312
+    mova      m1, m2
313
+    mova      m3, m4
314
+
315
+    punpckhbw m2, m0
316
+    punpckhbw m4, m0
317
+    punpcklbw m1, m0         ; mm1 not spread over (mm1,mm2)
318
+    punpcklbw m3, m0         ; mm4 not spread over (mm3,mm4)
319
+
320
+    pmaddwd   m2, m2
321
+    pmaddwd   m4, m4
322
+    pmaddwd   m1, m1
323
+    pmaddwd   m3, m3
324
+
325
+    paddd     m1, m2
326
+    paddd     m3, m4
327
+    paddd     m7, m1
328
+    paddd     m7, m3
329
+
330
+%if %1 == mmsize
331
+    lea    pix1q, [pix1q + 2*lsizeq]
332
+    lea    pix2q, [pix2q + 2*lsizeq]
333
+%else
334
+    add    pix1q, lsizeq
335
+    add    pix2q, lsizeq
336
+%endif
337
+    dec       hd
338
+    jnz .next2lines
339
+
340
+    HADDD     m7, m1
341
+    movd     eax, m7         ; return value
342
+    RET
343
+%endmacro
344
+
345
+INIT_MMX mmx
346
+SUM_SQUARED_ERRORS 8
347
+
348
+INIT_MMX mmx
349
+SUM_SQUARED_ERRORS 16
350
+
351
+INIT_XMM sse2
352
+SUM_SQUARED_ERRORS 16
353
+
354
+;-----------------------------------------------
355
+;int ff_sum_abs_dctelem(int16_t *block)
356
+;-----------------------------------------------
357
+; %1 = number of xmm registers used
358
+; %2 = number of inline loops
359
+
360
+%macro SUM_ABS_DCTELEM 2
361
+cglobal sum_abs_dctelem, 1, 1, %1, block
362
+    pxor    m0, m0
363
+    pxor    m1, m1
364
+%assign %%i 0
365
+%rep %2
366
+    mova      m2, [blockq+mmsize*(0+%%i)]
367
+    mova      m3, [blockq+mmsize*(1+%%i)]
368
+    mova      m4, [blockq+mmsize*(2+%%i)]
369
+    mova      m5, [blockq+mmsize*(3+%%i)]
370
+    ABS1_SUM  m2, m6, m0
371
+    ABS1_SUM  m3, m6, m1
372
+    ABS1_SUM  m4, m6, m0
373
+    ABS1_SUM  m5, m6, m1
374
+%assign %%i %%i+4
375
+%endrep
376
+    paddusw m0, m1
377
+    HSUM    m0, m1, eax
378
+    and     eax, 0xFFFF
379
+    RET
380
+%endmacro
381
+
382
+INIT_MMX mmx
383
+SUM_ABS_DCTELEM 0, 4
384
+INIT_MMX mmxext
385
+SUM_ABS_DCTELEM 0, 4
386
+INIT_XMM sse2
387
+SUM_ABS_DCTELEM 7, 2
388
+INIT_XMM ssse3
389
+SUM_ABS_DCTELEM 6, 2
390
+
391
+;------------------------------------------------------------------------------
392
+; int ff_hf_noise*_mmx(uint8_t *pix1, int lsize, int h)
393
+;------------------------------------------------------------------------------
394
+; %1 = 8/16. %2-5=m#
395
+%macro HF_NOISE_PART1 5
396
+    mova      m%2, [pix1q]
397
+%if %1 == 8
398
+    mova      m%3, m%2
399
+    psllq     m%2, 8
400
+    psrlq     m%3, 8
401
+    psrlq     m%2, 8
402
+%else
403
+    mova      m%3, [pix1q+1]
404
+%endif
405
+    mova      m%4, m%2
406
+    mova      m%5, m%3
407
+    punpcklbw m%2, m7
408
+    punpcklbw m%3, m7
409
+    punpckhbw m%4, m7
410
+    punpckhbw m%5, m7
411
+    psubw     m%2, m%3
412
+    psubw     m%4, m%5
413
+%endmacro
414
+
415
+; %1-2 = m#
416
+%macro HF_NOISE_PART2 4
417
+    psubw     m%1, m%3
418
+    psubw     m%2, m%4
419
+    pxor       m3, m3
420
+    pxor       m1, m1
421
+    pcmpgtw    m3, m%1
422
+    pcmpgtw    m1, m%2
423
+    pxor      m%1, m3
424
+    pxor      m%2, m1
425
+    psubw     m%1, m3
426
+    psubw     m%2, m1
427
+    paddw     m%2, m%1
428
+    paddw      m6, m%2
429
+%endmacro
430
+
431
+; %1 = 8/16
432
+%macro HF_NOISE 1
433
+cglobal hf_noise%1, 3,3,0, pix1, lsize, h
434
+    movsxdifnidn lsizeq, lsized
435
+    sub        hd, 2
436
+    pxor       m7, m7
437
+    pxor       m6, m6
438
+    HF_NOISE_PART1 %1, 0, 1, 2, 3
439
+    add     pix1q, lsizeq
440
+    HF_NOISE_PART1 %1, 4, 1, 5, 3
441
+    HF_NOISE_PART2     0, 2, 4, 5
442
+    add     pix1q, lsizeq
443
+.loop:
444
+    HF_NOISE_PART1 %1, 0, 1, 2, 3
445
+    HF_NOISE_PART2     4, 5, 0, 2
446
+    add     pix1q, lsizeq
447
+    HF_NOISE_PART1 %1, 4, 1, 5, 3
448
+    HF_NOISE_PART2     0, 2, 4, 5
449
+    add     pix1q, lsizeq
450
+    sub        hd, 2
451
+        jne .loop
452
+
453
+    mova       m0, m6
454
+    punpcklwd  m0, m7
455
+    punpckhwd  m6, m7
456
+    paddd      m6, m0
457
+    mova       m0, m6
458
+    psrlq      m6, 32
459
+    paddd      m0, m6
460
+    movd      eax, m0   ; eax = result of hf_noise8;
461
+    REP_RET                 ; return eax;
462
+%endmacro
463
+
464
+INIT_MMX mmx
465
+HF_NOISE 8
466
+HF_NOISE 16
0 467
new file mode 100644
... ...
@@ -0,0 +1,845 @@
0
+/*
1
+ * SIMD-optimized motion estimation
2
+ * Copyright (c) 2000, 2001 Fabrice Bellard
3
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
4
+ *
5
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
6
+ *
7
+ * This file is part of FFmpeg.
8
+ *
9
+ * FFmpeg is free software; you can redistribute it and/or
10
+ * modify it under the terms of the GNU Lesser General Public
11
+ * License as published by the Free Software Foundation; either
12
+ * version 2.1 of the License, or (at your option) any later version.
13
+ *
14
+ * FFmpeg is distributed in the hope that it will be useful,
15
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
+ * Lesser General Public License for more details.
18
+ *
19
+ * You should have received a copy of the GNU Lesser General Public
20
+ * License along with FFmpeg; if not, write to the Free Software
21
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
+ */
23
+
24
+#include "libavutil/attributes.h"
25
+#include "libavutil/cpu.h"
26
+#include "libavutil/x86/asm.h"
27
+#include "libavutil/x86/cpu.h"
28
+#include "libavcodec/me_cmp.h"
29
+#include "libavcodec/mpegvideo.h"
30
+
31
+int ff_sum_abs_dctelem_mmx(int16_t *block);
32
+int ff_sum_abs_dctelem_mmxext(int16_t *block);
33
+int ff_sum_abs_dctelem_sse2(int16_t *block);
34
+int ff_sum_abs_dctelem_ssse3(int16_t *block);
35
+int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
36
+                int line_size, int h);
37
+int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
38
+                 int line_size, int h);
39
+int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
40
+                  int line_size, int h);
41
+int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h);
42
+int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h);
43
+
44
+#define hadamard_func(cpu)                                              \
45
+    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,     \
46
+                                  uint8_t *src2, int stride, int h);    \
47
+    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,   \
48
+                                    uint8_t *src2, int stride, int h);
49
+
50
+hadamard_func(mmx)
51
+hadamard_func(mmxext)
52
+hadamard_func(sse2)
53
+hadamard_func(ssse3)
54
+
55
+#if HAVE_YASM
56
+static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
57
+                      int line_size, int h)
58
+{
59
+    int score1, score2;
60
+
61
+    if (c)
62
+        score1 = c->mecc.sse[0](c, pix1, pix2, line_size, h);
63
+    else
64
+        score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h);
65
+    score2 = ff_hf_noise16_mmx(pix1, line_size, h) + ff_hf_noise8_mmx(pix1+8, line_size, h)
66
+           - ff_hf_noise16_mmx(pix2, line_size, h) - ff_hf_noise8_mmx(pix2+8, line_size, h);
67
+
68
+    if (c)
69
+        return score1 + FFABS(score2) * c->avctx->nsse_weight;
70
+    else
71
+        return score1 + FFABS(score2) * 8;
72
+}
73
+
74
+static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
75
+                     int line_size, int h)
76
+{
77
+    int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h);
78
+    int score2 = ff_hf_noise8_mmx(pix1, line_size, h) -
79
+                 ff_hf_noise8_mmx(pix2, line_size, h);
80
+
81
+    if (c)
82
+        return score1 + FFABS(score2) * c->avctx->nsse_weight;
83
+    else
84
+        return score1 + FFABS(score2) * 8;
85
+}
86
+
87
+#endif /* HAVE_YASM */
88
+
89
+#if HAVE_INLINE_ASM
90
+
91
+static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
92
+                            int line_size, int h)
93
+{
94
+    int tmp;
95
+
96
+    av_assert2((((int) pix) & 7) == 0);
97
+    av_assert2((line_size & 7) == 0);
98
+
99
+#define SUM(in0, in1, out0, out1)               \
100
+    "movq (%0), %%mm2\n"                        \
101
+    "movq 8(%0), %%mm3\n"                       \
102
+    "add %2,%0\n"                               \
103
+    "movq %%mm2, " #out0 "\n"                   \
104
+    "movq %%mm3, " #out1 "\n"                   \
105
+    "psubusb " #in0 ", %%mm2\n"                 \
106
+    "psubusb " #in1 ", %%mm3\n"                 \
107
+    "psubusb " #out0 ", " #in0 "\n"             \
108
+    "psubusb " #out1 ", " #in1 "\n"             \
109
+    "por %%mm2, " #in0 "\n"                     \
110
+    "por %%mm3, " #in1 "\n"                     \
111
+    "movq " #in0 ", %%mm2\n"                    \
112
+    "movq " #in1 ", %%mm3\n"                    \
113
+    "punpcklbw %%mm7, " #in0 "\n"               \
114
+    "punpcklbw %%mm7, " #in1 "\n"               \
115
+    "punpckhbw %%mm7, %%mm2\n"                  \
116
+    "punpckhbw %%mm7, %%mm3\n"                  \
117
+    "paddw " #in1 ", " #in0 "\n"                \
118
+    "paddw %%mm3, %%mm2\n"                      \
119
+    "paddw %%mm2, " #in0 "\n"                   \
120
+    "paddw " #in0 ", %%mm6\n"
121
+
122
+
123
+    __asm__ volatile (
124
+        "movl    %3, %%ecx\n"
125
+        "pxor %%mm6, %%mm6\n"
126
+        "pxor %%mm7, %%mm7\n"
127
+        "movq  (%0), %%mm0\n"
128
+        "movq 8(%0), %%mm1\n"
129
+        "add %2, %0\n"
130
+        "jmp 2f\n"
131
+        "1:\n"
132
+
133
+        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
134
+        "2:\n"
135
+        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
136
+
137
+        "subl $2, %%ecx\n"
138
+        "jnz 1b\n"
139
+
140
+        "movq  %%mm6, %%mm0\n"
141
+        "psrlq $32,   %%mm6\n"
142
+        "paddw %%mm6, %%mm0\n"
143
+        "movq  %%mm0, %%mm6\n"
144
+        "psrlq $16,   %%mm0\n"
145
+        "paddw %%mm6, %%mm0\n"
146
+        "movd  %%mm0, %1\n"
147
+        : "+r" (pix), "=r" (tmp)
148
+        : "r" ((x86_reg) line_size), "m" (h)
149
+        : "%ecx");
150
+
151
+    return tmp & 0xFFFF;
152
+}
153
+#undef SUM
154
+
155
+static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
156
+                               int line_size, int h)
157
+{
158
+    int tmp;
159
+
160
+    av_assert2((((int) pix) & 7) == 0);
161
+    av_assert2((line_size & 7) == 0);
162
+
163
+#define SUM(in0, in1, out0, out1)               \
164
+    "movq (%0), " #out0 "\n"                    \
165
+    "movq 8(%0), " #out1 "\n"                   \
166
+    "add %2, %0\n"                              \
167
+    "psadbw " #out0 ", " #in0 "\n"              \
168
+    "psadbw " #out1 ", " #in1 "\n"              \
169
+    "paddw " #in1 ", " #in0 "\n"                \
170
+    "paddw " #in0 ", %%mm6\n"
171
+
172
+    __asm__ volatile (
173
+        "movl %3, %%ecx\n"
174
+        "pxor %%mm6, %%mm6\n"
175
+        "pxor %%mm7, %%mm7\n"
176
+        "movq (%0), %%mm0\n"
177
+        "movq 8(%0), %%mm1\n"
178
+        "add %2, %0\n"
179
+        "jmp 2f\n"
180
+        "1:\n"
181
+
182
+        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
183
+        "2:\n"
184
+        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
185
+
186
+        "subl $2, %%ecx\n"
187
+        "jnz 1b\n"
188
+
189
+        "movd %%mm6, %1\n"
190
+        : "+r" (pix), "=r" (tmp)
191
+        : "r" ((x86_reg) line_size), "m" (h)
192
+        : "%ecx");
193
+
194
+    return tmp;
195
+}
196
+#undef SUM
197
+
198
+static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
199
+                      int line_size, int h)
200
+{
201
+    int tmp;
202
+
203
+    av_assert2((((int) pix1) & 7) == 0);
204
+    av_assert2((((int) pix2) & 7) == 0);
205
+    av_assert2((line_size & 7) == 0);
206
+
207
+#define SUM(in0, in1, out0, out1)       \
208
+    "movq (%0), %%mm2\n"                \
209
+    "movq (%1), " #out0 "\n"            \
210
+    "movq 8(%0), %%mm3\n"               \
211
+    "movq 8(%1), " #out1 "\n"           \
212
+    "add %3, %0\n"                      \
213
+    "add %3, %1\n"                      \
214
+    "psubb " #out0 ", %%mm2\n"          \
215
+    "psubb " #out1 ", %%mm3\n"          \
216
+    "pxor %%mm7, %%mm2\n"               \
217
+    "pxor %%mm7, %%mm3\n"               \
218
+    "movq %%mm2, " #out0 "\n"           \
219
+    "movq %%mm3, " #out1 "\n"           \
220
+    "psubusb " #in0 ", %%mm2\n"         \
221
+    "psubusb " #in1 ", %%mm3\n"         \
222
+    "psubusb " #out0 ", " #in0 "\n"     \
223
+    "psubusb " #out1 ", " #in1 "\n"     \
224
+    "por %%mm2, " #in0 "\n"             \
225
+    "por %%mm3, " #in1 "\n"             \
226
+    "movq " #in0 ", %%mm2\n"            \
227
+    "movq " #in1 ", %%mm3\n"            \
228
+    "punpcklbw %%mm7, " #in0 "\n"       \
229
+    "punpcklbw %%mm7, " #in1 "\n"       \
230
+    "punpckhbw %%mm7, %%mm2\n"          \
231
+    "punpckhbw %%mm7, %%mm3\n"          \
232
+    "paddw " #in1 ", " #in0 "\n"        \
233
+    "paddw %%mm3, %%mm2\n"              \
234
+    "paddw %%mm2, " #in0 "\n"           \
235
+    "paddw " #in0 ", %%mm6\n"
236
+
237
+
238
+    __asm__ volatile (
239
+        "movl %4, %%ecx\n"
240
+        "pxor %%mm6, %%mm6\n"
241
+        "pcmpeqw %%mm7, %%mm7\n"
242
+        "psllw $15, %%mm7\n"
243
+        "packsswb %%mm7, %%mm7\n"
244
+        "movq (%0), %%mm0\n"
245
+        "movq (%1), %%mm2\n"
246
+        "movq 8(%0), %%mm1\n"
247
+        "movq 8(%1), %%mm3\n"
248
+        "add %3, %0\n"
249
+        "add %3, %1\n"
250
+        "psubb %%mm2, %%mm0\n"
251
+        "psubb %%mm3, %%mm1\n"
252
+        "pxor %%mm7, %%mm0\n"
253
+        "pxor %%mm7, %%mm1\n"
254
+        "jmp 2f\n"
255
+        "1:\n"
256
+
257
+        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
258
+        "2:\n"
259
+        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
260
+
261
+        "subl $2, %%ecx\n"
262
+        "jnz 1b\n"
263
+
264
+        "movq %%mm6, %%mm0\n"
265
+        "psrlq $32, %%mm6\n"
266
+        "paddw %%mm6, %%mm0\n"
267
+        "movq %%mm0, %%mm6\n"
268
+        "psrlq $16, %%mm0\n"
269
+        "paddw %%mm6, %%mm0\n"
270
+        "movd %%mm0, %2\n"
271
+        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
272
+        : "r" ((x86_reg) line_size), "m" (h)
273
+        : "%ecx");
274
+
275
+    return tmp & 0x7FFF;
276
+}
277
+#undef SUM
278
+
279
+static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
280
+                         int line_size, int h)
281
+{
282
+    int tmp;
283
+
284
+    av_assert2((((int) pix1) & 7) == 0);
285
+    av_assert2((((int) pix2) & 7) == 0);
286
+    av_assert2((line_size & 7) == 0);
287
+
288
+#define SUM(in0, in1, out0, out1)               \
289
+    "movq (%0), " #out0 "\n"                    \
290
+    "movq (%1), %%mm2\n"                        \
291
+    "movq 8(%0), " #out1 "\n"                   \
292
+    "movq 8(%1), %%mm3\n"                       \
293
+    "add %3, %0\n"                              \
294
+    "add %3, %1\n"                              \
295
+    "psubb %%mm2, " #out0 "\n"                  \
296
+    "psubb %%mm3, " #out1 "\n"                  \
297
+    "pxor %%mm7, " #out0 "\n"                   \
298
+    "pxor %%mm7, " #out1 "\n"                   \
299
+    "psadbw " #out0 ", " #in0 "\n"              \
300
+    "psadbw " #out1 ", " #in1 "\n"              \
301
+    "paddw " #in1 ", " #in0 "\n"                \
302
+    "paddw " #in0 ", %%mm6\n    "
303
+
304
+    __asm__ volatile (
305
+        "movl %4, %%ecx\n"
306
+        "pxor %%mm6, %%mm6\n"
307
+        "pcmpeqw %%mm7, %%mm7\n"
308
+        "psllw $15, %%mm7\n"
309
+        "packsswb %%mm7, %%mm7\n"
310
+        "movq (%0), %%mm0\n"
311
+        "movq (%1), %%mm2\n"
312
+        "movq 8(%0), %%mm1\n"
313
+        "movq 8(%1), %%mm3\n"
314
+        "add %3, %0\n"
315
+        "add %3, %1\n"
316
+        "psubb %%mm2, %%mm0\n"
317
+        "psubb %%mm3, %%mm1\n"
318
+        "pxor %%mm7, %%mm0\n"
319
+        "pxor %%mm7, %%mm1\n"
320
+        "jmp 2f\n"
321
+        "1:\n"
322
+
323
+        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
324
+        "2:\n"
325
+        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
326
+
327
+        "subl $2, %%ecx\n"
328
+        "jnz 1b\n"
329
+
330
+        "movd %%mm6, %2\n"
331
+        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
332
+        : "r" ((x86_reg) line_size), "m" (h)
333
+        : "%ecx");
334
+
335
+    return tmp;
336
+}
337
+#undef SUM
338
+
339
+
340
+
341
+DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
342
+    0x0000000000000000ULL,
343
+    0x0001000100010001ULL,
344
+    0x0002000200020002ULL,
345
+};
346
+
347
+DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
348
+
349
+static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
350
+{
351
+    x86_reg len = -(x86_reg)stride * h;
352
+    __asm__ volatile (
353
+        ".p2align 4                     \n\t"
354
+        "1:                             \n\t"
355
+        "movq (%1, %%"REG_a"), %%mm0    \n\t"
356
+        "movq (%2, %%"REG_a"), %%mm2    \n\t"
357
+        "movq (%2, %%"REG_a"), %%mm4    \n\t"
358
+        "add %3, %%"REG_a"              \n\t"
359
+        "psubusb %%mm0, %%mm2           \n\t"
360
+        "psubusb %%mm4, %%mm0           \n\t"
361
+        "movq (%1, %%"REG_a"), %%mm1    \n\t"
362
+        "movq (%2, %%"REG_a"), %%mm3    \n\t"
363
+        "movq (%2, %%"REG_a"), %%mm5    \n\t"
364
+        "psubusb %%mm1, %%mm3           \n\t"
365
+        "psubusb %%mm5, %%mm1           \n\t"
366
+        "por %%mm2, %%mm0               \n\t"
367
+        "por %%mm1, %%mm3               \n\t"
368
+        "movq %%mm0, %%mm1              \n\t"
369
+        "movq %%mm3, %%mm2              \n\t"
370
+        "punpcklbw %%mm7, %%mm0         \n\t"
371
+        "punpckhbw %%mm7, %%mm1         \n\t"
372
+        "punpcklbw %%mm7, %%mm3         \n\t"
373
+        "punpckhbw %%mm7, %%mm2         \n\t"
374
+        "paddw %%mm1, %%mm0             \n\t"
375
+        "paddw %%mm3, %%mm2             \n\t"
376
+        "paddw %%mm2, %%mm0             \n\t"
377
+        "paddw %%mm0, %%mm6             \n\t"
378
+        "add %3, %%"REG_a"              \n\t"
379
+        " js 1b                         \n\t"
380
+        : "+a" (len)
381
+        : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride));
382
+}
383
+
384
+static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
385
+                                 int stride, int h)
386
+{
387
+    __asm__ volatile (
388
+        ".p2align 4                     \n\t"
389
+        "1:                             \n\t"
390
+        "movq (%1), %%mm0               \n\t"
391
+        "movq (%1, %3), %%mm1           \n\t"
392
+        "psadbw (%2), %%mm0             \n\t"
393
+        "psadbw (%2, %3), %%mm1         \n\t"
394
+        "paddw %%mm0, %%mm6             \n\t"
395
+        "paddw %%mm1, %%mm6             \n\t"
396
+        "lea (%1,%3,2), %1              \n\t"
397
+        "lea (%2,%3,2), %2              \n\t"
398
+        "sub $2, %0                     \n\t"
399
+        " jg 1b                         \n\t"
400
+        : "+r" (h), "+r" (blk1), "+r" (blk2)
401
+        : "r" ((x86_reg) stride));
402
+}
403
+
404
+static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
405
+                      int stride, int h)
406
+{
407
+    int ret;
408
+    __asm__ volatile (
409
+        "pxor %%xmm2, %%xmm2            \n\t"
410
+        ".p2align 4                     \n\t"
411
+        "1:                             \n\t"
412
+        "movdqu (%1), %%xmm0            \n\t"
413
+        "movdqu (%1, %4), %%xmm1        \n\t"
414
+        "psadbw (%2), %%xmm0            \n\t"
415
+        "psadbw (%2, %4), %%xmm1        \n\t"
416
+        "paddw %%xmm0, %%xmm2           \n\t"
417
+        "paddw %%xmm1, %%xmm2           \n\t"
418
+        "lea (%1,%4,2), %1              \n\t"
419
+        "lea (%2,%4,2), %2              \n\t"
420
+        "sub $2, %0                     \n\t"
421
+        " jg 1b                         \n\t"
422
+        "movhlps %%xmm2, %%xmm0         \n\t"
423
+        "paddw   %%xmm0, %%xmm2         \n\t"
424
+        "movd    %%xmm2, %3             \n\t"
425
+        : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
426
+        : "r" ((x86_reg) stride));
427
+    return ret;
428
+}
429
+
430
+static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
431
+                                   int stride, int h)
432
+{
433
+    __asm__ volatile (
434
+        ".p2align 4                     \n\t"
435
+        "1:                             \n\t"
436
+        "movq (%1), %%mm0               \n\t"
437
+        "movq (%1, %3), %%mm1           \n\t"
438
+        "pavgb 1(%1), %%mm0             \n\t"
439
+        "pavgb 1(%1, %3), %%mm1         \n\t"
440
+        "psadbw (%2), %%mm0             \n\t"
441
+        "psadbw (%2, %3), %%mm1         \n\t"
442
+        "paddw %%mm0, %%mm6             \n\t"
443
+        "paddw %%mm1, %%mm6             \n\t"
444
+        "lea (%1,%3,2), %1              \n\t"
445
+        "lea (%2,%3,2), %2              \n\t"
446
+        "sub $2, %0                     \n\t"
447
+        " jg 1b                         \n\t"
448
+        : "+r" (h), "+r" (blk1), "+r" (blk2)
449
+        : "r" ((x86_reg) stride));
450
+}
451
+
452
+static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
453
+                                   int stride, int h)
454
+{
455
+    __asm__ volatile (
456
+        "movq (%1), %%mm0               \n\t"
457
+        "add %3, %1                     \n\t"
458
+        ".p2align 4                     \n\t"
459
+        "1:                             \n\t"
460
+        "movq (%1), %%mm1               \n\t"
461
+        "movq (%1, %3), %%mm2           \n\t"
462
+        "pavgb %%mm1, %%mm0             \n\t"
463
+        "pavgb %%mm2, %%mm1             \n\t"
464
+        "psadbw (%2), %%mm0             \n\t"
465
+        "psadbw (%2, %3), %%mm1         \n\t"
466
+        "paddw %%mm0, %%mm6             \n\t"
467
+        "paddw %%mm1, %%mm6             \n\t"
468
+        "movq %%mm2, %%mm0              \n\t"
469
+        "lea (%1,%3,2), %1              \n\t"
470
+        "lea (%2,%3,2), %2              \n\t"
471
+        "sub $2, %0                     \n\t"
472
+        " jg 1b                         \n\t"
473
+        : "+r" (h), "+r" (blk1), "+r" (blk2)
474
+        : "r" ((x86_reg) stride));
475
+}
476
+
477
+static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
478
+                                 int stride, int h)
479
+{
480
+    __asm__ volatile (
481
+        "movq "MANGLE(bone)", %%mm5     \n\t"
482
+        "movq (%1), %%mm0               \n\t"
483
+        "pavgb 1(%1), %%mm0             \n\t"
484
+        "add %3, %1                     \n\t"
485
+        ".p2align 4                     \n\t"
486
+        "1:                             \n\t"
487
+        "movq (%1), %%mm1               \n\t"
488
+        "movq (%1,%3), %%mm2            \n\t"
489
+        "pavgb 1(%1), %%mm1             \n\t"
490
+        "pavgb 1(%1,%3), %%mm2          \n\t"
491
+        "psubusb %%mm5, %%mm1           \n\t"
492
+        "pavgb %%mm1, %%mm0             \n\t"
493
+        "pavgb %%mm2, %%mm1             \n\t"
494
+        "psadbw (%2), %%mm0             \n\t"
495
+        "psadbw (%2,%3), %%mm1          \n\t"
496
+        "paddw %%mm0, %%mm6             \n\t"
497
+        "paddw %%mm1, %%mm6             \n\t"
498
+        "movq %%mm2, %%mm0              \n\t"
499
+        "lea (%1,%3,2), %1              \n\t"
500
+        "lea (%2,%3,2), %2              \n\t"
501
+        "sub $2, %0                     \n\t"
502
+        " jg 1b                         \n\t"
503
+        : "+r" (h), "+r" (blk1), "+r" (blk2)
504
+        : "r" ((x86_reg) stride)
505
+          NAMED_CONSTRAINTS_ADD(bone));
506
+}
507
+
508
+static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
509
+                              int stride, int h)
510
+{
511
+    x86_reg len = -(x86_reg)stride * h;
512
+    __asm__ volatile (
513
+        ".p2align 4                     \n\t"
514
+        "1:                             \n\t"
515
+        "movq (%1, %%"REG_a"), %%mm0    \n\t"
516
+        "movq (%2, %%"REG_a"), %%mm1    \n\t"
517
+        "movq (%1, %%"REG_a"), %%mm2    \n\t"
518
+        "movq (%2, %%"REG_a"), %%mm3    \n\t"
519
+        "punpcklbw %%mm7, %%mm0         \n\t"
520
+        "punpcklbw %%mm7, %%mm1         \n\t"
521
+        "punpckhbw %%mm7, %%mm2         \n\t"
522
+        "punpckhbw %%mm7, %%mm3         \n\t"
523
+        "paddw %%mm0, %%mm1             \n\t"
524
+        "paddw %%mm2, %%mm3             \n\t"
525
+        "movq (%3, %%"REG_a"), %%mm4    \n\t"
526
+        "movq (%3, %%"REG_a"), %%mm2    \n\t"
527
+        "paddw %%mm5, %%mm1             \n\t"
528
+        "paddw %%mm5, %%mm3             \n\t"
529
+        "psrlw $1, %%mm1                \n\t"
530
+        "psrlw $1, %%mm3                \n\t"
531
+        "packuswb %%mm3, %%mm1          \n\t"
532
+        "psubusb %%mm1, %%mm4           \n\t"
533
+        "psubusb %%mm2, %%mm1           \n\t"
534
+        "por %%mm4, %%mm1               \n\t"
535
+        "movq %%mm1, %%mm0              \n\t"
536
+        "punpcklbw %%mm7, %%mm0         \n\t"
537
+        "punpckhbw %%mm7, %%mm1         \n\t"
538
+        "paddw %%mm1, %%mm0             \n\t"
539
+        "paddw %%mm0, %%mm6             \n\t"
540
+        "add %4, %%"REG_a"              \n\t"
541
+        " js 1b                         \n\t"
542
+        : "+a" (len)
543
+        : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
544
+          "r" ((x86_reg) stride));
545
+}
546
+
547
+static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
548
+{
549
+    x86_reg len = -(x86_reg)stride * h;
550
+    __asm__ volatile (
551
+        "movq  (%1, %%"REG_a"), %%mm0   \n\t"
552
+        "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
553
+        "movq %%mm0, %%mm1              \n\t"
554
+        "movq %%mm2, %%mm3              \n\t"
555
+        "punpcklbw %%mm7, %%mm0         \n\t"
556
+        "punpckhbw %%mm7, %%mm1         \n\t"
557
+        "punpcklbw %%mm7, %%mm2         \n\t"
558
+        "punpckhbw %%mm7, %%mm3         \n\t"
559
+        "paddw %%mm2, %%mm0             \n\t"
560
+        "paddw %%mm3, %%mm1             \n\t"
561
+        ".p2align 4                     \n\t"
562
+        "1:                             \n\t"
563
+        "movq  (%2, %%"REG_a"), %%mm2   \n\t"
564
+        "movq 1(%2, %%"REG_a"), %%mm4   \n\t"
565
+        "movq %%mm2, %%mm3              \n\t"
566
+        "movq %%mm4, %%mm5              \n\t"
567
+        "punpcklbw %%mm7, %%mm2         \n\t"
568
+        "punpckhbw %%mm7, %%mm3         \n\t"
569
+        "punpcklbw %%mm7, %%mm4         \n\t"
570
+        "punpckhbw %%mm7, %%mm5         \n\t"
571
+        "paddw %%mm4, %%mm2             \n\t"
572
+        "paddw %%mm5, %%mm3             \n\t"
573
+        "movq %5, %%mm5                 \n\t"
574
+        "paddw %%mm2, %%mm0             \n\t"
575
+        "paddw %%mm3, %%mm1             \n\t"
576
+        "paddw %%mm5, %%mm0             \n\t"
577
+        "paddw %%mm5, %%mm1             \n\t"
578
+        "movq (%3, %%"REG_a"), %%mm4    \n\t"
579
+        "movq (%3, %%"REG_a"), %%mm5    \n\t"
580
+        "psrlw $2, %%mm0                \n\t"
581
+        "psrlw $2, %%mm1                \n\t"
582
+        "packuswb %%mm1, %%mm0          \n\t"
583
+        "psubusb %%mm0, %%mm4           \n\t"
584
+        "psubusb %%mm5, %%mm0           \n\t"
585
+        "por %%mm4, %%mm0               \n\t"
586
+        "movq %%mm0, %%mm4              \n\t"
587
+        "punpcklbw %%mm7, %%mm0         \n\t"
588
+        "punpckhbw %%mm7, %%mm4         \n\t"
589
+        "paddw %%mm0, %%mm6             \n\t"
590
+        "paddw %%mm4, %%mm6             \n\t"
591
+        "movq  %%mm2, %%mm0             \n\t"
592
+        "movq  %%mm3, %%mm1             \n\t"
593
+        "add %4, %%"REG_a"              \n\t"
594
+        " js 1b                         \n\t"
595
+        : "+a" (len)
596
+        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
597
+          "r" ((x86_reg) stride), "m" (round_tab[2]));
598
+}
599
+
600
+static inline int sum_mmx(void)
601
+{
602
+    int ret;
603
+    __asm__ volatile (
604
+        "movq %%mm6, %%mm0              \n\t"
605
+        "psrlq $32, %%mm6               \n\t"
606
+        "paddw %%mm0, %%mm6             \n\t"
607
+        "movq %%mm6, %%mm0              \n\t"
608
+        "psrlq $16, %%mm6               \n\t"
609
+        "paddw %%mm0, %%mm6             \n\t"
610
+        "movd %%mm6, %0                 \n\t"
611
+        : "=r" (ret));
612
+    return ret & 0xFFFF;
613
+}
614
+
615
+static inline int sum_mmxext(void)
616
+{
617
+    int ret;
618
+    __asm__ volatile (
619
+        "movd %%mm6, %0                 \n\t"
620
+        : "=r" (ret));
621
+    return ret;
622
+}
623
+
624
+static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
625
+{
626
+    sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
627
+}
628
+
629
+static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
630
+{
631
+    sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
632
+}
633
+
634
+#define PIX_SAD(suf)                                                    \
635
+static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
636
+                        uint8_t *blk1, int stride, int h)               \
637
+{                                                                       \
638
+    av_assert2(h == 8);                                                     \
639
+    __asm__ volatile (                                                  \
640
+        "pxor %%mm7, %%mm7     \n\t"                                    \
641
+        "pxor %%mm6, %%mm6     \n\t"                                    \
642
+        :);                                                             \
643
+                                                                        \
644
+    sad8_1_ ## suf(blk1, blk2, stride, 8);                              \
645
+                                                                        \
646
+    return sum_ ## suf();                                               \
647
+}                                                                       \
648
+                                                                        \
649
+static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
650
+                           uint8_t *blk1, int stride, int h)            \
651
+{                                                                       \
652
+    av_assert2(h == 8);                                                     \
653
+    __asm__ volatile (                                                  \
654
+        "pxor %%mm7, %%mm7     \n\t"                                    \
655
+        "pxor %%mm6, %%mm6     \n\t"                                    \
656
+        "movq %0, %%mm5        \n\t"                                    \
657
+        :: "m" (round_tab[1]));                                         \
658
+                                                                        \
659
+    sad8_x2a_ ## suf(blk1, blk2, stride, 8);                            \
660
+                                                                        \
661
+    return sum_ ## suf();                                               \
662
+}                                                                       \
663
+                                                                        \
664
+static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
665
+                           uint8_t *blk1, int stride, int h)            \
666
+{                                                                       \
667
+    av_assert2(h == 8);                                                     \
668
+    __asm__ volatile (                                                  \
669
+        "pxor %%mm7, %%mm7     \n\t"                                    \
670
+        "pxor %%mm6, %%mm6     \n\t"                                    \
671
+        "movq %0, %%mm5        \n\t"                                    \
672
+        :: "m" (round_tab[1]));                                         \
673
+                                                                        \
674
+    sad8_y2a_ ## suf(blk1, blk2, stride, 8);                            \
675
+                                                                        \
676
+    return sum_ ## suf();                                               \
677
+}                                                                       \
678
+                                                                        \
679
+static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
680
+                            uint8_t *blk1, int stride, int h)           \
681
+{                                                                       \
682
+    av_assert2(h == 8);                                                     \
683
+    __asm__ volatile (                                                  \
684
+        "pxor %%mm7, %%mm7     \n\t"                                    \
685
+        "pxor %%mm6, %%mm6     \n\t"                                    \
686
+        ::);                                                            \
687
+                                                                        \
688
+    sad8_4_ ## suf(blk1, blk2, stride, 8);                              \
689
+                                                                        \
690
+    return sum_ ## suf();                                               \
691
+}                                                                       \
692
+                                                                        \
693
+static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2,              \
694
+                         uint8_t *blk1, int stride, int h)              \
695
+{                                                                       \
696
+    __asm__ volatile (                                                  \
697
+        "pxor %%mm7, %%mm7     \n\t"                                    \
698
+        "pxor %%mm6, %%mm6     \n\t"                                    \
699
+        :);                                                             \
700
+                                                                        \
701
+    sad8_1_ ## suf(blk1,     blk2,     stride, h);                      \
702
+    sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
703
+                                                                        \
704
+    return sum_ ## suf();                                               \
705
+}                                                                       \
706
+                                                                        \
707
+static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
708
+                            uint8_t *blk1, int stride, int h)           \
709
+{                                                                       \
710
+    __asm__ volatile (                                                  \
711
+        "pxor %%mm7, %%mm7     \n\t"                                    \
712
+        "pxor %%mm6, %%mm6     \n\t"                                    \
713
+        "movq %0, %%mm5        \n\t"                                    \
714
+        :: "m" (round_tab[1]));                                         \
715
+                                                                        \
716
+    sad8_x2a_ ## suf(blk1,     blk2,     stride, h);                    \
717
+    sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
718
+                                                                        \
719
+    return sum_ ## suf();                                               \
720
+}                                                                       \
721
+                                                                        \
722
+static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
723
+                            uint8_t *blk1, int stride, int h)           \
724
+{                                                                       \
725
+    __asm__ volatile (                                                  \
726
+        "pxor %%mm7, %%mm7     \n\t"                                    \
727
+        "pxor %%mm6, %%mm6     \n\t"                                    \
728
+        "movq %0, %%mm5        \n\t"                                    \
729
+        :: "m" (round_tab[1]));                                         \
730
+                                                                        \
731
+    sad8_y2a_ ## suf(blk1,     blk2,     stride, h);                    \
732
+    sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
733
+                                                                        \
734
+    return sum_ ## suf();                                               \
735
+}                                                                       \
736
+                                                                        \
737
+static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
738
+                             uint8_t *blk1, int stride, int h)          \
739
+{                                                                       \
740
+    __asm__ volatile (                                                  \
741
+        "pxor %%mm7, %%mm7     \n\t"                                    \
742
+        "pxor %%mm6, %%mm6     \n\t"                                    \
743
+        ::);                                                            \
744
+                                                                        \
745
+    sad8_4_ ## suf(blk1,     blk2,     stride, h);                      \
746
+    sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
747
+                                                                        \
748
+    return sum_ ## suf();                                               \
749
+}                                                                       \
750
+
751
+PIX_SAD(mmx)
752
+PIX_SAD(mmxext)
753
+
754
+#endif /* HAVE_INLINE_ASM */
755
+
756
+av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
757
+{
758
+    int cpu_flags = av_get_cpu_flags();
759
+
760
+#if HAVE_INLINE_ASM
761
+    if (INLINE_MMX(cpu_flags)) {
762
+        c->pix_abs[0][0] = sad16_mmx;
763
+        c->pix_abs[0][1] = sad16_x2_mmx;
764
+        c->pix_abs[0][2] = sad16_y2_mmx;
765
+        c->pix_abs[0][3] = sad16_xy2_mmx;
766
+        c->pix_abs[1][0] = sad8_mmx;
767
+        c->pix_abs[1][1] = sad8_x2_mmx;
768
+        c->pix_abs[1][2] = sad8_y2_mmx;
769
+        c->pix_abs[1][3] = sad8_xy2_mmx;
770
+
771
+        c->sad[0] = sad16_mmx;
772
+        c->sad[1] = sad8_mmx;
773
+
774
+        c->vsad[4] = vsad_intra16_mmx;
775
+
776
+        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
777
+            c->vsad[0] = vsad16_mmx;
778
+        }
779
+    }
780
+
781
+    if (INLINE_MMXEXT(cpu_flags)) {
782
+        c->vsad[4] = vsad_intra16_mmxext;
783
+
784
+        c->pix_abs[0][0] = sad16_mmxext;
785
+        c->pix_abs[1][0] = sad8_mmxext;
786
+
787
+        c->sad[0] = sad16_mmxext;
788
+        c->sad[1] = sad8_mmxext;
789
+
790
+        c->pix_abs[0][1] = sad16_x2_mmxext;
791
+        c->pix_abs[0][2] = sad16_y2_mmxext;
792
+        c->pix_abs[1][1] = sad8_x2_mmxext;
793
+        c->pix_abs[1][2] = sad8_y2_mmxext;
794
+
795
+        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
796
+            c->pix_abs[0][3] = sad16_xy2_mmxext;
797
+            c->pix_abs[1][3] = sad8_xy2_mmxext;
798
+
799
+            c->vsad[0] = vsad16_mmxext;
800
+        }
801
+    }
802
+
803
+    if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
804
+        c->sad[0] = sad16_sse2;
805
+    }
806
+
807
+#endif /* HAVE_INLINE_ASM */
808
+
809
+    if (EXTERNAL_MMX(cpu_flags)) {
810
+        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
811
+        c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
812
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmx;
813
+        c->sse[0]            = ff_sse16_mmx;
814
+        c->sse[1]            = ff_sse8_mmx;
815
+#if HAVE_YASM
816
+        c->nsse[0]           = nsse16_mmx;
817
+        c->nsse[1]           = nsse8_mmx;
818
+#endif
819
+    }
820
+
821
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
822
+        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
823
+        c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
824
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmxext;
825
+    }
826
+
827
+    if (EXTERNAL_SSE2(cpu_flags)) {
828
+        c->sse[0] = ff_sse16_sse2;
829
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
830
+
831
+#if HAVE_ALIGNED_STACK
832
+        c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
833
+        c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
834
+#endif
835
+    }
836
+
837
+    if (EXTERNAL_SSSE3(cpu_flags)) {
838
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
839
+#if HAVE_ALIGNED_STACK
840
+        c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
841
+        c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
842
+#endif
843
+    }
844
+}
... ...
@@ -24,7 +24,7 @@
24 24
 
25 25
 #include "config.h"
26 26
 #include "avfilter.h"
27
-#include "libavcodec/dsputil.h"
27
+#include "libavcodec/me_cmp.h"
28 28
 #include "transform.h"
29 29
 #if CONFIG_OPENCL
30 30
 #include "libavutil/opencl.h"
... ...
@@ -81,7 +81,7 @@ typedef struct {
81 81
     int contrast;              ///< Contrast threshold
82 82
     int search;                ///< Motion search method
83 83
     AVCodecContext *avctx;
84
-    DSPContext c;              ///< Context providing optimized SAD methods
84
+    MECmpContext c;            ///< Context providing optimized SAD methods
85 85
     Transform last;            ///< Transform from last frame
86 86
     int refcount;              ///< Number of reference frames (defines averaging window)
87 87
     FILE *fp;
... ...
@@ -35,7 +35,7 @@
35 35
 #include "video.h"
36 36
 
37 37
 #if CONFIG_AVCODEC
38
-#include "libavcodec/dsputil.h"
38
+#include "libavcodec/me_cmp.h"
39 39
 #endif
40 40
 
41 41
 static const char *const var_names[] = {
... ...
@@ -146,7 +146,7 @@ typedef struct SelectContext {
146 146
     int do_scene_detect;            ///< 1 if the expression requires scene detection variables, 0 otherwise
147 147
 #if CONFIG_AVCODEC
148 148
     AVCodecContext *avctx;          ///< codec context required for the DSPContext (scene detect only)
149
-    DSPContext c;                   ///< context providing optimized SAD methods   (scene detect only)
149
+    MECmpContext c;                 ///< context providing optimized SAD methods   (scene detect only)
150 150
     double prev_mafd;               ///< previous MAFD                             (scene detect only)
151 151
 #endif
152 152
     AVFrame *prev_picref; ///< previous frame                            (scene detect only)
... ...
@@ -245,7 +245,7 @@ static int config_input(AVFilterLink *inlink)
245 245
         select->avctx = avcodec_alloc_context3(NULL);
246 246
         if (!select->avctx)
247 247
             return AVERROR(ENOMEM);
248
-        avpriv_dsputil_init(&select->c, select->avctx);
248
+        ff_me_cmp_init(&select->c, select->avctx);
249 249
     }
250 250
 #endif
251 251
     return 0;
... ...
@@ -53,7 +53,6 @@
53 53
 #include "libavutil/mem.h"
54 54
 #include "libavutil/x86/asm.h"
55 55
 #include "libavcodec/avcodec.h"
56
-#include "libavcodec/dsputil.h"
57 56
 
58 57
 #undef free
59 58
 #undef malloc
... ...
@@ -57,7 +57,7 @@
57 57
 #include "libavutil/mem.h"
58 58
 #include "libavutil/opt.h"
59 59
 #include "libavutil/pixdesc.h"
60
-#include "libavcodec/dsputil.h"
60
+#include "libavcodec/me_cmp.h"
61 61
 
62 62
 #include "deshake.h"
63 63
 #include "deshake_opencl.h"
... ...
@@ -414,7 +414,7 @@ static int config_props(AVFilterLink *link)
414 414
     deshake->last.zoom = 0;
415 415
 
416 416
     deshake->avctx = avcodec_alloc_context3(NULL);
417
-    avpriv_dsputil_init(&deshake->c, deshake->avctx);
417
+    ff_me_cmp_init(&deshake->c, deshake->avctx);
418 418
 
419 419
     return 0;
420 420
 }
... ...
@@ -27,7 +27,7 @@
27 27
 #include "libavutil/opt.h"
28 28
 #include "libavutil/pixdesc.h"
29 29
 #include "libavutil/timestamp.h"
30
-#include "libavcodec/dsputil.h"
30
+#include "libavcodec/me_cmp.h"
31 31
 #include "libavcodec/pixblockdsp.h"
32 32
 #include "avfilter.h"
33 33
 #include "internal.h"
... ...
@@ -49,7 +49,7 @@ typedef struct {
49 49
 
50 50
     int hsub, vsub;                ///< chroma subsampling values
51 51
     AVFrame *ref;                  ///< reference picture
52
-    DSPContext dspctx;             ///< context providing optimized diff routines
52
+    MECmpContext mecc;             ///< context providing optimized diff routines
53 53
     PixblockDSPContext pdsp;
54 54
     AVCodecContext *avctx;         ///< codec context required for the DSPContext
55 55
 } DecimateContext;
... ...
@@ -76,7 +76,7 @@ static int diff_planes(AVFilterContext *ctx,
76 76
                        int w, int h)
77 77
 {
78 78
     DecimateContext *decimate = ctx->priv;
79
-    DSPContext *dspctx = &decimate->dspctx;
79
+    MECmpContext *mecc = &decimate->mecc;
80 80
     PixblockDSPContext *pdsp = &decimate->pdsp;
81 81
 
82 82
     int x, y;
... ...
@@ -90,7 +90,7 @@ static int diff_planes(AVFilterContext *ctx,
90 90
             pdsp->diff_pixels(block,
91 91
                                 cur+x+y*linesize,
92 92
                                 ref+x+y*linesize, linesize);
93
-            d = dspctx->sum_abs_dctelem(block);
93
+            d = mecc->sum_abs_dctelem(block);
94 94
             if (d > decimate->hi)
95 95
                 return 1;
96 96
             if (d > decimate->lo) {
... ...
@@ -143,7 +143,7 @@ static av_cold int init(AVFilterContext *ctx)
143 143
     decimate->avctx = avcodec_alloc_context3(NULL);
144 144
     if (!decimate->avctx)
145 145
         return AVERROR(ENOMEM);
146
-    avpriv_dsputil_init(&decimate->dspctx, decimate->avctx);
146
+    ff_me_cmp_init(&decimate->mecc, decimate->avctx);
147 147
     ff_pixblockdsp_init(&decimate->pdsp, decimate->avctx);
148 148
 
149 149
     return 0;
... ...
@@ -31,7 +31,6 @@
31 31
  * ported by Clément Bœsch for FFmpeg.
32 32
  */
33 33
 
34
-#include "libavcodec/dsputil.h"
35 34
 #include "libavutil/avassert.h"
36 35
 #include "libavutil/imgutils.h"
37 36
 #include "libavutil/opt.h"