Browse code

dsputil: Split motion estimation compare bits off into their own context

Diego Biurrun authored on 2014/02/08 10:59:58
Showing 38 changed files
... ...
@@ -1533,7 +1533,6 @@ CONFIG_EXTRA="
1533 1533
     blockdsp
1534 1534
     bswapdsp
1535 1535
     cabac
1536
-    dsputil
1537 1536
     dvprofile
1538 1537
     fdctdsp
1539 1538
     gcrypt
... ...
@@ -1552,6 +1551,7 @@ CONFIG_EXTRA="
1552 1552
     intrax8
1553 1553
     lgplv3
1554 1554
     lpc
1555
+    me_cmp
1555 1556
     mpeg_er
1556 1557
     mpegaudio
1557 1558
     mpegaudiodsp
... ...
@@ -1707,24 +1707,24 @@ threads_if_any="$THREADS_LIST"
1707 1707
 
1708 1708
 # subsystems
1709 1709
 dct_select="rdft"
1710
-dsputil_select="fdctdsp idctdsp pixblockdsp"
1711
-error_resilience_select="dsputil"
1710
+error_resilience_select="me_cmp"
1712 1711
 intrax8_select="error_resilience"
1713 1712
 mdct_select="fft"
1714 1713
 rdft_select="fft"
1714
+me_cmp_select="fdctdsp idctdsp pixblockdsp"
1715 1715
 mpeg_er_select="error_resilience"
1716 1716
 mpegaudio_select="mpegaudiodsp"
1717 1717
 mpegaudiodsp_select="dct"
1718
-mpegvideo_select="blockdsp dsputil hpeldsp idctdsp videodsp"
1719
-mpegvideoenc_select="dsputil mpegvideo pixblockdsp qpeldsp"
1718
+mpegvideo_select="blockdsp hpeldsp idctdsp me_cmp videodsp"
1719
+mpegvideoenc_select="me_cmp mpegvideo pixblockdsp qpeldsp"
1720 1720
 
1721 1721
 # decoders / encoders
1722 1722
 aac_decoder_select="mdct sinewin"
1723 1723
 aac_encoder_select="audio_frame_queue mdct sinewin"
1724 1724
 aac_latm_decoder_select="aac_decoder aac_latm_parser"
1725 1725
 ac3_decoder_select="ac3_parser ac3dsp bswapdsp mdct"
1726
-ac3_encoder_select="ac3dsp audiodsp dsputil mdct"
1727
-ac3_fixed_encoder_select="ac3dsp audiodsp dsputil mdct"
1726
+ac3_encoder_select="ac3dsp audiodsp mdct me_cmp"
1727
+ac3_fixed_encoder_select="ac3dsp audiodsp mdct me_cmp"
1728 1728
 aic_decoder_select="golomb idctdsp"
1729 1729
 alac_encoder_select="lpc"
1730 1730
 als_decoder_select="bswapdsp"
... ...
@@ -1752,7 +1752,7 @@ dca_decoder_select="mdct"
1752 1752
 dnxhd_decoder_select="blockdsp idctdsp"
1753 1753
 dnxhd_encoder_select="aandcttables blockdsp fdctdsp idctdsp mpegvideoenc pixblockdsp"
1754 1754
 dvvideo_decoder_select="dvprofile idctdsp"
1755
-dvvideo_encoder_select="dsputil dvprofile fdctdsp pixblockdsp"
1755
+dvvideo_encoder_select="dvprofile fdctdsp me_cmp pixblockdsp"
1756 1756
 dxa_decoder_deps="zlib"
1757 1757
 eac3_decoder_select="ac3_decoder"
1758 1758
 eac3_encoder_select="ac3_encoder"
... ...
@@ -1856,7 +1856,7 @@ shorten_decoder_select="golomb"
1856 1856
 sipr_decoder_select="lsp"
1857 1857
 sp5x_decoder_select="mjpeg_decoder"
1858 1858
 svq1_decoder_select="hpeldsp"
1859
-svq1_encoder_select="aandcttables dsputil hpeldsp mpegvideoenc"
1859
+svq1_encoder_select="aandcttables hpeldsp me_cmp mpegvideoenc"
1860 1860
 svq3_decoder_select="h264_decoder hpeldsp tpeldsp"
1861 1861
 svq3_decoder_suggest="zlib"
1862 1862
 tak_decoder_select="audiodsp"
... ...
@@ -35,7 +35,6 @@ OBJS-$(CONFIG_BLOCKDSP)                += blockdsp.o
35 35
 OBJS-$(CONFIG_BSWAPDSP)                += bswapdsp.o
36 36
 OBJS-$(CONFIG_CABAC)                   += cabac.o
37 37
 OBJS-$(CONFIG_DCT)                     += dct.o dct32_fixed.o dct32_float.o
38
-OBJS-$(CONFIG_DSPUTIL)                 += dsputil.o
39 38
 OBJS-$(CONFIG_DXVA2)                   += dxva2.o
40 39
 OBJS-$(CONFIG_ERROR_RESILIENCE)        += error_resilience.o
41 40
 OBJS-$(CONFIG_FDCTDSP)                 += fdctdsp.o faandct.o           \
... ...
@@ -60,6 +59,7 @@ OBJS-$(CONFIG_LIBXVID)                 += libxvid_rc.o
60 60
 OBJS-$(CONFIG_LPC)                     += lpc.o
61 61
 OBJS-$(CONFIG_LSP)                     += lsp.o
62 62
 OBJS-$(CONFIG_MDCT)                    += mdct_fixed.o mdct_float.o
63
+OBJS-$(CONFIG_ME_CMP)                  += me_cmp.o
63 64
 OBJS-$(CONFIG_MPEG_ER)                 += mpeg_er.o
64 65
 OBJS-$(CONFIG_MPEGAUDIO)               += mpegaudio.o mpegaudiodata.o   \
65 66
                                           mpegaudiodecheader.o
... ...
@@ -36,6 +36,7 @@
36 36
 #include "libavutil/internal.h"
37 37
 #include "libavutil/opt.h"
38 38
 #include "avcodec.h"
39
+#include "me_cmp.h"
39 40
 #include "put_bits.h"
40 41
 #include "audiodsp.h"
41 42
 #include "ac3dsp.h"
... ...
@@ -379,7 +380,7 @@ static void compute_exp_strategy(AC3EncodeContext *s)
379 379
                 exp_strategy[blk] = EXP_NEW;
380 380
                 continue;
381 381
             }
382
-            exp_diff = s->dsp.sad[0](NULL, exp, exp - AC3_MAX_COEFS, 16, 16);
382
+            exp_diff = s->mecc.sad[0](NULL, exp, exp - AC3_MAX_COEFS, 16, 16);
383 383
             exp_strategy[blk] = EXP_REUSE;
384 384
             if (ch == CPL_CH && exp_diff > (EXP_DIFF_THRESHOLD * (s->blocks[blk].end_freq[ch] - s->start_freq[ch]) / AC3_MAX_COEFS))
385 385
                 exp_strategy[blk] = EXP_NEW;
... ...
@@ -2482,7 +2483,7 @@ av_cold int ff_ac3_encode_init(AVCodecContext *avctx)
2482 2482
         goto init_fail;
2483 2483
 
2484 2484
     ff_audiodsp_init(&s->adsp);
2485
-    ff_dsputil_init(&s->dsp, avctx);
2485
+    ff_me_cmp_init(&s->mecc, avctx);
2486 2486
     ff_ac3dsp_init(&s->ac3dsp, avctx->flags & CODEC_FLAG_BITEXACT);
2487 2487
 
2488 2488
     dprint_options(s);
... ...
@@ -35,9 +35,9 @@
35 35
 #include "ac3.h"
36 36
 #include "ac3dsp.h"
37 37
 #include "avcodec.h"
38
-#include "dsputil.h"
39 38
 #include "fft.h"
40 39
 #include "mathops.h"
40
+#include "me_cmp.h"
41 41
 #include "put_bits.h"
42 42
 #include "audiodsp.h"
43 43
 
... ...
@@ -162,9 +162,9 @@ typedef struct AC3EncodeContext {
162 162
     AC3EncOptions options;                  ///< encoding options
163 163
     AVCodecContext *avctx;                  ///< parent AVCodecContext
164 164
     PutBitContext pb;                       ///< bitstream writer context
165
-    DSPContext dsp;
166 165
     AudioDSPContext adsp;
167 166
     AVFloatDSPContext fdsp;
167
+    MECmpContext mecc;
168 168
     AC3DSPContext ac3dsp;                   ///< AC-3 optimized functions
169 169
     FFTContext mdct;                        ///< FFT context for MDCT calculation
170 170
     const SampleType *mdct_window;          ///< MDCT window function array
... ...
@@ -6,7 +6,6 @@ OBJS-$(CONFIG_AC3DSP)                  += arm/ac3dsp_init_arm.o         \
6 6
                                           arm/ac3dsp_arm.o
7 7
 OBJS-$(CONFIG_AUDIODSP)                += arm/audiodsp_init_arm.o
8 8
 OBJS-$(CONFIG_BLOCKDSP)                += arm/blockdsp_init_arm.o
9
-OBJS-$(CONFIG_DSPUTIL)                 += arm/dsputil_init_arm.o
10 9
 OBJS-$(CONFIG_FFT)                     += arm/fft_init_arm.o            \
11 10
                                           arm/fft_fixed_init_arm.o
12 11
 OBJS-$(CONFIG_H264CHROMA)              += arm/h264chroma_init_arm.o
... ...
@@ -19,6 +18,7 @@ OBJS-$(CONFIG_IDCTDSP)                 += arm/idctdsp_init_arm.o        \
19 19
                                           arm/idctdsp_arm.o             \
20 20
                                           arm/jrevdct_arm.o             \
21 21
                                           arm/simple_idct_arm.o
22
+OBJS-$(CONFIG_ME_CMP)                  += arm/me_cmp_init_arm.o
22 23
 OBJS-$(CONFIG_MPEGAUDIODSP)            += arm/mpegaudiodsp_init_arm.o
23 24
 OBJS-$(CONFIG_MPEGVIDEO)               += arm/mpegvideo_arm.o
24 25
 OBJS-$(CONFIG_MPEGVIDEOENC)            += arm/mpegvideoencdsp_init_arm.o
... ...
@@ -53,13 +53,13 @@ ARMV5TE-OBJS-$(CONFIG_VIDEODSP)        += arm/videodsp_init_armv5te.o   \
53 53
 ARMV5TE-OBJS-$(CONFIG_MLP_DECODER)     += arm/mlpdsp_armv5te.o
54 54
 
55 55
 ARMV6-OBJS-$(CONFIG_AC3DSP)            += arm/ac3dsp_armv6.o
56
-ARMV6-OBJS-$(CONFIG_DSPUTIL)           += arm/dsputil_armv6.o
57 56
 ARMV6-OBJS-$(CONFIG_H264DSP)           += arm/h264dsp_armv6.o
58 57
 ARMV6-OBJS-$(CONFIG_HPELDSP)           += arm/hpeldsp_init_armv6.o      \
59 58
                                           arm/hpeldsp_armv6.o
60 59
 ARMV6-OBJS-$(CONFIG_IDCTDSP)           += arm/idctdsp_init_armv6.o      \
61 60
                                           arm/idctdsp_armv6.o           \
62 61
                                           arm/simple_idct_armv6.o
62
+ARMV6-OBJS-$(CONFIG_ME_CMP)            += arm/me_cmp_armv6.o
63 63
 ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP)      += arm/mpegaudiodsp_fixed_armv6.o
64 64
 ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC)      += arm/mpegvideoencdsp_armv6.o
65 65
 ARMV6-OBJS-$(CONFIG_PIXBLOCKDSP)       += arm/pixblockdsp_armv6.o
66 66
deleted file mode 100644
... ...
@@ -1,244 +0,0 @@
1
-/*
2
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3
- *
4
- * This file is part of Libav.
5
- *
6
- * Libav is free software; you can redistribute it and/or
7
- * modify it under the terms of the GNU Lesser General Public
8
- * License as published by the Free Software Foundation; either
9
- * version 2.1 of the License, or (at your option) any later version.
10
- *
11
- * Libav is distributed in the hope that it will be useful,
12
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
- * Lesser General Public License for more details.
15
- *
16
- * You should have received a copy of the GNU Lesser General Public
17
- * License along with Libav; if not, write to the Free Software
18
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
- */
20
-
21
-#include "libavutil/arm/asm.S"
22
-
23
-function ff_pix_abs16_armv6, export=1
24
-        ldr             r0,  [sp]
25
-        push            {r4-r9, lr}
26
-        mov             r12, #0
27
-        mov             lr,  #0
28
-        ldm             r1,  {r4-r7}
29
-        ldr             r8,  [r2]
30
-1:
31
-        ldr             r9,  [r2, #4]
32
-        pld             [r1, r3]
33
-        usada8          r12, r4,  r8,  r12
34
-        ldr             r8,  [r2, #8]
35
-        pld             [r2, r3]
36
-        usada8          lr,  r5,  r9,  lr
37
-        ldr             r9,  [r2, #12]
38
-        usada8          r12, r6,  r8,  r12
39
-        subs            r0,  r0,  #1
40
-        usada8          lr,  r7,  r9,  lr
41
-        beq             2f
42
-        add             r1,  r1,  r3
43
-        ldm             r1,  {r4-r7}
44
-        add             r2,  r2,  r3
45
-        ldr             r8,  [r2]
46
-        b               1b
47
-2:
48
-        add             r0,  r12, lr
49
-        pop             {r4-r9, pc}
50
-endfunc
51
-
52
-function ff_pix_abs16_x2_armv6, export=1
53
-        ldr             r12, [sp]
54
-        push            {r4-r11, lr}
55
-        mov             r0,  #0
56
-        mov             lr,  #1
57
-        orr             lr,  lr,  lr,  lsl #8
58
-        orr             lr,  lr,  lr,  lsl #16
59
-1:
60
-        ldr             r8,  [r2]
61
-        ldr             r9,  [r2, #4]
62
-        lsr             r10, r8,  #8
63
-        ldr             r4,  [r1]
64
-        lsr             r6,  r9,  #8
65
-        orr             r10, r10, r9,  lsl #24
66
-        ldr             r5,  [r2, #8]
67
-        eor             r11, r8,  r10
68
-        uhadd8          r7,  r8,  r10
69
-        orr             r6,  r6,  r5,  lsl #24
70
-        and             r11, r11, lr
71
-        uadd8           r7,  r7,  r11
72
-        ldr             r8,  [r1, #4]
73
-        usada8          r0,  r4,  r7,  r0
74
-        eor             r7,  r9,  r6
75
-        lsr             r10, r5,  #8
76
-        and             r7,  r7,  lr
77
-        uhadd8          r4,  r9,  r6
78
-        ldr             r6,  [r2, #12]
79
-        uadd8           r4,  r4,  r7
80
-        pld             [r1, r3]
81
-        orr             r10, r10, r6,  lsl #24
82
-        usada8          r0,  r8,  r4,  r0
83
-        ldr             r4,  [r1, #8]
84
-        eor             r11, r5,  r10
85
-        ldrb            r7,  [r2, #16]
86
-        and             r11, r11, lr
87
-        uhadd8          r8,  r5,  r10
88
-        ldr             r5,  [r1, #12]
89
-        uadd8           r8,  r8,  r11
90
-        pld             [r2, r3]
91
-        lsr             r10, r6,  #8
92
-        usada8          r0,  r4,  r8,  r0
93
-        orr             r10, r10, r7,  lsl #24
94
-        subs            r12,  r12,  #1
95
-        eor             r11, r6,  r10
96
-        add             r1,  r1,  r3
97
-        uhadd8          r9,  r6,  r10
98
-        and             r11, r11, lr
99
-        uadd8           r9,  r9,  r11
100
-        add             r2,  r2,  r3
101
-        usada8          r0,  r5,  r9,  r0
102
-        bgt             1b
103
-
104
-        pop             {r4-r11, pc}
105
-endfunc
106
-
107
-.macro  usad_y2         p0,  p1,  p2,  p3,  n0,  n1,  n2,  n3
108
-        ldr             \n0, [r2]
109
-        eor             \n1, \p0, \n0
110
-        uhadd8          \p0, \p0, \n0
111
-        and             \n1, \n1, lr
112
-        ldr             \n2, [r1]
113
-        uadd8           \p0, \p0, \n1
114
-        ldr             \n1, [r2, #4]
115
-        usada8          r0,  \p0, \n2, r0
116
-        pld             [r1,  r3]
117
-        eor             \n3, \p1, \n1
118
-        uhadd8          \p1, \p1, \n1
119
-        and             \n3, \n3, lr
120
-        ldr             \p0, [r1, #4]
121
-        uadd8           \p1, \p1, \n3
122
-        ldr             \n2, [r2, #8]
123
-        usada8          r0,  \p1, \p0, r0
124
-        pld             [r2,  r3]
125
-        eor             \p0, \p2, \n2
126
-        uhadd8          \p2, \p2, \n2
127
-        and             \p0, \p0, lr
128
-        ldr             \p1, [r1, #8]
129
-        uadd8           \p2, \p2, \p0
130
-        ldr             \n3, [r2, #12]
131
-        usada8          r0,  \p2, \p1, r0
132
-        eor             \p1, \p3, \n3
133
-        uhadd8          \p3, \p3, \n3
134
-        and             \p1, \p1, lr
135
-        ldr             \p0,  [r1, #12]
136
-        uadd8           \p3, \p3, \p1
137
-        add             r1,  r1,  r3
138
-        usada8          r0,  \p3, \p0,  r0
139
-        add             r2,  r2,  r3
140
-.endm
141
-
142
-function ff_pix_abs16_y2_armv6, export=1
143
-        pld             [r1]
144
-        pld             [r2]
145
-        ldr             r12, [sp]
146
-        push            {r4-r11, lr}
147
-        mov             r0,  #0
148
-        mov             lr,  #1
149
-        orr             lr,  lr,  lr,  lsl #8
150
-        orr             lr,  lr,  lr,  lsl #16
151
-        ldr             r4,  [r2]
152
-        ldr             r5,  [r2, #4]
153
-        ldr             r6,  [r2, #8]
154
-        ldr             r7,  [r2, #12]
155
-        add             r2,  r2,  r3
156
-1:
157
-        usad_y2         r4,  r5,  r6,  r7,  r8,  r9,  r10, r11
158
-        subs            r12, r12, #2
159
-        usad_y2         r8,  r9,  r10, r11, r4,  r5,  r6,  r7
160
-        bgt             1b
161
-
162
-        pop             {r4-r11, pc}
163
-endfunc
164
-
165
-function ff_pix_abs8_armv6, export=1
166
-        pld             [r2, r3]
167
-        ldr             r12, [sp]
168
-        push            {r4-r9, lr}
169
-        mov             r0,  #0
170
-        mov             lr,  #0
171
-        ldrd_post       r4,  r5,  r1,  r3
172
-1:
173
-        subs            r12, r12, #2
174
-        ldr             r7,  [r2, #4]
175
-        ldr_post        r6,  r2,  r3
176
-        ldrd_post       r8,  r9,  r1,  r3
177
-        usada8          r0,  r4,  r6,  r0
178
-        pld             [r2, r3]
179
-        usada8          lr,  r5,  r7,  lr
180
-        ldr             r7,  [r2, #4]
181
-        ldr_post        r6,  r2,  r3
182
-        beq             2f
183
-        ldrd_post       r4,  r5,  r1,  r3
184
-        usada8          r0,  r8,  r6,  r0
185
-        pld             [r2, r3]
186
-        usada8          lr,  r9,  r7,  lr
187
-        b               1b
188
-2:
189
-        usada8          r0,  r8,  r6,  r0
190
-        usada8          lr,  r9,  r7,  lr
191
-        add             r0,  r0,  lr
192
-        pop             {r4-r9, pc}
193
-endfunc
194
-
195
-function ff_sse16_armv6, export=1
196
-        ldr             r12, [sp]
197
-        push            {r4-r9, lr}
198
-        mov             r0,  #0
199
-1:
200
-        ldrd            r4,  r5,  [r1]
201
-        ldr             r8,  [r2]
202
-        uxtb16          lr,  r4
203
-        uxtb16          r4,  r4,  ror #8
204
-        uxtb16          r9,  r8
205
-        uxtb16          r8,  r8,  ror #8
206
-        ldr             r7,  [r2, #4]
207
-        usub16          lr,  lr,  r9
208
-        usub16          r4,  r4,  r8
209
-        smlad           r0,  lr,  lr,  r0
210
-        uxtb16          r6,  r5
211
-        uxtb16          lr,  r5,  ror #8
212
-        uxtb16          r8,  r7
213
-        uxtb16          r9,  r7,  ror #8
214
-        smlad           r0,  r4,  r4,  r0
215
-        ldrd            r4,  r5,  [r1, #8]
216
-        usub16          r6,  r6,  r8
217
-        usub16          r8,  lr,  r9
218
-        ldr             r7,  [r2, #8]
219
-        smlad           r0,  r6,  r6,  r0
220
-        uxtb16          lr,  r4
221
-        uxtb16          r4,  r4,  ror #8
222
-        uxtb16          r9,  r7
223
-        uxtb16          r7,  r7, ror #8
224
-        smlad           r0,  r8,  r8,  r0
225
-        ldr             r8,  [r2, #12]
226
-        usub16          lr,  lr,  r9
227
-        usub16          r4,  r4,  r7
228
-        smlad           r0,  lr,  lr,  r0
229
-        uxtb16          r6,  r5
230
-        uxtb16          r5,  r5,  ror #8
231
-        uxtb16          r9,  r8
232
-        uxtb16          r8,  r8,  ror #8
233
-        smlad           r0,  r4,  r4,  r0
234
-        usub16          r6,  r6,  r9
235
-        usub16          r5,  r5,  r8
236
-        smlad           r0,  r6,  r6,  r0
237
-        add             r1,  r1,  r3
238
-        add             r2,  r2,  r3
239
-        subs            r12, r12, #1
240
-        smlad           r0,  r5,  r5,  r0
241
-        bgt             1b
242
-
243
-        pop             {r4-r9, pc}
244
-endfunc
245 1
deleted file mode 100644
... ...
@@ -1,58 +0,0 @@
1
-/*
2
- * This file is part of Libav.
3
- *
4
- * Libav is free software; you can redistribute it and/or
5
- * modify it under the terms of the GNU Lesser General Public
6
- * License as published by the Free Software Foundation; either
7
- * version 2.1 of the License, or (at your option) any later version.
8
- *
9
- * Libav is distributed in the hope that it will be useful,
10
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
- * Lesser General Public License for more details.
13
- *
14
- * You should have received a copy of the GNU Lesser General Public
15
- * License along with Libav; if not, write to the Free Software
16
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
- */
18
-
19
-#include <stdint.h>
20
-
21
-#include "libavutil/attributes.h"
22
-#include "libavutil/cpu.h"
23
-#include "libavutil/arm/cpu.h"
24
-#include "libavcodec/avcodec.h"
25
-#include "libavcodec/dsputil.h"
26
-#include "libavcodec/mpegvideo.h"
27
-
28
-int ff_pix_abs16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
29
-                       int line_size, int h);
30
-int ff_pix_abs16_x2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
31
-                          int line_size, int h);
32
-int ff_pix_abs16_y2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
33
-                          int line_size, int h);
34
-
35
-int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
36
-                      int line_size, int h);
37
-
38
-int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
39
-                   int line_size, int h);
40
-
41
-
42
-av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx)
43
-{
44
-    int cpu_flags = av_get_cpu_flags();
45
-
46
-    if (have_armv6(cpu_flags)) {
47
-        c->pix_abs[0][0] = ff_pix_abs16_armv6;
48
-        c->pix_abs[0][1] = ff_pix_abs16_x2_armv6;
49
-        c->pix_abs[0][2] = ff_pix_abs16_y2_armv6;
50
-
51
-        c->pix_abs[1][0] = ff_pix_abs8_armv6;
52
-
53
-        c->sad[0] = ff_pix_abs16_armv6;
54
-        c->sad[1] = ff_pix_abs8_armv6;
55
-
56
-        c->sse[0] = ff_sse16_armv6;
57
-    }
58
-}
59 1
new file mode 100644
... ...
@@ -0,0 +1,244 @@
0
+/*
1
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
2
+ *
3
+ * This file is part of Libav.
4
+ *
5
+ * Libav is free software; you can redistribute it and/or
6
+ * modify it under the terms of the GNU Lesser General Public
7
+ * License as published by the Free Software Foundation; either
8
+ * version 2.1 of the License, or (at your option) any later version.
9
+ *
10
+ * Libav is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
+ * Lesser General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU Lesser General Public
16
+ * License along with Libav; if not, write to the Free Software
17
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+#include "libavutil/arm/asm.S"
21
+
22
+function ff_pix_abs16_armv6, export=1
23
+        ldr             r0,  [sp]
24
+        push            {r4-r9, lr}
25
+        mov             r12, #0
26
+        mov             lr,  #0
27
+        ldm             r1,  {r4-r7}
28
+        ldr             r8,  [r2]
29
+1:
30
+        ldr             r9,  [r2, #4]
31
+        pld             [r1, r3]
32
+        usada8          r12, r4,  r8,  r12
33
+        ldr             r8,  [r2, #8]
34
+        pld             [r2, r3]
35
+        usada8          lr,  r5,  r9,  lr
36
+        ldr             r9,  [r2, #12]
37
+        usada8          r12, r6,  r8,  r12
38
+        subs            r0,  r0,  #1
39
+        usada8          lr,  r7,  r9,  lr
40
+        beq             2f
41
+        add             r1,  r1,  r3
42
+        ldm             r1,  {r4-r7}
43
+        add             r2,  r2,  r3
44
+        ldr             r8,  [r2]
45
+        b               1b
46
+2:
47
+        add             r0,  r12, lr
48
+        pop             {r4-r9, pc}
49
+endfunc
50
+
51
+function ff_pix_abs16_x2_armv6, export=1
52
+        ldr             r12, [sp]
53
+        push            {r4-r11, lr}
54
+        mov             r0,  #0
55
+        mov             lr,  #1
56
+        orr             lr,  lr,  lr,  lsl #8
57
+        orr             lr,  lr,  lr,  lsl #16
58
+1:
59
+        ldr             r8,  [r2]
60
+        ldr             r9,  [r2, #4]
61
+        lsr             r10, r8,  #8
62
+        ldr             r4,  [r1]
63
+        lsr             r6,  r9,  #8
64
+        orr             r10, r10, r9,  lsl #24
65
+        ldr             r5,  [r2, #8]
66
+        eor             r11, r8,  r10
67
+        uhadd8          r7,  r8,  r10
68
+        orr             r6,  r6,  r5,  lsl #24
69
+        and             r11, r11, lr
70
+        uadd8           r7,  r7,  r11
71
+        ldr             r8,  [r1, #4]
72
+        usada8          r0,  r4,  r7,  r0
73
+        eor             r7,  r9,  r6
74
+        lsr             r10, r5,  #8
75
+        and             r7,  r7,  lr
76
+        uhadd8          r4,  r9,  r6
77
+        ldr             r6,  [r2, #12]
78
+        uadd8           r4,  r4,  r7
79
+        pld             [r1, r3]
80
+        orr             r10, r10, r6,  lsl #24
81
+        usada8          r0,  r8,  r4,  r0
82
+        ldr             r4,  [r1, #8]
83
+        eor             r11, r5,  r10
84
+        ldrb            r7,  [r2, #16]
85
+        and             r11, r11, lr
86
+        uhadd8          r8,  r5,  r10
87
+        ldr             r5,  [r1, #12]
88
+        uadd8           r8,  r8,  r11
89
+        pld             [r2, r3]
90
+        lsr             r10, r6,  #8
91
+        usada8          r0,  r4,  r8,  r0
92
+        orr             r10, r10, r7,  lsl #24
93
+        subs            r12,  r12,  #1
94
+        eor             r11, r6,  r10
95
+        add             r1,  r1,  r3
96
+        uhadd8          r9,  r6,  r10
97
+        and             r11, r11, lr
98
+        uadd8           r9,  r9,  r11
99
+        add             r2,  r2,  r3
100
+        usada8          r0,  r5,  r9,  r0
101
+        bgt             1b
102
+
103
+        pop             {r4-r11, pc}
104
+endfunc
105
+
106
+.macro  usad_y2         p0,  p1,  p2,  p3,  n0,  n1,  n2,  n3
107
+        ldr             \n0, [r2]
108
+        eor             \n1, \p0, \n0
109
+        uhadd8          \p0, \p0, \n0
110
+        and             \n1, \n1, lr
111
+        ldr             \n2, [r1]
112
+        uadd8           \p0, \p0, \n1
113
+        ldr             \n1, [r2, #4]
114
+        usada8          r0,  \p0, \n2, r0
115
+        pld             [r1,  r3]
116
+        eor             \n3, \p1, \n1
117
+        uhadd8          \p1, \p1, \n1
118
+        and             \n3, \n3, lr
119
+        ldr             \p0, [r1, #4]
120
+        uadd8           \p1, \p1, \n3
121
+        ldr             \n2, [r2, #8]
122
+        usada8          r0,  \p1, \p0, r0
123
+        pld             [r2,  r3]
124
+        eor             \p0, \p2, \n2
125
+        uhadd8          \p2, \p2, \n2
126
+        and             \p0, \p0, lr
127
+        ldr             \p1, [r1, #8]
128
+        uadd8           \p2, \p2, \p0
129
+        ldr             \n3, [r2, #12]
130
+        usada8          r0,  \p2, \p1, r0
131
+        eor             \p1, \p3, \n3
132
+        uhadd8          \p3, \p3, \n3
133
+        and             \p1, \p1, lr
134
+        ldr             \p0,  [r1, #12]
135
+        uadd8           \p3, \p3, \p1
136
+        add             r1,  r1,  r3
137
+        usada8          r0,  \p3, \p0,  r0
138
+        add             r2,  r2,  r3
139
+.endm
140
+
141
+function ff_pix_abs16_y2_armv6, export=1
142
+        pld             [r1]
143
+        pld             [r2]
144
+        ldr             r12, [sp]
145
+        push            {r4-r11, lr}
146
+        mov             r0,  #0
147
+        mov             lr,  #1
148
+        orr             lr,  lr,  lr,  lsl #8
149
+        orr             lr,  lr,  lr,  lsl #16
150
+        ldr             r4,  [r2]
151
+        ldr             r5,  [r2, #4]
152
+        ldr             r6,  [r2, #8]
153
+        ldr             r7,  [r2, #12]
154
+        add             r2,  r2,  r3
155
+1:
156
+        usad_y2         r4,  r5,  r6,  r7,  r8,  r9,  r10, r11
157
+        subs            r12, r12, #2
158
+        usad_y2         r8,  r9,  r10, r11, r4,  r5,  r6,  r7
159
+        bgt             1b
160
+
161
+        pop             {r4-r11, pc}
162
+endfunc
163
+
164
+function ff_pix_abs8_armv6, export=1
165
+        pld             [r2, r3]
166
+        ldr             r12, [sp]
167
+        push            {r4-r9, lr}
168
+        mov             r0,  #0
169
+        mov             lr,  #0
170
+        ldrd_post       r4,  r5,  r1,  r3
171
+1:
172
+        subs            r12, r12, #2
173
+        ldr             r7,  [r2, #4]
174
+        ldr_post        r6,  r2,  r3
175
+        ldrd_post       r8,  r9,  r1,  r3
176
+        usada8          r0,  r4,  r6,  r0
177
+        pld             [r2, r3]
178
+        usada8          lr,  r5,  r7,  lr
179
+        ldr             r7,  [r2, #4]
180
+        ldr_post        r6,  r2,  r3
181
+        beq             2f
182
+        ldrd_post       r4,  r5,  r1,  r3
183
+        usada8          r0,  r8,  r6,  r0
184
+        pld             [r2, r3]
185
+        usada8          lr,  r9,  r7,  lr
186
+        b               1b
187
+2:
188
+        usada8          r0,  r8,  r6,  r0
189
+        usada8          lr,  r9,  r7,  lr
190
+        add             r0,  r0,  lr
191
+        pop             {r4-r9, pc}
192
+endfunc
193
+
194
+function ff_sse16_armv6, export=1
195
+        ldr             r12, [sp]
196
+        push            {r4-r9, lr}
197
+        mov             r0,  #0
198
+1:
199
+        ldrd            r4,  r5,  [r1]
200
+        ldr             r8,  [r2]
201
+        uxtb16          lr,  r4
202
+        uxtb16          r4,  r4,  ror #8
203
+        uxtb16          r9,  r8
204
+        uxtb16          r8,  r8,  ror #8
205
+        ldr             r7,  [r2, #4]
206
+        usub16          lr,  lr,  r9
207
+        usub16          r4,  r4,  r8
208
+        smlad           r0,  lr,  lr,  r0
209
+        uxtb16          r6,  r5
210
+        uxtb16          lr,  r5,  ror #8
211
+        uxtb16          r8,  r7
212
+        uxtb16          r9,  r7,  ror #8
213
+        smlad           r0,  r4,  r4,  r0
214
+        ldrd            r4,  r5,  [r1, #8]
215
+        usub16          r6,  r6,  r8
216
+        usub16          r8,  lr,  r9
217
+        ldr             r7,  [r2, #8]
218
+        smlad           r0,  r6,  r6,  r0
219
+        uxtb16          lr,  r4
220
+        uxtb16          r4,  r4,  ror #8
221
+        uxtb16          r9,  r7
222
+        uxtb16          r7,  r7, ror #8
223
+        smlad           r0,  r8,  r8,  r0
224
+        ldr             r8,  [r2, #12]
225
+        usub16          lr,  lr,  r9
226
+        usub16          r4,  r4,  r7
227
+        smlad           r0,  lr,  lr,  r0
228
+        uxtb16          r6,  r5
229
+        uxtb16          r5,  r5,  ror #8
230
+        uxtb16          r9,  r8
231
+        uxtb16          r8,  r8,  ror #8
232
+        smlad           r0,  r4,  r4,  r0
233
+        usub16          r6,  r6,  r9
234
+        usub16          r5,  r5,  r8
235
+        smlad           r0,  r6,  r6,  r0
236
+        add             r1,  r1,  r3
237
+        add             r2,  r2,  r3
238
+        subs            r12, r12, #1
239
+        smlad           r0,  r5,  r5,  r0
240
+        bgt             1b
241
+
242
+        pop             {r4-r9, pc}
243
+endfunc
0 244
new file mode 100644
... ...
@@ -0,0 +1,57 @@
0
+/*
1
+ * This file is part of Libav.
2
+ *
3
+ * Libav is free software; you can redistribute it and/or
4
+ * modify it under the terms of the GNU Lesser General Public
5
+ * License as published by the Free Software Foundation; either
6
+ * version 2.1 of the License, or (at your option) any later version.
7
+ *
8
+ * Libav is distributed in the hope that it will be useful,
9
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11
+ * Lesser General Public License for more details.
12
+ *
13
+ * You should have received a copy of the GNU Lesser General Public
14
+ * License along with Libav; if not, write to the Free Software
15
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+ */
17
+
18
+#include <stdint.h>
19
+
20
+#include "libavutil/attributes.h"
21
+#include "libavutil/cpu.h"
22
+#include "libavutil/arm/cpu.h"
23
+#include "libavcodec/avcodec.h"
24
+#include "libavcodec/me_cmp.h"
25
+#include "libavcodec/mpegvideo.h"
26
+
27
+int ff_pix_abs16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
28
+                       int line_size, int h);
29
+int ff_pix_abs16_x2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
30
+                          int line_size, int h);
31
+int ff_pix_abs16_y2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
32
+                          int line_size, int h);
33
+
34
+int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
35
+                      int line_size, int h);
36
+
37
+int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
38
+                   int line_size, int h);
39
+
40
+av_cold void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx)
41
+{
42
+    int cpu_flags = av_get_cpu_flags();
43
+
44
+    if (have_armv6(cpu_flags)) {
45
+        c->pix_abs[0][0] = ff_pix_abs16_armv6;
46
+        c->pix_abs[0][1] = ff_pix_abs16_x2_armv6;
47
+        c->pix_abs[0][2] = ff_pix_abs16_y2_armv6;
48
+
49
+        c->pix_abs[1][0] = ff_pix_abs8_armv6;
50
+
51
+        c->sad[0] = ff_pix_abs16_armv6;
52
+        c->sad[1] = ff_pix_abs8_armv6;
53
+
54
+        c->sse[0] = ff_sse16_armv6;
55
+    }
56
+}
0 57
deleted file mode 100644
... ...
@@ -1,952 +0,0 @@
1
-/*
2
- * DSP utils
3
- * Copyright (c) 2000, 2001 Fabrice Bellard
4
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
- *
6
- * This file is part of Libav.
7
- *
8
- * Libav is free software; you can redistribute it and/or
9
- * modify it under the terms of the GNU Lesser General Public
10
- * License as published by the Free Software Foundation; either
11
- * version 2.1 of the License, or (at your option) any later version.
12
- *
13
- * Libav is distributed in the hope that it will be useful,
14
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
- * Lesser General Public License for more details.
17
- *
18
- * You should have received a copy of the GNU Lesser General Public
19
- * License along with Libav; if not, write to the Free Software
20
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
- */
22
-
23
-/**
24
- * @file
25
- * DSP utils
26
- */
27
-
28
-#include "libavutil/attributes.h"
29
-#include "avcodec.h"
30
-#include "copy_block.h"
31
-#include "dsputil.h"
32
-#include "simple_idct.h"
33
-#include "mpegvideo.h"
34
-#include "config.h"
35
-
36
-uint32_t ff_square_tab[512] = { 0, };
37
-
38
-static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
39
-                  int line_size, int h)
40
-{
41
-    int s = 0, i;
42
-    uint32_t *sq = ff_square_tab + 256;
43
-
44
-    for (i = 0; i < h; i++) {
45
-        s    += sq[pix1[0] - pix2[0]];
46
-        s    += sq[pix1[1] - pix2[1]];
47
-        s    += sq[pix1[2] - pix2[2]];
48
-        s    += sq[pix1[3] - pix2[3]];
49
-        pix1 += line_size;
50
-        pix2 += line_size;
51
-    }
52
-    return s;
53
-}
54
-
55
-static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
56
-                  int line_size, int h)
57
-{
58
-    int s = 0, i;
59
-    uint32_t *sq = ff_square_tab + 256;
60
-
61
-    for (i = 0; i < h; i++) {
62
-        s    += sq[pix1[0] - pix2[0]];
63
-        s    += sq[pix1[1] - pix2[1]];
64
-        s    += sq[pix1[2] - pix2[2]];
65
-        s    += sq[pix1[3] - pix2[3]];
66
-        s    += sq[pix1[4] - pix2[4]];
67
-        s    += sq[pix1[5] - pix2[5]];
68
-        s    += sq[pix1[6] - pix2[6]];
69
-        s    += sq[pix1[7] - pix2[7]];
70
-        pix1 += line_size;
71
-        pix2 += line_size;
72
-    }
73
-    return s;
74
-}
75
-
76
-static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
77
-                   int line_size, int h)
78
-{
79
-    int s = 0, i;
80
-    uint32_t *sq = ff_square_tab + 256;
81
-
82
-    for (i = 0; i < h; i++) {
83
-        s += sq[pix1[0]  - pix2[0]];
84
-        s += sq[pix1[1]  - pix2[1]];
85
-        s += sq[pix1[2]  - pix2[2]];
86
-        s += sq[pix1[3]  - pix2[3]];
87
-        s += sq[pix1[4]  - pix2[4]];
88
-        s += sq[pix1[5]  - pix2[5]];
89
-        s += sq[pix1[6]  - pix2[6]];
90
-        s += sq[pix1[7]  - pix2[7]];
91
-        s += sq[pix1[8]  - pix2[8]];
92
-        s += sq[pix1[9]  - pix2[9]];
93
-        s += sq[pix1[10] - pix2[10]];
94
-        s += sq[pix1[11] - pix2[11]];
95
-        s += sq[pix1[12] - pix2[12]];
96
-        s += sq[pix1[13] - pix2[13]];
97
-        s += sq[pix1[14] - pix2[14]];
98
-        s += sq[pix1[15] - pix2[15]];
99
-
100
-        pix1 += line_size;
101
-        pix2 += line_size;
102
-    }
103
-    return s;
104
-}
105
-
106
-static int sum_abs_dctelem_c(int16_t *block)
107
-{
108
-    int sum = 0, i;
109
-
110
-    for (i = 0; i < 64; i++)
111
-        sum += FFABS(block[i]);
112
-    return sum;
113
-}
114
-
115
-#define avg2(a, b) ((a + b + 1) >> 1)
116
-#define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
117
-
118
-static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
119
-                              int line_size, int h)
120
-{
121
-    int s = 0, i;
122
-
123
-    for (i = 0; i < h; i++) {
124
-        s    += abs(pix1[0]  - pix2[0]);
125
-        s    += abs(pix1[1]  - pix2[1]);
126
-        s    += abs(pix1[2]  - pix2[2]);
127
-        s    += abs(pix1[3]  - pix2[3]);
128
-        s    += abs(pix1[4]  - pix2[4]);
129
-        s    += abs(pix1[5]  - pix2[5]);
130
-        s    += abs(pix1[6]  - pix2[6]);
131
-        s    += abs(pix1[7]  - pix2[7]);
132
-        s    += abs(pix1[8]  - pix2[8]);
133
-        s    += abs(pix1[9]  - pix2[9]);
134
-        s    += abs(pix1[10] - pix2[10]);
135
-        s    += abs(pix1[11] - pix2[11]);
136
-        s    += abs(pix1[12] - pix2[12]);
137
-        s    += abs(pix1[13] - pix2[13]);
138
-        s    += abs(pix1[14] - pix2[14]);
139
-        s    += abs(pix1[15] - pix2[15]);
140
-        pix1 += line_size;
141
-        pix2 += line_size;
142
-    }
143
-    return s;
144
-}
145
-
146
-static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
147
-                          int line_size, int h)
148
-{
149
-    int s = 0, i;
150
-
151
-    for (i = 0; i < h; i++) {
152
-        s    += abs(pix1[0]  - avg2(pix2[0],  pix2[1]));
153
-        s    += abs(pix1[1]  - avg2(pix2[1],  pix2[2]));
154
-        s    += abs(pix1[2]  - avg2(pix2[2],  pix2[3]));
155
-        s    += abs(pix1[3]  - avg2(pix2[3],  pix2[4]));
156
-        s    += abs(pix1[4]  - avg2(pix2[4],  pix2[5]));
157
-        s    += abs(pix1[5]  - avg2(pix2[5],  pix2[6]));
158
-        s    += abs(pix1[6]  - avg2(pix2[6],  pix2[7]));
159
-        s    += abs(pix1[7]  - avg2(pix2[7],  pix2[8]));
160
-        s    += abs(pix1[8]  - avg2(pix2[8],  pix2[9]));
161
-        s    += abs(pix1[9]  - avg2(pix2[9],  pix2[10]));
162
-        s    += abs(pix1[10] - avg2(pix2[10], pix2[11]));
163
-        s    += abs(pix1[11] - avg2(pix2[11], pix2[12]));
164
-        s    += abs(pix1[12] - avg2(pix2[12], pix2[13]));
165
-        s    += abs(pix1[13] - avg2(pix2[13], pix2[14]));
166
-        s    += abs(pix1[14] - avg2(pix2[14], pix2[15]));
167
-        s    += abs(pix1[15] - avg2(pix2[15], pix2[16]));
168
-        pix1 += line_size;
169
-        pix2 += line_size;
170
-    }
171
-    return s;
172
-}
173
-
174
-static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
175
-                          int line_size, int h)
176
-{
177
-    int s = 0, i;
178
-    uint8_t *pix3 = pix2 + line_size;
179
-
180
-    for (i = 0; i < h; i++) {
181
-        s    += abs(pix1[0]  - avg2(pix2[0],  pix3[0]));
182
-        s    += abs(pix1[1]  - avg2(pix2[1],  pix3[1]));
183
-        s    += abs(pix1[2]  - avg2(pix2[2],  pix3[2]));
184
-        s    += abs(pix1[3]  - avg2(pix2[3],  pix3[3]));
185
-        s    += abs(pix1[4]  - avg2(pix2[4],  pix3[4]));
186
-        s    += abs(pix1[5]  - avg2(pix2[5],  pix3[5]));
187
-        s    += abs(pix1[6]  - avg2(pix2[6],  pix3[6]));
188
-        s    += abs(pix1[7]  - avg2(pix2[7],  pix3[7]));
189
-        s    += abs(pix1[8]  - avg2(pix2[8],  pix3[8]));
190
-        s    += abs(pix1[9]  - avg2(pix2[9],  pix3[9]));
191
-        s    += abs(pix1[10] - avg2(pix2[10], pix3[10]));
192
-        s    += abs(pix1[11] - avg2(pix2[11], pix3[11]));
193
-        s    += abs(pix1[12] - avg2(pix2[12], pix3[12]));
194
-        s    += abs(pix1[13] - avg2(pix2[13], pix3[13]));
195
-        s    += abs(pix1[14] - avg2(pix2[14], pix3[14]));
196
-        s    += abs(pix1[15] - avg2(pix2[15], pix3[15]));
197
-        pix1 += line_size;
198
-        pix2 += line_size;
199
-        pix3 += line_size;
200
-    }
201
-    return s;
202
-}
203
-
204
-static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
205
-                           int line_size, int h)
206
-{
207
-    int s = 0, i;
208
-    uint8_t *pix3 = pix2 + line_size;
209
-
210
-    for (i = 0; i < h; i++) {
211
-        s    += abs(pix1[0]  - avg4(pix2[0],  pix2[1],  pix3[0],  pix3[1]));
212
-        s    += abs(pix1[1]  - avg4(pix2[1],  pix2[2],  pix3[1],  pix3[2]));
213
-        s    += abs(pix1[2]  - avg4(pix2[2],  pix2[3],  pix3[2],  pix3[3]));
214
-        s    += abs(pix1[3]  - avg4(pix2[3],  pix2[4],  pix3[3],  pix3[4]));
215
-        s    += abs(pix1[4]  - avg4(pix2[4],  pix2[5],  pix3[4],  pix3[5]));
216
-        s    += abs(pix1[5]  - avg4(pix2[5],  pix2[6],  pix3[5],  pix3[6]));
217
-        s    += abs(pix1[6]  - avg4(pix2[6],  pix2[7],  pix3[6],  pix3[7]));
218
-        s    += abs(pix1[7]  - avg4(pix2[7],  pix2[8],  pix3[7],  pix3[8]));
219
-        s    += abs(pix1[8]  - avg4(pix2[8],  pix2[9],  pix3[8],  pix3[9]));
220
-        s    += abs(pix1[9]  - avg4(pix2[9],  pix2[10], pix3[9],  pix3[10]));
221
-        s    += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
222
-        s    += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
223
-        s    += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
224
-        s    += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
225
-        s    += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
226
-        s    += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
227
-        pix1 += line_size;
228
-        pix2 += line_size;
229
-        pix3 += line_size;
230
-    }
231
-    return s;
232
-}
233
-
234
-static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
235
-                             int line_size, int h)
236
-{
237
-    int s = 0, i;
238
-
239
-    for (i = 0; i < h; i++) {
240
-        s    += abs(pix1[0] - pix2[0]);
241
-        s    += abs(pix1[1] - pix2[1]);
242
-        s    += abs(pix1[2] - pix2[2]);
243
-        s    += abs(pix1[3] - pix2[3]);
244
-        s    += abs(pix1[4] - pix2[4]);
245
-        s    += abs(pix1[5] - pix2[5]);
246
-        s    += abs(pix1[6] - pix2[6]);
247
-        s    += abs(pix1[7] - pix2[7]);
248
-        pix1 += line_size;
249
-        pix2 += line_size;
250
-    }
251
-    return s;
252
-}
253
-
254
-static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
255
-                         int line_size, int h)
256
-{
257
-    int s = 0, i;
258
-
259
-    for (i = 0; i < h; i++) {
260
-        s    += abs(pix1[0] - avg2(pix2[0], pix2[1]));
261
-        s    += abs(pix1[1] - avg2(pix2[1], pix2[2]));
262
-        s    += abs(pix1[2] - avg2(pix2[2], pix2[3]));
263
-        s    += abs(pix1[3] - avg2(pix2[3], pix2[4]));
264
-        s    += abs(pix1[4] - avg2(pix2[4], pix2[5]));
265
-        s    += abs(pix1[5] - avg2(pix2[5], pix2[6]));
266
-        s    += abs(pix1[6] - avg2(pix2[6], pix2[7]));
267
-        s    += abs(pix1[7] - avg2(pix2[7], pix2[8]));
268
-        pix1 += line_size;
269
-        pix2 += line_size;
270
-    }
271
-    return s;
272
-}
273
-
274
-static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
275
-                         int line_size, int h)
276
-{
277
-    int s = 0, i;
278
-    uint8_t *pix3 = pix2 + line_size;
279
-
280
-    for (i = 0; i < h; i++) {
281
-        s    += abs(pix1[0] - avg2(pix2[0], pix3[0]));
282
-        s    += abs(pix1[1] - avg2(pix2[1], pix3[1]));
283
-        s    += abs(pix1[2] - avg2(pix2[2], pix3[2]));
284
-        s    += abs(pix1[3] - avg2(pix2[3], pix3[3]));
285
-        s    += abs(pix1[4] - avg2(pix2[4], pix3[4]));
286
-        s    += abs(pix1[5] - avg2(pix2[5], pix3[5]));
287
-        s    += abs(pix1[6] - avg2(pix2[6], pix3[6]));
288
-        s    += abs(pix1[7] - avg2(pix2[7], pix3[7]));
289
-        pix1 += line_size;
290
-        pix2 += line_size;
291
-        pix3 += line_size;
292
-    }
293
-    return s;
294
-}
295
-
296
-static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
297
-                          int line_size, int h)
298
-{
299
-    int s = 0, i;
300
-    uint8_t *pix3 = pix2 + line_size;
301
-
302
-    for (i = 0; i < h; i++) {
303
-        s    += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
304
-        s    += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
305
-        s    += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
306
-        s    += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
307
-        s    += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
308
-        s    += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
309
-        s    += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
310
-        s    += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
311
-        pix1 += line_size;
312
-        pix2 += line_size;
313
-        pix3 += line_size;
314
-    }
315
-    return s;
316
-}
317
-
318
-static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
319
-{
320
-    int score1 = 0, score2 = 0, x, y;
321
-
322
-    for (y = 0; y < h; y++) {
323
-        for (x = 0; x < 16; x++)
324
-            score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
325
-        if (y + 1 < h) {
326
-            for (x = 0; x < 15; x++)
327
-                score2 += FFABS(s1[x]     - s1[x + stride] -
328
-                                s1[x + 1] + s1[x + stride + 1]) -
329
-                          FFABS(s2[x]     - s2[x + stride] -
330
-                                s2[x + 1] + s2[x + stride + 1]);
331
-        }
332
-        s1 += stride;
333
-        s2 += stride;
334
-    }
335
-
336
-    if (c)
337
-        return score1 + FFABS(score2) * c->avctx->nsse_weight;
338
-    else
339
-        return score1 + FFABS(score2) * 8;
340
-}
341
-
342
-static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
343
-{
344
-    int score1 = 0, score2 = 0, x, y;
345
-
346
-    for (y = 0; y < h; y++) {
347
-        for (x = 0; x < 8; x++)
348
-            score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
349
-        if (y + 1 < h) {
350
-            for (x = 0; x < 7; x++)
351
-                score2 += FFABS(s1[x]     - s1[x + stride] -
352
-                                s1[x + 1] + s1[x + stride + 1]) -
353
-                          FFABS(s2[x]     - s2[x + stride] -
354
-                                s2[x + 1] + s2[x + stride + 1]);
355
-        }
356
-        s1 += stride;
357
-        s2 += stride;
358
-    }
359
-
360
-    if (c)
361
-        return score1 + FFABS(score2) * c->avctx->nsse_weight;
362
-    else
363
-        return score1 + FFABS(score2) * 8;
364
-}
365
-
366
-static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
367
-                    int stride, int h)
368
-{
369
-    return 0;
370
-}
371
-
372
-void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
373
-{
374
-    int i;
375
-
376
-    memset(cmp, 0, sizeof(void *) * 6);
377
-
378
-    for (i = 0; i < 6; i++) {
379
-        switch (type & 0xFF) {
380
-        case FF_CMP_SAD:
381
-            cmp[i] = c->sad[i];
382
-            break;
383
-        case FF_CMP_SATD:
384
-            cmp[i] = c->hadamard8_diff[i];
385
-            break;
386
-        case FF_CMP_SSE:
387
-            cmp[i] = c->sse[i];
388
-            break;
389
-        case FF_CMP_DCT:
390
-            cmp[i] = c->dct_sad[i];
391
-            break;
392
-        case FF_CMP_DCT264:
393
-            cmp[i] = c->dct264_sad[i];
394
-            break;
395
-        case FF_CMP_DCTMAX:
396
-            cmp[i] = c->dct_max[i];
397
-            break;
398
-        case FF_CMP_PSNR:
399
-            cmp[i] = c->quant_psnr[i];
400
-            break;
401
-        case FF_CMP_BIT:
402
-            cmp[i] = c->bit[i];
403
-            break;
404
-        case FF_CMP_RD:
405
-            cmp[i] = c->rd[i];
406
-            break;
407
-        case FF_CMP_VSAD:
408
-            cmp[i] = c->vsad[i];
409
-            break;
410
-        case FF_CMP_VSSE:
411
-            cmp[i] = c->vsse[i];
412
-            break;
413
-        case FF_CMP_ZERO:
414
-            cmp[i] = zero_cmp;
415
-            break;
416
-        case FF_CMP_NSSE:
417
-            cmp[i] = c->nsse[i];
418
-            break;
419
-        default:
420
-            av_log(NULL, AV_LOG_ERROR,
421
-                   "internal error in cmp function selection\n");
422
-        }
423
-    }
424
-}
425
-
426
-#define BUTTERFLY2(o1, o2, i1, i2)              \
427
-    o1 = (i1) + (i2);                           \
428
-    o2 = (i1) - (i2);
429
-
430
-#define BUTTERFLY1(x, y)                        \
431
-    {                                           \
432
-        int a, b;                               \
433
-        a = x;                                  \
434
-        b = y;                                  \
435
-        x = a + b;                              \
436
-        y = a - b;                              \
437
-    }
438
-
439
-#define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
440
-
441
-static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
442
-                               uint8_t *src, int stride, int h)
443
-{
444
-    int i, temp[64], sum = 0;
445
-
446
-    assert(h == 8);
447
-
448
-    for (i = 0; i < 8; i++) {
449
-        // FIXME: try pointer walks
450
-        BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
451
-                   src[stride * i + 0] - dst[stride * i + 0],
452
-                   src[stride * i + 1] - dst[stride * i + 1]);
453
-        BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
454
-                   src[stride * i + 2] - dst[stride * i + 2],
455
-                   src[stride * i + 3] - dst[stride * i + 3]);
456
-        BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
457
-                   src[stride * i + 4] - dst[stride * i + 4],
458
-                   src[stride * i + 5] - dst[stride * i + 5]);
459
-        BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
460
-                   src[stride * i + 6] - dst[stride * i + 6],
461
-                   src[stride * i + 7] - dst[stride * i + 7]);
462
-
463
-        BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
464
-        BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
465
-        BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
466
-        BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
467
-
468
-        BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
469
-        BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
470
-        BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
471
-        BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
472
-    }
473
-
474
-    for (i = 0; i < 8; i++) {
475
-        BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
476
-        BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
477
-        BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
478
-        BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
479
-
480
-        BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
481
-        BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
482
-        BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
483
-        BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
484
-
485
-        sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
486
-               BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
487
-               BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
488
-               BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
489
-    }
490
-    return sum;
491
-}
492
-
493
-static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
494
-                                uint8_t *dummy, int stride, int h)
495
-{
496
-    int i, temp[64], sum = 0;
497
-
498
-    assert(h == 8);
499
-
500
-    for (i = 0; i < 8; i++) {
501
-        // FIXME: try pointer walks
502
-        BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
503
-                   src[stride * i + 0], src[stride * i + 1]);
504
-        BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
505
-                   src[stride * i + 2], src[stride * i + 3]);
506
-        BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
507
-                   src[stride * i + 4], src[stride * i + 5]);
508
-        BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
509
-                   src[stride * i + 6], src[stride * i + 7]);
510
-
511
-        BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
512
-        BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
513
-        BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
514
-        BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
515
-
516
-        BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
517
-        BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
518
-        BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
519
-        BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
520
-    }
521
-
522
-    for (i = 0; i < 8; i++) {
523
-        BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
524
-        BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
525
-        BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
526
-        BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
527
-
528
-        BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
529
-        BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
530
-        BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
531
-        BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
532
-
533
-        sum +=
534
-            BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
535
-            + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
536
-            + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
537
-            + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
538
-    }
539
-
540
-    sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
541
-
542
-    return sum;
543
-}
544
-
545
-static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
546
-                        uint8_t *src2, int stride, int h)
547
-{
548
-    LOCAL_ALIGNED_16(int16_t, temp, [64]);
549
-
550
-    assert(h == 8);
551
-
552
-    s->pdsp.diff_pixels(temp, src1, src2, stride);
553
-    s->fdsp.fdct(temp);
554
-    return s->dsp.sum_abs_dctelem(temp);
555
-}
556
-
557
-#if CONFIG_GPL
558
-#define DCT8_1D                                         \
559
-    {                                                   \
560
-        const int s07 = SRC(0) + SRC(7);                \
561
-        const int s16 = SRC(1) + SRC(6);                \
562
-        const int s25 = SRC(2) + SRC(5);                \
563
-        const int s34 = SRC(3) + SRC(4);                \
564
-        const int a0  = s07 + s34;                      \
565
-        const int a1  = s16 + s25;                      \
566
-        const int a2  = s07 - s34;                      \
567
-        const int a3  = s16 - s25;                      \
568
-        const int d07 = SRC(0) - SRC(7);                \
569
-        const int d16 = SRC(1) - SRC(6);                \
570
-        const int d25 = SRC(2) - SRC(5);                \
571
-        const int d34 = SRC(3) - SRC(4);                \
572
-        const int a4  = d16 + d25 + (d07 + (d07 >> 1)); \
573
-        const int a5  = d07 - d34 - (d25 + (d25 >> 1)); \
574
-        const int a6  = d07 + d34 - (d16 + (d16 >> 1)); \
575
-        const int a7  = d16 - d25 + (d34 + (d34 >> 1)); \
576
-        DST(0, a0 + a1);                                \
577
-        DST(1, a4 + (a7 >> 2));                         \
578
-        DST(2, a2 + (a3 >> 1));                         \
579
-        DST(3, a5 + (a6 >> 2));                         \
580
-        DST(4, a0 - a1);                                \
581
-        DST(5, a6 - (a5 >> 2));                         \
582
-        DST(6, (a2 >> 1) - a3);                         \
583
-        DST(7, (a4 >> 2) - a7);                         \
584
-    }
585
-
586
-static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
587
-                           uint8_t *src2, int stride, int h)
588
-{
589
-    int16_t dct[8][8];
590
-    int i, sum = 0;
591
-
592
-    s->pdsp.diff_pixels(dct[0], src1, src2, stride);
593
-
594
-#define SRC(x) dct[i][x]
595
-#define DST(x, v) dct[i][x] = v
596
-    for (i = 0; i < 8; i++)
597
-        DCT8_1D
598
-#undef SRC
599
-#undef DST
600
-
601
-#define SRC(x) dct[x][i]
602
-#define DST(x, v) sum += FFABS(v)
603
-        for (i = 0; i < 8; i++)
604
-            DCT8_1D
605
-#undef SRC
606
-#undef DST
607
-            return sum;
608
-}
609
-#endif
610
-
611
-static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
612
-                        uint8_t *src2, int stride, int h)
613
-{
614
-    LOCAL_ALIGNED_16(int16_t, temp, [64]);
615
-    int sum = 0, i;
616
-
617
-    assert(h == 8);
618
-
619
-    s->pdsp.diff_pixels(temp, src1, src2, stride);
620
-    s->fdsp.fdct(temp);
621
-
622
-    for (i = 0; i < 64; i++)
623
-        sum = FFMAX(sum, FFABS(temp[i]));
624
-
625
-    return sum;
626
-}
627
-
628
-static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
629
-                           uint8_t *src2, int stride, int h)
630
-{
631
-    LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
632
-    int16_t *const bak = temp + 64;
633
-    int sum = 0, i;
634
-
635
-    assert(h == 8);
636
-    s->mb_intra = 0;
637
-
638
-    s->pdsp.diff_pixels(temp, src1, src2, stride);
639
-
640
-    memcpy(bak, temp, 64 * sizeof(int16_t));
641
-
642
-    s->block_last_index[0 /* FIXME */] =
643
-        s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
644
-    s->dct_unquantize_inter(s, temp, 0, s->qscale);
645
-    ff_simple_idct_8(temp); // FIXME
646
-
647
-    for (i = 0; i < 64; i++)
648
-        sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
649
-
650
-    return sum;
651
-}
652
-
653
-static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
654
-                   int stride, int h)
655
-{
656
-    const uint8_t *scantable = s->intra_scantable.permutated;
657
-    LOCAL_ALIGNED_16(int16_t, temp, [64]);
658
-    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
659
-    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
660
-    int i, last, run, bits, level, distortion, start_i;
661
-    const int esc_length = s->ac_esc_length;
662
-    uint8_t *length, *last_length;
663
-
664
-    assert(h == 8);
665
-
666
-    copy_block8(lsrc1, src1, 8, stride, 8);
667
-    copy_block8(lsrc2, src2, 8, stride, 8);
668
-
669
-    s->pdsp.diff_pixels(temp, lsrc1, lsrc2, 8);
670
-
671
-    s->block_last_index[0 /* FIXME */] =
672
-    last                               =
673
-        s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
674
-
675
-    bits = 0;
676
-
677
-    if (s->mb_intra) {
678
-        start_i     = 1;
679
-        length      = s->intra_ac_vlc_length;
680
-        last_length = s->intra_ac_vlc_last_length;
681
-        bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
682
-    } else {
683
-        start_i     = 0;
684
-        length      = s->inter_ac_vlc_length;
685
-        last_length = s->inter_ac_vlc_last_length;
686
-    }
687
-
688
-    if (last >= start_i) {
689
-        run = 0;
690
-        for (i = start_i; i < last; i++) {
691
-            int j = scantable[i];
692
-            level = temp[j];
693
-
694
-            if (level) {
695
-                level += 64;
696
-                if ((level & (~127)) == 0)
697
-                    bits += length[UNI_AC_ENC_INDEX(run, level)];
698
-                else
699
-                    bits += esc_length;
700
-                run = 0;
701
-            } else
702
-                run++;
703
-        }
704
-        i = scantable[last];
705
-
706
-        level = temp[i] + 64;
707
-
708
-        assert(level - 64);
709
-
710
-        if ((level & (~127)) == 0) {
711
-            bits += last_length[UNI_AC_ENC_INDEX(run, level)];
712
-        } else
713
-            bits += esc_length;
714
-    }
715
-
716
-    if (last >= 0) {
717
-        if (s->mb_intra)
718
-            s->dct_unquantize_intra(s, temp, 0, s->qscale);
719
-        else
720
-            s->dct_unquantize_inter(s, temp, 0, s->qscale);
721
-    }
722
-
723
-    s->idsp.idct_add(lsrc2, 8, temp);
724
-
725
-    distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
726
-
727
-    return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
728
-}
729
-
730
-static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
731
-                    int stride, int h)
732
-{
733
-    const uint8_t *scantable = s->intra_scantable.permutated;
734
-    LOCAL_ALIGNED_16(int16_t, temp, [64]);
735
-    int i, last, run, bits, level, start_i;
736
-    const int esc_length = s->ac_esc_length;
737
-    uint8_t *length, *last_length;
738
-
739
-    assert(h == 8);
740
-
741
-    s->pdsp.diff_pixels(temp, src1, src2, stride);
742
-
743
-    s->block_last_index[0 /* FIXME */] =
744
-    last                               =
745
-        s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
746
-
747
-    bits = 0;
748
-
749
-    if (s->mb_intra) {
750
-        start_i     = 1;
751
-        length      = s->intra_ac_vlc_length;
752
-        last_length = s->intra_ac_vlc_last_length;
753
-        bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
754
-    } else {
755
-        start_i     = 0;
756
-        length      = s->inter_ac_vlc_length;
757
-        last_length = s->inter_ac_vlc_last_length;
758
-    }
759
-
760
-    if (last >= start_i) {
761
-        run = 0;
762
-        for (i = start_i; i < last; i++) {
763
-            int j = scantable[i];
764
-            level = temp[j];
765
-
766
-            if (level) {
767
-                level += 64;
768
-                if ((level & (~127)) == 0)
769
-                    bits += length[UNI_AC_ENC_INDEX(run, level)];
770
-                else
771
-                    bits += esc_length;
772
-                run = 0;
773
-            } else
774
-                run++;
775
-        }
776
-        i = scantable[last];
777
-
778
-        level = temp[i] + 64;
779
-
780
-        assert(level - 64);
781
-
782
-        if ((level & (~127)) == 0)
783
-            bits += last_length[UNI_AC_ENC_INDEX(run, level)];
784
-        else
785
-            bits += esc_length;
786
-    }
787
-
788
-    return bits;
789
-}
790
-
791
-#define VSAD_INTRA(size)                                                \
792
-static int vsad_intra ## size ## _c(MpegEncContext *c,                  \
793
-                                    uint8_t *s, uint8_t *dummy,         \
794
-                                    int stride, int h)                  \
795
-{                                                                       \
796
-    int score = 0, x, y;                                                \
797
-                                                                        \
798
-    for (y = 1; y < h; y++) {                                           \
799
-        for (x = 0; x < size; x += 4) {                                 \
800
-            score += FFABS(s[x]     - s[x + stride])     +              \
801
-                     FFABS(s[x + 1] - s[x + stride + 1]) +              \
802
-                     FFABS(s[x + 2] - s[x + 2 + stride]) +              \
803
-                     FFABS(s[x + 3] - s[x + 3 + stride]);               \
804
-        }                                                               \
805
-        s += stride;                                                    \
806
-    }                                                                   \
807
-                                                                        \
808
-    return score;                                                       \
809
-}
810
-VSAD_INTRA(8)
811
-VSAD_INTRA(16)
812
-
813
-static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
814
-                    int stride, int h)
815
-{
816
-    int score = 0, x, y;
817
-
818
-    for (y = 1; y < h; y++) {
819
-        for (x = 0; x < 16; x++)
820
-            score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
821
-        s1 += stride;
822
-        s2 += stride;
823
-    }
824
-
825
-    return score;
826
-}
827
-
828
-#define SQ(a) ((a) * (a))
829
-#define VSSE_INTRA(size)                                                \
830
-static int vsse_intra ## size ## _c(MpegEncContext *c,                  \
831
-                                    uint8_t *s, uint8_t *dummy,         \
832
-                                    int stride, int h)                  \
833
-{                                                                       \
834
-    int score = 0, x, y;                                                \
835
-                                                                        \
836
-    for (y = 1; y < h; y++) {                                           \
837
-        for (x = 0; x < size; x += 4) {                                 \
838
-            score += SQ(s[x]     - s[x + stride]) +                     \
839
-                     SQ(s[x + 1] - s[x + stride + 1]) +                 \
840
-                     SQ(s[x + 2] - s[x + stride + 2]) +                 \
841
-                     SQ(s[x + 3] - s[x + stride + 3]);                  \
842
-        }                                                               \
843
-        s += stride;                                                    \
844
-    }                                                                   \
845
-                                                                        \
846
-    return score;                                                       \
847
-}
848
-VSSE_INTRA(8)
849
-VSSE_INTRA(16)
850
-
851
-static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
852
-                    int stride, int h)
853
-{
854
-    int score = 0, x, y;
855
-
856
-    for (y = 1; y < h; y++) {
857
-        for (x = 0; x < 16; x++)
858
-            score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
859
-        s1 += stride;
860
-        s2 += stride;
861
-    }
862
-
863
-    return score;
864
-}
865
-
866
-#define WRAPPER8_16_SQ(name8, name16)                                   \
867
-static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,        \
868
-                  int stride, int h)                                    \
869
-{                                                                       \
870
-    int score = 0;                                                      \
871
-                                                                        \
872
-    score += name8(s, dst, src, stride, 8);                             \
873
-    score += name8(s, dst + 8, src + 8, stride, 8);                     \
874
-    if (h == 16) {                                                      \
875
-        dst   += 8 * stride;                                            \
876
-        src   += 8 * stride;                                            \
877
-        score += name8(s, dst, src, stride, 8);                         \
878
-        score += name8(s, dst + 8, src + 8, stride, 8);                 \
879
-    }                                                                   \
880
-    return score;                                                       \
881
-}
882
-
883
-WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
884
-WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
885
-WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
886
-#if CONFIG_GPL
887
-WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
888
-#endif
889
-WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
890
-WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
891
-WRAPPER8_16_SQ(rd8x8_c, rd16_c)
892
-WRAPPER8_16_SQ(bit8x8_c, bit16_c)
893
-
894
-/* init static data */
895
-av_cold void ff_dsputil_static_init(void)
896
-{
897
-    int i;
898
-
899
-    for (i = 0; i < 512; i++)
900
-        ff_square_tab[i] = (i - 256) * (i - 256);
901
-}
902
-
903
-av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
904
-{
905
-    c->sum_abs_dctelem = sum_abs_dctelem_c;
906
-
907
-    /* TODO [0] 16  [1] 8 */
908
-    c->pix_abs[0][0] = pix_abs16_c;
909
-    c->pix_abs[0][1] = pix_abs16_x2_c;
910
-    c->pix_abs[0][2] = pix_abs16_y2_c;
911
-    c->pix_abs[0][3] = pix_abs16_xy2_c;
912
-    c->pix_abs[1][0] = pix_abs8_c;
913
-    c->pix_abs[1][1] = pix_abs8_x2_c;
914
-    c->pix_abs[1][2] = pix_abs8_y2_c;
915
-    c->pix_abs[1][3] = pix_abs8_xy2_c;
916
-
917
-#define SET_CMP_FUNC(name)                      \
918
-    c->name[0] = name ## 16_c;                  \
919
-    c->name[1] = name ## 8x8_c;
920
-
921
-    SET_CMP_FUNC(hadamard8_diff)
922
-    c->hadamard8_diff[4] = hadamard8_intra16_c;
923
-    c->hadamard8_diff[5] = hadamard8_intra8x8_c;
924
-    SET_CMP_FUNC(dct_sad)
925
-    SET_CMP_FUNC(dct_max)
926
-#if CONFIG_GPL
927
-    SET_CMP_FUNC(dct264_sad)
928
-#endif
929
-    c->sad[0] = pix_abs16_c;
930
-    c->sad[1] = pix_abs8_c;
931
-    c->sse[0] = sse16_c;
932
-    c->sse[1] = sse8_c;
933
-    c->sse[2] = sse4_c;
934
-    SET_CMP_FUNC(quant_psnr)
935
-    SET_CMP_FUNC(rd)
936
-    SET_CMP_FUNC(bit)
937
-    c->vsad[0] = vsad16_c;
938
-    c->vsad[4] = vsad_intra16_c;
939
-    c->vsad[5] = vsad_intra8_c;
940
-    c->vsse[0] = vsse16_c;
941
-    c->vsse[4] = vsse_intra16_c;
942
-    c->vsse[5] = vsse_intra8_c;
943
-    c->nsse[0] = nsse16_c;
944
-    c->nsse[1] = nsse8_c;
945
-
946
-    if (ARCH_ARM)
947
-        ff_dsputil_init_arm(c, avctx);
948
-    if (ARCH_PPC)
949
-        ff_dsputil_init_ppc(c, avctx);
950
-    if (ARCH_X86)
951
-        ff_dsputil_init_x86(c, avctx);
952
-}
953 1
deleted file mode 100644
... ...
@@ -1,85 +0,0 @@
1
-/*
2
- * DSP utils
3
- * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
4
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
- *
6
- * This file is part of Libav.
7
- *
8
- * Libav is free software; you can redistribute it and/or
9
- * modify it under the terms of the GNU Lesser General Public
10
- * License as published by the Free Software Foundation; either
11
- * version 2.1 of the License, or (at your option) any later version.
12
- *
13
- * Libav is distributed in the hope that it will be useful,
14
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
- * Lesser General Public License for more details.
17
- *
18
- * You should have received a copy of the GNU Lesser General Public
19
- * License along with Libav; if not, write to the Free Software
20
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
- */
22
-
23
-/**
24
- * @file
25
- * DSP utils.
26
- * Note, many functions in here may use MMX which trashes the FPU state, it is
27
- * absolutely necessary to call emms_c() between DSP & float/double code.
28
- */
29
-
30
-#ifndef AVCODEC_DSPUTIL_H
31
-#define AVCODEC_DSPUTIL_H
32
-
33
-#include "avcodec.h"
34
-
35
-extern uint32_t ff_square_tab[512];
36
-
37
-struct MpegEncContext;
38
-/* Motion estimation:
39
- * h is limited to { width / 2, width, 2 * width },
40
- * but never larger than 16 and never smaller than 2.
41
- * Although currently h < 4 is not used as functions with
42
- * width < 8 are neither used nor implemented. */
43
-typedef int (*me_cmp_func)(struct MpegEncContext *c,
44
-                           uint8_t *blk1 /* align width (8 or 16) */,
45
-                           uint8_t *blk2 /* align 1 */, int line_size, int h);
46
-
47
-/**
48
- * DSPContext.
49
- */
50
-typedef struct DSPContext {
51
-    int (*sum_abs_dctelem)(int16_t *block /* align 16 */);
52
-
53
-    me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */
54
-    me_cmp_func sse[6];
55
-    me_cmp_func hadamard8_diff[6];
56
-    me_cmp_func dct_sad[6];
57
-    me_cmp_func quant_psnr[6];
58
-    me_cmp_func bit[6];
59
-    me_cmp_func rd[6];
60
-    me_cmp_func vsad[6];
61
-    me_cmp_func vsse[6];
62
-    me_cmp_func nsse[6];
63
-    me_cmp_func dct_max[6];
64
-    me_cmp_func dct264_sad[6];
65
-
66
-    me_cmp_func me_pre_cmp[6];
67
-    me_cmp_func me_cmp[6];
68
-    me_cmp_func me_sub_cmp[6];
69
-    me_cmp_func mb_cmp[6];
70
-    me_cmp_func ildct_cmp[6]; // only width 16 used
71
-    me_cmp_func frame_skip_cmp[6]; // only width 8 used
72
-
73
-    me_cmp_func pix_abs[2][4];
74
-} DSPContext;
75
-
76
-void ff_dsputil_static_init(void);
77
-void ff_dsputil_init(DSPContext *p, AVCodecContext *avctx);
78
-
79
-void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type);
80
-
81
-void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx);
82
-void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx);
83
-void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx);
84
-
85
-#endif /* AVCODEC_DSPUTIL_H */
... ...
@@ -28,7 +28,7 @@
28 28
 #define AVCODEC_DV_H
29 29
 
30 30
 #include "avcodec.h"
31
-#include "dsputil.h"
31
+#include "me_cmp.h"
32 32
 #include "get_bits.h"
33 33
 #include "dv_profile.h"
34 34
 
... ...
@@ -28,9 +28,9 @@
28 28
 #include "libavutil/pixdesc.h"
29 29
 #include "config.h"
30 30
 #include "avcodec.h"
31
-#include "dsputil.h"
32 31
 #include "fdctdsp.h"
33 32
 #include "internal.h"
33
+#include "me_cmp.h"
34 34
 #include "pixblockdsp.h"
35 35
 #include "put_bits.h"
36 36
 #include "dv.h"
... ...
@@ -40,8 +40,8 @@
40 40
 static av_cold int dvvideo_encode_init(AVCodecContext *avctx)
41 41
 {
42 42
     DVVideoContext *s = avctx->priv_data;
43
-    DSPContext dsp;
44 43
     FDCTDSPContext fdsp;
44
+    MECmpContext mecc;
45 45
     PixblockDSPContext pdsp;
46 46
     int ret;
47 47
 
... ...
@@ -65,13 +65,13 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx)
65 65
 
66 66
     dv_vlc_map_tableinit();
67 67
 
68
-    ff_dsputil_init(&dsp, avctx);
69 68
     ff_fdctdsp_init(&fdsp, avctx);
69
+    ff_me_cmp_init(&mecc, avctx);
70 70
     ff_pixblockdsp_init(&pdsp, avctx);
71
-    ff_set_cmp(&dsp, dsp.ildct_cmp, avctx->ildct_cmp);
71
+    ff_set_cmp(&mecc, mecc.ildct_cmp, avctx->ildct_cmp);
72 72
 
73 73
     s->get_pixels = pdsp.get_pixels;
74
-    s->ildct_cmp  = dsp.ildct_cmp[5];
74
+    s->ildct_cmp  = mecc.ildct_cmp[5];
75 75
 
76 76
     s->fdct[0]    = fdsp.fdct;
77 77
     s->fdct[1]    = fdsp.fdct248;
... ...
@@ -715,11 +715,11 @@ FF_ENABLE_DEPRECATION_WARNINGS
715 715
                 } else {
716 716
                     ff_thread_await_progress(s->last_pic.tf, mb_y, 0);
717 717
                 }
718
-                is_intra_likely += s->dsp->sad[0](NULL, last_mb_ptr, mb_ptr,
719
-                                                 linesize[0], 16);
720
-                is_intra_likely -= s->dsp->sad[0](NULL, last_mb_ptr,
721
-                                                 last_mb_ptr + linesize[0] * 16,
722
-                                                 linesize[0], 16);
718
+                is_intra_likely += s->mecc->sad[0](NULL, last_mb_ptr, mb_ptr,
719
+                                                   linesize[0], 16);
720
+                is_intra_likely -= s->mecc->sad[0](NULL, last_mb_ptr,
721
+                                                   last_mb_ptr + linesize[0] * 16,
722
+                                                   linesize[0], 16);
723 723
             } else {
724 724
                 if (IS_INTRA(s->cur_pic.mb_type[mb_xy]))
725 725
                    is_intra_likely++;
... ...
@@ -23,7 +23,7 @@
23 23
 #include <stdint.h>
24 24
 
25 25
 #include "avcodec.h"
26
-#include "dsputil.h"
26
+#include "me_cmp.h"
27 27
 #include "thread.h"
28 28
 
29 29
 ///< current MB is the first after a resync marker
... ...
@@ -52,7 +52,7 @@ typedef struct ERPicture {
52 52
 
53 53
 typedef struct ERContext {
54 54
     AVCodecContext *avctx;
55
-    DSPContext *dsp;
55
+    MECmpContext *mecc;
56 56
 
57 57
     int *mb_index2xy;
58 58
     int mb_num;
... ...
@@ -33,7 +33,6 @@
33 33
 #include "internal.h"
34 34
 #include "cabac.h"
35 35
 #include "cabac_functions.h"
36
-#include "dsputil.h"
37 36
 #include "error_resilience.h"
38 37
 #include "avcodec.h"
39 38
 #include "h264.h"
... ...
@@ -42,6 +41,7 @@
42 42
 #include "h264_mvpred.h"
43 43
 #include "golomb.h"
44 44
 #include "mathops.h"
45
+#include "me_cmp.h"
45 46
 #include "mpegutils.h"
46 47
 #include "rectangle.h"
47 48
 #include "svq3.h"
... ...
@@ -490,7 +490,7 @@ int ff_h264_context_init(H264Context *h)
490 490
     if (CONFIG_ERROR_RESILIENCE) {
491 491
         /* init ER */
492 492
         er->avctx          = h->avctx;
493
-        er->dsp            = &h->dsp;
493
+        er->mecc           = &h->mecc;
494 494
         er->decode_mb      = h264_er_decode_mb;
495 495
         er->opaque         = h;
496 496
         er->quarter_sample = 1;
... ...
@@ -620,7 +620,7 @@ av_cold int ff_h264_decode_init(AVCodecContext *avctx)
620 620
 
621 621
     /* needed so that IDCT permutation is known early */
622 622
     if (CONFIG_ERROR_RESILIENCE)
623
-        ff_dsputil_init(&h->dsp, h->avctx);
623
+        ff_me_cmp_init(&h->mecc, h->avctx);
624 624
     ff_videodsp_init(&h->vdsp, 8);
625 625
 
626 626
     memset(h->pps.scaling_matrix4, 16, 6 * 16 * sizeof(uint8_t));
... ...
@@ -1234,7 +1234,7 @@ int ff_h264_set_parameter_from_sps(H264Context *h)
1234 1234
             ff_h264_pred_init(&h->hpc, h->avctx->codec_id, h->sps.bit_depth_luma,
1235 1235
                               h->sps.chroma_format_idc);
1236 1236
             if (CONFIG_ERROR_RESILIENCE)
1237
-                ff_dsputil_init(&h->dsp, h->avctx);
1237
+                ff_me_cmp_init(&h->mecc, h->avctx);
1238 1238
             ff_videodsp_init(&h->vdsp, h->sps.bit_depth_luma);
1239 1239
         } else {
1240 1240
             av_log(h->avctx, AV_LOG_ERROR, "Unsupported bit depth %d\n",
... ...
@@ -30,13 +30,13 @@
30 30
 
31 31
 #include "libavutil/intreadwrite.h"
32 32
 #include "cabac.h"
33
-#include "dsputil.h"
34 33
 #include "error_resilience.h"
35 34
 #include "get_bits.h"
36 35
 #include "h264chroma.h"
37 36
 #include "h264dsp.h"
38 37
 #include "h264pred.h"
39 38
 #include "h264qpel.h"
39
+#include "me_cmp.h"
40 40
 #include "mpegutils.h"
41 41
 #include "parser.h"
42 42
 #include "qpeldsp.h"
... ...
@@ -302,7 +302,7 @@ typedef struct H264Picture {
302 302
  */
303 303
 typedef struct H264Context {
304 304
     AVCodecContext *avctx;
305
-    DSPContext       dsp;
305
+    MECmpContext mecc;
306 306
     VideoDSPContext vdsp;
307 307
     H264DSPContext h264dsp;
308 308
     H264ChromaContext h264chroma;
... ...
@@ -31,7 +31,6 @@
31 31
 #include "internal.h"
32 32
 #include "cabac.h"
33 33
 #include "cabac_functions.h"
34
-#include "dsputil.h"
35 34
 #include "error_resilience.h"
36 35
 #include "avcodec.h"
37 36
 #include "h264.h"
... ...
@@ -1119,7 +1118,7 @@ static int h264_slice_header_init(H264Context *h, int reinit)
1119 1119
             if (!c)
1120 1120
                 return AVERROR(ENOMEM);
1121 1121
             c->avctx             = h->avctx;
1122
-            c->dsp               = h->dsp;
1122
+            c->mecc              = h->mecc;
1123 1123
             c->vdsp              = h->vdsp;
1124 1124
             c->h264dsp           = h->h264dsp;
1125 1125
             c->h264qpel          = h->h264qpel;
1126 1126
new file mode 100644
... ...
@@ -0,0 +1,942 @@
0
+/*
1
+ * This file is part of Libav.
2
+ *
3
+ * Libav is free software; you can redistribute it and/or
4
+ * modify it under the terms of the GNU Lesser General Public
5
+ * License as published by the Free Software Foundation; either
6
+ * version 2.1 of the License, or (at your option) any later version.
7
+ *
8
+ * Libav is distributed in the hope that it will be useful,
9
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11
+ * Lesser General Public License for more details.
12
+ *
13
+ * You should have received a copy of the GNU Lesser General Public
14
+ * License along with Libav; if not, write to the Free Software
15
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+ */
17
+
18
+#include "libavutil/attributes.h"
19
+#include "avcodec.h"
20
+#include "copy_block.h"
21
+#include "simple_idct.h"
22
+#include "me_cmp.h"
23
+#include "mpegvideo.h"
24
+#include "config.h"
25
+
26
+uint32_t ff_square_tab[512] = { 0, };
27
+
28
+static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
29
+                  int line_size, int h)
30
+{
31
+    int s = 0, i;
32
+    uint32_t *sq = ff_square_tab + 256;
33
+
34
+    for (i = 0; i < h; i++) {
35
+        s    += sq[pix1[0] - pix2[0]];
36
+        s    += sq[pix1[1] - pix2[1]];
37
+        s    += sq[pix1[2] - pix2[2]];
38
+        s    += sq[pix1[3] - pix2[3]];
39
+        pix1 += line_size;
40
+        pix2 += line_size;
41
+    }
42
+    return s;
43
+}
44
+
45
+static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
46
+                  int line_size, int h)
47
+{
48
+    int s = 0, i;
49
+    uint32_t *sq = ff_square_tab + 256;
50
+
51
+    for (i = 0; i < h; i++) {
52
+        s    += sq[pix1[0] - pix2[0]];
53
+        s    += sq[pix1[1] - pix2[1]];
54
+        s    += sq[pix1[2] - pix2[2]];
55
+        s    += sq[pix1[3] - pix2[3]];
56
+        s    += sq[pix1[4] - pix2[4]];
57
+        s    += sq[pix1[5] - pix2[5]];
58
+        s    += sq[pix1[6] - pix2[6]];
59
+        s    += sq[pix1[7] - pix2[7]];
60
+        pix1 += line_size;
61
+        pix2 += line_size;
62
+    }
63
+    return s;
64
+}
65
+
66
+static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
67
+                   int line_size, int h)
68
+{
69
+    int s = 0, i;
70
+    uint32_t *sq = ff_square_tab + 256;
71
+
72
+    for (i = 0; i < h; i++) {
73
+        s += sq[pix1[0]  - pix2[0]];
74
+        s += sq[pix1[1]  - pix2[1]];
75
+        s += sq[pix1[2]  - pix2[2]];
76
+        s += sq[pix1[3]  - pix2[3]];
77
+        s += sq[pix1[4]  - pix2[4]];
78
+        s += sq[pix1[5]  - pix2[5]];
79
+        s += sq[pix1[6]  - pix2[6]];
80
+        s += sq[pix1[7]  - pix2[7]];
81
+        s += sq[pix1[8]  - pix2[8]];
82
+        s += sq[pix1[9]  - pix2[9]];
83
+        s += sq[pix1[10] - pix2[10]];
84
+        s += sq[pix1[11] - pix2[11]];
85
+        s += sq[pix1[12] - pix2[12]];
86
+        s += sq[pix1[13] - pix2[13]];
87
+        s += sq[pix1[14] - pix2[14]];
88
+        s += sq[pix1[15] - pix2[15]];
89
+
90
+        pix1 += line_size;
91
+        pix2 += line_size;
92
+    }
93
+    return s;
94
+}
95
+
96
+static int sum_abs_dctelem_c(int16_t *block)
97
+{
98
+    int sum = 0, i;
99
+
100
+    for (i = 0; i < 64; i++)
101
+        sum += FFABS(block[i]);
102
+    return sum;
103
+}
104
+
105
+#define avg2(a, b) ((a + b + 1) >> 1)
106
+#define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
107
+
108
+static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
109
+                              int line_size, int h)
110
+{
111
+    int s = 0, i;
112
+
113
+    for (i = 0; i < h; i++) {
114
+        s    += abs(pix1[0]  - pix2[0]);
115
+        s    += abs(pix1[1]  - pix2[1]);
116
+        s    += abs(pix1[2]  - pix2[2]);
117
+        s    += abs(pix1[3]  - pix2[3]);
118
+        s    += abs(pix1[4]  - pix2[4]);
119
+        s    += abs(pix1[5]  - pix2[5]);
120
+        s    += abs(pix1[6]  - pix2[6]);
121
+        s    += abs(pix1[7]  - pix2[7]);
122
+        s    += abs(pix1[8]  - pix2[8]);
123
+        s    += abs(pix1[9]  - pix2[9]);
124
+        s    += abs(pix1[10] - pix2[10]);
125
+        s    += abs(pix1[11] - pix2[11]);
126
+        s    += abs(pix1[12] - pix2[12]);
127
+        s    += abs(pix1[13] - pix2[13]);
128
+        s    += abs(pix1[14] - pix2[14]);
129
+        s    += abs(pix1[15] - pix2[15]);
130
+        pix1 += line_size;
131
+        pix2 += line_size;
132
+    }
133
+    return s;
134
+}
135
+
136
+static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
137
+                          int line_size, int h)
138
+{
139
+    int s = 0, i;
140
+
141
+    for (i = 0; i < h; i++) {
142
+        s    += abs(pix1[0]  - avg2(pix2[0],  pix2[1]));
143
+        s    += abs(pix1[1]  - avg2(pix2[1],  pix2[2]));
144
+        s    += abs(pix1[2]  - avg2(pix2[2],  pix2[3]));
145
+        s    += abs(pix1[3]  - avg2(pix2[3],  pix2[4]));
146
+        s    += abs(pix1[4]  - avg2(pix2[4],  pix2[5]));
147
+        s    += abs(pix1[5]  - avg2(pix2[5],  pix2[6]));
148
+        s    += abs(pix1[6]  - avg2(pix2[6],  pix2[7]));
149
+        s    += abs(pix1[7]  - avg2(pix2[7],  pix2[8]));
150
+        s    += abs(pix1[8]  - avg2(pix2[8],  pix2[9]));
151
+        s    += abs(pix1[9]  - avg2(pix2[9],  pix2[10]));
152
+        s    += abs(pix1[10] - avg2(pix2[10], pix2[11]));
153
+        s    += abs(pix1[11] - avg2(pix2[11], pix2[12]));
154
+        s    += abs(pix1[12] - avg2(pix2[12], pix2[13]));
155
+        s    += abs(pix1[13] - avg2(pix2[13], pix2[14]));
156
+        s    += abs(pix1[14] - avg2(pix2[14], pix2[15]));
157
+        s    += abs(pix1[15] - avg2(pix2[15], pix2[16]));
158
+        pix1 += line_size;
159
+        pix2 += line_size;
160
+    }
161
+    return s;
162
+}
163
+
164
+static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
165
+                          int line_size, int h)
166
+{
167
+    int s = 0, i;
168
+    uint8_t *pix3 = pix2 + line_size;
169
+
170
+    for (i = 0; i < h; i++) {
171
+        s    += abs(pix1[0]  - avg2(pix2[0],  pix3[0]));
172
+        s    += abs(pix1[1]  - avg2(pix2[1],  pix3[1]));
173
+        s    += abs(pix1[2]  - avg2(pix2[2],  pix3[2]));
174
+        s    += abs(pix1[3]  - avg2(pix2[3],  pix3[3]));
175
+        s    += abs(pix1[4]  - avg2(pix2[4],  pix3[4]));
176
+        s    += abs(pix1[5]  - avg2(pix2[5],  pix3[5]));
177
+        s    += abs(pix1[6]  - avg2(pix2[6],  pix3[6]));
178
+        s    += abs(pix1[7]  - avg2(pix2[7],  pix3[7]));
179
+        s    += abs(pix1[8]  - avg2(pix2[8],  pix3[8]));
180
+        s    += abs(pix1[9]  - avg2(pix2[9],  pix3[9]));
181
+        s    += abs(pix1[10] - avg2(pix2[10], pix3[10]));
182
+        s    += abs(pix1[11] - avg2(pix2[11], pix3[11]));
183
+        s    += abs(pix1[12] - avg2(pix2[12], pix3[12]));
184
+        s    += abs(pix1[13] - avg2(pix2[13], pix3[13]));
185
+        s    += abs(pix1[14] - avg2(pix2[14], pix3[14]));
186
+        s    += abs(pix1[15] - avg2(pix2[15], pix3[15]));
187
+        pix1 += line_size;
188
+        pix2 += line_size;
189
+        pix3 += line_size;
190
+    }
191
+    return s;
192
+}
193
+
194
+static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
195
+                           int line_size, int h)
196
+{
197
+    int s = 0, i;
198
+    uint8_t *pix3 = pix2 + line_size;
199
+
200
+    for (i = 0; i < h; i++) {
201
+        s    += abs(pix1[0]  - avg4(pix2[0],  pix2[1],  pix3[0],  pix3[1]));
202
+        s    += abs(pix1[1]  - avg4(pix2[1],  pix2[2],  pix3[1],  pix3[2]));
203
+        s    += abs(pix1[2]  - avg4(pix2[2],  pix2[3],  pix3[2],  pix3[3]));
204
+        s    += abs(pix1[3]  - avg4(pix2[3],  pix2[4],  pix3[3],  pix3[4]));
205
+        s    += abs(pix1[4]  - avg4(pix2[4],  pix2[5],  pix3[4],  pix3[5]));
206
+        s    += abs(pix1[5]  - avg4(pix2[5],  pix2[6],  pix3[5],  pix3[6]));
207
+        s    += abs(pix1[6]  - avg4(pix2[6],  pix2[7],  pix3[6],  pix3[7]));
208
+        s    += abs(pix1[7]  - avg4(pix2[7],  pix2[8],  pix3[7],  pix3[8]));
209
+        s    += abs(pix1[8]  - avg4(pix2[8],  pix2[9],  pix3[8],  pix3[9]));
210
+        s    += abs(pix1[9]  - avg4(pix2[9],  pix2[10], pix3[9],  pix3[10]));
211
+        s    += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
212
+        s    += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
213
+        s    += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
214
+        s    += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
215
+        s    += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
216
+        s    += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
217
+        pix1 += line_size;
218
+        pix2 += line_size;
219
+        pix3 += line_size;
220
+    }
221
+    return s;
222
+}
223
+
224
+static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
225
+                             int line_size, int h)
226
+{
227
+    int s = 0, i;
228
+
229
+    for (i = 0; i < h; i++) {
230
+        s    += abs(pix1[0] - pix2[0]);
231
+        s    += abs(pix1[1] - pix2[1]);
232
+        s    += abs(pix1[2] - pix2[2]);
233
+        s    += abs(pix1[3] - pix2[3]);
234
+        s    += abs(pix1[4] - pix2[4]);
235
+        s    += abs(pix1[5] - pix2[5]);
236
+        s    += abs(pix1[6] - pix2[6]);
237
+        s    += abs(pix1[7] - pix2[7]);
238
+        pix1 += line_size;
239
+        pix2 += line_size;
240
+    }
241
+    return s;
242
+}
243
+
244
+static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
245
+                         int line_size, int h)
246
+{
247
+    int s = 0, i;
248
+
249
+    for (i = 0; i < h; i++) {
250
+        s    += abs(pix1[0] - avg2(pix2[0], pix2[1]));
251
+        s    += abs(pix1[1] - avg2(pix2[1], pix2[2]));
252
+        s    += abs(pix1[2] - avg2(pix2[2], pix2[3]));
253
+        s    += abs(pix1[3] - avg2(pix2[3], pix2[4]));
254
+        s    += abs(pix1[4] - avg2(pix2[4], pix2[5]));
255
+        s    += abs(pix1[5] - avg2(pix2[5], pix2[6]));
256
+        s    += abs(pix1[6] - avg2(pix2[6], pix2[7]));
257
+        s    += abs(pix1[7] - avg2(pix2[7], pix2[8]));
258
+        pix1 += line_size;
259
+        pix2 += line_size;
260
+    }
261
+    return s;
262
+}
263
+
264
+static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
265
+                         int line_size, int h)
266
+{
267
+    int s = 0, i;
268
+    uint8_t *pix3 = pix2 + line_size;
269
+
270
+    for (i = 0; i < h; i++) {
271
+        s    += abs(pix1[0] - avg2(pix2[0], pix3[0]));
272
+        s    += abs(pix1[1] - avg2(pix2[1], pix3[1]));
273
+        s    += abs(pix1[2] - avg2(pix2[2], pix3[2]));
274
+        s    += abs(pix1[3] - avg2(pix2[3], pix3[3]));
275
+        s    += abs(pix1[4] - avg2(pix2[4], pix3[4]));
276
+        s    += abs(pix1[5] - avg2(pix2[5], pix3[5]));
277
+        s    += abs(pix1[6] - avg2(pix2[6], pix3[6]));
278
+        s    += abs(pix1[7] - avg2(pix2[7], pix3[7]));
279
+        pix1 += line_size;
280
+        pix2 += line_size;
281
+        pix3 += line_size;
282
+    }
283
+    return s;
284
+}
285
+
286
+static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
287
+                          int line_size, int h)
288
+{
289
+    int s = 0, i;
290
+    uint8_t *pix3 = pix2 + line_size;
291
+
292
+    for (i = 0; i < h; i++) {
293
+        s    += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
294
+        s    += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
295
+        s    += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
296
+        s    += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
297
+        s    += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
298
+        s    += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
299
+        s    += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
300
+        s    += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
301
+        pix1 += line_size;
302
+        pix2 += line_size;
303
+        pix3 += line_size;
304
+    }
305
+    return s;
306
+}
307
+
308
+static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
309
+{
310
+    int score1 = 0, score2 = 0, x, y;
311
+
312
+    for (y = 0; y < h; y++) {
313
+        for (x = 0; x < 16; x++)
314
+            score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
315
+        if (y + 1 < h) {
316
+            for (x = 0; x < 15; x++)
317
+                score2 += FFABS(s1[x]     - s1[x + stride] -
318
+                                s1[x + 1] + s1[x + stride + 1]) -
319
+                          FFABS(s2[x]     - s2[x + stride] -
320
+                                s2[x + 1] + s2[x + stride + 1]);
321
+        }
322
+        s1 += stride;
323
+        s2 += stride;
324
+    }
325
+
326
+    if (c)
327
+        return score1 + FFABS(score2) * c->avctx->nsse_weight;
328
+    else
329
+        return score1 + FFABS(score2) * 8;
330
+}
331
+
332
+static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
333
+{
334
+    int score1 = 0, score2 = 0, x, y;
335
+
336
+    for (y = 0; y < h; y++) {
337
+        for (x = 0; x < 8; x++)
338
+            score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
339
+        if (y + 1 < h) {
340
+            for (x = 0; x < 7; x++)
341
+                score2 += FFABS(s1[x]     - s1[x + stride] -
342
+                                s1[x + 1] + s1[x + stride + 1]) -
343
+                          FFABS(s2[x]     - s2[x + stride] -
344
+                                s2[x + 1] + s2[x + stride + 1]);
345
+        }
346
+        s1 += stride;
347
+        s2 += stride;
348
+    }
349
+
350
+    if (c)
351
+        return score1 + FFABS(score2) * c->avctx->nsse_weight;
352
+    else
353
+        return score1 + FFABS(score2) * 8;
354
+}
355
+
356
+static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
357
+                    int stride, int h)
358
+{
359
+    return 0;
360
+}
361
+
362
+void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type)
363
+{
364
+    int i;
365
+
366
+    memset(cmp, 0, sizeof(void *) * 6);
367
+
368
+    for (i = 0; i < 6; i++) {
369
+        switch (type & 0xFF) {
370
+        case FF_CMP_SAD:
371
+            cmp[i] = c->sad[i];
372
+            break;
373
+        case FF_CMP_SATD:
374
+            cmp[i] = c->hadamard8_diff[i];
375
+            break;
376
+        case FF_CMP_SSE:
377
+            cmp[i] = c->sse[i];
378
+            break;
379
+        case FF_CMP_DCT:
380
+            cmp[i] = c->dct_sad[i];
381
+            break;
382
+        case FF_CMP_DCT264:
383
+            cmp[i] = c->dct264_sad[i];
384
+            break;
385
+        case FF_CMP_DCTMAX:
386
+            cmp[i] = c->dct_max[i];
387
+            break;
388
+        case FF_CMP_PSNR:
389
+            cmp[i] = c->quant_psnr[i];
390
+            break;
391
+        case FF_CMP_BIT:
392
+            cmp[i] = c->bit[i];
393
+            break;
394
+        case FF_CMP_RD:
395
+            cmp[i] = c->rd[i];
396
+            break;
397
+        case FF_CMP_VSAD:
398
+            cmp[i] = c->vsad[i];
399
+            break;
400
+        case FF_CMP_VSSE:
401
+            cmp[i] = c->vsse[i];
402
+            break;
403
+        case FF_CMP_ZERO:
404
+            cmp[i] = zero_cmp;
405
+            break;
406
+        case FF_CMP_NSSE:
407
+            cmp[i] = c->nsse[i];
408
+            break;
409
+        default:
410
+            av_log(NULL, AV_LOG_ERROR,
411
+                   "internal error in cmp function selection\n");
412
+        }
413
+    }
414
+}
415
+
416
+#define BUTTERFLY2(o1, o2, i1, i2)              \
417
+    o1 = (i1) + (i2);                           \
418
+    o2 = (i1) - (i2);
419
+
420
+#define BUTTERFLY1(x, y)                        \
421
+    {                                           \
422
+        int a, b;                               \
423
+        a = x;                                  \
424
+        b = y;                                  \
425
+        x = a + b;                              \
426
+        y = a - b;                              \
427
+    }
428
+
429
+#define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
430
+
431
+static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
432
+                               uint8_t *src, int stride, int h)
433
+{
434
+    int i, temp[64], sum = 0;
435
+
436
+    assert(h == 8);
437
+
438
+    for (i = 0; i < 8; i++) {
439
+        // FIXME: try pointer walks
440
+        BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
441
+                   src[stride * i + 0] - dst[stride * i + 0],
442
+                   src[stride * i + 1] - dst[stride * i + 1]);
443
+        BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
444
+                   src[stride * i + 2] - dst[stride * i + 2],
445
+                   src[stride * i + 3] - dst[stride * i + 3]);
446
+        BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
447
+                   src[stride * i + 4] - dst[stride * i + 4],
448
+                   src[stride * i + 5] - dst[stride * i + 5]);
449
+        BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
450
+                   src[stride * i + 6] - dst[stride * i + 6],
451
+                   src[stride * i + 7] - dst[stride * i + 7]);
452
+
453
+        BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
454
+        BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
455
+        BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
456
+        BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
457
+
458
+        BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
459
+        BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
460
+        BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
461
+        BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
462
+    }
463
+
464
+    for (i = 0; i < 8; i++) {
465
+        BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
466
+        BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
467
+        BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
468
+        BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
469
+
470
+        BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
471
+        BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
472
+        BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
473
+        BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
474
+
475
+        sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
476
+               BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
477
+               BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
478
+               BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
479
+    }
480
+    return sum;
481
+}
482
+
483
+static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
484
+                                uint8_t *dummy, int stride, int h)
485
+{
486
+    int i, temp[64], sum = 0;
487
+
488
+    assert(h == 8);
489
+
490
+    for (i = 0; i < 8; i++) {
491
+        // FIXME: try pointer walks
492
+        BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
493
+                   src[stride * i + 0], src[stride * i + 1]);
494
+        BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
495
+                   src[stride * i + 2], src[stride * i + 3]);
496
+        BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
497
+                   src[stride * i + 4], src[stride * i + 5]);
498
+        BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
499
+                   src[stride * i + 6], src[stride * i + 7]);
500
+
501
+        BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
502
+        BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
503
+        BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
504
+        BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
505
+
506
+        BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
507
+        BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
508
+        BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
509
+        BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
510
+    }
511
+
512
+    for (i = 0; i < 8; i++) {
513
+        BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
514
+        BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
515
+        BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
516
+        BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
517
+
518
+        BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
519
+        BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
520
+        BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
521
+        BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
522
+
523
+        sum +=
524
+            BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
525
+            + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
526
+            + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
527
+            + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
528
+    }
529
+
530
+    sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
531
+
532
+    return sum;
533
+}
534
+
535
+static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
536
+                        uint8_t *src2, int stride, int h)
537
+{
538
+    LOCAL_ALIGNED_16(int16_t, temp, [64]);
539
+
540
+    assert(h == 8);
541
+
542
+    s->pdsp.diff_pixels(temp, src1, src2, stride);
543
+    s->fdsp.fdct(temp);
544
+    return s->mecc.sum_abs_dctelem(temp);
545
+}
546
+
547
+#if CONFIG_GPL
548
+#define DCT8_1D                                         \
549
+    {                                                   \
550
+        const int s07 = SRC(0) + SRC(7);                \
551
+        const int s16 = SRC(1) + SRC(6);                \
552
+        const int s25 = SRC(2) + SRC(5);                \
553
+        const int s34 = SRC(3) + SRC(4);                \
554
+        const int a0  = s07 + s34;                      \
555
+        const int a1  = s16 + s25;                      \
556
+        const int a2  = s07 - s34;                      \
557
+        const int a3  = s16 - s25;                      \
558
+        const int d07 = SRC(0) - SRC(7);                \
559
+        const int d16 = SRC(1) - SRC(6);                \
560
+        const int d25 = SRC(2) - SRC(5);                \
561
+        const int d34 = SRC(3) - SRC(4);                \
562
+        const int a4  = d16 + d25 + (d07 + (d07 >> 1)); \
563
+        const int a5  = d07 - d34 - (d25 + (d25 >> 1)); \
564
+        const int a6  = d07 + d34 - (d16 + (d16 >> 1)); \
565
+        const int a7  = d16 - d25 + (d34 + (d34 >> 1)); \
566
+        DST(0, a0 + a1);                                \
567
+        DST(1, a4 + (a7 >> 2));                         \
568
+        DST(2, a2 + (a3 >> 1));                         \
569
+        DST(3, a5 + (a6 >> 2));                         \
570
+        DST(4, a0 - a1);                                \
571
+        DST(5, a6 - (a5 >> 2));                         \
572
+        DST(6, (a2 >> 1) - a3);                         \
573
+        DST(7, (a4 >> 2) - a7);                         \
574
+    }
575
+
576
+static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
577
+                           uint8_t *src2, int stride, int h)
578
+{
579
+    int16_t dct[8][8];
580
+    int i, sum = 0;
581
+
582
+    s->pdsp.diff_pixels(dct[0], src1, src2, stride);
583
+
584
+#define SRC(x) dct[i][x]
585
+#define DST(x, v) dct[i][x] = v
586
+    for (i = 0; i < 8; i++)
587
+        DCT8_1D
588
+#undef SRC
589
+#undef DST
590
+
591
+#define SRC(x) dct[x][i]
592
+#define DST(x, v) sum += FFABS(v)
593
+        for (i = 0; i < 8; i++)
594
+            DCT8_1D
595
+#undef SRC
596
+#undef DST
597
+            return sum;
598
+}
599
+#endif
600
+
601
+static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
602
+                        uint8_t *src2, int stride, int h)
603
+{
604
+    LOCAL_ALIGNED_16(int16_t, temp, [64]);
605
+    int sum = 0, i;
606
+
607
+    assert(h == 8);
608
+
609
+    s->pdsp.diff_pixels(temp, src1, src2, stride);
610
+    s->fdsp.fdct(temp);
611
+
612
+    for (i = 0; i < 64; i++)
613
+        sum = FFMAX(sum, FFABS(temp[i]));
614
+
615
+    return sum;
616
+}
617
+
618
+static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
619
+                           uint8_t *src2, int stride, int h)
620
+{
621
+    LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
622
+    int16_t *const bak = temp + 64;
623
+    int sum = 0, i;
624
+
625
+    assert(h == 8);
626
+    s->mb_intra = 0;
627
+
628
+    s->pdsp.diff_pixels(temp, src1, src2, stride);
629
+
630
+    memcpy(bak, temp, 64 * sizeof(int16_t));
631
+
632
+    s->block_last_index[0 /* FIXME */] =
633
+        s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
634
+    s->dct_unquantize_inter(s, temp, 0, s->qscale);
635
+    ff_simple_idct_8(temp); // FIXME
636
+
637
+    for (i = 0; i < 64; i++)
638
+        sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
639
+
640
+    return sum;
641
+}
642
+
643
+static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
644
+                   int stride, int h)
645
+{
646
+    const uint8_t *scantable = s->intra_scantable.permutated;
647
+    LOCAL_ALIGNED_16(int16_t, temp, [64]);
648
+    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
649
+    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
650
+    int i, last, run, bits, level, distortion, start_i;
651
+    const int esc_length = s->ac_esc_length;
652
+    uint8_t *length, *last_length;
653
+
654
+    assert(h == 8);
655
+
656
+    copy_block8(lsrc1, src1, 8, stride, 8);
657
+    copy_block8(lsrc2, src2, 8, stride, 8);
658
+
659
+    s->pdsp.diff_pixels(temp, lsrc1, lsrc2, 8);
660
+
661
+    s->block_last_index[0 /* FIXME */] =
662
+    last                               =
663
+        s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
664
+
665
+    bits = 0;
666
+
667
+    if (s->mb_intra) {
668
+        start_i     = 1;
669
+        length      = s->intra_ac_vlc_length;
670
+        last_length = s->intra_ac_vlc_last_length;
671
+        bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
672
+    } else {
673
+        start_i     = 0;
674
+        length      = s->inter_ac_vlc_length;
675
+        last_length = s->inter_ac_vlc_last_length;
676
+    }
677
+
678
+    if (last >= start_i) {
679
+        run = 0;
680
+        for (i = start_i; i < last; i++) {
681
+            int j = scantable[i];
682
+            level = temp[j];
683
+
684
+            if (level) {
685
+                level += 64;
686
+                if ((level & (~127)) == 0)
687
+                    bits += length[UNI_AC_ENC_INDEX(run, level)];
688
+                else
689
+                    bits += esc_length;
690
+                run = 0;
691
+            } else
692
+                run++;
693
+        }
694
+        i = scantable[last];
695
+
696
+        level = temp[i] + 64;
697
+
698
+        assert(level - 64);
699
+
700
+        if ((level & (~127)) == 0) {
701
+            bits += last_length[UNI_AC_ENC_INDEX(run, level)];
702
+        } else
703
+            bits += esc_length;
704
+    }
705
+
706
+    if (last >= 0) {
707
+        if (s->mb_intra)
708
+            s->dct_unquantize_intra(s, temp, 0, s->qscale);
709
+        else
710
+            s->dct_unquantize_inter(s, temp, 0, s->qscale);
711
+    }
712
+
713
+    s->idsp.idct_add(lsrc2, 8, temp);
714
+
715
+    distortion = s->mecc.sse[1](NULL, lsrc2, lsrc1, 8, 8);
716
+
717
+    return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
718
+}
719
+
720
+static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
721
+                    int stride, int h)
722
+{
723
+    const uint8_t *scantable = s->intra_scantable.permutated;
724
+    LOCAL_ALIGNED_16(int16_t, temp, [64]);
725
+    int i, last, run, bits, level, start_i;
726
+    const int esc_length = s->ac_esc_length;
727
+    uint8_t *length, *last_length;
728
+
729
+    assert(h == 8);
730
+
731
+    s->pdsp.diff_pixels(temp, src1, src2, stride);
732
+
733
+    s->block_last_index[0 /* FIXME */] =
734
+    last                               =
735
+        s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
736
+
737
+    bits = 0;
738
+
739
+    if (s->mb_intra) {
740
+        start_i     = 1;
741
+        length      = s->intra_ac_vlc_length;
742
+        last_length = s->intra_ac_vlc_last_length;
743
+        bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
744
+    } else {
745
+        start_i     = 0;
746
+        length      = s->inter_ac_vlc_length;
747
+        last_length = s->inter_ac_vlc_last_length;
748
+    }
749
+
750
+    if (last >= start_i) {
751
+        run = 0;
752
+        for (i = start_i; i < last; i++) {
753
+            int j = scantable[i];
754
+            level = temp[j];
755
+
756
+            if (level) {
757
+                level += 64;
758
+                if ((level & (~127)) == 0)
759
+                    bits += length[UNI_AC_ENC_INDEX(run, level)];
760
+                else
761
+                    bits += esc_length;
762
+                run = 0;
763
+            } else
764
+                run++;
765
+        }
766
+        i = scantable[last];
767
+
768
+        level = temp[i] + 64;
769
+
770
+        assert(level - 64);
771
+
772
+        if ((level & (~127)) == 0)
773
+            bits += last_length[UNI_AC_ENC_INDEX(run, level)];
774
+        else
775
+            bits += esc_length;
776
+    }
777
+
778
+    return bits;
779
+}
780
+
781
+#define VSAD_INTRA(size)                                                \
782
+static int vsad_intra ## size ## _c(MpegEncContext *c,                  \
783
+                                    uint8_t *s, uint8_t *dummy,         \
784
+                                    int stride, int h)                  \
785
+{                                                                       \
786
+    int score = 0, x, y;                                                \
787
+                                                                        \
788
+    for (y = 1; y < h; y++) {                                           \
789
+        for (x = 0; x < size; x += 4) {                                 \
790
+            score += FFABS(s[x]     - s[x + stride])     +              \
791
+                     FFABS(s[x + 1] - s[x + stride + 1]) +              \
792
+                     FFABS(s[x + 2] - s[x + 2 + stride]) +              \
793
+                     FFABS(s[x + 3] - s[x + 3 + stride]);               \
794
+        }                                                               \
795
+        s += stride;                                                    \
796
+    }                                                                   \
797
+                                                                        \
798
+    return score;                                                       \
799
+}
800
+VSAD_INTRA(8)
801
+VSAD_INTRA(16)
802
+
803
+static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
804
+                    int stride, int h)
805
+{
806
+    int score = 0, x, y;
807
+
808
+    for (y = 1; y < h; y++) {
809
+        for (x = 0; x < 16; x++)
810
+            score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
811
+        s1 += stride;
812
+        s2 += stride;
813
+    }
814
+
815
+    return score;
816
+}
817
+
818
+#define SQ(a) ((a) * (a))
819
+#define VSSE_INTRA(size)                                                \
820
+static int vsse_intra ## size ## _c(MpegEncContext *c,                  \
821
+                                    uint8_t *s, uint8_t *dummy,         \
822
+                                    int stride, int h)                  \
823
+{                                                                       \
824
+    int score = 0, x, y;                                                \
825
+                                                                        \
826
+    for (y = 1; y < h; y++) {                                           \
827
+        for (x = 0; x < size; x += 4) {                                 \
828
+            score += SQ(s[x]     - s[x + stride]) +                     \
829
+                     SQ(s[x + 1] - s[x + stride + 1]) +                 \
830
+                     SQ(s[x + 2] - s[x + stride + 2]) +                 \
831
+                     SQ(s[x + 3] - s[x + stride + 3]);                  \
832
+        }                                                               \
833
+        s += stride;                                                    \
834
+    }                                                                   \
835
+                                                                        \
836
+    return score;                                                       \
837
+}
838
+VSSE_INTRA(8)
839
+VSSE_INTRA(16)
840
+
841
+static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
842
+                    int stride, int h)
843
+{
844
+    int score = 0, x, y;
845
+
846
+    for (y = 1; y < h; y++) {
847
+        for (x = 0; x < 16; x++)
848
+            score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
849
+        s1 += stride;
850
+        s2 += stride;
851
+    }
852
+
853
+    return score;
854
+}
855
+
856
+#define WRAPPER8_16_SQ(name8, name16)                                   \
857
+static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,        \
858
+                  int stride, int h)                                    \
859
+{                                                                       \
860
+    int score = 0;                                                      \
861
+                                                                        \
862
+    score += name8(s, dst, src, stride, 8);                             \
863
+    score += name8(s, dst + 8, src + 8, stride, 8);                     \
864
+    if (h == 16) {                                                      \
865
+        dst   += 8 * stride;                                            \
866
+        src   += 8 * stride;                                            \
867
+        score += name8(s, dst, src, stride, 8);                         \
868
+        score += name8(s, dst + 8, src + 8, stride, 8);                 \
869
+    }                                                                   \
870
+    return score;                                                       \
871
+}
872
+
873
+WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
874
+WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
875
+WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
876
+#if CONFIG_GPL
877
+WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
878
+#endif
879
+WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
880
+WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
881
+WRAPPER8_16_SQ(rd8x8_c, rd16_c)
882
+WRAPPER8_16_SQ(bit8x8_c, bit16_c)
883
+
884
+av_cold void ff_me_cmp_init_static(void)
885
+{
886
+    int i;
887
+
888
+    for (i = 0; i < 512; i++)
889
+        ff_square_tab[i] = (i - 256) * (i - 256);
890
+}
891
+
892
+av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
893
+{
894
+    c->sum_abs_dctelem = sum_abs_dctelem_c;
895
+
896
+    /* TODO [0] 16  [1] 8 */
897
+    c->pix_abs[0][0] = pix_abs16_c;
898
+    c->pix_abs[0][1] = pix_abs16_x2_c;
899
+    c->pix_abs[0][2] = pix_abs16_y2_c;
900
+    c->pix_abs[0][3] = pix_abs16_xy2_c;
901
+    c->pix_abs[1][0] = pix_abs8_c;
902
+    c->pix_abs[1][1] = pix_abs8_x2_c;
903
+    c->pix_abs[1][2] = pix_abs8_y2_c;
904
+    c->pix_abs[1][3] = pix_abs8_xy2_c;
905
+
906
+#define SET_CMP_FUNC(name)                      \
907
+    c->name[0] = name ## 16_c;                  \
908
+    c->name[1] = name ## 8x8_c;
909
+
910
+    SET_CMP_FUNC(hadamard8_diff)
911
+    c->hadamard8_diff[4] = hadamard8_intra16_c;
912
+    c->hadamard8_diff[5] = hadamard8_intra8x8_c;
913
+    SET_CMP_FUNC(dct_sad)
914
+    SET_CMP_FUNC(dct_max)
915
+#if CONFIG_GPL
916
+    SET_CMP_FUNC(dct264_sad)
917
+#endif
918
+    c->sad[0] = pix_abs16_c;
919
+    c->sad[1] = pix_abs8_c;
920
+    c->sse[0] = sse16_c;
921
+    c->sse[1] = sse8_c;
922
+    c->sse[2] = sse4_c;
923
+    SET_CMP_FUNC(quant_psnr)
924
+    SET_CMP_FUNC(rd)
925
+    SET_CMP_FUNC(bit)
926
+    c->vsad[0] = vsad16_c;
927
+    c->vsad[4] = vsad_intra16_c;
928
+    c->vsad[5] = vsad_intra8_c;
929
+    c->vsse[0] = vsse16_c;
930
+    c->vsse[4] = vsse_intra16_c;
931
+    c->vsse[5] = vsse_intra8_c;
932
+    c->nsse[0] = nsse16_c;
933
+    c->nsse[1] = nsse8_c;
934
+
935
+    if (ARCH_ARM)
936
+        ff_me_cmp_init_arm(c, avctx);
937
+    if (ARCH_PPC)
938
+        ff_me_cmp_init_ppc(c, avctx);
939
+    if (ARCH_X86)
940
+        ff_me_cmp_init_x86(c, avctx);
941
+}
0 942
new file mode 100644
... ...
@@ -0,0 +1,73 @@
0
+/*
1
+ * This file is part of Libav.
2
+ *
3
+ * Libav is free software; you can redistribute it and/or
4
+ * modify it under the terms of the GNU Lesser General Public
5
+ * License as published by the Free Software Foundation; either
6
+ * version 2.1 of the License, or (at your option) any later version.
7
+ *
8
+ * Libav is distributed in the hope that it will be useful,
9
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11
+ * Lesser General Public License for more details.
12
+ *
13
+ * You should have received a copy of the GNU Lesser General Public
14
+ * License along with Libav; if not, write to the Free Software
15
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+ */
17
+
18
+#ifndef AVCODEC_ME_CMP_H
19
+#define AVCODEC_ME_CMP_H
20
+
21
+#include <stdint.h>
22
+
23
+#include "avcodec.h"
24
+
25
+extern uint32_t ff_square_tab[512];
26
+
27
+struct MpegEncContext;
28
+/* Motion estimation:
29
+ * h is limited to { width / 2, width, 2 * width },
30
+ * but never larger than 16 and never smaller than 2.
31
+ * Although currently h < 4 is not used as functions with
32
+ * width < 8 are neither used nor implemented. */
33
+typedef int (*me_cmp_func)(struct MpegEncContext *c,
34
+                           uint8_t *blk1 /* align width (8 or 16) */,
35
+                           uint8_t *blk2 /* align 1 */, int line_size, int h);
36
+
37
+typedef struct MECmpContext {
38
+    int (*sum_abs_dctelem)(int16_t *block /* align 16 */);
39
+
40
+    me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */
41
+    me_cmp_func sse[6];
42
+    me_cmp_func hadamard8_diff[6];
43
+    me_cmp_func dct_sad[6];
44
+    me_cmp_func quant_psnr[6];
45
+    me_cmp_func bit[6];
46
+    me_cmp_func rd[6];
47
+    me_cmp_func vsad[6];
48
+    me_cmp_func vsse[6];
49
+    me_cmp_func nsse[6];
50
+    me_cmp_func dct_max[6];
51
+    me_cmp_func dct264_sad[6];
52
+
53
+    me_cmp_func me_pre_cmp[6];
54
+    me_cmp_func me_cmp[6];
55
+    me_cmp_func me_sub_cmp[6];
56
+    me_cmp_func mb_cmp[6];
57
+    me_cmp_func ildct_cmp[6]; // only width 16 used
58
+    me_cmp_func frame_skip_cmp[6]; // only width 8 used
59
+
60
+    me_cmp_func pix_abs[2][4];
61
+} MECmpContext;
62
+
63
+void ff_me_cmp_init_static(void);
64
+
65
+void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
66
+void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
67
+void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
68
+void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
69
+
70
+void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type);
71
+
72
+#endif /* AVCODEC_ME_CMP_H */
... ...
@@ -317,10 +317,10 @@ int ff_init_me(MpegEncContext *s){
317 317
         av_log(s->avctx, AV_LOG_INFO, "ME_MAP size may be a little small for the selected diamond size\n");
318 318
     }
319 319
 
320
-    ff_set_cmp(&s->dsp, s->dsp.me_pre_cmp, c->avctx->me_pre_cmp);
321
-    ff_set_cmp(&s->dsp, s->dsp.me_cmp, c->avctx->me_cmp);
322
-    ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, c->avctx->me_sub_cmp);
323
-    ff_set_cmp(&s->dsp, s->dsp.mb_cmp, c->avctx->mb_cmp);
320
+    ff_set_cmp(&s->mecc, s->mecc.me_pre_cmp, c->avctx->me_pre_cmp);
321
+    ff_set_cmp(&s->mecc, s->mecc.me_cmp,     c->avctx->me_cmp);
322
+    ff_set_cmp(&s->mecc, s->mecc.me_sub_cmp, c->avctx->me_sub_cmp);
323
+    ff_set_cmp(&s->mecc, s->mecc.mb_cmp,     c->avctx->mb_cmp);
324 324
 
325 325
     c->flags    = get_flags(c, 0, c->avctx->me_cmp    &FF_CMP_CHROMA);
326 326
     c->sub_flags= get_flags(c, 0, c->avctx->me_sub_cmp&FF_CMP_CHROMA);
... ...
@@ -361,12 +361,10 @@ int ff_init_me(MpegEncContext *s){
361 361
     /* 8x8 fullpel search would need a 4x4 chroma compare, which we do
362 362
      * not have yet, and even if we had, the motion estimation code
363 363
      * does not expect it. */
364
-    if((c->avctx->me_cmp&FF_CMP_CHROMA)/* && !s->dsp.me_cmp[2]*/){
365
-        s->dsp.me_cmp[2]= zero_cmp;
366
-    }
367
-    if((c->avctx->me_sub_cmp&FF_CMP_CHROMA) && !s->dsp.me_sub_cmp[2]){
368
-        s->dsp.me_sub_cmp[2]= zero_cmp;
369
-    }
364
+    if ((c->avctx->me_cmp & FF_CMP_CHROMA) /* && !s->mecc.me_cmp[2] */)
365
+        s->mecc.me_cmp[2] = zero_cmp;
366
+    if ((c->avctx->me_sub_cmp & FF_CMP_CHROMA) && !s->mecc.me_sub_cmp[2])
367
+        s->mecc.me_sub_cmp[2] = zero_cmp;
370 368
     c->hpel_put[2][0]= c->hpel_put[2][1]=
371 369
     c->hpel_put[2][2]= c->hpel_put[2][3]= zero_hpel;
372 370
 
... ...
@@ -379,7 +377,7 @@ int ff_init_me(MpegEncContext *s){
379 379
 
380 380
 #define CHECK_SAD_HALF_MV(suffix, x, y) \
381 381
 {\
382
-    d= s->dsp.pix_abs[size][(x?1:0)+(y?2:0)](NULL, pix, ptr+((x)>>1), stride, h);\
382
+    d  = s->mecc.pix_abs[size][(x ? 1 : 0) + (y ? 2 : 0)](NULL, pix, ptr + ((x) >> 1), stride, h); \
383 383
     d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*penalty_factor;\
384 384
     COPY3_IF_LT(dminh, d, dx, x, dy, y)\
385 385
 }
... ...
@@ -615,7 +613,7 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
615 615
 
616 616
         dmin4= c->sub_motion_search(s, &mx4, &my4, dmin4, block, block, size, h);
617 617
 
618
-        if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){
618
+        if (s->mecc.me_sub_cmp[0] != s->mecc.mb_cmp[0]) {
619 619
             int dxy;
620 620
             const int offset= ((block&1) + (block>>1)*stride)*8;
621 621
             uint8_t *dest_y = c->scratchpad + offset;
... ...
@@ -657,8 +655,11 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
657 657
     if(same)
658 658
         return INT_MAX;
659 659
 
660
-    if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){
661
-        dmin_sum += s->dsp.mb_cmp[0](s, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*16*stride, c->scratchpad, stride, 16);
660
+    if (s->mecc.me_sub_cmp[0] != s->mecc.mb_cmp[0]) {
661
+        dmin_sum += s->mecc.mb_cmp[0](s,
662
+                                      s->new_picture.f->data[0] +
663
+                                      s->mb_x * 16 + s->mb_y * 16 * stride,
664
+                                      c->scratchpad, stride, 16);
662 665
     }
663 666
 
664 667
     if(c->avctx->mb_cmp&FF_CMP_CHROMA){
... ...
@@ -680,8 +681,8 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
680 680
             s->hdsp.put_pixels_tab       [1][dxy](c->scratchpad + 8, s->last_picture.f->data[2] + offset, s->uvlinesize, 8);
681 681
         }
682 682
 
683
-        dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.f->data[1] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, c->scratchpad  , s->uvlinesize, 8);
684
-        dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.f->data[2] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, c->scratchpad+8, s->uvlinesize, 8);
683
+        dmin_sum += s->mecc.mb_cmp[1](s, s->new_picture.f->data[1] + s->mb_x * 8 + s->mb_y * 8 * s->uvlinesize, c->scratchpad,     s->uvlinesize, 8);
684
+        dmin_sum += s->mecc.mb_cmp[1](s, s->new_picture.f->data[2] + s->mb_x * 8 + s->mb_y * 8 * s->uvlinesize, c->scratchpad + 8, s->uvlinesize, 8);
685 685
     }
686 686
 
687 687
     c->pred_x= mx;
... ...
@@ -777,7 +778,7 @@ static int interlaced_search(MpegEncContext *s, int ref_index,
777 777
             mv_table[xy][0]= mx_i;
778 778
             mv_table[xy][1]= my_i;
779 779
 
780
-            if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){
780
+            if (s->mecc.me_sub_cmp[0] != s->mecc.mb_cmp[0]) {
781 781
                 int dxy;
782 782
 
783 783
                 //FIXME chroma ME
... ...
@@ -789,7 +790,7 @@ static int interlaced_search(MpegEncContext *s, int ref_index,
789 789
                 }else{
790 790
                     s->hdsp.put_pixels_tab       [size][dxy](c->scratchpad, ref    , stride, h);
791 791
                 }
792
-                dmin= s->dsp.mb_cmp[size](s, c->src[block][0], c->scratchpad, stride, h);
792
+                dmin = s->mecc.mb_cmp[size](s, c->src[block][0], c->scratchpad, stride, h);
793 793
                 dmin+= (mv_penalty[mx_i-c->pred_x] + mv_penalty[my_i-c->pred_y] + 1)*c->mb_penalty_factor;
794 794
             }else
795 795
                 dmin+= c->mb_penalty_factor; //field_select bits
... ...
@@ -940,7 +941,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
940 940
     /* At this point (mx,my) are full-pell and the relative displacement */
941 941
     ppix = c->ref[0][0] + (my * s->linesize) + mx;
942 942
 
943
-    vard = s->dsp.sse[0](NULL, pix, ppix, s->linesize, 16);
943
+    vard = s->mecc.sse[0](NULL, pix, ppix, s->linesize, 16);
944 944
 
945 945
     pic->mc_mb_var[s->mb_stride * mb_y + mb_x] = (vard+128)>>8;
946 946
     c->mc_mb_var_sum_temp += (vard+128)>>8;
... ...
@@ -1037,7 +1038,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
1037 1037
                 *(uint32_t*)(&c->scratchpad[i*s->linesize+12]) = mean;
1038 1038
             }
1039 1039
 
1040
-            intra_score= s->dsp.mb_cmp[0](s, c->scratchpad, pix, s->linesize, 16);
1040
+            intra_score= s->mecc.mb_cmp[0](s, c->scratchpad, pix, s->linesize, 16);
1041 1041
         }
1042 1042
         intra_score += c->mb_penalty_factor*16;
1043 1043
 
... ...
@@ -1237,7 +1238,7 @@ static inline int check_bidir_mv(MpegEncContext * s,
1237 1237
 
1238 1238
     fbmin = (mv_penalty_f[motion_fx-pred_fx] + mv_penalty_f[motion_fy-pred_fy])*c->mb_penalty_factor
1239 1239
            +(mv_penalty_b[motion_bx-pred_bx] + mv_penalty_b[motion_by-pred_by])*c->mb_penalty_factor
1240
-           + s->dsp.mb_cmp[size](s, src_data[0], dest_y, stride, h); //FIXME new_pic
1240
+           + s->mecc.mb_cmp[size](s, src_data[0], dest_y, stride, h); // FIXME new_pic
1241 1241
 
1242 1242
     if(c->avctx->mb_cmp&FF_CMP_CHROMA){
1243 1243
     }
... ...
@@ -63,8 +63,8 @@ static int hpel_motion_search(MpegEncContext * s,
63 63
 
64 64
  //FIXME factorize
65 65
 
66
-    cmp_sub= s->dsp.me_sub_cmp[size];
67
-    chroma_cmp_sub= s->dsp.me_sub_cmp[size+1];
66
+    cmp_sub        = s->mecc.me_sub_cmp[size];
67
+    chroma_cmp_sub = s->mecc.me_sub_cmp[size + 1];
68 68
 
69 69
     if(c->skip){ //FIXME move out of hpel?
70 70
         *mx_ptr = 0;
... ...
@@ -166,7 +166,6 @@ static inline int get_mb_score(MpegEncContext *s, int mx, int my,
166 166
                                int src_index, int ref_index, int size,
167 167
                                int h, int add_rate)
168 168
 {
169
-//    const int check_luma= s->dsp.me_sub_cmp != s->dsp.mb_cmp;
170 169
     MotionEstContext * const c= &s->me;
171 170
     const int penalty_factor= c->mb_penalty_factor;
172 171
     const int flags= c->mb_flags;
... ...
@@ -179,8 +178,8 @@ static inline int get_mb_score(MpegEncContext *s, int mx, int my,
179 179
 
180 180
  //FIXME factorize
181 181
 
182
-    cmp_sub= s->dsp.mb_cmp[size];
183
-    chroma_cmp_sub= s->dsp.mb_cmp[size+1];
182
+    cmp_sub        = s->mecc.mb_cmp[size];
183
+    chroma_cmp_sub = s->mecc.mb_cmp[size + 1];
184 184
 
185 185
 //    assert(!c->skip);
186 186
 //    assert(c->avctx->me_sub_cmp != c->avctx->mb_cmp);
... ...
@@ -226,12 +225,12 @@ static int qpel_motion_search(MpegEncContext * s,
226 226
     LOAD_COMMON
227 227
     int flags= c->sub_flags;
228 228
 
229
-    cmpf= s->dsp.me_cmp[size];
230
-    chroma_cmpf= s->dsp.me_cmp[size+1]; //factorize FIXME
229
+    cmpf        = s->mecc.me_cmp[size];
230
+    chroma_cmpf = s->mecc.me_cmp[size + 1]; // FIXME: factorize
231 231
  //FIXME factorize
232 232
 
233
-    cmp_sub= s->dsp.me_sub_cmp[size];
234
-    chroma_cmp_sub= s->dsp.me_sub_cmp[size+1];
233
+    cmp_sub        = s->mecc.me_sub_cmp[size];
234
+    chroma_cmp_sub = s->mecc.me_sub_cmp[size + 1];
235 235
 
236 236
     if(c->skip){ //FIXME somehow move up (benchmark)
237 237
         *mx_ptr = 0;
... ...
@@ -427,8 +426,8 @@ static av_always_inline int small_diamond_search(MpegEncContext * s, int *best,
427 427
     LOAD_COMMON2
428 428
     unsigned map_generation = c->map_generation;
429 429
 
430
-    cmpf= s->dsp.me_cmp[size];
431
-    chroma_cmpf= s->dsp.me_cmp[size+1];
430
+    cmpf        = s->mecc.me_cmp[size];
431
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
432 432
 
433 433
     { /* ensure that the best point is in the MAP as h/qpel refinement needs it */
434 434
         const unsigned key = (best[1]<<ME_MAP_MV_BITS) + best[0] + map_generation;
... ...
@@ -468,8 +467,8 @@ static int funny_diamond_search(MpegEncContext * s, int *best, int dmin,
468 468
     LOAD_COMMON2
469 469
     unsigned map_generation = c->map_generation;
470 470
 
471
-    cmpf= s->dsp.me_cmp[size];
472
-    chroma_cmpf= s->dsp.me_cmp[size+1];
471
+    cmpf        = s->mecc.me_cmp[size];
472
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
473 473
 
474 474
     for(dia_size=1; dia_size<=4; dia_size++){
475 475
         int dir;
... ...
@@ -511,8 +510,8 @@ static int hex_search(MpegEncContext * s, int *best, int dmin,
511 511
     int x,y,d;
512 512
     const int dec= dia_size & (dia_size-1);
513 513
 
514
-    cmpf= s->dsp.me_cmp[size];
515
-    chroma_cmpf= s->dsp.me_cmp[size+1];
514
+    cmpf        = s->mecc.me_cmp[size];
515
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
516 516
 
517 517
     for(;dia_size; dia_size= dec ? dia_size-1 : dia_size>>1){
518 518
         do{
... ...
@@ -548,8 +547,8 @@ static int l2s_dia_search(MpegEncContext * s, int *best, int dmin,
548 548
     static const int hex[8][2]={{-2, 0}, {-1,-1}, { 0,-2}, { 1,-1},
549 549
                                 { 2, 0}, { 1, 1}, { 0, 2}, {-1, 1}};
550 550
 
551
-    cmpf= s->dsp.me_cmp[size];
552
-    chroma_cmpf= s->dsp.me_cmp[size+1];
551
+    cmpf        = s->mecc.me_cmp[size];
552
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
553 553
 
554 554
     for(; dia_size; dia_size= dec ? dia_size-1 : dia_size>>1){
555 555
         do{
... ...
@@ -587,8 +586,8 @@ static int umh_search(MpegEncContext * s, int *best, int dmin,
587 587
                                  {-2, 3}, { 0, 4}, { 2, 3},
588 588
                                  {-2,-3}, { 0,-4}, { 2,-3},};
589 589
 
590
-    cmpf= s->dsp.me_cmp[size];
591
-    chroma_cmpf= s->dsp.me_cmp[size+1];
590
+    cmpf        = s->mecc.me_cmp[size];
591
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
592 592
 
593 593
     x= best[0];
594 594
     y= best[1];
... ...
@@ -630,8 +629,8 @@ static int full_search(MpegEncContext * s, int *best, int dmin,
630 630
     int x,y, d;
631 631
     const int dia_size= c->dia_size&0xFF;
632 632
 
633
-    cmpf= s->dsp.me_cmp[size];
634
-    chroma_cmpf= s->dsp.me_cmp[size+1];
633
+    cmpf        = s->mecc.me_cmp[size];
634
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
635 635
 
636 636
     for(y=FFMAX(-dia_size, ymin); y<=FFMIN(dia_size,ymax); y++){
637 637
         for(x=FFMAX(-dia_size, xmin); x<=FFMIN(dia_size,xmax); x++){
... ...
@@ -694,8 +693,8 @@ static int sab_diamond_search(MpegEncContext * s, int *best, int dmin,
694 694
     LOAD_COMMON2
695 695
     unsigned map_generation = c->map_generation;
696 696
 
697
-    cmpf= s->dsp.me_cmp[size];
698
-    chroma_cmpf= s->dsp.me_cmp[size+1];
697
+    cmpf        = s->mecc.me_cmp[size];
698
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
699 699
 
700 700
     /*Note j<MAX_SAB_SIZE is needed if MAX_SAB_SIZE < ME_MAP_SIZE as j can
701 701
       become larger due to MVs overflowing their ME_MAP_MV_BITS bits space in map
... ...
@@ -779,8 +778,8 @@ static int var_diamond_search(MpegEncContext * s, int *best, int dmin,
779 779
     LOAD_COMMON2
780 780
     unsigned map_generation = c->map_generation;
781 781
 
782
-    cmpf= s->dsp.me_cmp[size];
783
-    chroma_cmpf= s->dsp.me_cmp[size+1];
782
+    cmpf        = s->mecc.me_cmp[size];
783
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
784 784
 
785 785
     for(dia_size=1; dia_size<=c->dia_size; dia_size++){
786 786
         int dir, start, end;
... ...
@@ -880,12 +879,12 @@ static av_always_inline int epzs_motion_search_internal(MpegEncContext * s, int
880 880
 
881 881
     if(c->pre_pass){
882 882
         penalty_factor= c->pre_penalty_factor;
883
-        cmpf= s->dsp.me_pre_cmp[size];
884
-        chroma_cmpf= s->dsp.me_pre_cmp[size+1];
883
+        cmpf           = s->mecc.me_pre_cmp[size];
884
+        chroma_cmpf    = s->mecc.me_pre_cmp[size + 1];
885 885
     }else{
886 886
         penalty_factor= c->penalty_factor;
887
-        cmpf= s->dsp.me_cmp[size];
888
-        chroma_cmpf= s->dsp.me_cmp[size+1];
887
+        cmpf           = s->mecc.me_cmp[size];
888
+        chroma_cmpf    = s->mecc.me_cmp[size + 1];
889 889
     }
890 890
 
891 891
     map_generation= update_map_generation(c);
... ...
@@ -1009,8 +1008,8 @@ static int epzs_motion_search4(MpegEncContext * s,
1009 1009
     int flags= c->flags;
1010 1010
     LOAD_COMMON2
1011 1011
 
1012
-    cmpf= s->dsp.me_cmp[size];
1013
-    chroma_cmpf= s->dsp.me_cmp[size+1];
1012
+    cmpf        = s->mecc.me_cmp[size];
1013
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
1014 1014
 
1015 1015
     map_generation= update_map_generation(c);
1016 1016
 
... ...
@@ -1068,8 +1067,8 @@ static int epzs_motion_search2(MpegEncContext * s,
1068 1068
     int flags= c->flags;
1069 1069
     LOAD_COMMON2
1070 1070
 
1071
-    cmpf= s->dsp.me_cmp[size];
1072
-    chroma_cmpf= s->dsp.me_cmp[size+1];
1071
+    cmpf        = s->mecc.me_cmp[size];
1072
+    chroma_cmpf = s->mecc.me_cmp[size + 1];
1073 1073
 
1074 1074
     map_generation= update_map_generation(c);
1075 1075
 
... ...
@@ -689,7 +689,7 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
689 689
                         b_pic = pic->f->data[0] + offset;
690 690
                         if (!pic->shared)
691 691
                             b_pic += INPLACE_OFFSET;
692
-                        diff = s->dsp.sad[0](NULL, p_pic, b_pic, s->linesize, 16);
692
+                        diff = s->mecc.sad[0](NULL, p_pic, b_pic, s->linesize, 16);
693 693
                         if (diff > s->qscale * 70) {  // FIXME check that 70 is optimal
694 694
                             s->mb_skipped = 0;
695 695
                             break;
... ...
@@ -378,9 +378,9 @@ static void mpeg_er_decode_mb(void *opaque, int ref, int mv_dir, int mv_type,
378 378
 av_cold int ff_dct_common_init(MpegEncContext *s)
379 379
 {
380 380
     ff_blockdsp_init(&s->bdsp, s->avctx);
381
-    ff_dsputil_init(&s->dsp, s->avctx);
382 381
     ff_hpeldsp_init(&s->hdsp, s->avctx->flags);
383 382
     ff_idctdsp_init(&s->idsp, s->avctx);
383
+    ff_me_cmp_init(&s->mecc, s->avctx);
384 384
     ff_mpegvideodsp_init(&s->mdsp);
385 385
     ff_videodsp_init(&s->vdsp, s->avctx->bits_per_raw_sample);
386 386
 
... ...
@@ -1051,7 +1051,7 @@ static int init_er(MpegEncContext *s)
1051 1051
     int i;
1052 1052
 
1053 1053
     er->avctx       = s->avctx;
1054
-    er->dsp         = &s->dsp;
1054
+    er->mecc        = &s->mecc;
1055 1055
 
1056 1056
     er->mb_index2xy = s->mb_index2xy;
1057 1057
     er->mb_num      = s->mb_num;
... ...
@@ -30,13 +30,13 @@
30 30
 
31 31
 #include "avcodec.h"
32 32
 #include "blockdsp.h"
33
-#include "dsputil.h"
34 33
 #include "error_resilience.h"
35 34
 #include "fdctdsp.h"
36 35
 #include "get_bits.h"
37 36
 #include "h263dsp.h"
38 37
 #include "hpeldsp.h"
39 38
 #include "idctdsp.h"
39
+#include "me_cmp.h"
40 40
 #include "mpegvideodsp.h"
41 41
 #include "mpegvideoencdsp.h"
42 42
 #include "pixblockdsp.h"
... ...
@@ -356,10 +356,10 @@ typedef struct MpegEncContext {
356 356
     int h263_long_vectors;      ///< use horrible h263v1 long vector mode
357 357
 
358 358
     BlockDSPContext bdsp;
359
-    DSPContext dsp;             ///< pointers for accelerated dsp functions
360 359
     FDCTDSPContext fdsp;
361 360
     HpelDSPContext hdsp;
362 361
     IDCTDSPContext idsp;
362
+    MECmpContext mecc;
363 363
     MpegVideoDSPContext mdsp;
364 364
     MpegvideoEncDSPContext mpvencdsp;
365 365
     PixblockDSPContext pdsp;
... ...
@@ -702,6 +702,7 @@ av_cold int ff_MPV_encode_init(AVCodecContext *avctx)
702 702
         ff_MPV_encode_init_x86(s);
703 703
 
704 704
     ff_fdctdsp_init(&s->fdsp, avctx);
705
+    ff_me_cmp_init(&s->mecc, avctx);
705 706
     ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
706 707
     ff_pixblockdsp_init(&s->pdsp, avctx);
707 708
     ff_qpeldsp_init(&s->qdsp);
... ...
@@ -744,8 +745,8 @@ av_cold int ff_MPV_encode_init(AVCodecContext *avctx)
744 744
 
745 745
     s->quant_precision = 5;
746 746
 
747
-    ff_set_cmp(&s->dsp, s->dsp.ildct_cmp, s->avctx->ildct_cmp);
748
-    ff_set_cmp(&s->dsp, s->dsp.frame_skip_cmp, s->avctx->frame_skip_cmp);
747
+    ff_set_cmp(&s->mecc, s->mecc.ildct_cmp,      s->avctx->ildct_cmp);
748
+    ff_set_cmp(&s->mecc, s->mecc.frame_skip_cmp, s->avctx->frame_skip_cmp);
749 749
 
750 750
     if (CONFIG_H261_ENCODER && s->out_format == FMT_H261)
751 751
         ff_h261_encode_init(s);
... ...
@@ -895,8 +896,8 @@ static int get_intra_count(MpegEncContext *s, uint8_t *src,
895 895
     for (y = 0; y < h; y += 16) {
896 896
         for (x = 0; x < w; x += 16) {
897 897
             int offset = x + y * stride;
898
-            int sad  = s->dsp.sad[0](NULL, src + offset, ref + offset, stride,
899
-                                     16);
898
+            int sad  = s->mecc.sad[0](NULL, src + offset, ref + offset,
899
+                                      stride, 16);
900 900
             int mean = (s->mpvencdsp.pix_sum(src + offset, stride) + 128) >> 8;
901 901
             int sae  = get_sae(src + offset, mean, stride);
902 902
 
... ...
@@ -1053,7 +1054,7 @@ static int skip_check(MpegEncContext *s, Picture *p, Picture *ref)
1053 1053
                 int off = p->shared ? 0 : 16;
1054 1054
                 uint8_t *dptr = p->f->data[plane] + 8 * (x + y * stride) + off;
1055 1055
                 uint8_t *rptr = ref->f->data[plane] + 8 * (x + y * stride);
1056
-                int v   = s->dsp.frame_skip_cmp[1](s, dptr, rptr, stride, 8);
1056
+                int v = s->mecc.frame_skip_cmp[1](s, dptr, rptr, stride, 8);
1057 1057
 
1058 1058
                 switch (s->avctx->frame_skip_exp) {
1059 1059
                 case 0: score    =  FFMAX(score, v);          break;
... ...
@@ -1923,16 +1924,15 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
1923 1923
             int progressive_score, interlaced_score;
1924 1924
 
1925 1925
             s->interlaced_dct = 0;
1926
-            progressive_score = s->dsp.ildct_cmp[4](s, ptr_y,
1927
-                                                    NULL, wrap_y, 8) +
1928
-                                s->dsp.ildct_cmp[4](s, ptr_y + wrap_y * 8,
1929
-                                                    NULL, wrap_y, 8) - 400;
1926
+            progressive_score = s->mecc.ildct_cmp[4](s, ptr_y, NULL, wrap_y, 8) +
1927
+                                s->mecc.ildct_cmp[4](s, ptr_y + wrap_y * 8,
1928
+                                                     NULL, wrap_y, 8) - 400;
1930 1929
 
1931 1930
             if (progressive_score > 0) {
1932
-                interlaced_score = s->dsp.ildct_cmp[4](s, ptr_y,
1933
-                                                       NULL, wrap_y * 2, 8) +
1934
-                                   s->dsp.ildct_cmp[4](s, ptr_y + wrap_y,
1935
-                                                       NULL, wrap_y * 2, 8);
1931
+                interlaced_score = s->mecc.ildct_cmp[4](s, ptr_y,
1932
+                                                        NULL, wrap_y * 2, 8) +
1933
+                                   s->mecc.ildct_cmp[4](s, ptr_y + wrap_y,
1934
+                                                        NULL, wrap_y * 2, 8);
1936 1935
                 if (progressive_score > interlaced_score) {
1937 1936
                     s->interlaced_dct = 1;
1938 1937
 
... ...
@@ -1996,23 +1996,20 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
1996 1996
             int progressive_score, interlaced_score;
1997 1997
 
1998 1998
             s->interlaced_dct = 0;
1999
-            progressive_score = s->dsp.ildct_cmp[0](s, dest_y,
2000
-                                                    ptr_y,              wrap_y,
2001
-                                                    8) +
2002
-                                s->dsp.ildct_cmp[0](s, dest_y + wrap_y * 8,
2003
-                                                    ptr_y + wrap_y * 8, wrap_y,
2004
-                                                    8) - 400;
1999
+            progressive_score = s->mecc.ildct_cmp[0](s, dest_y, ptr_y, wrap_y, 8) +
2000
+                                s->mecc.ildct_cmp[0](s, dest_y + wrap_y * 8,
2001
+                                                     ptr_y + wrap_y * 8,
2002
+                                                     wrap_y, 8) - 400;
2005 2003
 
2006 2004
             if (s->avctx->ildct_cmp == FF_CMP_VSSE)
2007 2005
                 progressive_score -= 400;
2008 2006
 
2009 2007
             if (progressive_score > 0) {
2010
-                interlaced_score = s->dsp.ildct_cmp[0](s, dest_y,
2011
-                                                       ptr_y,
2012
-                                                       wrap_y * 2, 8) +
2013
-                                   s->dsp.ildct_cmp[0](s, dest_y + wrap_y,
2014
-                                                       ptr_y + wrap_y,
2015
-                                                       wrap_y * 2, 8);
2008
+                interlaced_score = s->mecc.ildct_cmp[0](s, dest_y, ptr_y,
2009
+                                                        wrap_y * 2, 8) +
2010
+                                   s->mecc.ildct_cmp[0](s, dest_y + wrap_y,
2011
+                                                        ptr_y + wrap_y,
2012
+                                                        wrap_y * 2, 8);
2016 2013
 
2017 2014
                 if (progressive_score > interlaced_score) {
2018 2015
                     s->interlaced_dct = 1;
... ...
@@ -2049,33 +2046,28 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
2049 2049
         if (s->current_picture.mc_mb_var[s->mb_stride * mb_y + mb_x] <
2050 2050
                 2 * s->qscale * s->qscale) {
2051 2051
             // FIXME optimize
2052
-            if (s->dsp.sad[1](NULL, ptr_y , dest_y,
2053
-                              wrap_y, 8) < 20 * s->qscale)
2052
+            if (s->mecc.sad[1](NULL, ptr_y, dest_y, wrap_y, 8) < 20 * s->qscale)
2054 2053
                 skip_dct[0] = 1;
2055
-            if (s->dsp.sad[1](NULL, ptr_y + 8,
2056
-                              dest_y + 8, wrap_y, 8) < 20 * s->qscale)
2054
+            if (s->mecc.sad[1](NULL, ptr_y + 8, dest_y + 8, wrap_y, 8) < 20 * s->qscale)
2057 2055
                 skip_dct[1] = 1;
2058
-            if (s->dsp.sad[1](NULL, ptr_y + dct_offset,
2059
-                              dest_y + dct_offset, wrap_y, 8) < 20 * s->qscale)
2056
+            if (s->mecc.sad[1](NULL, ptr_y + dct_offset, dest_y + dct_offset,
2057
+                               wrap_y, 8) < 20 * s->qscale)
2060 2058
                 skip_dct[2] = 1;
2061
-            if (s->dsp.sad[1](NULL, ptr_y + dct_offset + 8,
2062
-                              dest_y + dct_offset + 8,
2063
-                              wrap_y, 8) < 20 * s->qscale)
2059
+            if (s->mecc.sad[1](NULL, ptr_y + dct_offset + 8, dest_y + dct_offset + 8,
2060
+                               wrap_y, 8) < 20 * s->qscale)
2064 2061
                 skip_dct[3] = 1;
2065
-            if (s->dsp.sad[1](NULL, ptr_cb, dest_cb,
2066
-                              wrap_c, 8) < 20 * s->qscale)
2062
+            if (s->mecc.sad[1](NULL, ptr_cb, dest_cb, wrap_c, 8) < 20 * s->qscale)
2067 2063
                 skip_dct[4] = 1;
2068
-            if (s->dsp.sad[1](NULL, ptr_cr, dest_cr,
2069
-                              wrap_c, 8) < 20 * s->qscale)
2064
+            if (s->mecc.sad[1](NULL, ptr_cr, dest_cr, wrap_c, 8) < 20 * s->qscale)
2070 2065
                 skip_dct[5] = 1;
2071 2066
             if (!s->chroma_y_shift) { /* 422 */
2072
-                if (s->dsp.sad[1](NULL, ptr_cb + (dct_offset >> 1),
2073
-                                  dest_cb + (dct_offset >> 1),
2074
-                                  wrap_c, 8) < 20 * s->qscale)
2067
+                if (s->mecc.sad[1](NULL, ptr_cb + (dct_offset >> 1),
2068
+                                   dest_cb + (dct_offset >> 1),
2069
+                                   wrap_c, 8) < 20 * s->qscale)
2075 2070
                     skip_dct[6] = 1;
2076
-                if (s->dsp.sad[1](NULL, ptr_cr + (dct_offset >> 1),
2077
-                                  dest_cr + (dct_offset >> 1),
2078
-                                  wrap_c, 8) < 20 * s->qscale)
2071
+                if (s->mecc.sad[1](NULL, ptr_cr + (dct_offset >> 1),
2072
+                                   dest_cr + (dct_offset >> 1),
2073
+                                   wrap_c, 8) < 20 * s->qscale)
2079 2074
                     skip_dct[7] = 1;
2080 2075
             }
2081 2076
         }
... ...
@@ -2340,9 +2332,9 @@ static int sse(MpegEncContext *s, uint8_t *src1, uint8_t *src2, int w, int h, in
2340 2340
     int x,y;
2341 2341
 
2342 2342
     if(w==16 && h==16)
2343
-        return s->dsp.sse[0](NULL, src1, src2, stride, 16);
2343
+        return s->mecc.sse[0](NULL, src1, src2, stride, 16);
2344 2344
     else if(w==8 && h==8)
2345
-        return s->dsp.sse[1](NULL, src1, src2, stride, 8);
2345
+        return s->mecc.sse[1](NULL, src1, src2, stride, 8);
2346 2346
 
2347 2347
     for(y=0; y<h; y++){
2348 2348
         for(x=0; x<w; x++){
... ...
@@ -2364,13 +2356,13 @@ static int sse_mb(MpegEncContext *s){
2364 2364
 
2365 2365
     if(w==16 && h==16)
2366 2366
       if(s->avctx->mb_cmp == FF_CMP_NSSE){
2367
-        return  s->dsp.nsse[0](s, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], s->linesize, 16)
2368
-               +s->dsp.nsse[1](s, s->new_picture.f->data[1] + s->mb_x*8  + s->mb_y*s->uvlinesize*8,s->dest[1], s->uvlinesize, 8)
2369
-               +s->dsp.nsse[1](s, s->new_picture.f->data[2] + s->mb_x*8  + s->mb_y*s->uvlinesize*8,s->dest[2], s->uvlinesize, 8);
2367
+        return s->mecc.nsse[0](s, s->new_picture.f->data[0] + s->mb_x * 16 + s->mb_y * s->linesize   * 16, s->dest[0], s->linesize,   16) +
2368
+               s->mecc.nsse[1](s, s->new_picture.f->data[1] + s->mb_x *  8 + s->mb_y * s->uvlinesize *  8, s->dest[1], s->uvlinesize,  8) +
2369
+               s->mecc.nsse[1](s, s->new_picture.f->data[2] + s->mb_x *  8 + s->mb_y * s->uvlinesize *  8, s->dest[2], s->uvlinesize,  8);
2370 2370
       }else{
2371
-        return  s->dsp.sse[0](NULL, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], s->linesize, 16)
2372
-               +s->dsp.sse[1](NULL, s->new_picture.f->data[1] + s->mb_x*8  + s->mb_y*s->uvlinesize*8,s->dest[1], s->uvlinesize, 8)
2373
-               +s->dsp.sse[1](NULL, s->new_picture.f->data[2] + s->mb_x*8  + s->mb_y*s->uvlinesize*8,s->dest[2], s->uvlinesize, 8);
2371
+        return s->mecc.sse[0](NULL, s->new_picture.f->data[0] + s->mb_x * 16 + s->mb_y * s->linesize   * 16, s->dest[0], s->linesize,   16) +
2372
+               s->mecc.sse[1](NULL, s->new_picture.f->data[1] + s->mb_x *  8 + s->mb_y * s->uvlinesize *  8, s->dest[1], s->uvlinesize,  8) +
2373
+               s->mecc.sse[1](NULL, s->new_picture.f->data[2] + s->mb_x *  8 + s->mb_y * s->uvlinesize *  8, s->dest[2], s->uvlinesize,  8);
2374 2374
       }
2375 2375
     else
2376 2376
         return  sse(s, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], w, h, s->linesize)
... ...
@@ -24,8 +24,8 @@
24 24
 #include "libavutil/attributes.h"
25 25
 #include "libavutil/imgutils.h"
26 26
 #include "avcodec.h"
27
-#include "dsputil.h"
28 27
 #include "imgconvert.h"
28
+#include "me_cmp.h"
29 29
 #include "mpegvideoencdsp.h"
30 30
 
31 31
 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
... ...
@@ -2,7 +2,6 @@ OBJS                                   += ppc/fmtconvert_altivec.o      \
2 2
 
3 3
 OBJS-$(CONFIG_AUDIODSP)                += ppc/audiodsp.o
4 4
 OBJS-$(CONFIG_BLOCKDSP)                += ppc/blockdsp.o
5
-OBJS-$(CONFIG_DSPUTIL)                 += ppc/dsputil_altivec.o
6 5
 OBJS-$(CONFIG_FFT)                     += ppc/fft_altivec.o
7 6
 OBJS-$(CONFIG_H264CHROMA)              += ppc/h264chroma_init.o
8 7
 OBJS-$(CONFIG_H264DSP)                 += ppc/h264dsp.o
... ...
@@ -11,6 +10,7 @@ OBJS-$(CONFIG_HPELDSP)                 += ppc/hpeldsp_altivec.o
11 11
 OBJS-$(CONFIG_HUFFYUVDSP)              += ppc/huffyuvdsp_altivec.o
12 12
 OBJS-$(CONFIG_FDCTDSP)                 += ppc/fdctdsp.o
13 13
 OBJS-$(CONFIG_IDCTDSP)                 += ppc/idctdsp.o
14
+OBJS-$(CONFIG_ME_CMP)                  += ppc/me_cmp.o
14 15
 OBJS-$(CONFIG_MPEGAUDIODSP)            += ppc/mpegaudiodsp_altivec.o
15 16
 OBJS-$(CONFIG_MPEGVIDEO)               += ppc/mpegvideo_altivec.o      \
16 17
                                           ppc/mpegvideodsp.o
17 18
deleted file mode 100644
... ...
@@ -1,767 +0,0 @@
1
-/*
2
- * Copyright (c) 2002 Brian Foley
3
- * Copyright (c) 2002 Dieter Shirley
4
- * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5
- *
6
- * This file is part of Libav.
7
- *
8
- * Libav is free software; you can redistribute it and/or
9
- * modify it under the terms of the GNU Lesser General Public
10
- * License as published by the Free Software Foundation; either
11
- * version 2.1 of the License, or (at your option) any later version.
12
- *
13
- * Libav is distributed in the hope that it will be useful,
14
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
- * Lesser General Public License for more details.
17
- *
18
- * You should have received a copy of the GNU Lesser General Public
19
- * License along with Libav; if not, write to the Free Software
20
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
- */
22
-
23
-#include "config.h"
24
-#if HAVE_ALTIVEC_H
25
-#include <altivec.h>
26
-#endif
27
-
28
-#include "libavutil/attributes.h"
29
-#include "libavutil/cpu.h"
30
-#include "libavutil/ppc/cpu.h"
31
-#include "libavutil/ppc/types_altivec.h"
32
-#include "libavutil/ppc/util_altivec.h"
33
-#include "libavcodec/avcodec.h"
34
-#include "libavcodec/dsputil.h"
35
-#include "libavcodec/mpegvideo.h"
36
-
37
-#if HAVE_ALTIVEC
38
-static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
39
-                            int line_size, int h)
40
-{
41
-    int i, s = 0;
42
-    const vector unsigned char zero =
43
-        (const vector unsigned char) vec_splat_u8(0);
44
-    vector unsigned char perm1 = vec_lvsl(0, pix2);
45
-    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
46
-    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
47
-    vector signed int sumdiffs;
48
-
49
-    for (i = 0; i < h; i++) {
50
-        /* Read unaligned pixels into our vectors. The vectors are as follows:
51
-         * pix1v: pix1[0] - pix1[15]
52
-         * pix2v: pix2[0] - pix2[15]      pix2iv: pix2[1] - pix2[16] */
53
-        vector unsigned char pix1v  = vec_ld(0,  pix1);
54
-        vector unsigned char pix2l  = vec_ld(0,  pix2);
55
-        vector unsigned char pix2r  = vec_ld(16, pix2);
56
-        vector unsigned char pix2v  = vec_perm(pix2l, pix2r, perm1);
57
-        vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
58
-
59
-        /* Calculate the average vector. */
60
-        vector unsigned char avgv = vec_avg(pix2v, pix2iv);
61
-
62
-        /* Calculate a sum of abs differences vector. */
63
-        vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv),
64
-                                          vec_min(pix1v, avgv));
65
-
66
-        /* Add each 4 pixel group together and put 4 results into sad. */
67
-        sad = vec_sum4s(t5, sad);
68
-
69
-        pix1 += line_size;
70
-        pix2 += line_size;
71
-    }
72
-    /* Sum up the four partial sums, and put the result into s. */
73
-    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
74
-    sumdiffs = vec_splat(sumdiffs, 3);
75
-    vec_ste(sumdiffs, 0, &s);
76
-
77
-    return s;
78
-}
79
-
80
-static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
81
-                            int line_size, int h)
82
-{
83
-    int i, s = 0;
84
-    const vector unsigned char zero =
85
-        (const vector unsigned char) vec_splat_u8(0);
86
-    vector unsigned char perm = vec_lvsl(0, pix2);
87
-    vector unsigned char pix1v, pix3v, avgv, t5;
88
-    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
89
-    vector signed int sumdiffs;
90
-    uint8_t *pix3 = pix2 + line_size;
91
-
92
-    /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
93
-     * iteration becomes pix2 in the next iteration. We can use this
94
-     * fact to avoid a potentially expensive unaligned read, each
95
-     * time around the loop.
96
-     * Read unaligned pixels into our vectors. The vectors are as follows:
97
-     * pix2v: pix2[0] - pix2[15]
98
-     * Split the pixel vectors into shorts. */
99
-    vector unsigned char pix2l = vec_ld(0,  pix2);
100
-    vector unsigned char pix2r = vec_ld(15, pix2);
101
-    vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm);
102
-
103
-    for (i = 0; i < h; i++) {
104
-        /* Read unaligned pixels into our vectors. The vectors are as follows:
105
-         * pix1v: pix1[0] - pix1[15]
106
-         * pix3v: pix3[0] - pix3[15] */
107
-        pix1v = vec_ld(0,  pix1);
108
-
109
-        pix2l = vec_ld(0,  pix3);
110
-        pix2r = vec_ld(15, pix3);
111
-        pix3v = vec_perm(pix2l, pix2r, perm);
112
-
113
-        /* Calculate the average vector. */
114
-        avgv = vec_avg(pix2v, pix3v);
115
-
116
-        /* Calculate a sum of abs differences vector. */
117
-        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
118
-
119
-        /* Add each 4 pixel group together and put 4 results into sad. */
120
-        sad = vec_sum4s(t5, sad);
121
-
122
-        pix1 += line_size;
123
-        pix2v = pix3v;
124
-        pix3 += line_size;
125
-    }
126
-
127
-    /* Sum up the four partial sums, and put the result into s. */
128
-    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
129
-    sumdiffs = vec_splat(sumdiffs, 3);
130
-    vec_ste(sumdiffs, 0, &s);
131
-    return s;
132
-}
133
-
134
-static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
135
-                             int line_size, int h)
136
-{
137
-    int i, s = 0;
138
-    uint8_t *pix3 = pix2 + line_size;
139
-    const vector unsigned char zero =
140
-        (const vector unsigned char) vec_splat_u8(0);
141
-    const vector unsigned short two =
142
-        (const vector unsigned short) vec_splat_u16(2);
143
-    vector unsigned char avgv, t5;
144
-    vector unsigned char perm1 = vec_lvsl(0, pix2);
145
-    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
146
-    vector unsigned char pix1v, pix3v, pix3iv;
147
-    vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
148
-    vector unsigned short avghv, avglv;
149
-    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
150
-    vector signed int sumdiffs;
151
-
152
-    /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
153
-     * iteration becomes pix2 in the next iteration. We can use this
154
-     * fact to avoid a potentially expensive unaligned read, as well
155
-     * as some splitting, and vector addition each time around the loop.
156
-     * Read unaligned pixels into our vectors. The vectors are as follows:
157
-     * pix2v: pix2[0] - pix2[15]  pix2iv: pix2[1] - pix2[16]
158
-     * Split the pixel vectors into shorts. */
159
-    vector unsigned char pix2l  = vec_ld(0,  pix2);
160
-    vector unsigned char pix2r  = vec_ld(16, pix2);
161
-    vector unsigned char pix2v  = vec_perm(pix2l, pix2r, perm1);
162
-    vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
163
-
164
-    vector unsigned short pix2hv  =
165
-        (vector unsigned short) vec_mergeh(zero, pix2v);
166
-    vector unsigned short pix2lv  =
167
-        (vector unsigned short) vec_mergel(zero, pix2v);
168
-    vector unsigned short pix2ihv =
169
-        (vector unsigned short) vec_mergeh(zero, pix2iv);
170
-    vector unsigned short pix2ilv =
171
-        (vector unsigned short) vec_mergel(zero, pix2iv);
172
-    vector unsigned short t1 = vec_add(pix2hv, pix2ihv);
173
-    vector unsigned short t2 = vec_add(pix2lv, pix2ilv);
174
-    vector unsigned short t3, t4;
175
-
176
-    for (i = 0; i < h; i++) {
177
-        /* Read unaligned pixels into our vectors. The vectors are as follows:
178
-         * pix1v: pix1[0] - pix1[15]
179
-         * pix3v: pix3[0] - pix3[15]      pix3iv: pix3[1] - pix3[16] */
180
-        pix1v  = vec_ld(0, pix1);
181
-
182
-        pix2l  = vec_ld(0, pix3);
183
-        pix2r  = vec_ld(16, pix3);
184
-        pix3v  = vec_perm(pix2l, pix2r, perm1);
185
-        pix3iv = vec_perm(pix2l, pix2r, perm2);
186
-
187
-        /* Note that AltiVec does have vec_avg, but this works on vector pairs
188
-         * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the
189
-         * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when
190
-         * it should be 1. Instead, we have to split the pixel vectors into
191
-         * vectors of shorts and do the averaging by hand. */
192
-
193
-        /* Split the pixel vectors into shorts. */
194
-        pix3hv  = (vector unsigned short) vec_mergeh(zero, pix3v);
195
-        pix3lv  = (vector unsigned short) vec_mergel(zero, pix3v);
196
-        pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
197
-        pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
198
-
199
-        /* Do the averaging on them. */
200
-        t3 = vec_add(pix3hv, pix3ihv);
201
-        t4 = vec_add(pix3lv, pix3ilv);
202
-
203
-        avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
204
-        avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
205
-
206
-        /* Pack the shorts back into a result. */
207
-        avgv = vec_pack(avghv, avglv);
208
-
209
-        /* Calculate a sum of abs differences vector. */
210
-        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
211
-
212
-        /* Add each 4 pixel group together and put 4 results into sad. */
213
-        sad = vec_sum4s(t5, sad);
214
-
215
-        pix1 += line_size;
216
-        pix3 += line_size;
217
-        /* Transfer the calculated values for pix3 into pix2. */
218
-        t1 = t3;
219
-        t2 = t4;
220
-    }
221
-    /* Sum up the four partial sums, and put the result into s. */
222
-    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
223
-    sumdiffs = vec_splat(sumdiffs, 3);
224
-    vec_ste(sumdiffs, 0, &s);
225
-
226
-    return s;
227
-}
228
-
229
-static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
230
-                         int line_size, int h)
231
-{
232
-    int i, s;
233
-    const vector unsigned int zero =
234
-        (const vector unsigned int) vec_splat_u32(0);
235
-    vector unsigned char perm = vec_lvsl(0, pix2);
236
-    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
237
-    vector signed int sumdiffs;
238
-
239
-    for (i = 0; i < h; i++) {
240
-        /* Read potentially unaligned pixels into t1 and t2. */
241
-        vector unsigned char pix2l = vec_ld(0,  pix2);
242
-        vector unsigned char pix2r = vec_ld(15, pix2);
243
-        vector unsigned char t1 = vec_ld(0, pix1);
244
-        vector unsigned char t2 = vec_perm(pix2l, pix2r, perm);
245
-
246
-        /* Calculate a sum of abs differences vector. */
247
-        vector unsigned char t3 = vec_max(t1, t2);
248
-        vector unsigned char t4 = vec_min(t1, t2);
249
-        vector unsigned char t5 = vec_sub(t3, t4);
250
-
251
-        /* Add each 4 pixel group together and put 4 results into sad. */
252
-        sad = vec_sum4s(t5, sad);
253
-
254
-        pix1 += line_size;
255
-        pix2 += line_size;
256
-    }
257
-
258
-    /* Sum up the four partial sums, and put the result into s. */
259
-    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
260
-    sumdiffs = vec_splat(sumdiffs, 3);
261
-    vec_ste(sumdiffs, 0, &s);
262
-
263
-    return s;
264
-}
265
-
266
-static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
267
-                        int line_size, int h)
268
-{
269
-    int i, s;
270
-    const vector unsigned int zero =
271
-        (const vector unsigned int) vec_splat_u32(0);
272
-    const vector unsigned char permclear =
273
-        (vector unsigned char)
274
-        { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
275
-    vector unsigned char perm1 = vec_lvsl(0, pix1);
276
-    vector unsigned char perm2 = vec_lvsl(0, pix2);
277
-    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
278
-    vector signed int sumdiffs;
279
-
280
-    for (i = 0; i < h; i++) {
281
-        /* Read potentially unaligned pixels into t1 and t2.
282
-         * Since we're reading 16 pixels, and actually only want 8,
283
-         * mask out the last 8 pixels. The 0s don't change the sum. */
284
-        vector unsigned char pix1l = vec_ld(0, pix1);
285
-        vector unsigned char pix1r = vec_ld(7, pix1);
286
-        vector unsigned char pix2l = vec_ld(0, pix2);
287
-        vector unsigned char pix2r = vec_ld(7, pix2);
288
-        vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1),
289
-                                          permclear);
290
-        vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2),
291
-                                          permclear);
292
-
293
-        /* Calculate a sum of abs differences vector. */
294
-        vector unsigned char t3 = vec_max(t1, t2);
295
-        vector unsigned char t4 = vec_min(t1, t2);
296
-        vector unsigned char t5 = vec_sub(t3, t4);
297
-
298
-        /* Add each 4 pixel group together and put 4 results into sad. */
299
-        sad = vec_sum4s(t5, sad);
300
-
301
-        pix1 += line_size;
302
-        pix2 += line_size;
303
-    }
304
-
305
-    /* Sum up the four partial sums, and put the result into s. */
306
-    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
307
-    sumdiffs = vec_splat(sumdiffs, 3);
308
-    vec_ste(sumdiffs, 0, &s);
309
-
310
-    return s;
311
-}
312
-
313
-/* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced.
314
- * It's the sad8_altivec code above w/ squaring added. */
315
-static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
316
-                        int line_size, int h)
317
-{
318
-    int i, s;
319
-    const vector unsigned int zero =
320
-        (const vector unsigned int) vec_splat_u32(0);
321
-    const vector unsigned char permclear =
322
-        (vector unsigned char)
323
-        { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
324
-    vector unsigned char perm1 = vec_lvsl(0, pix1);
325
-    vector unsigned char perm2 = vec_lvsl(0, pix2);
326
-    vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
327
-    vector signed int sumsqr;
328
-
329
-    for (i = 0; i < h; i++) {
330
-        /* Read potentially unaligned pixels into t1 and t2.
331
-         * Since we're reading 16 pixels, and actually only want 8,
332
-         * mask out the last 8 pixels. The 0s don't change the sum. */
333
-        vector unsigned char pix1l = vec_ld(0, pix1);
334
-        vector unsigned char pix1r = vec_ld(7, pix1);
335
-        vector unsigned char pix2l = vec_ld(0, pix2);
336
-        vector unsigned char pix2r = vec_ld(7, pix2);
337
-        vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1),
338
-                                          permclear);
339
-        vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2),
340
-                                          permclear);
341
-
342
-        /* Since we want to use unsigned chars, we can take advantage
343
-         * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
344
-
345
-        /* Calculate abs differences vector. */
346
-        vector unsigned char t3 = vec_max(t1, t2);
347
-        vector unsigned char t4 = vec_min(t1, t2);
348
-        vector unsigned char t5 = vec_sub(t3, t4);
349
-
350
-        /* Square the values and add them to our sum. */
351
-        sum = vec_msum(t5, t5, sum);
352
-
353
-        pix1 += line_size;
354
-        pix2 += line_size;
355
-    }
356
-
357
-    /* Sum up the four partial sums, and put the result into s. */
358
-    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
359
-    sumsqr = vec_splat(sumsqr, 3);
360
-    vec_ste(sumsqr, 0, &s);
361
-
362
-    return s;
363
-}
364
-
365
-/* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced.
366
- * It's the sad16_altivec code above w/ squaring added. */
367
-static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
368
-                         int line_size, int h)
369
-{
370
-    int i, s;
371
-    const vector unsigned int zero =
372
-        (const vector unsigned int) vec_splat_u32(0);
373
-    vector unsigned char perm = vec_lvsl(0, pix2);
374
-    vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
375
-    vector signed int sumsqr;
376
-
377
-    for (i = 0; i < h; i++) {
378
-        /* Read potentially unaligned pixels into t1 and t2. */
379
-        vector unsigned char pix2l = vec_ld(0,  pix2);
380
-        vector unsigned char pix2r = vec_ld(15, pix2);
381
-        vector unsigned char t1 = vec_ld(0, pix1);
382
-        vector unsigned char t2 = vec_perm(pix2l, pix2r, perm);
383
-
384
-        /* Since we want to use unsigned chars, we can take advantage
385
-         * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
386
-
387
-        /* Calculate abs differences vector. */
388
-        vector unsigned char t3 = vec_max(t1, t2);
389
-        vector unsigned char t4 = vec_min(t1, t2);
390
-        vector unsigned char t5 = vec_sub(t3, t4);
391
-
392
-        /* Square the values and add them to our sum. */
393
-        sum = vec_msum(t5, t5, sum);
394
-
395
-        pix1 += line_size;
396
-        pix2 += line_size;
397
-    }
398
-
399
-    /* Sum up the four partial sums, and put the result into s. */
400
-    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
401
-    sumsqr = vec_splat(sumsqr, 3);
402
-    vec_ste(sumsqr, 0, &s);
403
-
404
-    return s;
405
-}
406
-
407
-static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
408
-                                     uint8_t *src, int stride, int h)
409
-{
410
-    int sum;
411
-    register const vector unsigned char vzero =
412
-        (const vector unsigned char) vec_splat_u8(0);
413
-    register vector signed short temp0, temp1, temp2, temp3, temp4,
414
-                                 temp5, temp6, temp7;
415
-    {
416
-        register const vector signed short vprod1 =
417
-            (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
418
-        register const vector signed short vprod2 =
419
-            (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
420
-        register const vector signed short vprod3 =
421
-            (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
422
-        register const vector unsigned char perm1 =
423
-            (const vector unsigned char)
424
-            { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
425
-              0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
426
-        register const vector unsigned char perm2 =
427
-            (const vector unsigned char)
428
-            { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
429
-              0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
430
-        register const vector unsigned char perm3 =
431
-            (const vector unsigned char)
432
-            { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
433
-              0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
434
-
435
-#define ONEITERBUTTERFLY(i, res)                                            \
436
-    {                                                                       \
437
-        register vector unsigned char src1 = vec_ld(stride * i, src);       \
438
-        register vector unsigned char src2 = vec_ld(stride * i + 15, src);  \
439
-        register vector unsigned char srcO =                                \
440
-            vec_perm(src1, src2, vec_lvsl(stride * i, src));                \
441
-        register vector unsigned char dst1 = vec_ld(stride * i, dst);       \
442
-        register vector unsigned char dst2 = vec_ld(stride * i + 15, dst);  \
443
-        register vector unsigned char dstO =                                \
444
-            vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));                \
445
-                                                                            \
446
-        /* Promote the unsigned chars to signed shorts. */                  \
447
-        /* We're in the 8x8 function, we only care for the first 8. */      \
448
-        register vector signed short srcV =                                 \
449
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
450
-                                             (vector signed char) srcO);    \
451
-        register vector signed short dstV =                                 \
452
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
453
-                                             (vector signed char) dstO);    \
454
-                                                                            \
455
-        /* subtractions inside the first butterfly */                       \
456
-        register vector signed short but0 = vec_sub(srcV, dstV);            \
457
-        register vector signed short op1  = vec_perm(but0, but0, perm1);    \
458
-        register vector signed short but1 = vec_mladd(but0, vprod1, op1);   \
459
-        register vector signed short op2  = vec_perm(but1, but1, perm2);    \
460
-        register vector signed short but2 = vec_mladd(but1, vprod2, op2);   \
461
-        register vector signed short op3  = vec_perm(but2, but2, perm3);    \
462
-        res  = vec_mladd(but2, vprod3, op3);                                \
463
-    }
464
-        ONEITERBUTTERFLY(0, temp0);
465
-        ONEITERBUTTERFLY(1, temp1);
466
-        ONEITERBUTTERFLY(2, temp2);
467
-        ONEITERBUTTERFLY(3, temp3);
468
-        ONEITERBUTTERFLY(4, temp4);
469
-        ONEITERBUTTERFLY(5, temp5);
470
-        ONEITERBUTTERFLY(6, temp6);
471
-        ONEITERBUTTERFLY(7, temp7);
472
-    }
473
-#undef ONEITERBUTTERFLY
474
-    {
475
-        register vector signed int vsum;
476
-        register vector signed short line0  = vec_add(temp0, temp1);
477
-        register vector signed short line1  = vec_sub(temp0, temp1);
478
-        register vector signed short line2  = vec_add(temp2, temp3);
479
-        register vector signed short line3  = vec_sub(temp2, temp3);
480
-        register vector signed short line4  = vec_add(temp4, temp5);
481
-        register vector signed short line5  = vec_sub(temp4, temp5);
482
-        register vector signed short line6  = vec_add(temp6, temp7);
483
-        register vector signed short line7  = vec_sub(temp6, temp7);
484
-
485
-        register vector signed short line0B = vec_add(line0, line2);
486
-        register vector signed short line2B = vec_sub(line0, line2);
487
-        register vector signed short line1B = vec_add(line1, line3);
488
-        register vector signed short line3B = vec_sub(line1, line3);
489
-        register vector signed short line4B = vec_add(line4, line6);
490
-        register vector signed short line6B = vec_sub(line4, line6);
491
-        register vector signed short line5B = vec_add(line5, line7);
492
-        register vector signed short line7B = vec_sub(line5, line7);
493
-
494
-        register vector signed short line0C = vec_add(line0B, line4B);
495
-        register vector signed short line4C = vec_sub(line0B, line4B);
496
-        register vector signed short line1C = vec_add(line1B, line5B);
497
-        register vector signed short line5C = vec_sub(line1B, line5B);
498
-        register vector signed short line2C = vec_add(line2B, line6B);
499
-        register vector signed short line6C = vec_sub(line2B, line6B);
500
-        register vector signed short line3C = vec_add(line3B, line7B);
501
-        register vector signed short line7C = vec_sub(line3B, line7B);
502
-
503
-        vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
504
-        vsum = vec_sum4s(vec_abs(line1C), vsum);
505
-        vsum = vec_sum4s(vec_abs(line2C), vsum);
506
-        vsum = vec_sum4s(vec_abs(line3C), vsum);
507
-        vsum = vec_sum4s(vec_abs(line4C), vsum);
508
-        vsum = vec_sum4s(vec_abs(line5C), vsum);
509
-        vsum = vec_sum4s(vec_abs(line6C), vsum);
510
-        vsum = vec_sum4s(vec_abs(line7C), vsum);
511
-        vsum = vec_sums(vsum, (vector signed int) vzero);
512
-        vsum = vec_splat(vsum, 3);
513
-        vec_ste(vsum, 0, &sum);
514
-    }
515
-    return sum;
516
-}
517
-
518
-/*
519
- * 16x8 works with 16 elements; it allows to avoid replicating loads, and
520
- * gives the compiler more room for scheduling. It's only used from
521
- * inside hadamard8_diff16_altivec.
522
- *
523
- * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has
524
- * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in
525
- * registers by itself. The following code includes hand-made register
526
- * allocation. It's not clean, but on a 7450 the resulting code is much faster
527
- * (best case falls from 700+ cycles to 550).
528
- *
529
- * xlc doesn't add spill code, but it doesn't know how to schedule for the
530
- * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses
531
- * 25% fewer instructions...)
532
- *
533
- * On the 970, the hand-made RA is still a win (around 690 vs. around 780),
534
- * but xlc goes to around 660 on the regular C code...
535
- */
536
-static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
537
-                                      uint8_t *src, int stride, int h)
538
-{
539
-    int sum;
540
-    register vector signed short
541
-        temp0 __asm__ ("v0"),
542
-        temp1 __asm__ ("v1"),
543
-        temp2 __asm__ ("v2"),
544
-        temp3 __asm__ ("v3"),
545
-        temp4 __asm__ ("v4"),
546
-        temp5 __asm__ ("v5"),
547
-        temp6 __asm__ ("v6"),
548
-        temp7 __asm__ ("v7");
549
-    register vector signed short
550
-        temp0S __asm__ ("v8"),
551
-        temp1S __asm__ ("v9"),
552
-        temp2S __asm__ ("v10"),
553
-        temp3S __asm__ ("v11"),
554
-        temp4S __asm__ ("v12"),
555
-        temp5S __asm__ ("v13"),
556
-        temp6S __asm__ ("v14"),
557
-        temp7S __asm__ ("v15");
558
-    register const vector unsigned char vzero __asm__ ("v31") =
559
-        (const vector unsigned char) vec_splat_u8(0);
560
-    {
561
-        register const vector signed short vprod1 __asm__ ("v16") =
562
-            (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
563
-
564
-        register const vector signed short vprod2 __asm__ ("v17") =
565
-            (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
566
-
567
-        register const vector signed short vprod3 __asm__ ("v18") =
568
-            (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
569
-
570
-        register const vector unsigned char perm1 __asm__ ("v19") =
571
-            (const vector unsigned char)
572
-            { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
573
-              0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
574
-
575
-        register const vector unsigned char perm2 __asm__ ("v20") =
576
-            (const vector unsigned char)
577
-            { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
578
-              0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
579
-
580
-        register const vector unsigned char perm3 __asm__ ("v21") =
581
-            (const vector unsigned char)
582
-            { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
583
-              0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
584
-
585
-#define ONEITERBUTTERFLY(i, res1, res2)                                     \
586
-    {                                                                       \
587
-        register vector unsigned char src1 __asm__ ("v22") =                \
588
-            vec_ld(stride * i, src);                                        \
589
-        register vector unsigned char src2 __asm__ ("v23") =                \
590
-            vec_ld(stride * i + 16, src);                                   \
591
-        register vector unsigned char srcO __asm__ ("v22") =                \
592
-            vec_perm(src1, src2, vec_lvsl(stride * i, src));                \
593
-        register vector unsigned char dst1 __asm__ ("v24") =                \
594
-            vec_ld(stride * i, dst);                                        \
595
-        register vector unsigned char dst2 __asm__ ("v25") =                \
596
-            vec_ld(stride * i + 16, dst);                                   \
597
-        register vector unsigned char dstO __asm__ ("v23") =                \
598
-            vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));                \
599
-                                                                            \
600
-        /* Promote the unsigned chars to signed shorts. */                  \
601
-        register vector signed short srcV __asm__ ("v24") =                 \
602
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
603
-                                             (vector signed char) srcO);    \
604
-        register vector signed short dstV __asm__ ("v25") =                 \
605
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
606
-                                             (vector signed char) dstO);    \
607
-        register vector signed short srcW __asm__ ("v26") =                 \
608
-            (vector signed short) vec_mergel((vector signed char) vzero,    \
609
-                                             (vector signed char) srcO);    \
610
-        register vector signed short dstW __asm__ ("v27") =                 \
611
-            (vector signed short) vec_mergel((vector signed char) vzero,    \
612
-                                             (vector signed char) dstO);    \
613
-                                                                            \
614
-        /* subtractions inside the first butterfly */                       \
615
-        register vector signed short but0  __asm__ ("v28") =                \
616
-            vec_sub(srcV, dstV);                                            \
617
-        register vector signed short but0S __asm__ ("v29") =                \
618
-            vec_sub(srcW, dstW);                                            \
619
-        register vector signed short op1   __asm__ ("v30") =                \
620
-            vec_perm(but0, but0, perm1);                                    \
621
-        register vector signed short but1  __asm__ ("v22") =                \
622
-            vec_mladd(but0, vprod1, op1);                                   \
623
-        register vector signed short op1S  __asm__ ("v23") =                \
624
-            vec_perm(but0S, but0S, perm1);                                  \
625
-        register vector signed short but1S __asm__ ("v24") =                \
626
-            vec_mladd(but0S, vprod1, op1S);                                 \
627
-        register vector signed short op2   __asm__ ("v25") =                \
628
-            vec_perm(but1, but1, perm2);                                    \
629
-        register vector signed short but2  __asm__ ("v26") =                \
630
-            vec_mladd(but1, vprod2, op2);                                   \
631
-        register vector signed short op2S  __asm__ ("v27") =                \
632
-            vec_perm(but1S, but1S, perm2);                                  \
633
-        register vector signed short but2S __asm__ ("v28") =                \
634
-            vec_mladd(but1S, vprod2, op2S);                                 \
635
-        register vector signed short op3   __asm__ ("v29") =                \
636
-            vec_perm(but2, but2, perm3);                                    \
637
-        register vector signed short op3S  __asm__ ("v30") =                \
638
-            vec_perm(but2S, but2S, perm3);                                  \
639
-        res1 = vec_mladd(but2, vprod3, op3);                                \
640
-        res2 = vec_mladd(but2S, vprod3, op3S);                              \
641
-    }
642
-        ONEITERBUTTERFLY(0, temp0, temp0S);
643
-        ONEITERBUTTERFLY(1, temp1, temp1S);
644
-        ONEITERBUTTERFLY(2, temp2, temp2S);
645
-        ONEITERBUTTERFLY(3, temp3, temp3S);
646
-        ONEITERBUTTERFLY(4, temp4, temp4S);
647
-        ONEITERBUTTERFLY(5, temp5, temp5S);
648
-        ONEITERBUTTERFLY(6, temp6, temp6S);
649
-        ONEITERBUTTERFLY(7, temp7, temp7S);
650
-    }
651
-#undef ONEITERBUTTERFLY
652
-    {
653
-        register vector signed int vsum;
654
-
655
-        register vector signed short line0  = vec_add(temp0, temp1);
656
-        register vector signed short line1  = vec_sub(temp0, temp1);
657
-        register vector signed short line2  = vec_add(temp2, temp3);
658
-        register vector signed short line3  = vec_sub(temp2, temp3);
659
-        register vector signed short line4  = vec_add(temp4, temp5);
660
-        register vector signed short line5  = vec_sub(temp4, temp5);
661
-        register vector signed short line6  = vec_add(temp6, temp7);
662
-        register vector signed short line7  = vec_sub(temp6, temp7);
663
-
664
-        register vector signed short line0B = vec_add(line0, line2);
665
-        register vector signed short line2B = vec_sub(line0, line2);
666
-        register vector signed short line1B = vec_add(line1, line3);
667
-        register vector signed short line3B = vec_sub(line1, line3);
668
-        register vector signed short line4B = vec_add(line4, line6);
669
-        register vector signed short line6B = vec_sub(line4, line6);
670
-        register vector signed short line5B = vec_add(line5, line7);
671
-        register vector signed short line7B = vec_sub(line5, line7);
672
-
673
-        register vector signed short line0C = vec_add(line0B, line4B);
674
-        register vector signed short line4C = vec_sub(line0B, line4B);
675
-        register vector signed short line1C = vec_add(line1B, line5B);
676
-        register vector signed short line5C = vec_sub(line1B, line5B);
677
-        register vector signed short line2C = vec_add(line2B, line6B);
678
-        register vector signed short line6C = vec_sub(line2B, line6B);
679
-        register vector signed short line3C = vec_add(line3B, line7B);
680
-        register vector signed short line7C = vec_sub(line3B, line7B);
681
-
682
-        register vector signed short line0S = vec_add(temp0S, temp1S);
683
-        register vector signed short line1S = vec_sub(temp0S, temp1S);
684
-        register vector signed short line2S = vec_add(temp2S, temp3S);
685
-        register vector signed short line3S = vec_sub(temp2S, temp3S);
686
-        register vector signed short line4S = vec_add(temp4S, temp5S);
687
-        register vector signed short line5S = vec_sub(temp4S, temp5S);
688
-        register vector signed short line6S = vec_add(temp6S, temp7S);
689
-        register vector signed short line7S = vec_sub(temp6S, temp7S);
690
-
691
-        register vector signed short line0BS = vec_add(line0S, line2S);
692
-        register vector signed short line2BS = vec_sub(line0S, line2S);
693
-        register vector signed short line1BS = vec_add(line1S, line3S);
694
-        register vector signed short line3BS = vec_sub(line1S, line3S);
695
-        register vector signed short line4BS = vec_add(line4S, line6S);
696
-        register vector signed short line6BS = vec_sub(line4S, line6S);
697
-        register vector signed short line5BS = vec_add(line5S, line7S);
698
-        register vector signed short line7BS = vec_sub(line5S, line7S);
699
-
700
-        register vector signed short line0CS = vec_add(line0BS, line4BS);
701
-        register vector signed short line4CS = vec_sub(line0BS, line4BS);
702
-        register vector signed short line1CS = vec_add(line1BS, line5BS);
703
-        register vector signed short line5CS = vec_sub(line1BS, line5BS);
704
-        register vector signed short line2CS = vec_add(line2BS, line6BS);
705
-        register vector signed short line6CS = vec_sub(line2BS, line6BS);
706
-        register vector signed short line3CS = vec_add(line3BS, line7BS);
707
-        register vector signed short line7CS = vec_sub(line3BS, line7BS);
708
-
709
-        vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
710
-        vsum = vec_sum4s(vec_abs(line1C), vsum);
711
-        vsum = vec_sum4s(vec_abs(line2C), vsum);
712
-        vsum = vec_sum4s(vec_abs(line3C), vsum);
713
-        vsum = vec_sum4s(vec_abs(line4C), vsum);
714
-        vsum = vec_sum4s(vec_abs(line5C), vsum);
715
-        vsum = vec_sum4s(vec_abs(line6C), vsum);
716
-        vsum = vec_sum4s(vec_abs(line7C), vsum);
717
-
718
-        vsum = vec_sum4s(vec_abs(line0CS), vsum);
719
-        vsum = vec_sum4s(vec_abs(line1CS), vsum);
720
-        vsum = vec_sum4s(vec_abs(line2CS), vsum);
721
-        vsum = vec_sum4s(vec_abs(line3CS), vsum);
722
-        vsum = vec_sum4s(vec_abs(line4CS), vsum);
723
-        vsum = vec_sum4s(vec_abs(line5CS), vsum);
724
-        vsum = vec_sum4s(vec_abs(line6CS), vsum);
725
-        vsum = vec_sum4s(vec_abs(line7CS), vsum);
726
-        vsum = vec_sums(vsum, (vector signed int) vzero);
727
-        vsum = vec_splat(vsum, 3);
728
-        vec_ste(vsum, 0, &sum);
729
-    }
730
-    return sum;
731
-}
732
-
733
-static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst,
734
-                                    uint8_t *src, int stride, int h)
735
-{
736
-    int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
737
-
738
-    if (h == 16) {
739
-        dst   += 8 * stride;
740
-        src   += 8 * stride;
741
-        score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
742
-    }
743
-    return score;
744
-}
745
-#endif /* HAVE_ALTIVEC */
746
-
747
-av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx)
748
-{
749
-#if HAVE_ALTIVEC
750
-    if (!PPC_ALTIVEC(av_get_cpu_flags()))
751
-        return;
752
-
753
-    c->pix_abs[0][1] = sad16_x2_altivec;
754
-    c->pix_abs[0][2] = sad16_y2_altivec;
755
-    c->pix_abs[0][3] = sad16_xy2_altivec;
756
-    c->pix_abs[0][0] = sad16_altivec;
757
-    c->pix_abs[1][0] = sad8_altivec;
758
-
759
-    c->sad[0] = sad16_altivec;
760
-    c->sad[1] = sad8_altivec;
761
-    c->sse[0] = sse16_altivec;
762
-    c->sse[1] = sse8_altivec;
763
-
764
-    c->hadamard8_diff[0] = hadamard8_diff16_altivec;
765
-    c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
766
-#endif /* HAVE_ALTIVEC */
767
-}
768 1
new file mode 100644
... ...
@@ -0,0 +1,767 @@
0
+/*
1
+ * Copyright (c) 2002 Brian Foley
2
+ * Copyright (c) 2002 Dieter Shirley
3
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
4
+ *
5
+ * This file is part of Libav.
6
+ *
7
+ * Libav is free software; you can redistribute it and/or
8
+ * modify it under the terms of the GNU Lesser General Public
9
+ * License as published by the Free Software Foundation; either
10
+ * version 2.1 of the License, or (at your option) any later version.
11
+ *
12
+ * Libav is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
+ * Lesser General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU Lesser General Public
18
+ * License along with Libav; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ */
21
+
22
+#include "config.h"
23
+#if HAVE_ALTIVEC_H
24
+#include <altivec.h>
25
+#endif
26
+
27
+#include "libavutil/attributes.h"
28
+#include "libavutil/cpu.h"
29
+#include "libavutil/ppc/cpu.h"
30
+#include "libavutil/ppc/types_altivec.h"
31
+#include "libavutil/ppc/util_altivec.h"
32
+#include "libavcodec/avcodec.h"
33
+#include "libavcodec/mpegvideo.h"
34
+#include "libavcodec/me_cmp.h"
35
+
36
+#if HAVE_ALTIVEC
37
+static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
38
+                            int line_size, int h)
39
+{
40
+    int i, s = 0;
41
+    const vector unsigned char zero =
42
+        (const vector unsigned char) vec_splat_u8(0);
43
+    vector unsigned char perm1 = vec_lvsl(0, pix2);
44
+    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
45
+    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
46
+    vector signed int sumdiffs;
47
+
48
+    for (i = 0; i < h; i++) {
49
+        /* Read unaligned pixels into our vectors. The vectors are as follows:
50
+         * pix1v: pix1[0] - pix1[15]
51
+         * pix2v: pix2[0] - pix2[15]      pix2iv: pix2[1] - pix2[16] */
52
+        vector unsigned char pix1v  = vec_ld(0,  pix1);
53
+        vector unsigned char pix2l  = vec_ld(0,  pix2);
54
+        vector unsigned char pix2r  = vec_ld(16, pix2);
55
+        vector unsigned char pix2v  = vec_perm(pix2l, pix2r, perm1);
56
+        vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
57
+
58
+        /* Calculate the average vector. */
59
+        vector unsigned char avgv = vec_avg(pix2v, pix2iv);
60
+
61
+        /* Calculate a sum of abs differences vector. */
62
+        vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv),
63
+                                          vec_min(pix1v, avgv));
64
+
65
+        /* Add each 4 pixel group together and put 4 results into sad. */
66
+        sad = vec_sum4s(t5, sad);
67
+
68
+        pix1 += line_size;
69
+        pix2 += line_size;
70
+    }
71
+    /* Sum up the four partial sums, and put the result into s. */
72
+    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
73
+    sumdiffs = vec_splat(sumdiffs, 3);
74
+    vec_ste(sumdiffs, 0, &s);
75
+
76
+    return s;
77
+}
78
+
79
+static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
80
+                            int line_size, int h)
81
+{
82
+    int i, s = 0;
83
+    const vector unsigned char zero =
84
+        (const vector unsigned char) vec_splat_u8(0);
85
+    vector unsigned char perm = vec_lvsl(0, pix2);
86
+    vector unsigned char pix1v, pix3v, avgv, t5;
87
+    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
88
+    vector signed int sumdiffs;
89
+    uint8_t *pix3 = pix2 + line_size;
90
+
91
+    /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
92
+     * iteration becomes pix2 in the next iteration. We can use this
93
+     * fact to avoid a potentially expensive unaligned read, each
94
+     * time around the loop.
95
+     * Read unaligned pixels into our vectors. The vectors are as follows:
96
+     * pix2v: pix2[0] - pix2[15]
97
+     * Split the pixel vectors into shorts. */
98
+    vector unsigned char pix2l = vec_ld(0,  pix2);
99
+    vector unsigned char pix2r = vec_ld(15, pix2);
100
+    vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm);
101
+
102
+    for (i = 0; i < h; i++) {
103
+        /* Read unaligned pixels into our vectors. The vectors are as follows:
104
+         * pix1v: pix1[0] - pix1[15]
105
+         * pix3v: pix3[0] - pix3[15] */
106
+        pix1v = vec_ld(0,  pix1);
107
+
108
+        pix2l = vec_ld(0,  pix3);
109
+        pix2r = vec_ld(15, pix3);
110
+        pix3v = vec_perm(pix2l, pix2r, perm);
111
+
112
+        /* Calculate the average vector. */
113
+        avgv = vec_avg(pix2v, pix3v);
114
+
115
+        /* Calculate a sum of abs differences vector. */
116
+        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
117
+
118
+        /* Add each 4 pixel group together and put 4 results into sad. */
119
+        sad = vec_sum4s(t5, sad);
120
+
121
+        pix1 += line_size;
122
+        pix2v = pix3v;
123
+        pix3 += line_size;
124
+    }
125
+
126
+    /* Sum up the four partial sums, and put the result into s. */
127
+    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
128
+    sumdiffs = vec_splat(sumdiffs, 3);
129
+    vec_ste(sumdiffs, 0, &s);
130
+    return s;
131
+}
132
+
133
+static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
134
+                             int line_size, int h)
135
+{
136
+    int i, s = 0;
137
+    uint8_t *pix3 = pix2 + line_size;
138
+    const vector unsigned char zero =
139
+        (const vector unsigned char) vec_splat_u8(0);
140
+    const vector unsigned short two =
141
+        (const vector unsigned short) vec_splat_u16(2);
142
+    vector unsigned char avgv, t5;
143
+    vector unsigned char perm1 = vec_lvsl(0, pix2);
144
+    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
145
+    vector unsigned char pix1v, pix3v, pix3iv;
146
+    vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
147
+    vector unsigned short avghv, avglv;
148
+    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
149
+    vector signed int sumdiffs;
150
+
151
+    /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
152
+     * iteration becomes pix2 in the next iteration. We can use this
153
+     * fact to avoid a potentially expensive unaligned read, as well
154
+     * as some splitting, and vector addition each time around the loop.
155
+     * Read unaligned pixels into our vectors. The vectors are as follows:
156
+     * pix2v: pix2[0] - pix2[15]  pix2iv: pix2[1] - pix2[16]
157
+     * Split the pixel vectors into shorts. */
158
+    vector unsigned char pix2l  = vec_ld(0,  pix2);
159
+    vector unsigned char pix2r  = vec_ld(16, pix2);
160
+    vector unsigned char pix2v  = vec_perm(pix2l, pix2r, perm1);
161
+    vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
162
+
163
+    vector unsigned short pix2hv  =
164
+        (vector unsigned short) vec_mergeh(zero, pix2v);
165
+    vector unsigned short pix2lv  =
166
+        (vector unsigned short) vec_mergel(zero, pix2v);
167
+    vector unsigned short pix2ihv =
168
+        (vector unsigned short) vec_mergeh(zero, pix2iv);
169
+    vector unsigned short pix2ilv =
170
+        (vector unsigned short) vec_mergel(zero, pix2iv);
171
+    vector unsigned short t1 = vec_add(pix2hv, pix2ihv);
172
+    vector unsigned short t2 = vec_add(pix2lv, pix2ilv);
173
+    vector unsigned short t3, t4;
174
+
175
+    for (i = 0; i < h; i++) {
176
+        /* Read unaligned pixels into our vectors. The vectors are as follows:
177
+         * pix1v: pix1[0] - pix1[15]
178
+         * pix3v: pix3[0] - pix3[15]      pix3iv: pix3[1] - pix3[16] */
179
+        pix1v  = vec_ld(0, pix1);
180
+
181
+        pix2l  = vec_ld(0, pix3);
182
+        pix2r  = vec_ld(16, pix3);
183
+        pix3v  = vec_perm(pix2l, pix2r, perm1);
184
+        pix3iv = vec_perm(pix2l, pix2r, perm2);
185
+
186
+        /* Note that AltiVec does have vec_avg, but this works on vector pairs
187
+         * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the
188
+         * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when
189
+         * it should be 1. Instead, we have to split the pixel vectors into
190
+         * vectors of shorts and do the averaging by hand. */
191
+
192
+        /* Split the pixel vectors into shorts. */
193
+        pix3hv  = (vector unsigned short) vec_mergeh(zero, pix3v);
194
+        pix3lv  = (vector unsigned short) vec_mergel(zero, pix3v);
195
+        pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
196
+        pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
197
+
198
+        /* Do the averaging on them. */
199
+        t3 = vec_add(pix3hv, pix3ihv);
200
+        t4 = vec_add(pix3lv, pix3ilv);
201
+
202
+        avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
203
+        avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
204
+
205
+        /* Pack the shorts back into a result. */
206
+        avgv = vec_pack(avghv, avglv);
207
+
208
+        /* Calculate a sum of abs differences vector. */
209
+        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
210
+
211
+        /* Add each 4 pixel group together and put 4 results into sad. */
212
+        sad = vec_sum4s(t5, sad);
213
+
214
+        pix1 += line_size;
215
+        pix3 += line_size;
216
+        /* Transfer the calculated values for pix3 into pix2. */
217
+        t1 = t3;
218
+        t2 = t4;
219
+    }
220
+    /* Sum up the four partial sums, and put the result into s. */
221
+    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
222
+    sumdiffs = vec_splat(sumdiffs, 3);
223
+    vec_ste(sumdiffs, 0, &s);
224
+
225
+    return s;
226
+}
227
+
228
+static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
229
+                         int line_size, int h)
230
+{
231
+    int i, s;
232
+    const vector unsigned int zero =
233
+        (const vector unsigned int) vec_splat_u32(0);
234
+    vector unsigned char perm = vec_lvsl(0, pix2);
235
+    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
236
+    vector signed int sumdiffs;
237
+
238
+    for (i = 0; i < h; i++) {
239
+        /* Read potentially unaligned pixels into t1 and t2. */
240
+        vector unsigned char pix2l = vec_ld(0,  pix2);
241
+        vector unsigned char pix2r = vec_ld(15, pix2);
242
+        vector unsigned char t1 = vec_ld(0, pix1);
243
+        vector unsigned char t2 = vec_perm(pix2l, pix2r, perm);
244
+
245
+        /* Calculate a sum of abs differences vector. */
246
+        vector unsigned char t3 = vec_max(t1, t2);
247
+        vector unsigned char t4 = vec_min(t1, t2);
248
+        vector unsigned char t5 = vec_sub(t3, t4);
249
+
250
+        /* Add each 4 pixel group together and put 4 results into sad. */
251
+        sad = vec_sum4s(t5, sad);
252
+
253
+        pix1 += line_size;
254
+        pix2 += line_size;
255
+    }
256
+
257
+    /* Sum up the four partial sums, and put the result into s. */
258
+    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
259
+    sumdiffs = vec_splat(sumdiffs, 3);
260
+    vec_ste(sumdiffs, 0, &s);
261
+
262
+    return s;
263
+}
264
+
265
+static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
266
+                        int line_size, int h)
267
+{
268
+    int i, s;
269
+    const vector unsigned int zero =
270
+        (const vector unsigned int) vec_splat_u32(0);
271
+    const vector unsigned char permclear =
272
+        (vector unsigned char)
273
+        { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
274
+    vector unsigned char perm1 = vec_lvsl(0, pix1);
275
+    vector unsigned char perm2 = vec_lvsl(0, pix2);
276
+    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
277
+    vector signed int sumdiffs;
278
+
279
+    for (i = 0; i < h; i++) {
280
+        /* Read potentially unaligned pixels into t1 and t2.
281
+         * Since we're reading 16 pixels, and actually only want 8,
282
+         * mask out the last 8 pixels. The 0s don't change the sum. */
283
+        vector unsigned char pix1l = vec_ld(0, pix1);
284
+        vector unsigned char pix1r = vec_ld(7, pix1);
285
+        vector unsigned char pix2l = vec_ld(0, pix2);
286
+        vector unsigned char pix2r = vec_ld(7, pix2);
287
+        vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1),
288
+                                          permclear);
289
+        vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2),
290
+                                          permclear);
291
+
292
+        /* Calculate a sum of abs differences vector. */
293
+        vector unsigned char t3 = vec_max(t1, t2);
294
+        vector unsigned char t4 = vec_min(t1, t2);
295
+        vector unsigned char t5 = vec_sub(t3, t4);
296
+
297
+        /* Add each 4 pixel group together and put 4 results into sad. */
298
+        sad = vec_sum4s(t5, sad);
299
+
300
+        pix1 += line_size;
301
+        pix2 += line_size;
302
+    }
303
+
304
+    /* Sum up the four partial sums, and put the result into s. */
305
+    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
306
+    sumdiffs = vec_splat(sumdiffs, 3);
307
+    vec_ste(sumdiffs, 0, &s);
308
+
309
+    return s;
310
+}
311
+
312
+/* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced.
313
+ * It's the sad8_altivec code above w/ squaring added. */
314
+static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
315
+                        int line_size, int h)
316
+{
317
+    int i, s;
318
+    const vector unsigned int zero =
319
+        (const vector unsigned int) vec_splat_u32(0);
320
+    const vector unsigned char permclear =
321
+        (vector unsigned char)
322
+        { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
323
+    vector unsigned char perm1 = vec_lvsl(0, pix1);
324
+    vector unsigned char perm2 = vec_lvsl(0, pix2);
325
+    vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
326
+    vector signed int sumsqr;
327
+
328
+    for (i = 0; i < h; i++) {
329
+        /* Read potentially unaligned pixels into t1 and t2.
330
+         * Since we're reading 16 pixels, and actually only want 8,
331
+         * mask out the last 8 pixels. The 0s don't change the sum. */
332
+        vector unsigned char pix1l = vec_ld(0, pix1);
333
+        vector unsigned char pix1r = vec_ld(7, pix1);
334
+        vector unsigned char pix2l = vec_ld(0, pix2);
335
+        vector unsigned char pix2r = vec_ld(7, pix2);
336
+        vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1),
337
+                                          permclear);
338
+        vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2),
339
+                                          permclear);
340
+
341
+        /* Since we want to use unsigned chars, we can take advantage
342
+         * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
343
+
344
+        /* Calculate abs differences vector. */
345
+        vector unsigned char t3 = vec_max(t1, t2);
346
+        vector unsigned char t4 = vec_min(t1, t2);
347
+        vector unsigned char t5 = vec_sub(t3, t4);
348
+
349
+        /* Square the values and add them to our sum. */
350
+        sum = vec_msum(t5, t5, sum);
351
+
352
+        pix1 += line_size;
353
+        pix2 += line_size;
354
+    }
355
+
356
+    /* Sum up the four partial sums, and put the result into s. */
357
+    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
358
+    sumsqr = vec_splat(sumsqr, 3);
359
+    vec_ste(sumsqr, 0, &s);
360
+
361
+    return s;
362
+}
363
+
364
+/* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced.
365
+ * It's the sad16_altivec code above w/ squaring added. */
366
+static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
367
+                         int line_size, int h)
368
+{
369
+    int i, s;
370
+    const vector unsigned int zero =
371
+        (const vector unsigned int) vec_splat_u32(0);
372
+    vector unsigned char perm = vec_lvsl(0, pix2);
373
+    vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
374
+    vector signed int sumsqr;
375
+
376
+    for (i = 0; i < h; i++) {
377
+        /* Read potentially unaligned pixels into t1 and t2. */
378
+        vector unsigned char pix2l = vec_ld(0,  pix2);
379
+        vector unsigned char pix2r = vec_ld(15, pix2);
380
+        vector unsigned char t1 = vec_ld(0, pix1);
381
+        vector unsigned char t2 = vec_perm(pix2l, pix2r, perm);
382
+
383
+        /* Since we want to use unsigned chars, we can take advantage
384
+         * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
385
+
386
+        /* Calculate abs differences vector. */
387
+        vector unsigned char t3 = vec_max(t1, t2);
388
+        vector unsigned char t4 = vec_min(t1, t2);
389
+        vector unsigned char t5 = vec_sub(t3, t4);
390
+
391
+        /* Square the values and add them to our sum. */
392
+        sum = vec_msum(t5, t5, sum);
393
+
394
+        pix1 += line_size;
395
+        pix2 += line_size;
396
+    }
397
+
398
+    /* Sum up the four partial sums, and put the result into s. */
399
+    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
400
+    sumsqr = vec_splat(sumsqr, 3);
401
+    vec_ste(sumsqr, 0, &s);
402
+
403
+    return s;
404
+}
405
+
406
+static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
407
+                                     uint8_t *src, int stride, int h)
408
+{
409
+    int sum;
410
+    register const vector unsigned char vzero =
411
+        (const vector unsigned char) vec_splat_u8(0);
412
+    register vector signed short temp0, temp1, temp2, temp3, temp4,
413
+                                 temp5, temp6, temp7;
414
+    {
415
+        register const vector signed short vprod1 =
416
+            (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
417
+        register const vector signed short vprod2 =
418
+            (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
419
+        register const vector signed short vprod3 =
420
+            (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
421
+        register const vector unsigned char perm1 =
422
+            (const vector unsigned char)
423
+            { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
424
+              0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
425
+        register const vector unsigned char perm2 =
426
+            (const vector unsigned char)
427
+            { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
428
+              0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
429
+        register const vector unsigned char perm3 =
430
+            (const vector unsigned char)
431
+            { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
432
+              0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
433
+
434
+#define ONEITERBUTTERFLY(i, res)                                            \
435
+    {                                                                       \
436
+        register vector unsigned char src1 = vec_ld(stride * i, src);       \
437
+        register vector unsigned char src2 = vec_ld(stride * i + 15, src);  \
438
+        register vector unsigned char srcO =                                \
439
+            vec_perm(src1, src2, vec_lvsl(stride * i, src));                \
440
+        register vector unsigned char dst1 = vec_ld(stride * i, dst);       \
441
+        register vector unsigned char dst2 = vec_ld(stride * i + 15, dst);  \
442
+        register vector unsigned char dstO =                                \
443
+            vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));                \
444
+                                                                            \
445
+        /* Promote the unsigned chars to signed shorts. */                  \
446
+        /* We're in the 8x8 function, we only care for the first 8. */      \
447
+        register vector signed short srcV =                                 \
448
+            (vector signed short) vec_mergeh((vector signed char) vzero,    \
449
+                                             (vector signed char) srcO);    \
450
+        register vector signed short dstV =                                 \
451
+            (vector signed short) vec_mergeh((vector signed char) vzero,    \
452
+                                             (vector signed char) dstO);    \
453
+                                                                            \
454
+        /* subtractions inside the first butterfly */                       \
455
+        register vector signed short but0 = vec_sub(srcV, dstV);            \
456
+        register vector signed short op1  = vec_perm(but0, but0, perm1);    \
457
+        register vector signed short but1 = vec_mladd(but0, vprod1, op1);   \
458
+        register vector signed short op2  = vec_perm(but1, but1, perm2);    \
459
+        register vector signed short but2 = vec_mladd(but1, vprod2, op2);   \
460
+        register vector signed short op3  = vec_perm(but2, but2, perm3);    \
461
+        res  = vec_mladd(but2, vprod3, op3);                                \
462
+    }
463
+        ONEITERBUTTERFLY(0, temp0);
464
+        ONEITERBUTTERFLY(1, temp1);
465
+        ONEITERBUTTERFLY(2, temp2);
466
+        ONEITERBUTTERFLY(3, temp3);
467
+        ONEITERBUTTERFLY(4, temp4);
468
+        ONEITERBUTTERFLY(5, temp5);
469
+        ONEITERBUTTERFLY(6, temp6);
470
+        ONEITERBUTTERFLY(7, temp7);
471
+    }
472
+#undef ONEITERBUTTERFLY
473
+    {
474
+        register vector signed int vsum;
475
+        register vector signed short line0  = vec_add(temp0, temp1);
476
+        register vector signed short line1  = vec_sub(temp0, temp1);
477
+        register vector signed short line2  = vec_add(temp2, temp3);
478
+        register vector signed short line3  = vec_sub(temp2, temp3);
479
+        register vector signed short line4  = vec_add(temp4, temp5);
480
+        register vector signed short line5  = vec_sub(temp4, temp5);
481
+        register vector signed short line6  = vec_add(temp6, temp7);
482
+        register vector signed short line7  = vec_sub(temp6, temp7);
483
+
484
+        register vector signed short line0B = vec_add(line0, line2);
485
+        register vector signed short line2B = vec_sub(line0, line2);
486
+        register vector signed short line1B = vec_add(line1, line3);
487
+        register vector signed short line3B = vec_sub(line1, line3);
488
+        register vector signed short line4B = vec_add(line4, line6);
489
+        register vector signed short line6B = vec_sub(line4, line6);
490
+        register vector signed short line5B = vec_add(line5, line7);
491
+        register vector signed short line7B = vec_sub(line5, line7);
492
+
493
+        register vector signed short line0C = vec_add(line0B, line4B);
494
+        register vector signed short line4C = vec_sub(line0B, line4B);
495
+        register vector signed short line1C = vec_add(line1B, line5B);
496
+        register vector signed short line5C = vec_sub(line1B, line5B);
497
+        register vector signed short line2C = vec_add(line2B, line6B);
498
+        register vector signed short line6C = vec_sub(line2B, line6B);
499
+        register vector signed short line3C = vec_add(line3B, line7B);
500
+        register vector signed short line7C = vec_sub(line3B, line7B);
501
+
502
+        vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
503
+        vsum = vec_sum4s(vec_abs(line1C), vsum);
504
+        vsum = vec_sum4s(vec_abs(line2C), vsum);
505
+        vsum = vec_sum4s(vec_abs(line3C), vsum);
506
+        vsum = vec_sum4s(vec_abs(line4C), vsum);
507
+        vsum = vec_sum4s(vec_abs(line5C), vsum);
508
+        vsum = vec_sum4s(vec_abs(line6C), vsum);
509
+        vsum = vec_sum4s(vec_abs(line7C), vsum);
510
+        vsum = vec_sums(vsum, (vector signed int) vzero);
511
+        vsum = vec_splat(vsum, 3);
512
+        vec_ste(vsum, 0, &sum);
513
+    }
514
+    return sum;
515
+}
516
+
517
+/*
518
+ * 16x8 works with 16 elements; it allows to avoid replicating loads, and
519
+ * gives the compiler more room for scheduling. It's only used from
520
+ * inside hadamard8_diff16_altivec.
521
+ *
522
+ * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has
523
+ * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in
524
+ * registers by itself. The following code includes hand-made register
525
+ * allocation. It's not clean, but on a 7450 the resulting code is much faster
526
+ * (best case falls from 700+ cycles to 550).
527
+ *
528
+ * xlc doesn't add spill code, but it doesn't know how to schedule for the
529
+ * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses
530
+ * 25% fewer instructions...)
531
+ *
532
+ * On the 970, the hand-made RA is still a win (around 690 vs. around 780),
533
+ * but xlc goes to around 660 on the regular C code...
534
+ */
535
+static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
536
+                                      uint8_t *src, int stride, int h)
537
+{
538
+    int sum;
539
+    register vector signed short
540
+        temp0 __asm__ ("v0"),
541
+        temp1 __asm__ ("v1"),
542
+        temp2 __asm__ ("v2"),
543
+        temp3 __asm__ ("v3"),
544
+        temp4 __asm__ ("v4"),
545
+        temp5 __asm__ ("v5"),
546
+        temp6 __asm__ ("v6"),
547
+        temp7 __asm__ ("v7");
548
+    register vector signed short
549
+        temp0S __asm__ ("v8"),
550
+        temp1S __asm__ ("v9"),
551
+        temp2S __asm__ ("v10"),
552
+        temp3S __asm__ ("v11"),
553
+        temp4S __asm__ ("v12"),
554
+        temp5S __asm__ ("v13"),
555
+        temp6S __asm__ ("v14"),
556
+        temp7S __asm__ ("v15");
557
+    register const vector unsigned char vzero __asm__ ("v31") =
558
+        (const vector unsigned char) vec_splat_u8(0);
559
+    {
560
+        register const vector signed short vprod1 __asm__ ("v16") =
561
+            (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
562
+
563
+        register const vector signed short vprod2 __asm__ ("v17") =
564
+            (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
565
+
566
+        register const vector signed short vprod3 __asm__ ("v18") =
567
+            (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
568
+
569
+        register const vector unsigned char perm1 __asm__ ("v19") =
570
+            (const vector unsigned char)
571
+            { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
572
+              0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
573
+
574
+        register const vector unsigned char perm2 __asm__ ("v20") =
575
+            (const vector unsigned char)
576
+            { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
577
+              0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
578
+
579
+        register const vector unsigned char perm3 __asm__ ("v21") =
580
+            (const vector unsigned char)
581
+            { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
582
+              0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
583
+
584
+#define ONEITERBUTTERFLY(i, res1, res2)                                     \
585
+    {                                                                       \
586
+        register vector unsigned char src1 __asm__ ("v22") =                \
587
+            vec_ld(stride * i, src);                                        \
588
+        register vector unsigned char src2 __asm__ ("v23") =                \
589
+            vec_ld(stride * i + 16, src);                                   \
590
+        register vector unsigned char srcO __asm__ ("v22") =                \
591
+            vec_perm(src1, src2, vec_lvsl(stride * i, src));                \
592
+        register vector unsigned char dst1 __asm__ ("v24") =                \
593
+            vec_ld(stride * i, dst);                                        \
594
+        register vector unsigned char dst2 __asm__ ("v25") =                \
595
+            vec_ld(stride * i + 16, dst);                                   \
596
+        register vector unsigned char dstO __asm__ ("v23") =                \
597
+            vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));                \
598
+                                                                            \
599
+        /* Promote the unsigned chars to signed shorts. */                  \
600
+        register vector signed short srcV __asm__ ("v24") =                 \
601
+            (vector signed short) vec_mergeh((vector signed char) vzero,    \
602
+                                             (vector signed char) srcO);    \
603
+        register vector signed short dstV __asm__ ("v25") =                 \
604
+            (vector signed short) vec_mergeh((vector signed char) vzero,    \
605
+                                             (vector signed char) dstO);    \
606
+        register vector signed short srcW __asm__ ("v26") =                 \
607
+            (vector signed short) vec_mergel((vector signed char) vzero,    \
608
+                                             (vector signed char) srcO);    \
609
+        register vector signed short dstW __asm__ ("v27") =                 \
610
+            (vector signed short) vec_mergel((vector signed char) vzero,    \
611
+                                             (vector signed char) dstO);    \
612
+                                                                            \
613
+        /* subtractions inside the first butterfly */                       \
614
+        register vector signed short but0  __asm__ ("v28") =                \
615
+            vec_sub(srcV, dstV);                                            \
616
+        register vector signed short but0S __asm__ ("v29") =                \
617
+            vec_sub(srcW, dstW);                                            \
618
+        register vector signed short op1   __asm__ ("v30") =                \
619
+            vec_perm(but0, but0, perm1);                                    \
620
+        register vector signed short but1  __asm__ ("v22") =                \
621
+            vec_mladd(but0, vprod1, op1);                                   \
622
+        register vector signed short op1S  __asm__ ("v23") =                \
623
+            vec_perm(but0S, but0S, perm1);                                  \
624
+        register vector signed short but1S __asm__ ("v24") =                \
625
+            vec_mladd(but0S, vprod1, op1S);                                 \
626
+        register vector signed short op2   __asm__ ("v25") =                \
627
+            vec_perm(but1, but1, perm2);                                    \
628
+        register vector signed short but2  __asm__ ("v26") =                \
629
+            vec_mladd(but1, vprod2, op2);                                   \
630
+        register vector signed short op2S  __asm__ ("v27") =                \
631
+            vec_perm(but1S, but1S, perm2);                                  \
632
+        register vector signed short but2S __asm__ ("v28") =                \
633
+            vec_mladd(but1S, vprod2, op2S);                                 \
634
+        register vector signed short op3   __asm__ ("v29") =                \
635
+            vec_perm(but2, but2, perm3);                                    \
636
+        register vector signed short op3S  __asm__ ("v30") =                \
637
+            vec_perm(but2S, but2S, perm3);                                  \
638
+        res1 = vec_mladd(but2, vprod3, op3);                                \
639
+        res2 = vec_mladd(but2S, vprod3, op3S);                              \
640
+    }
641
+        ONEITERBUTTERFLY(0, temp0, temp0S);
642
+        ONEITERBUTTERFLY(1, temp1, temp1S);
643
+        ONEITERBUTTERFLY(2, temp2, temp2S);
644
+        ONEITERBUTTERFLY(3, temp3, temp3S);
645
+        ONEITERBUTTERFLY(4, temp4, temp4S);
646
+        ONEITERBUTTERFLY(5, temp5, temp5S);
647
+        ONEITERBUTTERFLY(6, temp6, temp6S);
648
+        ONEITERBUTTERFLY(7, temp7, temp7S);
649
+    }
650
+#undef ONEITERBUTTERFLY
651
+    {
652
+        register vector signed int vsum;
653
+
654
+        register vector signed short line0  = vec_add(temp0, temp1);
655
+        register vector signed short line1  = vec_sub(temp0, temp1);
656
+        register vector signed short line2  = vec_add(temp2, temp3);
657
+        register vector signed short line3  = vec_sub(temp2, temp3);
658
+        register vector signed short line4  = vec_add(temp4, temp5);
659
+        register vector signed short line5  = vec_sub(temp4, temp5);
660
+        register vector signed short line6  = vec_add(temp6, temp7);
661
+        register vector signed short line7  = vec_sub(temp6, temp7);
662
+
663
+        register vector signed short line0B = vec_add(line0, line2);
664
+        register vector signed short line2B = vec_sub(line0, line2);
665
+        register vector signed short line1B = vec_add(line1, line3);
666
+        register vector signed short line3B = vec_sub(line1, line3);
667
+        register vector signed short line4B = vec_add(line4, line6);
668
+        register vector signed short line6B = vec_sub(line4, line6);
669
+        register vector signed short line5B = vec_add(line5, line7);
670
+        register vector signed short line7B = vec_sub(line5, line7);
671
+
672
+        register vector signed short line0C = vec_add(line0B, line4B);
673
+        register vector signed short line4C = vec_sub(line0B, line4B);
674
+        register vector signed short line1C = vec_add(line1B, line5B);
675
+        register vector signed short line5C = vec_sub(line1B, line5B);
676
+        register vector signed short line2C = vec_add(line2B, line6B);
677
+        register vector signed short line6C = vec_sub(line2B, line6B);
678
+        register vector signed short line3C = vec_add(line3B, line7B);
679
+        register vector signed short line7C = vec_sub(line3B, line7B);
680
+
681
+        register vector signed short line0S = vec_add(temp0S, temp1S);
682
+        register vector signed short line1S = vec_sub(temp0S, temp1S);
683
+        register vector signed short line2S = vec_add(temp2S, temp3S);
684
+        register vector signed short line3S = vec_sub(temp2S, temp3S);
685
+        register vector signed short line4S = vec_add(temp4S, temp5S);
686
+        register vector signed short line5S = vec_sub(temp4S, temp5S);
687
+        register vector signed short line6S = vec_add(temp6S, temp7S);
688
+        register vector signed short line7S = vec_sub(temp6S, temp7S);
689
+
690
+        register vector signed short line0BS = vec_add(line0S, line2S);
691
+        register vector signed short line2BS = vec_sub(line0S, line2S);
692
+        register vector signed short line1BS = vec_add(line1S, line3S);
693
+        register vector signed short line3BS = vec_sub(line1S, line3S);
694
+        register vector signed short line4BS = vec_add(line4S, line6S);
695
+        register vector signed short line6BS = vec_sub(line4S, line6S);
696
+        register vector signed short line5BS = vec_add(line5S, line7S);
697
+        register vector signed short line7BS = vec_sub(line5S, line7S);
698
+
699
+        register vector signed short line0CS = vec_add(line0BS, line4BS);
700
+        register vector signed short line4CS = vec_sub(line0BS, line4BS);
701
+        register vector signed short line1CS = vec_add(line1BS, line5BS);
702
+        register vector signed short line5CS = vec_sub(line1BS, line5BS);
703
+        register vector signed short line2CS = vec_add(line2BS, line6BS);
704
+        register vector signed short line6CS = vec_sub(line2BS, line6BS);
705
+        register vector signed short line3CS = vec_add(line3BS, line7BS);
706
+        register vector signed short line7CS = vec_sub(line3BS, line7BS);
707
+
708
+        vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
709
+        vsum = vec_sum4s(vec_abs(line1C), vsum);
710
+        vsum = vec_sum4s(vec_abs(line2C), vsum);
711
+        vsum = vec_sum4s(vec_abs(line3C), vsum);
712
+        vsum = vec_sum4s(vec_abs(line4C), vsum);
713
+        vsum = vec_sum4s(vec_abs(line5C), vsum);
714
+        vsum = vec_sum4s(vec_abs(line6C), vsum);
715
+        vsum = vec_sum4s(vec_abs(line7C), vsum);
716
+
717
+        vsum = vec_sum4s(vec_abs(line0CS), vsum);
718
+        vsum = vec_sum4s(vec_abs(line1CS), vsum);
719
+        vsum = vec_sum4s(vec_abs(line2CS), vsum);
720
+        vsum = vec_sum4s(vec_abs(line3CS), vsum);
721
+        vsum = vec_sum4s(vec_abs(line4CS), vsum);
722
+        vsum = vec_sum4s(vec_abs(line5CS), vsum);
723
+        vsum = vec_sum4s(vec_abs(line6CS), vsum);
724
+        vsum = vec_sum4s(vec_abs(line7CS), vsum);
725
+        vsum = vec_sums(vsum, (vector signed int) vzero);
726
+        vsum = vec_splat(vsum, 3);
727
+        vec_ste(vsum, 0, &sum);
728
+    }
729
+    return sum;
730
+}
731
+
732
+static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst,
733
+                                    uint8_t *src, int stride, int h)
734
+{
735
+    int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
736
+
737
+    if (h == 16) {
738
+        dst   += 8 * stride;
739
+        src   += 8 * stride;
740
+        score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
741
+    }
742
+    return score;
743
+}
744
+#endif /* HAVE_ALTIVEC */
745
+
746
+av_cold void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx)
747
+{
748
+#if HAVE_ALTIVEC
749
+    if (!PPC_ALTIVEC(av_get_cpu_flags()))
750
+        return;
751
+
752
+    c->pix_abs[0][1] = sad16_x2_altivec;
753
+    c->pix_abs[0][2] = sad16_y2_altivec;
754
+    c->pix_abs[0][3] = sad16_xy2_altivec;
755
+    c->pix_abs[0][0] = sad16_altivec;
756
+    c->pix_abs[1][0] = sad8_altivec;
757
+
758
+    c->sad[0] = sad16_altivec;
759
+    c->sad[1] = sad8_altivec;
760
+    c->sse[0] = sse16_altivec;
761
+    c->sse[1] = sse8_altivec;
762
+
763
+    c->hadamard8_diff[0] = hadamard8_diff16_altivec;
764
+    c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
765
+#endif /* HAVE_ALTIVEC */
766
+}
... ...
@@ -27,8 +27,8 @@
27 27
  */
28 28
 
29 29
 #include "avcodec.h"
30
-#include "dsputil.h"
31 30
 #include "hpeldsp.h"
31
+#include "me_cmp.h"
32 32
 #include "mpegvideo.h"
33 33
 #include "h263.h"
34 34
 #include "internal.h"
... ...
@@ -306,7 +306,7 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane,
306 306
         s->m.current_picture.motion_val[0]   = s->motion_val8[plane] + 2;
307 307
         s->m.p_mv_table                      = s->motion_val16[plane] +
308 308
                                                s->m.mb_stride + 1;
309
-        s->m.dsp                             = s->dsp; // move
309
+        s->m.mecc                            = s->mecc; // move
310 310
         ff_init_me(&s->m);
311 311
 
312 312
         s->m.me.dia_size      = s->avctx->dia_size;
... ...
@@ -431,8 +431,8 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane,
431 431
                     best      = score[1] <= score[0];
432 432
 
433 433
                     vlc       = ff_svq1_block_type_vlc[SVQ1_BLOCK_SKIP];
434
-                    score[2]  = s->dsp.sse[0](NULL, src + 16 * x, ref,
435
-                                              stride, 16);
434
+                    score[2]  = s->mecc.sse[0](NULL, src + 16 * x, ref,
435
+                                               stride, 16);
436 436
                     score[2] += vlc[1] * lambda;
437 437
                     if (score[2] < score[best] && mx == 0 && my == 0) {
438 438
                         best = 2;
... ...
@@ -509,8 +509,8 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx)
509 509
     SVQ1EncContext *const s = avctx->priv_data;
510 510
     int ret;
511 511
 
512
-    ff_dsputil_init(&s->dsp, avctx);
513 512
     ff_hpeldsp_init(&s->hdsp, avctx->flags);
513
+    ff_me_cmp_init(&s->mecc, avctx);
514 514
     ff_mpegvideoencdsp_init(&s->m.mpvencdsp, avctx);
515 515
 
516 516
     avctx->coded_frame = av_frame_alloc();
... ...
@@ -25,9 +25,9 @@
25 25
 
26 26
 #include "libavutil/frame.h"
27 27
 #include "avcodec.h"
28
-#include "dsputil.h"
29 28
 #include "get_bits.h"
30 29
 #include "hpeldsp.h"
30
+#include "me_cmp.h"
31 31
 #include "mpegvideo.h"
32 32
 #include "put_bits.h"
33 33
 
... ...
@@ -37,7 +37,7 @@ typedef struct SVQ1EncContext {
37 37
      * of MpegEncContext, so this will be removed then. */
38 38
     MpegEncContext m;
39 39
     AVCodecContext *avctx;
40
-    DSPContext dsp;
40
+    MECmpContext mecc;
41 41
     HpelDSPContext hdsp;
42 42
     AVFrame *current_picture;
43 43
     AVFrame *last_picture;
... ...
@@ -39,8 +39,8 @@
39 39
 #include "libavutil/samplefmt.h"
40 40
 #include "libavutil/dict.h"
41 41
 #include "avcodec.h"
42
-#include "dsputil.h"
43 42
 #include "libavutil/opt.h"
43
+#include "me_cmp.h"
44 44
 #include "mpegvideo.h"
45 45
 #include "thread.h"
46 46
 #include "internal.h"
... ...
@@ -100,8 +100,8 @@ static av_cold void avcodec_init(void)
100 100
         return;
101 101
     initialized = 1;
102 102
 
103
-    if (CONFIG_DSPUTIL)
104
-        ff_dsputil_static_init();
103
+    if (CONFIG_ME_CMP)
104
+        ff_me_cmp_init_static();
105 105
 }
106 106
 
107 107
 int av_codec_is_encoder(const AVCodec *codec)
... ...
@@ -6,7 +6,6 @@ OBJS-$(CONFIG_AUDIODSP)                += x86/audiodsp_init.o
6 6
 OBJS-$(CONFIG_BLOCKDSP)                += x86/blockdsp.o
7 7
 OBJS-$(CONFIG_BSWAPDSP)                += x86/bswapdsp_init.o
8 8
 OBJS-$(CONFIG_DCT)                     += x86/dct_init.o
9
-OBJS-$(CONFIG_DSPUTIL)                 += x86/dsputil_init.o
10 9
 OBJS-$(CONFIG_FDCTDSP)                 += x86/fdctdsp_init.o
11 10
 OBJS-$(CONFIG_FFT)                     += x86/fft_init.o
12 11
 OBJS-$(CONFIG_H263DSP)                 += x86/h263dsp_init.o
... ...
@@ -19,6 +18,7 @@ OBJS-$(CONFIG_HUFFYUVDSP)              += x86/huffyuvdsp_init.o
19 19
 OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_mmx.o
20 20
 OBJS-$(CONFIG_IDCTDSP)                 += x86/idctdsp_init.o
21 21
 OBJS-$(CONFIG_LPC)                     += x86/lpc.o
22
+OBJS-$(CONFIG_ME_CMP)                  += x86/me_cmp_init.o
22 23
 OBJS-$(CONFIG_MPEGAUDIODSP)            += x86/mpegaudiodsp.o
23 24
 OBJS-$(CONFIG_MPEGVIDEO)               += x86/mpegvideo.o              \
24 25
                                           x86/mpegvideodsp.o
... ...
@@ -70,7 +70,6 @@ YASM-OBJS-$(CONFIG_AUDIODSP)           += x86/audiodsp.o
70 70
 YASM-OBJS-$(CONFIG_BSWAPDSP)           += x86/bswapdsp.o
71 71
 YASM-OBJS-$(CONFIG_DCT)                += x86/dct32.o
72 72
 YASM-OBJS-$(CONFIG_DNXHD_ENCODER)      += x86/dnxhdenc.o
73
-YASM-OBJS-$(CONFIG_DSPUTIL)            += x86/dsputilenc.o
74 73
 YASM-OBJS-$(CONFIG_FFT)                += x86/fft.o
75 74
 YASM-OBJS-$(CONFIG_H263DSP)            += x86/h263_loopfilter.o
76 75
 YASM-OBJS-$(CONFIG_H264CHROMA)         += x86/h264_chromamc.o           \
... ...
@@ -90,6 +89,7 @@ YASM-OBJS-$(CONFIG_H264QPEL)           += x86/h264_qpel_8bit.o          \
90 90
 YASM-OBJS-$(CONFIG_HPELDSP)            += x86/fpel.o                    \
91 91
                                           x86/hpeldsp.o
92 92
 YASM-OBJS-$(CONFIG_HUFFYUVDSP)         += x86/huffyuvdsp.o
93
+YASM-OBJS-$(CONFIG_ME_CMP)             += x86/me_cmp.o
93 94
 YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36.o
94 95
 YASM-OBJS-$(CONFIG_MPEGVIDEOENC)       += x86/mpegvideoencdsp.o
95 96
 YASM-OBJS-$(CONFIG_PIXBLOCKDSP)        += x86/pixblockdsp.o
96 97
deleted file mode 100644
... ...
@@ -1,1321 +0,0 @@
1
-/*
2
- * MMX optimized DSP utils
3
- * Copyright (c) 2000, 2001 Fabrice Bellard
4
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
- *
6
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7
- *
8
- * This file is part of Libav.
9
- *
10
- * Libav is free software; you can redistribute it and/or
11
- * modify it under the terms of the GNU Lesser General Public
12
- * License as published by the Free Software Foundation; either
13
- * version 2.1 of the License, or (at your option) any later version.
14
- *
15
- * Libav is distributed in the hope that it will be useful,
16
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
- * Lesser General Public License for more details.
19
- *
20
- * You should have received a copy of the GNU Lesser General Public
21
- * License along with Libav; if not, write to the Free Software
22
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
- */
24
-
25
-#include "libavutil/attributes.h"
26
-#include "libavutil/cpu.h"
27
-#include "libavutil/x86/asm.h"
28
-#include "libavutil/x86/cpu.h"
29
-#include "libavcodec/dsputil.h"
30
-#include "libavcodec/mpegvideo.h"
31
-
32
-#if HAVE_INLINE_ASM
33
-
34
-static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
35
-                    int line_size, int h)
36
-{
37
-    int tmp;
38
-
39
-    __asm__ volatile (
40
-        "movl         %4, %%ecx          \n"
41
-        "shr          $1, %%ecx          \n"
42
-        "pxor      %%mm0, %%mm0          \n" /* mm0 = 0 */
43
-        "pxor      %%mm7, %%mm7          \n" /* mm7 holds the sum */
44
-        "1:                              \n"
45
-        "movq       (%0), %%mm1          \n" /* mm1 = pix1[0][0 - 7] */
46
-        "movq       (%1), %%mm2          \n" /* mm2 = pix2[0][0 - 7] */
47
-        "movq   (%0, %3), %%mm3          \n" /* mm3 = pix1[1][0 - 7] */
48
-        "movq   (%1, %3), %%mm4          \n" /* mm4 = pix2[1][0 - 7] */
49
-
50
-        /* todo: mm1-mm2, mm3-mm4 */
51
-        /* algo: subtract mm1 from mm2 with saturation and vice versa */
52
-        /*       OR the results to get absolute difference */
53
-        "movq      %%mm1, %%mm5          \n"
54
-        "movq      %%mm3, %%mm6          \n"
55
-        "psubusb   %%mm2, %%mm1          \n"
56
-        "psubusb   %%mm4, %%mm3          \n"
57
-        "psubusb   %%mm5, %%mm2          \n"
58
-        "psubusb   %%mm6, %%mm4          \n"
59
-
60
-        "por       %%mm1, %%mm2          \n"
61
-        "por       %%mm3, %%mm4          \n"
62
-
63
-        /* now convert to 16-bit vectors so we can square them */
64
-        "movq      %%mm2, %%mm1          \n"
65
-        "movq      %%mm4, %%mm3          \n"
66
-
67
-        "punpckhbw %%mm0, %%mm2          \n"
68
-        "punpckhbw %%mm0, %%mm4          \n"
69
-        "punpcklbw %%mm0, %%mm1          \n" /* mm1 now spread over (mm1, mm2) */
70
-        "punpcklbw %%mm0, %%mm3          \n" /* mm4 now spread over (mm3, mm4) */
71
-
72
-        "pmaddwd   %%mm2, %%mm2          \n"
73
-        "pmaddwd   %%mm4, %%mm4          \n"
74
-        "pmaddwd   %%mm1, %%mm1          \n"
75
-        "pmaddwd   %%mm3, %%mm3          \n"
76
-
77
-        "lea (%0, %3, 2), %0             \n" /* pix1 += 2 * line_size */
78
-        "lea (%1, %3, 2), %1             \n" /* pix2 += 2 * line_size */
79
-
80
-        "paddd     %%mm2, %%mm1          \n"
81
-        "paddd     %%mm4, %%mm3          \n"
82
-        "paddd     %%mm1, %%mm7          \n"
83
-        "paddd     %%mm3, %%mm7          \n"
84
-
85
-        "decl      %%ecx                 \n"
86
-        "jnz       1b                    \n"
87
-
88
-        "movq      %%mm7, %%mm1          \n"
89
-        "psrlq       $32, %%mm7          \n" /* shift hi dword to lo */
90
-        "paddd     %%mm7, %%mm1          \n"
91
-        "movd      %%mm1, %2             \n"
92
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
93
-        : "r" ((x86_reg) line_size), "m" (h)
94
-        : "%ecx");
95
-
96
-    return tmp;
97
-}
98
-
99
-static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
100
-                     int line_size, int h)
101
-{
102
-    int tmp;
103
-
104
-    __asm__ volatile (
105
-        "movl %4, %%ecx\n"
106
-        "pxor %%mm0, %%mm0\n"    /* mm0 = 0 */
107
-        "pxor %%mm7, %%mm7\n"    /* mm7 holds the sum */
108
-        "1:\n"
109
-        "movq (%0), %%mm1\n"     /* mm1 = pix1[0 -  7] */
110
-        "movq (%1), %%mm2\n"     /* mm2 = pix2[0 -  7] */
111
-        "movq 8(%0), %%mm3\n"    /* mm3 = pix1[8 - 15] */
112
-        "movq 8(%1), %%mm4\n"    /* mm4 = pix2[8 - 15] */
113
-
114
-        /* todo: mm1-mm2, mm3-mm4 */
115
-        /* algo: subtract mm1 from mm2 with saturation and vice versa */
116
-        /*       OR the results to get absolute difference */
117
-        "movq %%mm1, %%mm5\n"
118
-        "movq %%mm3, %%mm6\n"
119
-        "psubusb %%mm2, %%mm1\n"
120
-        "psubusb %%mm4, %%mm3\n"
121
-        "psubusb %%mm5, %%mm2\n"
122
-        "psubusb %%mm6, %%mm4\n"
123
-
124
-        "por %%mm1, %%mm2\n"
125
-        "por %%mm3, %%mm4\n"
126
-
127
-        /* now convert to 16-bit vectors so we can square them */
128
-        "movq %%mm2, %%mm1\n"
129
-        "movq %%mm4, %%mm3\n"
130
-
131
-        "punpckhbw %%mm0, %%mm2\n"
132
-        "punpckhbw %%mm0, %%mm4\n"
133
-        "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */
134
-        "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */
135
-
136
-        "pmaddwd %%mm2, %%mm2\n"
137
-        "pmaddwd %%mm4, %%mm4\n"
138
-        "pmaddwd %%mm1, %%mm1\n"
139
-        "pmaddwd %%mm3, %%mm3\n"
140
-
141
-        "add %3, %0\n"
142
-        "add %3, %1\n"
143
-
144
-        "paddd %%mm2, %%mm1\n"
145
-        "paddd %%mm4, %%mm3\n"
146
-        "paddd %%mm1, %%mm7\n"
147
-        "paddd %%mm3, %%mm7\n"
148
-
149
-        "decl %%ecx\n"
150
-        "jnz 1b\n"
151
-
152
-        "movq %%mm7, %%mm1\n"
153
-        "psrlq $32, %%mm7\n"    /* shift hi dword to lo */
154
-        "paddd %%mm7, %%mm1\n"
155
-        "movd %%mm1, %2\n"
156
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
157
-        : "r" ((x86_reg) line_size), "m" (h)
158
-        : "%ecx");
159
-
160
-    return tmp;
161
-}
162
-
163
-static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h)
164
-{
165
-    int tmp;
166
-
167
-    __asm__ volatile (
168
-        "movl %3, %%ecx\n"
169
-        "pxor %%mm7, %%mm7\n"
170
-        "pxor %%mm6, %%mm6\n"
171
-
172
-        "movq (%0), %%mm0\n"
173
-        "movq %%mm0, %%mm1\n"
174
-        "psllq $8, %%mm0\n"
175
-        "psrlq $8, %%mm1\n"
176
-        "psrlq $8, %%mm0\n"
177
-        "movq %%mm0, %%mm2\n"
178
-        "movq %%mm1, %%mm3\n"
179
-        "punpcklbw %%mm7, %%mm0\n"
180
-        "punpcklbw %%mm7, %%mm1\n"
181
-        "punpckhbw %%mm7, %%mm2\n"
182
-        "punpckhbw %%mm7, %%mm3\n"
183
-        "psubw %%mm1, %%mm0\n"
184
-        "psubw %%mm3, %%mm2\n"
185
-
186
-        "add %2, %0\n"
187
-
188
-        "movq (%0), %%mm4\n"
189
-        "movq %%mm4, %%mm1\n"
190
-        "psllq $8, %%mm4\n"
191
-        "psrlq $8, %%mm1\n"
192
-        "psrlq $8, %%mm4\n"
193
-        "movq %%mm4, %%mm5\n"
194
-        "movq %%mm1, %%mm3\n"
195
-        "punpcklbw %%mm7, %%mm4\n"
196
-        "punpcklbw %%mm7, %%mm1\n"
197
-        "punpckhbw %%mm7, %%mm5\n"
198
-        "punpckhbw %%mm7, %%mm3\n"
199
-        "psubw %%mm1, %%mm4\n"
200
-        "psubw %%mm3, %%mm5\n"
201
-        "psubw %%mm4, %%mm0\n"
202
-        "psubw %%mm5, %%mm2\n"
203
-        "pxor %%mm3, %%mm3\n"
204
-        "pxor %%mm1, %%mm1\n"
205
-        "pcmpgtw %%mm0, %%mm3\n\t"
206
-        "pcmpgtw %%mm2, %%mm1\n\t"
207
-        "pxor %%mm3, %%mm0\n"
208
-        "pxor %%mm1, %%mm2\n"
209
-        "psubw %%mm3, %%mm0\n"
210
-        "psubw %%mm1, %%mm2\n"
211
-        "paddw %%mm0, %%mm2\n"
212
-        "paddw %%mm2, %%mm6\n"
213
-
214
-        "add %2, %0\n"
215
-        "1:\n"
216
-
217
-        "movq (%0), %%mm0\n"
218
-        "movq %%mm0, %%mm1\n"
219
-        "psllq $8, %%mm0\n"
220
-        "psrlq $8, %%mm1\n"
221
-        "psrlq $8, %%mm0\n"
222
-        "movq %%mm0, %%mm2\n"
223
-        "movq %%mm1, %%mm3\n"
224
-        "punpcklbw %%mm7, %%mm0\n"
225
-        "punpcklbw %%mm7, %%mm1\n"
226
-        "punpckhbw %%mm7, %%mm2\n"
227
-        "punpckhbw %%mm7, %%mm3\n"
228
-        "psubw %%mm1, %%mm0\n"
229
-        "psubw %%mm3, %%mm2\n"
230
-        "psubw %%mm0, %%mm4\n"
231
-        "psubw %%mm2, %%mm5\n"
232
-        "pxor  %%mm3, %%mm3\n"
233
-        "pxor  %%mm1, %%mm1\n"
234
-        "pcmpgtw %%mm4, %%mm3\n\t"
235
-        "pcmpgtw %%mm5, %%mm1\n\t"
236
-        "pxor  %%mm3, %%mm4\n"
237
-        "pxor  %%mm1, %%mm5\n"
238
-        "psubw %%mm3, %%mm4\n"
239
-        "psubw %%mm1, %%mm5\n"
240
-        "paddw %%mm4, %%mm5\n"
241
-        "paddw %%mm5, %%mm6\n"
242
-
243
-        "add %2, %0\n"
244
-
245
-        "movq (%0), %%mm4\n"
246
-        "movq      %%mm4, %%mm1\n"
247
-        "psllq $8, %%mm4\n"
248
-        "psrlq $8, %%mm1\n"
249
-        "psrlq $8, %%mm4\n"
250
-        "movq      %%mm4, %%mm5\n"
251
-        "movq      %%mm1, %%mm3\n"
252
-        "punpcklbw %%mm7, %%mm4\n"
253
-        "punpcklbw %%mm7, %%mm1\n"
254
-        "punpckhbw %%mm7, %%mm5\n"
255
-        "punpckhbw %%mm7, %%mm3\n"
256
-        "psubw     %%mm1, %%mm4\n"
257
-        "psubw     %%mm3, %%mm5\n"
258
-        "psubw     %%mm4, %%mm0\n"
259
-        "psubw     %%mm5, %%mm2\n"
260
-        "pxor      %%mm3, %%mm3\n"
261
-        "pxor      %%mm1, %%mm1\n"
262
-        "pcmpgtw   %%mm0, %%mm3\n\t"
263
-        "pcmpgtw   %%mm2, %%mm1\n\t"
264
-        "pxor      %%mm3, %%mm0\n"
265
-        "pxor      %%mm1, %%mm2\n"
266
-        "psubw     %%mm3, %%mm0\n"
267
-        "psubw     %%mm1, %%mm2\n"
268
-        "paddw     %%mm0, %%mm2\n"
269
-        "paddw     %%mm2, %%mm6\n"
270
-
271
-        "add  %2, %0\n"
272
-        "subl $2, %%ecx\n"
273
-        " jnz 1b\n"
274
-
275
-        "movq      %%mm6, %%mm0\n"
276
-        "punpcklwd %%mm7, %%mm0\n"
277
-        "punpckhwd %%mm7, %%mm6\n"
278
-        "paddd     %%mm0, %%mm6\n"
279
-
280
-        "movq  %%mm6, %%mm0\n"
281
-        "psrlq $32,   %%mm6\n"
282
-        "paddd %%mm6, %%mm0\n"
283
-        "movd  %%mm0, %1\n"
284
-        : "+r" (pix1), "=r" (tmp)
285
-        : "r" ((x86_reg) line_size), "g" (h - 2)
286
-        : "%ecx");
287
-
288
-    return tmp;
289
-}
290
-
291
-static int hf_noise16_mmx(uint8_t *pix1, int line_size, int h)
292
-{
293
-    int tmp;
294
-    uint8_t *pix = pix1;
295
-
296
-    __asm__ volatile (
297
-        "movl %3, %%ecx\n"
298
-        "pxor %%mm7, %%mm7\n"
299
-        "pxor %%mm6, %%mm6\n"
300
-
301
-        "movq (%0), %%mm0\n"
302
-        "movq 1(%0), %%mm1\n"
303
-        "movq %%mm0, %%mm2\n"
304
-        "movq %%mm1, %%mm3\n"
305
-        "punpcklbw %%mm7, %%mm0\n"
306
-        "punpcklbw %%mm7, %%mm1\n"
307
-        "punpckhbw %%mm7, %%mm2\n"
308
-        "punpckhbw %%mm7, %%mm3\n"
309
-        "psubw %%mm1, %%mm0\n"
310
-        "psubw %%mm3, %%mm2\n"
311
-
312
-        "add %2, %0\n"
313
-
314
-        "movq (%0), %%mm4\n"
315
-        "movq 1(%0), %%mm1\n"
316
-        "movq %%mm4, %%mm5\n"
317
-        "movq %%mm1, %%mm3\n"
318
-        "punpcklbw %%mm7, %%mm4\n"
319
-        "punpcklbw %%mm7, %%mm1\n"
320
-        "punpckhbw %%mm7, %%mm5\n"
321
-        "punpckhbw %%mm7, %%mm3\n"
322
-        "psubw %%mm1, %%mm4\n"
323
-        "psubw %%mm3, %%mm5\n"
324
-        "psubw %%mm4, %%mm0\n"
325
-        "psubw %%mm5, %%mm2\n"
326
-        "pxor %%mm3, %%mm3\n"
327
-        "pxor %%mm1, %%mm1\n"
328
-        "pcmpgtw %%mm0, %%mm3\n\t"
329
-        "pcmpgtw %%mm2, %%mm1\n\t"
330
-        "pxor %%mm3, %%mm0\n"
331
-        "pxor %%mm1, %%mm2\n"
332
-        "psubw %%mm3, %%mm0\n"
333
-        "psubw %%mm1, %%mm2\n"
334
-        "paddw %%mm0, %%mm2\n"
335
-        "paddw %%mm2, %%mm6\n"
336
-
337
-        "add %2, %0\n"
338
-        "1:\n"
339
-
340
-        "movq (%0), %%mm0\n"
341
-        "movq 1(%0), %%mm1\n"
342
-        "movq %%mm0, %%mm2\n"
343
-        "movq %%mm1, %%mm3\n"
344
-        "punpcklbw %%mm7, %%mm0\n"
345
-        "punpcklbw %%mm7, %%mm1\n"
346
-        "punpckhbw %%mm7, %%mm2\n"
347
-        "punpckhbw %%mm7, %%mm3\n"
348
-        "psubw %%mm1, %%mm0\n"
349
-        "psubw %%mm3, %%mm2\n"
350
-        "psubw %%mm0, %%mm4\n"
351
-        "psubw %%mm2, %%mm5\n"
352
-        "pxor %%mm3, %%mm3\n"
353
-        "pxor %%mm1, %%mm1\n"
354
-        "pcmpgtw %%mm4, %%mm3\n\t"
355
-        "pcmpgtw %%mm5, %%mm1\n\t"
356
-        "pxor %%mm3, %%mm4\n"
357
-        "pxor %%mm1, %%mm5\n"
358
-        "psubw %%mm3, %%mm4\n"
359
-        "psubw %%mm1, %%mm5\n"
360
-        "paddw %%mm4, %%mm5\n"
361
-        "paddw %%mm5, %%mm6\n"
362
-
363
-        "add %2, %0\n"
364
-
365
-        "movq (%0), %%mm4\n"
366
-        "movq 1(%0), %%mm1\n"
367
-        "movq %%mm4, %%mm5\n"
368
-        "movq %%mm1, %%mm3\n"
369
-        "punpcklbw %%mm7, %%mm4\n"
370
-        "punpcklbw %%mm7, %%mm1\n"
371
-        "punpckhbw %%mm7, %%mm5\n"
372
-        "punpckhbw %%mm7, %%mm3\n"
373
-        "psubw %%mm1, %%mm4\n"
374
-        "psubw %%mm3, %%mm5\n"
375
-        "psubw %%mm4, %%mm0\n"
376
-        "psubw %%mm5, %%mm2\n"
377
-        "pxor %%mm3, %%mm3\n"
378
-        "pxor %%mm1, %%mm1\n"
379
-        "pcmpgtw %%mm0, %%mm3\n\t"
380
-        "pcmpgtw %%mm2, %%mm1\n\t"
381
-        "pxor %%mm3, %%mm0\n"
382
-        "pxor %%mm1, %%mm2\n"
383
-        "psubw %%mm3, %%mm0\n"
384
-        "psubw %%mm1, %%mm2\n"
385
-        "paddw %%mm0, %%mm2\n"
386
-        "paddw %%mm2, %%mm6\n"
387
-
388
-        "add %2, %0\n"
389
-        "subl $2, %%ecx\n"
390
-        " jnz 1b\n"
391
-
392
-        "movq %%mm6, %%mm0\n"
393
-        "punpcklwd %%mm7, %%mm0\n"
394
-        "punpckhwd %%mm7, %%mm6\n"
395
-        "paddd %%mm0, %%mm6\n"
396
-
397
-        "movq %%mm6, %%mm0\n"
398
-        "psrlq $32, %%mm6\n"
399
-        "paddd %%mm6, %%mm0\n"
400
-        "movd %%mm0, %1\n"
401
-        : "+r" (pix1), "=r" (tmp)
402
-        : "r" ((x86_reg) line_size), "g" (h - 2)
403
-        : "%ecx");
404
-
405
-    return tmp + hf_noise8_mmx(pix + 8, line_size, h);
406
-}
407
-
408
-static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
409
-                      int line_size, int h)
410
-{
411
-    int score1, score2;
412
-
413
-    if (c)
414
-        score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
415
-    else
416
-        score1 = sse16_mmx(c, pix1, pix2, line_size, h);
417
-    score2 = hf_noise16_mmx(pix1, line_size, h) -
418
-             hf_noise16_mmx(pix2, line_size, h);
419
-
420
-    if (c)
421
-        return score1 + FFABS(score2) * c->avctx->nsse_weight;
422
-    else
423
-        return score1 + FFABS(score2) * 8;
424
-}
425
-
426
-static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
427
-                     int line_size, int h)
428
-{
429
-    int score1 = sse8_mmx(c, pix1, pix2, line_size, h);
430
-    int score2 = hf_noise8_mmx(pix1, line_size, h) -
431
-                 hf_noise8_mmx(pix2, line_size, h);
432
-
433
-    if (c)
434
-        return score1 + FFABS(score2) * c->avctx->nsse_weight;
435
-    else
436
-        return score1 + FFABS(score2) * 8;
437
-}
438
-
439
-static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
440
-                            int line_size, int h)
441
-{
442
-    int tmp;
443
-
444
-    assert((((int) pix) & 7) == 0);
445
-    assert((line_size & 7) == 0);
446
-
447
-#define SUM(in0, in1, out0, out1)               \
448
-    "movq (%0), %%mm2\n"                        \
449
-    "movq 8(%0), %%mm3\n"                       \
450
-    "add %2,%0\n"                               \
451
-    "movq %%mm2, " #out0 "\n"                   \
452
-    "movq %%mm3, " #out1 "\n"                   \
453
-    "psubusb " #in0 ", %%mm2\n"                 \
454
-    "psubusb " #in1 ", %%mm3\n"                 \
455
-    "psubusb " #out0 ", " #in0 "\n"             \
456
-    "psubusb " #out1 ", " #in1 "\n"             \
457
-    "por %%mm2, " #in0 "\n"                     \
458
-    "por %%mm3, " #in1 "\n"                     \
459
-    "movq " #in0 ", %%mm2\n"                    \
460
-    "movq " #in1 ", %%mm3\n"                    \
461
-    "punpcklbw %%mm7, " #in0 "\n"               \
462
-    "punpcklbw %%mm7, " #in1 "\n"               \
463
-    "punpckhbw %%mm7, %%mm2\n"                  \
464
-    "punpckhbw %%mm7, %%mm3\n"                  \
465
-    "paddw " #in1 ", " #in0 "\n"                \
466
-    "paddw %%mm3, %%mm2\n"                      \
467
-    "paddw %%mm2, " #in0 "\n"                   \
468
-    "paddw " #in0 ", %%mm6\n"
469
-
470
-
471
-    __asm__ volatile (
472
-        "movl    %3, %%ecx\n"
473
-        "pxor %%mm6, %%mm6\n"
474
-        "pxor %%mm7, %%mm7\n"
475
-        "movq  (%0), %%mm0\n"
476
-        "movq 8(%0), %%mm1\n"
477
-        "add %2, %0\n"
478
-        "jmp 2f\n"
479
-        "1:\n"
480
-
481
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
482
-        "2:\n"
483
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
484
-
485
-        "subl $2, %%ecx\n"
486
-        "jnz 1b\n"
487
-
488
-        "movq  %%mm6, %%mm0\n"
489
-        "psrlq $32,   %%mm6\n"
490
-        "paddw %%mm6, %%mm0\n"
491
-        "movq  %%mm0, %%mm6\n"
492
-        "psrlq $16,   %%mm0\n"
493
-        "paddw %%mm6, %%mm0\n"
494
-        "movd  %%mm0, %1\n"
495
-        : "+r" (pix), "=r" (tmp)
496
-        : "r" ((x86_reg) line_size), "m" (h)
497
-        : "%ecx");
498
-
499
-    return tmp & 0xFFFF;
500
-}
501
-#undef SUM
502
-
503
-static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
504
-                               int line_size, int h)
505
-{
506
-    int tmp;
507
-
508
-    assert((((int) pix) & 7) == 0);
509
-    assert((line_size & 7) == 0);
510
-
511
-#define SUM(in0, in1, out0, out1)               \
512
-    "movq (%0), " #out0 "\n"                    \
513
-    "movq 8(%0), " #out1 "\n"                   \
514
-    "add %2, %0\n"                              \
515
-    "psadbw " #out0 ", " #in0 "\n"              \
516
-    "psadbw " #out1 ", " #in1 "\n"              \
517
-    "paddw " #in1 ", " #in0 "\n"                \
518
-    "paddw " #in0 ", %%mm6\n"
519
-
520
-    __asm__ volatile (
521
-        "movl %3, %%ecx\n"
522
-        "pxor %%mm6, %%mm6\n"
523
-        "pxor %%mm7, %%mm7\n"
524
-        "movq (%0), %%mm0\n"
525
-        "movq 8(%0), %%mm1\n"
526
-        "add %2, %0\n"
527
-        "jmp 2f\n"
528
-        "1:\n"
529
-
530
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
531
-        "2:\n"
532
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
533
-
534
-        "subl $2, %%ecx\n"
535
-        "jnz 1b\n"
536
-
537
-        "movd %%mm6, %1\n"
538
-        : "+r" (pix), "=r" (tmp)
539
-        : "r" ((x86_reg) line_size), "m" (h)
540
-        : "%ecx");
541
-
542
-    return tmp;
543
-}
544
-#undef SUM
545
-
546
-static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
547
-                      int line_size, int h)
548
-{
549
-    int tmp;
550
-
551
-    assert((((int) pix1) & 7) == 0);
552
-    assert((((int) pix2) & 7) == 0);
553
-    assert((line_size & 7) == 0);
554
-
555
-#define SUM(in0, in1, out0, out1)       \
556
-    "movq (%0), %%mm2\n"                \
557
-    "movq (%1), " #out0 "\n"            \
558
-    "movq 8(%0), %%mm3\n"               \
559
-    "movq 8(%1), " #out1 "\n"           \
560
-    "add %3, %0\n"                      \
561
-    "add %3, %1\n"                      \
562
-    "psubb " #out0 ", %%mm2\n"          \
563
-    "psubb " #out1 ", %%mm3\n"          \
564
-    "pxor %%mm7, %%mm2\n"               \
565
-    "pxor %%mm7, %%mm3\n"               \
566
-    "movq %%mm2, " #out0 "\n"           \
567
-    "movq %%mm3, " #out1 "\n"           \
568
-    "psubusb " #in0 ", %%mm2\n"         \
569
-    "psubusb " #in1 ", %%mm3\n"         \
570
-    "psubusb " #out0 ", " #in0 "\n"     \
571
-    "psubusb " #out1 ", " #in1 "\n"     \
572
-    "por %%mm2, " #in0 "\n"             \
573
-    "por %%mm3, " #in1 "\n"             \
574
-    "movq " #in0 ", %%mm2\n"            \
575
-    "movq " #in1 ", %%mm3\n"            \
576
-    "punpcklbw %%mm7, " #in0 "\n"       \
577
-    "punpcklbw %%mm7, " #in1 "\n"       \
578
-    "punpckhbw %%mm7, %%mm2\n"          \
579
-    "punpckhbw %%mm7, %%mm3\n"          \
580
-    "paddw " #in1 ", " #in0 "\n"        \
581
-    "paddw %%mm3, %%mm2\n"              \
582
-    "paddw %%mm2, " #in0 "\n"           \
583
-    "paddw " #in0 ", %%mm6\n"
584
-
585
-
586
-    __asm__ volatile (
587
-        "movl %4, %%ecx\n"
588
-        "pxor %%mm6, %%mm6\n"
589
-        "pcmpeqw %%mm7, %%mm7\n"
590
-        "psllw $15, %%mm7\n"
591
-        "packsswb %%mm7, %%mm7\n"
592
-        "movq (%0), %%mm0\n"
593
-        "movq (%1), %%mm2\n"
594
-        "movq 8(%0), %%mm1\n"
595
-        "movq 8(%1), %%mm3\n"
596
-        "add %3, %0\n"
597
-        "add %3, %1\n"
598
-        "psubb %%mm2, %%mm0\n"
599
-        "psubb %%mm3, %%mm1\n"
600
-        "pxor %%mm7, %%mm0\n"
601
-        "pxor %%mm7, %%mm1\n"
602
-        "jmp 2f\n"
603
-        "1:\n"
604
-
605
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
606
-        "2:\n"
607
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
608
-
609
-        "subl $2, %%ecx\n"
610
-        "jnz 1b\n"
611
-
612
-        "movq %%mm6, %%mm0\n"
613
-        "psrlq $32, %%mm6\n"
614
-        "paddw %%mm6, %%mm0\n"
615
-        "movq %%mm0, %%mm6\n"
616
-        "psrlq $16, %%mm0\n"
617
-        "paddw %%mm6, %%mm0\n"
618
-        "movd %%mm0, %2\n"
619
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
620
-        : "r" ((x86_reg) line_size), "m" (h)
621
-        : "%ecx");
622
-
623
-    return tmp & 0x7FFF;
624
-}
625
-#undef SUM
626
-
627
-static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
628
-                         int line_size, int h)
629
-{
630
-    int tmp;
631
-
632
-    assert((((int) pix1) & 7) == 0);
633
-    assert((((int) pix2) & 7) == 0);
634
-    assert((line_size & 7) == 0);
635
-
636
-#define SUM(in0, in1, out0, out1)               \
637
-    "movq (%0), " #out0 "\n"                    \
638
-    "movq (%1), %%mm2\n"                        \
639
-    "movq 8(%0), " #out1 "\n"                   \
640
-    "movq 8(%1), %%mm3\n"                       \
641
-    "add %3, %0\n"                              \
642
-    "add %3, %1\n"                              \
643
-    "psubb %%mm2, " #out0 "\n"                  \
644
-    "psubb %%mm3, " #out1 "\n"                  \
645
-    "pxor %%mm7, " #out0 "\n"                   \
646
-    "pxor %%mm7, " #out1 "\n"                   \
647
-    "psadbw " #out0 ", " #in0 "\n"              \
648
-    "psadbw " #out1 ", " #in1 "\n"              \
649
-    "paddw " #in1 ", " #in0 "\n"                \
650
-    "paddw " #in0 ", %%mm6\n    "
651
-
652
-    __asm__ volatile (
653
-        "movl %4, %%ecx\n"
654
-        "pxor %%mm6, %%mm6\n"
655
-        "pcmpeqw %%mm7, %%mm7\n"
656
-        "psllw $15, %%mm7\n"
657
-        "packsswb %%mm7, %%mm7\n"
658
-        "movq (%0), %%mm0\n"
659
-        "movq (%1), %%mm2\n"
660
-        "movq 8(%0), %%mm1\n"
661
-        "movq 8(%1), %%mm3\n"
662
-        "add %3, %0\n"
663
-        "add %3, %1\n"
664
-        "psubb %%mm2, %%mm0\n"
665
-        "psubb %%mm3, %%mm1\n"
666
-        "pxor %%mm7, %%mm0\n"
667
-        "pxor %%mm7, %%mm1\n"
668
-        "jmp 2f\n"
669
-        "1:\n"
670
-
671
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
672
-        "2:\n"
673
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
674
-
675
-        "subl $2, %%ecx\n"
676
-        "jnz 1b\n"
677
-
678
-        "movd %%mm6, %2\n"
679
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
680
-        : "r" ((x86_reg) line_size), "m" (h)
681
-        : "%ecx");
682
-
683
-    return tmp;
684
-}
685
-#undef SUM
686
-
687
-#define MMABS_MMX(a,z)                          \
688
-    "pxor "    #z ", " #z "             \n\t"   \
689
-    "pcmpgtw " #a ", " #z "             \n\t"   \
690
-    "pxor "    #z ", " #a "             \n\t"   \
691
-    "psubw "   #z ", " #a "             \n\t"
692
-
693
-#define MMABS_MMXEXT(a, z)                      \
694
-    "pxor "    #z ", " #z "             \n\t"   \
695
-    "psubw "   #a ", " #z "             \n\t"   \
696
-    "pmaxsw "  #z ", " #a "             \n\t"
697
-
698
-#define MMABS_SSSE3(a,z)                        \
699
-    "pabsw "   #a ", " #a "             \n\t"
700
-
701
-#define MMABS_SUM(a,z, sum)                     \
702
-    MMABS(a,z)                                  \
703
-    "paddusw " #a ", " #sum "           \n\t"
704
-
705
-/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get
706
- * up to about 100k on extreme inputs. But that's very unlikely to occur in
707
- * natural video, and it's even more unlikely to not have any alternative
708
- * mvs/modes with lower cost. */
709
-#define HSUM_MMX(a, t, dst)                     \
710
-    "movq    " #a ", " #t "             \n\t"   \
711
-    "psrlq      $32, " #a "             \n\t"   \
712
-    "paddusw " #t ", " #a "             \n\t"   \
713
-    "movq    " #a ", " #t "             \n\t"   \
714
-    "psrlq      $16, " #a "             \n\t"   \
715
-    "paddusw " #t ", " #a "             \n\t"   \
716
-    "movd    " #a ", " #dst "           \n\t"   \
717
-
718
-#define HSUM_MMXEXT(a, t, dst)                  \
719
-    "pshufw   $0x0E, " #a ", " #t "     \n\t"   \
720
-    "paddusw " #t ", " #a "             \n\t"   \
721
-    "pshufw   $0x01, " #a ", " #t "     \n\t"   \
722
-    "paddusw " #t ", " #a "             \n\t"   \
723
-    "movd    " #a ", " #dst "           \n\t"   \
724
-
725
-#define HSUM_SSE2(a, t, dst)                    \
726
-    "movhlps " #a ", " #t "             \n\t"   \
727
-    "paddusw " #t ", " #a "             \n\t"   \
728
-    "pshuflw  $0x0E, " #a ", " #t "     \n\t"   \
729
-    "paddusw " #t ", " #a "             \n\t"   \
730
-    "pshuflw  $0x01, " #a ", " #t "     \n\t"   \
731
-    "paddusw " #t ", " #a "             \n\t"   \
732
-    "movd    " #a ", " #dst "           \n\t"   \
733
-
734
-#define DCT_SAD4(m, mm, o)                      \
735
-    "mov"#m" "#o" +  0(%1), " #mm "2    \n\t"   \
736
-    "mov"#m" "#o" + 16(%1), " #mm "3    \n\t"   \
737
-    "mov"#m" "#o" + 32(%1), " #mm "4    \n\t"   \
738
-    "mov"#m" "#o" + 48(%1), " #mm "5    \n\t"   \
739
-    MMABS_SUM(mm ## 2, mm ## 6, mm ## 0)        \
740
-    MMABS_SUM(mm ## 3, mm ## 7, mm ## 1)        \
741
-    MMABS_SUM(mm ## 4, mm ## 6, mm ## 0)        \
742
-    MMABS_SUM(mm ## 5, mm ## 7, mm ## 1)        \
743
-
744
-#define DCT_SAD_MMX                             \
745
-    "pxor    %%mm0, %%mm0               \n\t"   \
746
-    "pxor    %%mm1, %%mm1               \n\t"   \
747
-    DCT_SAD4(q, %%mm, 0)                        \
748
-    DCT_SAD4(q, %%mm, 8)                        \
749
-    DCT_SAD4(q, %%mm, 64)                       \
750
-    DCT_SAD4(q, %%mm, 72)                       \
751
-    "paddusw %%mm1, %%mm0               \n\t"   \
752
-    HSUM(%%mm0, %%mm1, %0)
753
-
754
-#define DCT_SAD_SSE2                            \
755
-    "pxor    %%xmm0, %%xmm0             \n\t"   \
756
-    "pxor    %%xmm1, %%xmm1             \n\t"   \
757
-    DCT_SAD4(dqa, %%xmm, 0)                     \
758
-    DCT_SAD4(dqa, %%xmm, 64)                    \
759
-    "paddusw %%xmm1, %%xmm0             \n\t"   \
760
-    HSUM(%%xmm0, %%xmm1, %0)
761
-
762
-#define DCT_SAD_FUNC(cpu)                           \
763
-static int sum_abs_dctelem_ ## cpu(int16_t *block)  \
764
-{                                                   \
765
-    int sum;                                        \
766
-    __asm__ volatile (                              \
767
-        DCT_SAD                                     \
768
-        :"=r"(sum)                                  \
769
-        :"r"(block));                               \
770
-    return sum & 0xFFFF;                            \
771
-}
772
-
773
-#define DCT_SAD         DCT_SAD_MMX
774
-#define HSUM(a, t, dst) HSUM_MMX(a, t, dst)
775
-#define MMABS(a, z)     MMABS_MMX(a, z)
776
-DCT_SAD_FUNC(mmx)
777
-#undef MMABS
778
-#undef HSUM
779
-
780
-#define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst)
781
-#define MMABS(a, z)     MMABS_MMXEXT(a, z)
782
-DCT_SAD_FUNC(mmxext)
783
-#undef HSUM
784
-#undef DCT_SAD
785
-
786
-#define DCT_SAD         DCT_SAD_SSE2
787
-#define HSUM(a, t, dst) HSUM_SSE2(a, t, dst)
788
-DCT_SAD_FUNC(sse2)
789
-#undef MMABS
790
-
791
-#if HAVE_SSSE3_INLINE
792
-#define MMABS(a, z)     MMABS_SSSE3(a, z)
793
-DCT_SAD_FUNC(ssse3)
794
-#undef MMABS
795
-#endif
796
-#undef HSUM
797
-#undef DCT_SAD
798
-
799
-
800
-DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
801
-    0x0000000000000000ULL,
802
-    0x0001000100010001ULL,
803
-    0x0002000200020002ULL,
804
-};
805
-
806
-DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
807
-
808
-static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
809
-{
810
-    x86_reg len = -(stride * h);
811
-    __asm__ volatile (
812
-        ".p2align 4                     \n\t"
813
-        "1:                             \n\t"
814
-        "movq (%1, %%"REG_a"), %%mm0    \n\t"
815
-        "movq (%2, %%"REG_a"), %%mm2    \n\t"
816
-        "movq (%2, %%"REG_a"), %%mm4    \n\t"
817
-        "add %3, %%"REG_a"              \n\t"
818
-        "psubusb %%mm0, %%mm2           \n\t"
819
-        "psubusb %%mm4, %%mm0           \n\t"
820
-        "movq (%1, %%"REG_a"), %%mm1    \n\t"
821
-        "movq (%2, %%"REG_a"), %%mm3    \n\t"
822
-        "movq (%2, %%"REG_a"), %%mm5    \n\t"
823
-        "psubusb %%mm1, %%mm3           \n\t"
824
-        "psubusb %%mm5, %%mm1           \n\t"
825
-        "por %%mm2, %%mm0               \n\t"
826
-        "por %%mm1, %%mm3               \n\t"
827
-        "movq %%mm0, %%mm1              \n\t"
828
-        "movq %%mm3, %%mm2              \n\t"
829
-        "punpcklbw %%mm7, %%mm0         \n\t"
830
-        "punpckhbw %%mm7, %%mm1         \n\t"
831
-        "punpcklbw %%mm7, %%mm3         \n\t"
832
-        "punpckhbw %%mm7, %%mm2         \n\t"
833
-        "paddw %%mm1, %%mm0             \n\t"
834
-        "paddw %%mm3, %%mm2             \n\t"
835
-        "paddw %%mm2, %%mm0             \n\t"
836
-        "paddw %%mm0, %%mm6             \n\t"
837
-        "add %3, %%"REG_a"              \n\t"
838
-        " js 1b                         \n\t"
839
-        : "+a" (len)
840
-        : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride));
841
-}
842
-
843
-static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
844
-                                 int stride, int h)
845
-{
846
-    __asm__ volatile (
847
-        ".p2align 4                     \n\t"
848
-        "1:                             \n\t"
849
-        "movq (%1), %%mm0               \n\t"
850
-        "movq (%1, %3), %%mm1           \n\t"
851
-        "psadbw (%2), %%mm0             \n\t"
852
-        "psadbw (%2, %3), %%mm1         \n\t"
853
-        "paddw %%mm0, %%mm6             \n\t"
854
-        "paddw %%mm1, %%mm6             \n\t"
855
-        "lea (%1,%3,2), %1              \n\t"
856
-        "lea (%2,%3,2), %2              \n\t"
857
-        "sub $2, %0                     \n\t"
858
-        " jg 1b                         \n\t"
859
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
860
-        : "r" ((x86_reg) stride));
861
-}
862
-
863
-static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
864
-                      int stride, int h)
865
-{
866
-    int ret;
867
-    __asm__ volatile (
868
-        "pxor %%xmm2, %%xmm2            \n\t"
869
-        ".p2align 4                     \n\t"
870
-        "1:                             \n\t"
871
-        "movdqu (%1), %%xmm0            \n\t"
872
-        "movdqu (%1, %4), %%xmm1        \n\t"
873
-        "psadbw (%2), %%xmm0            \n\t"
874
-        "psadbw (%2, %4), %%xmm1        \n\t"
875
-        "paddw %%xmm0, %%xmm2           \n\t"
876
-        "paddw %%xmm1, %%xmm2           \n\t"
877
-        "lea (%1,%4,2), %1              \n\t"
878
-        "lea (%2,%4,2), %2              \n\t"
879
-        "sub $2, %0                     \n\t"
880
-        " jg 1b                         \n\t"
881
-        "movhlps %%xmm2, %%xmm0         \n\t"
882
-        "paddw   %%xmm0, %%xmm2         \n\t"
883
-        "movd    %%xmm2, %3             \n\t"
884
-        : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
885
-        : "r" ((x86_reg) stride));
886
-    return ret;
887
-}
888
-
889
-static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
890
-                                   int stride, int h)
891
-{
892
-    __asm__ volatile (
893
-        ".p2align 4                     \n\t"
894
-        "1:                             \n\t"
895
-        "movq (%1), %%mm0               \n\t"
896
-        "movq (%1, %3), %%mm1           \n\t"
897
-        "pavgb 1(%1), %%mm0             \n\t"
898
-        "pavgb 1(%1, %3), %%mm1         \n\t"
899
-        "psadbw (%2), %%mm0             \n\t"
900
-        "psadbw (%2, %3), %%mm1         \n\t"
901
-        "paddw %%mm0, %%mm6             \n\t"
902
-        "paddw %%mm1, %%mm6             \n\t"
903
-        "lea (%1,%3,2), %1              \n\t"
904
-        "lea (%2,%3,2), %2              \n\t"
905
-        "sub $2, %0                     \n\t"
906
-        " jg 1b                         \n\t"
907
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
908
-        : "r" ((x86_reg) stride));
909
-}
910
-
911
-static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
912
-                                   int stride, int h)
913
-{
914
-    __asm__ volatile (
915
-        "movq (%1), %%mm0               \n\t"
916
-        "add %3, %1                     \n\t"
917
-        ".p2align 4                     \n\t"
918
-        "1:                             \n\t"
919
-        "movq (%1), %%mm1               \n\t"
920
-        "movq (%1, %3), %%mm2           \n\t"
921
-        "pavgb %%mm1, %%mm0             \n\t"
922
-        "pavgb %%mm2, %%mm1             \n\t"
923
-        "psadbw (%2), %%mm0             \n\t"
924
-        "psadbw (%2, %3), %%mm1         \n\t"
925
-        "paddw %%mm0, %%mm6             \n\t"
926
-        "paddw %%mm1, %%mm6             \n\t"
927
-        "movq %%mm2, %%mm0              \n\t"
928
-        "lea (%1,%3,2), %1              \n\t"
929
-        "lea (%2,%3,2), %2              \n\t"
930
-        "sub $2, %0                     \n\t"
931
-        " jg 1b                         \n\t"
932
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
933
-        : "r" ((x86_reg) stride));
934
-}
935
-
936
-static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
937
-                                 int stride, int h)
938
-{
939
-    __asm__ volatile (
940
-        "movq "MANGLE(bone)", %%mm5     \n\t"
941
-        "movq (%1), %%mm0               \n\t"
942
-        "pavgb 1(%1), %%mm0             \n\t"
943
-        "add %3, %1                     \n\t"
944
-        ".p2align 4                     \n\t"
945
-        "1:                             \n\t"
946
-        "movq (%1), %%mm1               \n\t"
947
-        "movq (%1,%3), %%mm2            \n\t"
948
-        "pavgb 1(%1), %%mm1             \n\t"
949
-        "pavgb 1(%1,%3), %%mm2          \n\t"
950
-        "psubusb %%mm5, %%mm1           \n\t"
951
-        "pavgb %%mm1, %%mm0             \n\t"
952
-        "pavgb %%mm2, %%mm1             \n\t"
953
-        "psadbw (%2), %%mm0             \n\t"
954
-        "psadbw (%2,%3), %%mm1          \n\t"
955
-        "paddw %%mm0, %%mm6             \n\t"
956
-        "paddw %%mm1, %%mm6             \n\t"
957
-        "movq %%mm2, %%mm0              \n\t"
958
-        "lea (%1,%3,2), %1              \n\t"
959
-        "lea (%2,%3,2), %2              \n\t"
960
-        "sub $2, %0                     \n\t"
961
-        " jg 1b                         \n\t"
962
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
963
-        : "r" ((x86_reg) stride));
964
-}
965
-
966
-static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
967
-                              int stride, int h)
968
-{
969
-    x86_reg len = -(stride * h);
970
-    __asm__ volatile (
971
-        ".p2align 4                     \n\t"
972
-        "1:                             \n\t"
973
-        "movq (%1, %%"REG_a"), %%mm0    \n\t"
974
-        "movq (%2, %%"REG_a"), %%mm1    \n\t"
975
-        "movq (%1, %%"REG_a"), %%mm2    \n\t"
976
-        "movq (%2, %%"REG_a"), %%mm3    \n\t"
977
-        "punpcklbw %%mm7, %%mm0         \n\t"
978
-        "punpcklbw %%mm7, %%mm1         \n\t"
979
-        "punpckhbw %%mm7, %%mm2         \n\t"
980
-        "punpckhbw %%mm7, %%mm3         \n\t"
981
-        "paddw %%mm0, %%mm1             \n\t"
982
-        "paddw %%mm2, %%mm3             \n\t"
983
-        "movq (%3, %%"REG_a"), %%mm4    \n\t"
984
-        "movq (%3, %%"REG_a"), %%mm2    \n\t"
985
-        "paddw %%mm5, %%mm1             \n\t"
986
-        "paddw %%mm5, %%mm3             \n\t"
987
-        "psrlw $1, %%mm1                \n\t"
988
-        "psrlw $1, %%mm3                \n\t"
989
-        "packuswb %%mm3, %%mm1          \n\t"
990
-        "psubusb %%mm1, %%mm4           \n\t"
991
-        "psubusb %%mm2, %%mm1           \n\t"
992
-        "por %%mm4, %%mm1               \n\t"
993
-        "movq %%mm1, %%mm0              \n\t"
994
-        "punpcklbw %%mm7, %%mm0         \n\t"
995
-        "punpckhbw %%mm7, %%mm1         \n\t"
996
-        "paddw %%mm1, %%mm0             \n\t"
997
-        "paddw %%mm0, %%mm6             \n\t"
998
-        "add %4, %%"REG_a"              \n\t"
999
-        " js 1b                         \n\t"
1000
-        : "+a" (len)
1001
-        : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
1002
-          "r" ((x86_reg) stride));
1003
-}
1004
-
1005
-static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
1006
-{
1007
-    x86_reg len = -(stride * h);
1008
-    __asm__ volatile (
1009
-        "movq  (%1, %%"REG_a"), %%mm0   \n\t"
1010
-        "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
1011
-        "movq %%mm0, %%mm1              \n\t"
1012
-        "movq %%mm2, %%mm3              \n\t"
1013
-        "punpcklbw %%mm7, %%mm0         \n\t"
1014
-        "punpckhbw %%mm7, %%mm1         \n\t"
1015
-        "punpcklbw %%mm7, %%mm2         \n\t"
1016
-        "punpckhbw %%mm7, %%mm3         \n\t"
1017
-        "paddw %%mm2, %%mm0             \n\t"
1018
-        "paddw %%mm3, %%mm1             \n\t"
1019
-        ".p2align 4                     \n\t"
1020
-        "1:                             \n\t"
1021
-        "movq  (%2, %%"REG_a"), %%mm2   \n\t"
1022
-        "movq 1(%2, %%"REG_a"), %%mm4   \n\t"
1023
-        "movq %%mm2, %%mm3              \n\t"
1024
-        "movq %%mm4, %%mm5              \n\t"
1025
-        "punpcklbw %%mm7, %%mm2         \n\t"
1026
-        "punpckhbw %%mm7, %%mm3         \n\t"
1027
-        "punpcklbw %%mm7, %%mm4         \n\t"
1028
-        "punpckhbw %%mm7, %%mm5         \n\t"
1029
-        "paddw %%mm4, %%mm2             \n\t"
1030
-        "paddw %%mm5, %%mm3             \n\t"
1031
-        "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
1032
-        "paddw %%mm2, %%mm0             \n\t"
1033
-        "paddw %%mm3, %%mm1             \n\t"
1034
-        "paddw %%mm5, %%mm0             \n\t"
1035
-        "paddw %%mm5, %%mm1             \n\t"
1036
-        "movq (%3, %%"REG_a"), %%mm4    \n\t"
1037
-        "movq (%3, %%"REG_a"), %%mm5    \n\t"
1038
-        "psrlw $2, %%mm0                \n\t"
1039
-        "psrlw $2, %%mm1                \n\t"
1040
-        "packuswb %%mm1, %%mm0          \n\t"
1041
-        "psubusb %%mm0, %%mm4           \n\t"
1042
-        "psubusb %%mm5, %%mm0           \n\t"
1043
-        "por %%mm4, %%mm0               \n\t"
1044
-        "movq %%mm0, %%mm4              \n\t"
1045
-        "punpcklbw %%mm7, %%mm0         \n\t"
1046
-        "punpckhbw %%mm7, %%mm4         \n\t"
1047
-        "paddw %%mm0, %%mm6             \n\t"
1048
-        "paddw %%mm4, %%mm6             \n\t"
1049
-        "movq  %%mm2, %%mm0             \n\t"
1050
-        "movq  %%mm3, %%mm1             \n\t"
1051
-        "add %4, %%"REG_a"              \n\t"
1052
-        " js 1b                         \n\t"
1053
-        : "+a" (len)
1054
-        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
1055
-          "r" ((x86_reg) stride));
1056
-}
1057
-
1058
-static inline int sum_mmx(void)
1059
-{
1060
-    int ret;
1061
-    __asm__ volatile (
1062
-        "movq %%mm6, %%mm0              \n\t"
1063
-        "psrlq $32, %%mm6               \n\t"
1064
-        "paddw %%mm0, %%mm6             \n\t"
1065
-        "movq %%mm6, %%mm0              \n\t"
1066
-        "psrlq $16, %%mm6               \n\t"
1067
-        "paddw %%mm0, %%mm6             \n\t"
1068
-        "movd %%mm6, %0                 \n\t"
1069
-        : "=r" (ret));
1070
-    return ret & 0xFFFF;
1071
-}
1072
-
1073
-static inline int sum_mmxext(void)
1074
-{
1075
-    int ret;
1076
-    __asm__ volatile (
1077
-        "movd %%mm6, %0                 \n\t"
1078
-        : "=r" (ret));
1079
-    return ret;
1080
-}
1081
-
1082
-static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
1083
-{
1084
-    sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
1085
-}
1086
-
1087
-static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
1088
-{
1089
-    sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
1090
-}
1091
-
1092
-#define PIX_SAD(suf)                                                    \
1093
-static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
1094
-                        uint8_t *blk1, int stride, int h)               \
1095
-{                                                                       \
1096
-    assert(h == 8);                                                     \
1097
-    __asm__ volatile (                                                  \
1098
-        "pxor %%mm7, %%mm7     \n\t"                                    \
1099
-        "pxor %%mm6, %%mm6     \n\t"                                    \
1100
-        :);                                                             \
1101
-                                                                        \
1102
-    sad8_1_ ## suf(blk1, blk2, stride, 8);                              \
1103
-                                                                        \
1104
-    return sum_ ## suf();                                               \
1105
-}                                                                       \
1106
-                                                                        \
1107
-static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
1108
-                           uint8_t *blk1, int stride, int h)            \
1109
-{                                                                       \
1110
-    assert(h == 8);                                                     \
1111
-    __asm__ volatile (                                                  \
1112
-        "pxor %%mm7, %%mm7     \n\t"                                    \
1113
-        "pxor %%mm6, %%mm6     \n\t"                                    \
1114
-        "movq %0, %%mm5        \n\t"                                    \
1115
-        :: "m" (round_tab[1]));                                         \
1116
-                                                                        \
1117
-    sad8_x2a_ ## suf(blk1, blk2, stride, 8);                            \
1118
-                                                                        \
1119
-    return sum_ ## suf();                                               \
1120
-}                                                                       \
1121
-                                                                        \
1122
-static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
1123
-                           uint8_t *blk1, int stride, int h)            \
1124
-{                                                                       \
1125
-    assert(h == 8);                                                     \
1126
-    __asm__ volatile (                                                  \
1127
-        "pxor %%mm7, %%mm7     \n\t"                                    \
1128
-        "pxor %%mm6, %%mm6     \n\t"                                    \
1129
-        "movq %0, %%mm5        \n\t"                                    \
1130
-        :: "m" (round_tab[1]));                                         \
1131
-                                                                        \
1132
-    sad8_y2a_ ## suf(blk1, blk2, stride, 8);                            \
1133
-                                                                        \
1134
-    return sum_ ## suf();                                               \
1135
-}                                                                       \
1136
-                                                                        \
1137
-static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
1138
-                            uint8_t *blk1, int stride, int h)           \
1139
-{                                                                       \
1140
-    assert(h == 8);                                                     \
1141
-    __asm__ volatile (                                                  \
1142
-        "pxor %%mm7, %%mm7     \n\t"                                    \
1143
-        "pxor %%mm6, %%mm6     \n\t"                                    \
1144
-        ::);                                                            \
1145
-                                                                        \
1146
-    sad8_4_ ## suf(blk1, blk2, stride, 8);                              \
1147
-                                                                        \
1148
-    return sum_ ## suf();                                               \
1149
-}                                                                       \
1150
-                                                                        \
1151
-static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2,              \
1152
-                         uint8_t *blk1, int stride, int h)              \
1153
-{                                                                       \
1154
-    __asm__ volatile (                                                  \
1155
-        "pxor %%mm7, %%mm7     \n\t"                                    \
1156
-        "pxor %%mm6, %%mm6     \n\t"                                    \
1157
-        :);                                                             \
1158
-                                                                        \
1159
-    sad8_1_ ## suf(blk1,     blk2,     stride, h);                      \
1160
-    sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
1161
-                                                                        \
1162
-    return sum_ ## suf();                                               \
1163
-}                                                                       \
1164
-                                                                        \
1165
-static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
1166
-                            uint8_t *blk1, int stride, int h)           \
1167
-{                                                                       \
1168
-    __asm__ volatile (                                                  \
1169
-        "pxor %%mm7, %%mm7     \n\t"                                    \
1170
-        "pxor %%mm6, %%mm6     \n\t"                                    \
1171
-        "movq %0, %%mm5        \n\t"                                    \
1172
-        :: "m" (round_tab[1]));                                         \
1173
-                                                                        \
1174
-    sad8_x2a_ ## suf(blk1,     blk2,     stride, h);                    \
1175
-    sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
1176
-                                                                        \
1177
-    return sum_ ## suf();                                               \
1178
-}                                                                       \
1179
-                                                                        \
1180
-static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
1181
-                            uint8_t *blk1, int stride, int h)           \
1182
-{                                                                       \
1183
-    __asm__ volatile (                                                  \
1184
-        "pxor %%mm7, %%mm7     \n\t"                                    \
1185
-        "pxor %%mm6, %%mm6     \n\t"                                    \
1186
-        "movq %0, %%mm5        \n\t"                                    \
1187
-        :: "m" (round_tab[1]));                                         \
1188
-                                                                        \
1189
-    sad8_y2a_ ## suf(blk1,     blk2,     stride, h);                    \
1190
-    sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
1191
-                                                                        \
1192
-    return sum_ ## suf();                                               \
1193
-}                                                                       \
1194
-                                                                        \
1195
-static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
1196
-                             uint8_t *blk1, int stride, int h)          \
1197
-{                                                                       \
1198
-    __asm__ volatile (                                                  \
1199
-        "pxor %%mm7, %%mm7     \n\t"                                    \
1200
-        "pxor %%mm6, %%mm6     \n\t"                                    \
1201
-        ::);                                                            \
1202
-                                                                        \
1203
-    sad8_4_ ## suf(blk1,     blk2,     stride, h);                      \
1204
-    sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
1205
-                                                                        \
1206
-    return sum_ ## suf();                                               \
1207
-}                                                                       \
1208
-
1209
-PIX_SAD(mmx)
1210
-PIX_SAD(mmxext)
1211
-
1212
-#endif /* HAVE_INLINE_ASM */
1213
-
1214
-int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1215
-                  int line_size, int h);
1216
-
1217
-#define hadamard_func(cpu)                                              \
1218
-    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,     \
1219
-                                  uint8_t *src2, int stride, int h);    \
1220
-    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,   \
1221
-                                    uint8_t *src2, int stride, int h);
1222
-
1223
-hadamard_func(mmx)
1224
-hadamard_func(mmxext)
1225
-hadamard_func(sse2)
1226
-hadamard_func(ssse3)
1227
-
1228
-av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx)
1229
-{
1230
-    int cpu_flags = av_get_cpu_flags();
1231
-
1232
-#if HAVE_INLINE_ASM
1233
-    if (INLINE_MMX(cpu_flags)) {
1234
-        c->sum_abs_dctelem = sum_abs_dctelem_mmx;
1235
-
1236
-        c->pix_abs[0][0] = sad16_mmx;
1237
-        c->pix_abs[0][1] = sad16_x2_mmx;
1238
-        c->pix_abs[0][2] = sad16_y2_mmx;
1239
-        c->pix_abs[0][3] = sad16_xy2_mmx;
1240
-        c->pix_abs[1][0] = sad8_mmx;
1241
-        c->pix_abs[1][1] = sad8_x2_mmx;
1242
-        c->pix_abs[1][2] = sad8_y2_mmx;
1243
-        c->pix_abs[1][3] = sad8_xy2_mmx;
1244
-
1245
-        c->sad[0] = sad16_mmx;
1246
-        c->sad[1] = sad8_mmx;
1247
-
1248
-        c->sse[0]  = sse16_mmx;
1249
-        c->sse[1]  = sse8_mmx;
1250
-        c->vsad[4] = vsad_intra16_mmx;
1251
-
1252
-        c->nsse[0] = nsse16_mmx;
1253
-        c->nsse[1] = nsse8_mmx;
1254
-
1255
-        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1256
-            c->vsad[0] = vsad16_mmx;
1257
-        }
1258
-    }
1259
-
1260
-    if (INLINE_MMXEXT(cpu_flags)) {
1261
-        c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
1262
-
1263
-        c->vsad[4] = vsad_intra16_mmxext;
1264
-
1265
-        c->pix_abs[0][0] = sad16_mmxext;
1266
-        c->pix_abs[1][0] = sad8_mmxext;
1267
-
1268
-        c->sad[0] = sad16_mmxext;
1269
-        c->sad[1] = sad8_mmxext;
1270
-
1271
-        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1272
-            c->pix_abs[0][1] = sad16_x2_mmxext;
1273
-            c->pix_abs[0][2] = sad16_y2_mmxext;
1274
-            c->pix_abs[0][3] = sad16_xy2_mmxext;
1275
-            c->pix_abs[1][1] = sad8_x2_mmxext;
1276
-            c->pix_abs[1][2] = sad8_y2_mmxext;
1277
-            c->pix_abs[1][3] = sad8_xy2_mmxext;
1278
-
1279
-            c->vsad[0] = vsad16_mmxext;
1280
-        }
1281
-    }
1282
-
1283
-    if (INLINE_SSE2(cpu_flags)) {
1284
-        c->sum_abs_dctelem = sum_abs_dctelem_sse2;
1285
-    }
1286
-
1287
-    if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) {
1288
-        c->sad[0] = sad16_sse2;
1289
-    }
1290
-
1291
-#if HAVE_SSSE3_INLINE
1292
-    if (INLINE_SSSE3(cpu_flags)) {
1293
-        c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
1294
-    }
1295
-#endif
1296
-#endif /* HAVE_INLINE_ASM */
1297
-
1298
-    if (EXTERNAL_MMX(cpu_flags)) {
1299
-        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
1300
-        c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
1301
-    }
1302
-
1303
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
1304
-        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
1305
-        c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
1306
-    }
1307
-
1308
-    if (EXTERNAL_SSE2(cpu_flags)) {
1309
-        c->sse[0] = ff_sse16_sse2;
1310
-
1311
-#if HAVE_ALIGNED_STACK
1312
-        c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
1313
-        c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
1314
-#endif
1315
-    }
1316
-
1317
-    if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
1318
-        c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
1319
-        c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
1320
-    }
1321
-}
1322 1
deleted file mode 100644
... ...
@@ -1,336 +0,0 @@
1
-;*****************************************************************************
2
-;* MMX optimized DSP utils
3
-;*****************************************************************************
4
-;* Copyright (c) 2000, 2001 Fabrice Bellard
5
-;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6
-;*
7
-;* This file is part of Libav.
8
-;*
9
-;* Libav is free software; you can redistribute it and/or
10
-;* modify it under the terms of the GNU Lesser General Public
11
-;* License as published by the Free Software Foundation; either
12
-;* version 2.1 of the License, or (at your option) any later version.
13
-;*
14
-;* Libav is distributed in the hope that it will be useful,
15
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
-;* Lesser General Public License for more details.
18
-;*
19
-;* You should have received a copy of the GNU Lesser General Public
20
-;* License along with Libav; if not, write to the Free Software
21
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
-;*****************************************************************************
23
-
24
-%include "libavutil/x86/x86util.asm"
25
-
26
-SECTION .text
27
-
28
-%macro DIFF_PIXELS_1 4
29
-    movh            %1, %3
30
-    movh            %2, %4
31
-    punpcklbw       %2, %1
32
-    punpcklbw       %1, %1
33
-    psubw           %1, %2
34
-%endmacro
35
-
36
-; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
37
-; %6=temporary storage location
38
-; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
39
-%macro DIFF_PIXELS_8 6
40
-    DIFF_PIXELS_1   m0, m7, [%1     +%3], [%2     +%3]
41
-    DIFF_PIXELS_1   m1, m7, [%1+%4  +%3], [%2+%4  +%3]
42
-    DIFF_PIXELS_1   m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
43
-    add             %1, %5
44
-    add             %2, %5
45
-    DIFF_PIXELS_1   m3, m7, [%1     +%3], [%2     +%3]
46
-    DIFF_PIXELS_1   m4, m7, [%1+%4  +%3], [%2+%4  +%3]
47
-    DIFF_PIXELS_1   m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
48
-    DIFF_PIXELS_1   m6, m7, [%1+%5  +%3], [%2+%5  +%3]
49
-%ifdef m8
50
-    DIFF_PIXELS_1   m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
51
-%else
52
-    mova          [%6], m0
53
-    DIFF_PIXELS_1   m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
54
-    mova            m0, [%6]
55
-%endif
56
-    sub             %1, %5
57
-    sub             %2, %5
58
-%endmacro
59
-
60
-%macro HADAMARD8 0
61
-    SUMSUB_BADC       w, 0, 1, 2, 3
62
-    SUMSUB_BADC       w, 4, 5, 6, 7
63
-    SUMSUB_BADC       w, 0, 2, 1, 3
64
-    SUMSUB_BADC       w, 4, 6, 5, 7
65
-    SUMSUB_BADC       w, 0, 4, 1, 5
66
-    SUMSUB_BADC       w, 2, 6, 3, 7
67
-%endmacro
68
-
69
-%macro ABS1_SUM 3
70
-    ABS1            %1, %2
71
-    paddusw         %3, %1
72
-%endmacro
73
-
74
-%macro ABS2_SUM 6
75
-    ABS2            %1, %2, %3, %4
76
-    paddusw         %5, %1
77
-    paddusw         %6, %2
78
-%endmacro
79
-
80
-%macro ABS_SUM_8x8_64 1
81
-    ABS2            m0, m1, m8, m9
82
-    ABS2_SUM        m2, m3, m8, m9, m0, m1
83
-    ABS2_SUM        m4, m5, m8, m9, m0, m1
84
-    ABS2_SUM        m6, m7, m8, m9, m0, m1
85
-    paddusw         m0, m1
86
-%endmacro
87
-
88
-%macro ABS_SUM_8x8_32 1
89
-    mova          [%1], m7
90
-    ABS1            m0, m7
91
-    ABS1            m1, m7
92
-    ABS1_SUM        m2, m7, m0
93
-    ABS1_SUM        m3, m7, m1
94
-    ABS1_SUM        m4, m7, m0
95
-    ABS1_SUM        m5, m7, m1
96
-    ABS1_SUM        m6, m7, m0
97
-    mova            m2, [%1]
98
-    ABS1_SUM        m2, m7, m1
99
-    paddusw         m0, m1
100
-%endmacro
101
-
102
-; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
103
-; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
104
-; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
105
-%macro HSUM 3
106
-%if cpuflag(sse2)
107
-    movhlps         %2, %1
108
-    paddusw         %1, %2
109
-    pshuflw         %2, %1, 0xE
110
-    paddusw         %1, %2
111
-    pshuflw         %2, %1, 0x1
112
-    paddusw         %1, %2
113
-    movd            %3, %1
114
-%elif cpuflag(mmxext)
115
-    pshufw          %2, %1, 0xE
116
-    paddusw         %1, %2
117
-    pshufw          %2, %1, 0x1
118
-    paddusw         %1, %2
119
-    movd            %3, %1
120
-%elif cpuflag(mmx)
121
-    mova            %2, %1
122
-    psrlq           %1, 32
123
-    paddusw         %1, %2
124
-    mova            %2, %1
125
-    psrlq           %1, 16
126
-    paddusw         %1, %2
127
-    movd            %3, %1
128
-%endif
129
-%endmacro
130
-
131
-%macro STORE4 5
132
-    mova [%1+mmsize*0], %2
133
-    mova [%1+mmsize*1], %3
134
-    mova [%1+mmsize*2], %4
135
-    mova [%1+mmsize*3], %5
136
-%endmacro
137
-
138
-%macro LOAD4 5
139
-    mova            %2, [%1+mmsize*0]
140
-    mova            %3, [%1+mmsize*1]
141
-    mova            %4, [%1+mmsize*2]
142
-    mova            %5, [%1+mmsize*3]
143
-%endmacro
144
-
145
-%macro hadamard8_16_wrapper 2
146
-cglobal hadamard8_diff, 4, 4, %1
147
-%ifndef m8
148
-    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
149
-    SUB            rsp, pad
150
-%endif
151
-    call hadamard8x8_diff %+ SUFFIX
152
-%ifndef m8
153
-    ADD            rsp, pad
154
-%endif
155
-    RET
156
-
157
-cglobal hadamard8_diff16, 5, 6, %1
158
-%ifndef m8
159
-    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
160
-    SUB            rsp, pad
161
-%endif
162
-
163
-    call hadamard8x8_diff %+ SUFFIX
164
-    mov            r5d, eax
165
-
166
-    add             r1, 8
167
-    add             r2, 8
168
-    call hadamard8x8_diff %+ SUFFIX
169
-    add            r5d, eax
170
-
171
-    cmp            r4d, 16
172
-    jne .done
173
-
174
-    lea             r1, [r1+r3*8-8]
175
-    lea             r2, [r2+r3*8-8]
176
-    call hadamard8x8_diff %+ SUFFIX
177
-    add            r5d, eax
178
-
179
-    add             r1, 8
180
-    add             r2, 8
181
-    call hadamard8x8_diff %+ SUFFIX
182
-    add            r5d, eax
183
-
184
-.done:
185
-    mov            eax, r5d
186
-%ifndef m8
187
-    ADD            rsp, pad
188
-%endif
189
-    RET
190
-%endmacro
191
-
192
-%macro HADAMARD8_DIFF 0-1
193
-%if cpuflag(sse2)
194
-hadamard8x8_diff %+ SUFFIX:
195
-    lea                          r0, [r3*3]
196
-    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize
197
-    HADAMARD8
198
-%if ARCH_X86_64
199
-    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7,  8
200
-%else
201
-    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7, [rsp+gprsize], [rsp+mmsize+gprsize]
202
-%endif
203
-    HADAMARD8
204
-    ABS_SUM_8x8         rsp+gprsize
205
-    HSUM                        m0, m1, eax
206
-    and                         eax, 0xFFFF
207
-    ret
208
-
209
-hadamard8_16_wrapper %1, 3
210
-%elif cpuflag(mmx)
211
-ALIGN 16
212
-; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,
213
-;                               uint8_t *src2, int stride, int h)
214
-; r0 = void *s = unused, int h = unused (always 8)
215
-; note how r1, r2 and r3 are not clobbered in this function, so 16x16
216
-; can simply call this 2x2x (and that's why we access rsp+gprsize
217
-; everywhere, which is rsp of calling func
218
-hadamard8x8_diff %+ SUFFIX:
219
-    lea                          r0, [r3*3]
220
-
221
-    ; first 4x8 pixels
222
-    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize+0x60
223
-    HADAMARD8
224
-    mova         [rsp+gprsize+0x60], m7
225
-    TRANSPOSE4x4W                 0,  1,  2,  3,  7
226
-    STORE4              rsp+gprsize, m0, m1, m2, m3
227
-    mova                         m7, [rsp+gprsize+0x60]
228
-    TRANSPOSE4x4W                 4,  5,  6,  7,  0
229
-    STORE4         rsp+gprsize+0x40, m4, m5, m6, m7
230
-
231
-    ; second 4x8 pixels
232
-    DIFF_PIXELS_8                r1, r2,  4, r3, r0, rsp+gprsize+0x60
233
-    HADAMARD8
234
-    mova         [rsp+gprsize+0x60], m7
235
-    TRANSPOSE4x4W                 0,  1,  2,  3,  7
236
-    STORE4         rsp+gprsize+0x20, m0, m1, m2, m3
237
-    mova                         m7, [rsp+gprsize+0x60]
238
-    TRANSPOSE4x4W                 4,  5,  6,  7,  0
239
-
240
-    LOAD4          rsp+gprsize+0x40, m0, m1, m2, m3
241
-    HADAMARD8
242
-    ABS_SUM_8x8_32 rsp+gprsize+0x60
243
-    mova         [rsp+gprsize+0x60], m0
244
-
245
-    LOAD4          rsp+gprsize     , m0, m1, m2, m3
246
-    LOAD4          rsp+gprsize+0x20, m4, m5, m6, m7
247
-    HADAMARD8
248
-    ABS_SUM_8x8_32 rsp+gprsize
249
-    paddusw                      m0, [rsp+gprsize+0x60]
250
-
251
-    HSUM                         m0, m1, eax
252
-    and                         rax, 0xFFFF
253
-    ret
254
-
255
-hadamard8_16_wrapper 0, 14
256
-%endif
257
-%endmacro
258
-
259
-INIT_MMX mmx
260
-HADAMARD8_DIFF
261
-
262
-INIT_MMX mmxext
263
-HADAMARD8_DIFF
264
-
265
-INIT_XMM sse2
266
-%if ARCH_X86_64
267
-%define ABS_SUM_8x8 ABS_SUM_8x8_64
268
-%else
269
-%define ABS_SUM_8x8 ABS_SUM_8x8_32
270
-%endif
271
-HADAMARD8_DIFF 10
272
-
273
-INIT_XMM ssse3
274
-%define ABS_SUM_8x8 ABS_SUM_8x8_64
275
-HADAMARD8_DIFF 9
276
-
277
-INIT_XMM sse2
278
-; int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
279
-;                   int line_size, int h);
280
-cglobal sse16, 5, 5, 8
281
-    shr      r4d, 1
282
-    pxor      m0, m0         ; mm0 = 0
283
-    pxor      m7, m7         ; mm7 holds the sum
284
-
285
-.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
286
-    movu      m1, [r1   ]    ; mm1 = pix1[0][0-15]
287
-    movu      m2, [r2   ]    ; mm2 = pix2[0][0-15]
288
-    movu      m3, [r1+r3]    ; mm3 = pix1[1][0-15]
289
-    movu      m4, [r2+r3]    ; mm4 = pix2[1][0-15]
290
-
291
-    ; todo: mm1-mm2, mm3-mm4
292
-    ; algo: subtract mm1 from mm2 with saturation and vice versa
293
-    ;       OR the result to get the absolute difference
294
-    mova      m5, m1
295
-    mova      m6, m3
296
-    psubusb   m1, m2
297
-    psubusb   m3, m4
298
-    psubusb   m2, m5
299
-    psubusb   m4, m6
300
-
301
-    por       m2, m1
302
-    por       m4, m3
303
-
304
-    ; now convert to 16-bit vectors so we can square them
305
-    mova      m1, m2
306
-    mova      m3, m4
307
-
308
-    punpckhbw m2, m0
309
-    punpckhbw m4, m0
310
-    punpcklbw m1, m0         ; mm1 not spread over (mm1,mm2)
311
-    punpcklbw m3, m0         ; mm4 not spread over (mm3,mm4)
312
-
313
-    pmaddwd   m2, m2
314
-    pmaddwd   m4, m4
315
-    pmaddwd   m1, m1
316
-    pmaddwd   m3, m3
317
-
318
-    lea       r1, [r1+r3*2]  ; pix1 += 2*line_size
319
-    lea       r2, [r2+r3*2]  ; pix2 += 2*line_size
320
-
321
-    paddd     m1, m2
322
-    paddd     m3, m4
323
-    paddd     m7, m1
324
-    paddd     m7, m3
325
-
326
-    dec       r4
327
-    jnz .next2lines
328
-
329
-    mova      m1, m7
330
-    psrldq    m7, 8          ; shift hi qword to lo
331
-    paddd     m7, m1
332
-    mova      m1, m7
333
-    psrldq    m7, 4          ; shift hi dword to lo
334
-    paddd     m7, m1
335
-    movd     eax, m7         ; return value
336
-    RET
337 1
new file mode 100644
... ...
@@ -0,0 +1,336 @@
0
+;*****************************************************************************
1
+;* SIMD-optimized motion compensation estimation
2
+;*****************************************************************************
3
+;* Copyright (c) 2000, 2001 Fabrice Bellard
4
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
+;*
6
+;* This file is part of Libav.
7
+;*
8
+;* Libav is free software; you can redistribute it and/or
9
+;* modify it under the terms of the GNU Lesser General Public
10
+;* License as published by the Free Software Foundation; either
11
+;* version 2.1 of the License, or (at your option) any later version.
12
+;*
13
+;* Libav is distributed in the hope that it will be useful,
14
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
+;* Lesser General Public License for more details.
17
+;*
18
+;* You should have received a copy of the GNU Lesser General Public
19
+;* License along with Libav; if not, write to the Free Software
20
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
+;*****************************************************************************
22
+
23
+%include "libavutil/x86/x86util.asm"
24
+
25
+SECTION .text
26
+
27
+%macro DIFF_PIXELS_1 4
28
+    movh            %1, %3
29
+    movh            %2, %4
30
+    punpcklbw       %2, %1
31
+    punpcklbw       %1, %1
32
+    psubw           %1, %2
33
+%endmacro
34
+
35
+; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
36
+; %6=temporary storage location
37
+; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
38
+%macro DIFF_PIXELS_8 6
39
+    DIFF_PIXELS_1   m0, m7, [%1     +%3], [%2     +%3]
40
+    DIFF_PIXELS_1   m1, m7, [%1+%4  +%3], [%2+%4  +%3]
41
+    DIFF_PIXELS_1   m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
42
+    add             %1, %5
43
+    add             %2, %5
44
+    DIFF_PIXELS_1   m3, m7, [%1     +%3], [%2     +%3]
45
+    DIFF_PIXELS_1   m4, m7, [%1+%4  +%3], [%2+%4  +%3]
46
+    DIFF_PIXELS_1   m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
47
+    DIFF_PIXELS_1   m6, m7, [%1+%5  +%3], [%2+%5  +%3]
48
+%ifdef m8
49
+    DIFF_PIXELS_1   m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
50
+%else
51
+    mova          [%6], m0
52
+    DIFF_PIXELS_1   m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
53
+    mova            m0, [%6]
54
+%endif
55
+    sub             %1, %5
56
+    sub             %2, %5
57
+%endmacro
58
+
59
+%macro HADAMARD8 0
60
+    SUMSUB_BADC       w, 0, 1, 2, 3
61
+    SUMSUB_BADC       w, 4, 5, 6, 7
62
+    SUMSUB_BADC       w, 0, 2, 1, 3
63
+    SUMSUB_BADC       w, 4, 6, 5, 7
64
+    SUMSUB_BADC       w, 0, 4, 1, 5
65
+    SUMSUB_BADC       w, 2, 6, 3, 7
66
+%endmacro
67
+
68
+%macro ABS1_SUM 3
69
+    ABS1            %1, %2
70
+    paddusw         %3, %1
71
+%endmacro
72
+
73
+%macro ABS2_SUM 6
74
+    ABS2            %1, %2, %3, %4
75
+    paddusw         %5, %1
76
+    paddusw         %6, %2
77
+%endmacro
78
+
79
+%macro ABS_SUM_8x8_64 1
80
+    ABS2            m0, m1, m8, m9
81
+    ABS2_SUM        m2, m3, m8, m9, m0, m1
82
+    ABS2_SUM        m4, m5, m8, m9, m0, m1
83
+    ABS2_SUM        m6, m7, m8, m9, m0, m1
84
+    paddusw         m0, m1
85
+%endmacro
86
+
87
+%macro ABS_SUM_8x8_32 1
88
+    mova          [%1], m7
89
+    ABS1            m0, m7
90
+    ABS1            m1, m7
91
+    ABS1_SUM        m2, m7, m0
92
+    ABS1_SUM        m3, m7, m1
93
+    ABS1_SUM        m4, m7, m0
94
+    ABS1_SUM        m5, m7, m1
95
+    ABS1_SUM        m6, m7, m0
96
+    mova            m2, [%1]
97
+    ABS1_SUM        m2, m7, m1
98
+    paddusw         m0, m1
99
+%endmacro
100
+
101
+; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
102
+; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
103
+; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
104
+%macro HSUM 3
105
+%if cpuflag(sse2)
106
+    movhlps         %2, %1
107
+    paddusw         %1, %2
108
+    pshuflw         %2, %1, 0xE
109
+    paddusw         %1, %2
110
+    pshuflw         %2, %1, 0x1
111
+    paddusw         %1, %2
112
+    movd            %3, %1
113
+%elif cpuflag(mmxext)
114
+    pshufw          %2, %1, 0xE
115
+    paddusw         %1, %2
116
+    pshufw          %2, %1, 0x1
117
+    paddusw         %1, %2
118
+    movd            %3, %1
119
+%elif cpuflag(mmx)
120
+    mova            %2, %1
121
+    psrlq           %1, 32
122
+    paddusw         %1, %2
123
+    mova            %2, %1
124
+    psrlq           %1, 16
125
+    paddusw         %1, %2
126
+    movd            %3, %1
127
+%endif
128
+%endmacro
129
+
130
+%macro STORE4 5
131
+    mova [%1+mmsize*0], %2
132
+    mova [%1+mmsize*1], %3
133
+    mova [%1+mmsize*2], %4
134
+    mova [%1+mmsize*3], %5
135
+%endmacro
136
+
137
+%macro LOAD4 5
138
+    mova            %2, [%1+mmsize*0]
139
+    mova            %3, [%1+mmsize*1]
140
+    mova            %4, [%1+mmsize*2]
141
+    mova            %5, [%1+mmsize*3]
142
+%endmacro
143
+
144
+%macro hadamard8_16_wrapper 2
145
+cglobal hadamard8_diff, 4, 4, %1
146
+%ifndef m8
147
+    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
148
+    SUB            rsp, pad
149
+%endif
150
+    call hadamard8x8_diff %+ SUFFIX
151
+%ifndef m8
152
+    ADD            rsp, pad
153
+%endif
154
+    RET
155
+
156
+cglobal hadamard8_diff16, 5, 6, %1
157
+%ifndef m8
158
+    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
159
+    SUB            rsp, pad
160
+%endif
161
+
162
+    call hadamard8x8_diff %+ SUFFIX
163
+    mov            r5d, eax
164
+
165
+    add             r1, 8
166
+    add             r2, 8
167
+    call hadamard8x8_diff %+ SUFFIX
168
+    add            r5d, eax
169
+
170
+    cmp            r4d, 16
171
+    jne .done
172
+
173
+    lea             r1, [r1+r3*8-8]
174
+    lea             r2, [r2+r3*8-8]
175
+    call hadamard8x8_diff %+ SUFFIX
176
+    add            r5d, eax
177
+
178
+    add             r1, 8
179
+    add             r2, 8
180
+    call hadamard8x8_diff %+ SUFFIX
181
+    add            r5d, eax
182
+
183
+.done:
184
+    mov            eax, r5d
185
+%ifndef m8
186
+    ADD            rsp, pad
187
+%endif
188
+    RET
189
+%endmacro
190
+
191
+%macro HADAMARD8_DIFF 0-1
192
+%if cpuflag(sse2)
193
+hadamard8x8_diff %+ SUFFIX:
194
+    lea                          r0, [r3*3]
195
+    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize
196
+    HADAMARD8
197
+%if ARCH_X86_64
198
+    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7,  8
199
+%else
200
+    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7, [rsp+gprsize], [rsp+mmsize+gprsize]
201
+%endif
202
+    HADAMARD8
203
+    ABS_SUM_8x8         rsp+gprsize
204
+    HSUM                        m0, m1, eax
205
+    and                         eax, 0xFFFF
206
+    ret
207
+
208
+hadamard8_16_wrapper %1, 3
209
+%elif cpuflag(mmx)
210
+ALIGN 16
211
+; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,
212
+;                               uint8_t *src2, int stride, int h)
213
+; r0 = void *s = unused, int h = unused (always 8)
214
+; note how r1, r2 and r3 are not clobbered in this function, so 16x16
215
+; can simply call this 2x2x (and that's why we access rsp+gprsize
216
+; everywhere, which is rsp of calling func
217
+hadamard8x8_diff %+ SUFFIX:
218
+    lea                          r0, [r3*3]
219
+
220
+    ; first 4x8 pixels
221
+    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize+0x60
222
+    HADAMARD8
223
+    mova         [rsp+gprsize+0x60], m7
224
+    TRANSPOSE4x4W                 0,  1,  2,  3,  7
225
+    STORE4              rsp+gprsize, m0, m1, m2, m3
226
+    mova                         m7, [rsp+gprsize+0x60]
227
+    TRANSPOSE4x4W                 4,  5,  6,  7,  0
228
+    STORE4         rsp+gprsize+0x40, m4, m5, m6, m7
229
+
230
+    ; second 4x8 pixels
231
+    DIFF_PIXELS_8                r1, r2,  4, r3, r0, rsp+gprsize+0x60
232
+    HADAMARD8
233
+    mova         [rsp+gprsize+0x60], m7
234
+    TRANSPOSE4x4W                 0,  1,  2,  3,  7
235
+    STORE4         rsp+gprsize+0x20, m0, m1, m2, m3
236
+    mova                         m7, [rsp+gprsize+0x60]
237
+    TRANSPOSE4x4W                 4,  5,  6,  7,  0
238
+
239
+    LOAD4          rsp+gprsize+0x40, m0, m1, m2, m3
240
+    HADAMARD8
241
+    ABS_SUM_8x8_32 rsp+gprsize+0x60
242
+    mova         [rsp+gprsize+0x60], m0
243
+
244
+    LOAD4          rsp+gprsize     , m0, m1, m2, m3
245
+    LOAD4          rsp+gprsize+0x20, m4, m5, m6, m7
246
+    HADAMARD8
247
+    ABS_SUM_8x8_32 rsp+gprsize
248
+    paddusw                      m0, [rsp+gprsize+0x60]
249
+
250
+    HSUM                         m0, m1, eax
251
+    and                         rax, 0xFFFF
252
+    ret
253
+
254
+hadamard8_16_wrapper 0, 14
255
+%endif
256
+%endmacro
257
+
258
+INIT_MMX mmx
259
+HADAMARD8_DIFF
260
+
261
+INIT_MMX mmxext
262
+HADAMARD8_DIFF
263
+
264
+INIT_XMM sse2
265
+%if ARCH_X86_64
266
+%define ABS_SUM_8x8 ABS_SUM_8x8_64
267
+%else
268
+%define ABS_SUM_8x8 ABS_SUM_8x8_32
269
+%endif
270
+HADAMARD8_DIFF 10
271
+
272
+INIT_XMM ssse3
273
+%define ABS_SUM_8x8 ABS_SUM_8x8_64
274
+HADAMARD8_DIFF 9
275
+
276
+INIT_XMM sse2
277
+; int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
278
+;                   int line_size, int h);
279
+cglobal sse16, 5, 5, 8
280
+    shr      r4d, 1
281
+    pxor      m0, m0         ; mm0 = 0
282
+    pxor      m7, m7         ; mm7 holds the sum
283
+
284
+.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
285
+    movu      m1, [r1   ]    ; mm1 = pix1[0][0-15]
286
+    movu      m2, [r2   ]    ; mm2 = pix2[0][0-15]
287
+    movu      m3, [r1+r3]    ; mm3 = pix1[1][0-15]
288
+    movu      m4, [r2+r3]    ; mm4 = pix2[1][0-15]
289
+
290
+    ; todo: mm1-mm2, mm3-mm4
291
+    ; algo: subtract mm1 from mm2 with saturation and vice versa
292
+    ;       OR the result to get the absolute difference
293
+    mova      m5, m1
294
+    mova      m6, m3
295
+    psubusb   m1, m2
296
+    psubusb   m3, m4
297
+    psubusb   m2, m5
298
+    psubusb   m4, m6
299
+
300
+    por       m2, m1
301
+    por       m4, m3
302
+
303
+    ; now convert to 16-bit vectors so we can square them
304
+    mova      m1, m2
305
+    mova      m3, m4
306
+
307
+    punpckhbw m2, m0
308
+    punpckhbw m4, m0
309
+    punpcklbw m1, m0         ; mm1 not spread over (mm1,mm2)
310
+    punpcklbw m3, m0         ; mm4 not spread over (mm3,mm4)
311
+
312
+    pmaddwd   m2, m2
313
+    pmaddwd   m4, m4
314
+    pmaddwd   m1, m1
315
+    pmaddwd   m3, m3
316
+
317
+    lea       r1, [r1+r3*2]  ; pix1 += 2*line_size
318
+    lea       r2, [r2+r3*2]  ; pix2 += 2*line_size
319
+
320
+    paddd     m1, m2
321
+    paddd     m3, m4
322
+    paddd     m7, m1
323
+    paddd     m7, m3
324
+
325
+    dec       r4
326
+    jnz .next2lines
327
+
328
+    mova      m1, m7
329
+    psrldq    m7, 8          ; shift hi qword to lo
330
+    paddd     m7, m1
331
+    mova      m1, m7
332
+    psrldq    m7, 4          ; shift hi dword to lo
333
+    paddd     m7, m1
334
+    movd     eax, m7         ; return value
335
+    RET
0 336
new file mode 100644
... ...
@@ -0,0 +1,1321 @@
0
+/*
1
+ * SIMD-optimized motion estimation
2
+ * Copyright (c) 2000, 2001 Fabrice Bellard
3
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
4
+ *
5
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
6
+ *
7
+ * This file is part of Libav.
8
+ *
9
+ * Libav is free software; you can redistribute it and/or
10
+ * modify it under the terms of the GNU Lesser General Public
11
+ * License as published by the Free Software Foundation; either
12
+ * version 2.1 of the License, or (at your option) any later version.
13
+ *
14
+ * Libav is distributed in the hope that it will be useful,
15
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
+ * Lesser General Public License for more details.
18
+ *
19
+ * You should have received a copy of the GNU Lesser General Public
20
+ * License along with Libav; if not, write to the Free Software
21
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
+ */
23
+
24
+#include "libavutil/attributes.h"
25
+#include "libavutil/cpu.h"
26
+#include "libavutil/x86/asm.h"
27
+#include "libavutil/x86/cpu.h"
28
+#include "libavcodec/me_cmp.h"
29
+#include "libavcodec/mpegvideo.h"
30
+
31
+#if HAVE_INLINE_ASM
32
+
33
+static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
34
+                    int line_size, int h)
35
+{
36
+    int tmp;
37
+
38
+    __asm__ volatile (
39
+        "movl         %4, %%ecx          \n"
40
+        "shr          $1, %%ecx          \n"
41
+        "pxor      %%mm0, %%mm0          \n" /* mm0 = 0 */
42
+        "pxor      %%mm7, %%mm7          \n" /* mm7 holds the sum */
43
+        "1:                              \n"
44
+        "movq       (%0), %%mm1          \n" /* mm1 = pix1[0][0 - 7] */
45
+        "movq       (%1), %%mm2          \n" /* mm2 = pix2[0][0 - 7] */
46
+        "movq   (%0, %3), %%mm3          \n" /* mm3 = pix1[1][0 - 7] */
47
+        "movq   (%1, %3), %%mm4          \n" /* mm4 = pix2[1][0 - 7] */
48
+
49
+        /* todo: mm1-mm2, mm3-mm4 */
50
+        /* algo: subtract mm1 from mm2 with saturation and vice versa */
51
+        /*       OR the results to get absolute difference */
52
+        "movq      %%mm1, %%mm5          \n"
53
+        "movq      %%mm3, %%mm6          \n"
54
+        "psubusb   %%mm2, %%mm1          \n"
55
+        "psubusb   %%mm4, %%mm3          \n"
56
+        "psubusb   %%mm5, %%mm2          \n"
57
+        "psubusb   %%mm6, %%mm4          \n"
58
+
59
+        "por       %%mm1, %%mm2          \n"
60
+        "por       %%mm3, %%mm4          \n"
61
+
62
+        /* now convert to 16-bit vectors so we can square them */
63
+        "movq      %%mm2, %%mm1          \n"
64
+        "movq      %%mm4, %%mm3          \n"
65
+
66
+        "punpckhbw %%mm0, %%mm2          \n"
67
+        "punpckhbw %%mm0, %%mm4          \n"
68
+        "punpcklbw %%mm0, %%mm1          \n" /* mm1 now spread over (mm1, mm2) */
69
+        "punpcklbw %%mm0, %%mm3          \n" /* mm4 now spread over (mm3, mm4) */
70
+
71
+        "pmaddwd   %%mm2, %%mm2          \n"
72
+        "pmaddwd   %%mm4, %%mm4          \n"
73
+        "pmaddwd   %%mm1, %%mm1          \n"
74
+        "pmaddwd   %%mm3, %%mm3          \n"
75
+
76
+        "lea (%0, %3, 2), %0             \n" /* pix1 += 2 * line_size */
77
+        "lea (%1, %3, 2), %1             \n" /* pix2 += 2 * line_size */
78
+
79
+        "paddd     %%mm2, %%mm1          \n"
80
+        "paddd     %%mm4, %%mm3          \n"
81
+        "paddd     %%mm1, %%mm7          \n"
82
+        "paddd     %%mm3, %%mm7          \n"
83
+
84
+        "decl      %%ecx                 \n"
85
+        "jnz       1b                    \n"
86
+
87
+        "movq      %%mm7, %%mm1          \n"
88
+        "psrlq       $32, %%mm7          \n" /* shift hi dword to lo */
89
+        "paddd     %%mm7, %%mm1          \n"
90
+        "movd      %%mm1, %2             \n"
91
+        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
92
+        : "r" ((x86_reg) line_size), "m" (h)
93
+        : "%ecx");
94
+
95
+    return tmp;
96
+}
97
+
98
+static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
99
+                     int line_size, int h)
100
+{
101
+    int tmp;
102
+
103
+    __asm__ volatile (
104
+        "movl %4, %%ecx\n"
105
+        "pxor %%mm0, %%mm0\n"    /* mm0 = 0 */
106
+        "pxor %%mm7, %%mm7\n"    /* mm7 holds the sum */
107
+        "1:\n"
108
+        "movq (%0), %%mm1\n"     /* mm1 = pix1[0 -  7] */
109
+        "movq (%1), %%mm2\n"     /* mm2 = pix2[0 -  7] */
110
+        "movq 8(%0), %%mm3\n"    /* mm3 = pix1[8 - 15] */
111
+        "movq 8(%1), %%mm4\n"    /* mm4 = pix2[8 - 15] */
112
+
113
+        /* todo: mm1-mm2, mm3-mm4 */
114
+        /* algo: subtract mm1 from mm2 with saturation and vice versa */
115
+        /*       OR the results to get absolute difference */
116
+        "movq %%mm1, %%mm5\n"
117
+        "movq %%mm3, %%mm6\n"
118
+        "psubusb %%mm2, %%mm1\n"
119
+        "psubusb %%mm4, %%mm3\n"
120
+        "psubusb %%mm5, %%mm2\n"
121
+        "psubusb %%mm6, %%mm4\n"
122
+
123
+        "por %%mm1, %%mm2\n"
124
+        "por %%mm3, %%mm4\n"
125
+
126
+        /* now convert to 16-bit vectors so we can square them */
127
+        "movq %%mm2, %%mm1\n"
128
+        "movq %%mm4, %%mm3\n"
129
+
130
+        "punpckhbw %%mm0, %%mm2\n"
131
+        "punpckhbw %%mm0, %%mm4\n"
132
+        "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */
133
+        "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */
134
+
135
+        "pmaddwd %%mm2, %%mm2\n"
136
+        "pmaddwd %%mm4, %%mm4\n"
137
+        "pmaddwd %%mm1, %%mm1\n"
138
+        "pmaddwd %%mm3, %%mm3\n"
139
+
140
+        "add %3, %0\n"
141
+        "add %3, %1\n"
142
+
143
+        "paddd %%mm2, %%mm1\n"
144
+        "paddd %%mm4, %%mm3\n"
145
+        "paddd %%mm1, %%mm7\n"
146
+        "paddd %%mm3, %%mm7\n"
147
+
148
+        "decl %%ecx\n"
149
+        "jnz 1b\n"
150
+
151
+        "movq %%mm7, %%mm1\n"
152
+        "psrlq $32, %%mm7\n"    /* shift hi dword to lo */
153
+        "paddd %%mm7, %%mm1\n"
154
+        "movd %%mm1, %2\n"
155
+        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
156
+        : "r" ((x86_reg) line_size), "m" (h)
157
+        : "%ecx");
158
+
159
+    return tmp;
160
+}
161
+
162
+static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h)
163
+{
164
+    int tmp;
165
+
166
+    __asm__ volatile (
167
+        "movl %3, %%ecx\n"
168
+        "pxor %%mm7, %%mm7\n"
169
+        "pxor %%mm6, %%mm6\n"
170
+
171
+        "movq (%0), %%mm0\n"
172
+        "movq %%mm0, %%mm1\n"
173
+        "psllq $8, %%mm0\n"
174
+        "psrlq $8, %%mm1\n"
175
+        "psrlq $8, %%mm0\n"
176
+        "movq %%mm0, %%mm2\n"
177
+        "movq %%mm1, %%mm3\n"
178
+        "punpcklbw %%mm7, %%mm0\n"
179
+        "punpcklbw %%mm7, %%mm1\n"
180
+        "punpckhbw %%mm7, %%mm2\n"
181
+        "punpckhbw %%mm7, %%mm3\n"
182
+        "psubw %%mm1, %%mm0\n"
183
+        "psubw %%mm3, %%mm2\n"
184
+
185
+        "add %2, %0\n"
186
+
187
+        "movq (%0), %%mm4\n"
188
+        "movq %%mm4, %%mm1\n"
189
+        "psllq $8, %%mm4\n"
190
+        "psrlq $8, %%mm1\n"
191
+        "psrlq $8, %%mm4\n"
192
+        "movq %%mm4, %%mm5\n"
193
+        "movq %%mm1, %%mm3\n"
194
+        "punpcklbw %%mm7, %%mm4\n"
195
+        "punpcklbw %%mm7, %%mm1\n"
196
+        "punpckhbw %%mm7, %%mm5\n"
197
+        "punpckhbw %%mm7, %%mm3\n"
198
+        "psubw %%mm1, %%mm4\n"
199
+        "psubw %%mm3, %%mm5\n"
200
+        "psubw %%mm4, %%mm0\n"
201
+        "psubw %%mm5, %%mm2\n"
202
+        "pxor %%mm3, %%mm3\n"
203
+        "pxor %%mm1, %%mm1\n"
204
+        "pcmpgtw %%mm0, %%mm3\n\t"
205
+        "pcmpgtw %%mm2, %%mm1\n\t"
206
+        "pxor %%mm3, %%mm0\n"
207
+        "pxor %%mm1, %%mm2\n"
208
+        "psubw %%mm3, %%mm0\n"
209
+        "psubw %%mm1, %%mm2\n"
210
+        "paddw %%mm0, %%mm2\n"
211
+        "paddw %%mm2, %%mm6\n"
212
+
213
+        "add %2, %0\n"
214
+        "1:\n"
215
+
216
+        "movq (%0), %%mm0\n"
217
+        "movq %%mm0, %%mm1\n"
218
+        "psllq $8, %%mm0\n"
219
+        "psrlq $8, %%mm1\n"
220
+        "psrlq $8, %%mm0\n"
221
+        "movq %%mm0, %%mm2\n"
222
+        "movq %%mm1, %%mm3\n"
223
+        "punpcklbw %%mm7, %%mm0\n"
224
+        "punpcklbw %%mm7, %%mm1\n"
225
+        "punpckhbw %%mm7, %%mm2\n"
226
+        "punpckhbw %%mm7, %%mm3\n"
227
+        "psubw %%mm1, %%mm0\n"
228
+        "psubw %%mm3, %%mm2\n"
229
+        "psubw %%mm0, %%mm4\n"
230
+        "psubw %%mm2, %%mm5\n"
231
+        "pxor  %%mm3, %%mm3\n"
232
+        "pxor  %%mm1, %%mm1\n"
233
+        "pcmpgtw %%mm4, %%mm3\n\t"
234
+        "pcmpgtw %%mm5, %%mm1\n\t"
235
+        "pxor  %%mm3, %%mm4\n"
236
+        "pxor  %%mm1, %%mm5\n"
237
+        "psubw %%mm3, %%mm4\n"
238
+        "psubw %%mm1, %%mm5\n"
239
+        "paddw %%mm4, %%mm5\n"
240
+        "paddw %%mm5, %%mm6\n"
241
+
242
+        "add %2, %0\n"
243
+
244
+        "movq (%0), %%mm4\n"
245
+        "movq      %%mm4, %%mm1\n"
246
+        "psllq $8, %%mm4\n"
247
+        "psrlq $8, %%mm1\n"
248
+        "psrlq $8, %%mm4\n"
249
+        "movq      %%mm4, %%mm5\n"
250
+        "movq      %%mm1, %%mm3\n"
251
+        "punpcklbw %%mm7, %%mm4\n"
252
+        "punpcklbw %%mm7, %%mm1\n"
253
+        "punpckhbw %%mm7, %%mm5\n"
254
+        "punpckhbw %%mm7, %%mm3\n"
255
+        "psubw     %%mm1, %%mm4\n"
256
+        "psubw     %%mm3, %%mm5\n"
257
+        "psubw     %%mm4, %%mm0\n"
258
+        "psubw     %%mm5, %%mm2\n"
259
+        "pxor      %%mm3, %%mm3\n"
260
+        "pxor      %%mm1, %%mm1\n"
261
+        "pcmpgtw   %%mm0, %%mm3\n\t"
262
+        "pcmpgtw   %%mm2, %%mm1\n\t"
263
+        "pxor      %%mm3, %%mm0\n"
264
+        "pxor      %%mm1, %%mm2\n"
265
+        "psubw     %%mm3, %%mm0\n"
266
+        "psubw     %%mm1, %%mm2\n"
267
+        "paddw     %%mm0, %%mm2\n"
268
+        "paddw     %%mm2, %%mm6\n"
269
+
270
+        "add  %2, %0\n"
271
+        "subl $2, %%ecx\n"
272
+        " jnz 1b\n"
273
+
274
+        "movq      %%mm6, %%mm0\n"
275
+        "punpcklwd %%mm7, %%mm0\n"
276
+        "punpckhwd %%mm7, %%mm6\n"
277
+        "paddd     %%mm0, %%mm6\n"
278
+
279
+        "movq  %%mm6, %%mm0\n"
280
+        "psrlq $32,   %%mm6\n"
281
+        "paddd %%mm6, %%mm0\n"
282
+        "movd  %%mm0, %1\n"
283
+        : "+r" (pix1), "=r" (tmp)
284
+        : "r" ((x86_reg) line_size), "g" (h - 2)
285
+        : "%ecx");
286
+
287
+    return tmp;
288
+}
289
+
290
+static int hf_noise16_mmx(uint8_t *pix1, int line_size, int h)
291
+{
292
+    int tmp;
293
+    uint8_t *pix = pix1;
294
+
295
+    __asm__ volatile (
296
+        "movl %3, %%ecx\n"
297
+        "pxor %%mm7, %%mm7\n"
298
+        "pxor %%mm6, %%mm6\n"
299
+
300
+        "movq (%0), %%mm0\n"
301
+        "movq 1(%0), %%mm1\n"
302
+        "movq %%mm0, %%mm2\n"
303
+        "movq %%mm1, %%mm3\n"
304
+        "punpcklbw %%mm7, %%mm0\n"
305
+        "punpcklbw %%mm7, %%mm1\n"
306
+        "punpckhbw %%mm7, %%mm2\n"
307
+        "punpckhbw %%mm7, %%mm3\n"
308
+        "psubw %%mm1, %%mm0\n"
309
+        "psubw %%mm3, %%mm2\n"
310
+
311
+        "add %2, %0\n"
312
+
313
+        "movq (%0), %%mm4\n"
314
+        "movq 1(%0), %%mm1\n"
315
+        "movq %%mm4, %%mm5\n"
316
+        "movq %%mm1, %%mm3\n"
317
+        "punpcklbw %%mm7, %%mm4\n"
318
+        "punpcklbw %%mm7, %%mm1\n"
319
+        "punpckhbw %%mm7, %%mm5\n"
320
+        "punpckhbw %%mm7, %%mm3\n"
321
+        "psubw %%mm1, %%mm4\n"
322
+        "psubw %%mm3, %%mm5\n"
323
+        "psubw %%mm4, %%mm0\n"
324
+        "psubw %%mm5, %%mm2\n"
325
+        "pxor %%mm3, %%mm3\n"
326
+        "pxor %%mm1, %%mm1\n"
327
+        "pcmpgtw %%mm0, %%mm3\n\t"
328
+        "pcmpgtw %%mm2, %%mm1\n\t"
329
+        "pxor %%mm3, %%mm0\n"
330
+        "pxor %%mm1, %%mm2\n"
331
+        "psubw %%mm3, %%mm0\n"
332
+        "psubw %%mm1, %%mm2\n"
333
+        "paddw %%mm0, %%mm2\n"
334
+        "paddw %%mm2, %%mm6\n"
335
+
336
+        "add %2, %0\n"
337
+        "1:\n"
338
+
339
+        "movq (%0), %%mm0\n"
340
+        "movq 1(%0), %%mm1\n"
341
+        "movq %%mm0, %%mm2\n"
342
+        "movq %%mm1, %%mm3\n"
343
+        "punpcklbw %%mm7, %%mm0\n"
344
+        "punpcklbw %%mm7, %%mm1\n"
345
+        "punpckhbw %%mm7, %%mm2\n"
346
+        "punpckhbw %%mm7, %%mm3\n"
347
+        "psubw %%mm1, %%mm0\n"
348
+        "psubw %%mm3, %%mm2\n"
349
+        "psubw %%mm0, %%mm4\n"
350
+        "psubw %%mm2, %%mm5\n"
351
+        "pxor %%mm3, %%mm3\n"
352
+        "pxor %%mm1, %%mm1\n"
353
+        "pcmpgtw %%mm4, %%mm3\n\t"
354
+        "pcmpgtw %%mm5, %%mm1\n\t"
355
+        "pxor %%mm3, %%mm4\n"
356
+        "pxor %%mm1, %%mm5\n"
357
+        "psubw %%mm3, %%mm4\n"
358
+        "psubw %%mm1, %%mm5\n"
359
+        "paddw %%mm4, %%mm5\n"
360
+        "paddw %%mm5, %%mm6\n"
361
+
362
+        "add %2, %0\n"
363
+
364
+        "movq (%0), %%mm4\n"
365
+        "movq 1(%0), %%mm1\n"
366
+        "movq %%mm4, %%mm5\n"
367
+        "movq %%mm1, %%mm3\n"
368
+        "punpcklbw %%mm7, %%mm4\n"
369
+        "punpcklbw %%mm7, %%mm1\n"
370
+        "punpckhbw %%mm7, %%mm5\n"
371
+        "punpckhbw %%mm7, %%mm3\n"
372
+        "psubw %%mm1, %%mm4\n"
373
+        "psubw %%mm3, %%mm5\n"
374
+        "psubw %%mm4, %%mm0\n"
375
+        "psubw %%mm5, %%mm2\n"
376
+        "pxor %%mm3, %%mm3\n"
377
+        "pxor %%mm1, %%mm1\n"
378
+        "pcmpgtw %%mm0, %%mm3\n\t"
379
+        "pcmpgtw %%mm2, %%mm1\n\t"
380
+        "pxor %%mm3, %%mm0\n"
381
+        "pxor %%mm1, %%mm2\n"
382
+        "psubw %%mm3, %%mm0\n"
383
+        "psubw %%mm1, %%mm2\n"
384
+        "paddw %%mm0, %%mm2\n"
385
+        "paddw %%mm2, %%mm6\n"
386
+
387
+        "add %2, %0\n"
388
+        "subl $2, %%ecx\n"
389
+        " jnz 1b\n"
390
+
391
+        "movq %%mm6, %%mm0\n"
392
+        "punpcklwd %%mm7, %%mm0\n"
393
+        "punpckhwd %%mm7, %%mm6\n"
394
+        "paddd %%mm0, %%mm6\n"
395
+
396
+        "movq %%mm6, %%mm0\n"
397
+        "psrlq $32, %%mm6\n"
398
+        "paddd %%mm6, %%mm0\n"
399
+        "movd %%mm0, %1\n"
400
+        : "+r" (pix1), "=r" (tmp)
401
+        : "r" ((x86_reg) line_size), "g" (h - 2)
402
+        : "%ecx");
403
+
404
+    return tmp + hf_noise8_mmx(pix + 8, line_size, h);
405
+}
406
+
407
+static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
408
+                      int line_size, int h)
409
+{
410
+    int score1, score2;
411
+
412
+    if (c)
413
+        score1 = c->mecc.sse[0](c, pix1, pix2, line_size, h);
414
+    else
415
+        score1 = sse16_mmx(c, pix1, pix2, line_size, h);
416
+    score2 = hf_noise16_mmx(pix1, line_size, h) -
417
+             hf_noise16_mmx(pix2, line_size, h);
418
+
419
+    if (c)
420
+        return score1 + FFABS(score2) * c->avctx->nsse_weight;
421
+    else
422
+        return score1 + FFABS(score2) * 8;
423
+}
424
+
425
+static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
426
+                     int line_size, int h)
427
+{
428
+    int score1 = sse8_mmx(c, pix1, pix2, line_size, h);
429
+    int score2 = hf_noise8_mmx(pix1, line_size, h) -
430
+                 hf_noise8_mmx(pix2, line_size, h);
431
+
432
+    if (c)
433
+        return score1 + FFABS(score2) * c->avctx->nsse_weight;
434
+    else
435
+        return score1 + FFABS(score2) * 8;
436
+}
437
+
438
+static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
439
+                            int line_size, int h)
440
+{
441
+    int tmp;
442
+
443
+    assert((((int) pix) & 7) == 0);
444
+    assert((line_size & 7) == 0);
445
+
446
+#define SUM(in0, in1, out0, out1)               \
447
+    "movq (%0), %%mm2\n"                        \
448
+    "movq 8(%0), %%mm3\n"                       \
449
+    "add %2,%0\n"                               \
450
+    "movq %%mm2, " #out0 "\n"                   \
451
+    "movq %%mm3, " #out1 "\n"                   \
452
+    "psubusb " #in0 ", %%mm2\n"                 \
453
+    "psubusb " #in1 ", %%mm3\n"                 \
454
+    "psubusb " #out0 ", " #in0 "\n"             \
455
+    "psubusb " #out1 ", " #in1 "\n"             \
456
+    "por %%mm2, " #in0 "\n"                     \
457
+    "por %%mm3, " #in1 "\n"                     \
458
+    "movq " #in0 ", %%mm2\n"                    \
459
+    "movq " #in1 ", %%mm3\n"                    \
460
+    "punpcklbw %%mm7, " #in0 "\n"               \
461
+    "punpcklbw %%mm7, " #in1 "\n"               \
462
+    "punpckhbw %%mm7, %%mm2\n"                  \
463
+    "punpckhbw %%mm7, %%mm3\n"                  \
464
+    "paddw " #in1 ", " #in0 "\n"                \
465
+    "paddw %%mm3, %%mm2\n"                      \
466
+    "paddw %%mm2, " #in0 "\n"                   \
467
+    "paddw " #in0 ", %%mm6\n"
468
+
469
+
470
+    __asm__ volatile (
471
+        "movl    %3, %%ecx\n"
472
+        "pxor %%mm6, %%mm6\n"
473
+        "pxor %%mm7, %%mm7\n"
474
+        "movq  (%0), %%mm0\n"
475
+        "movq 8(%0), %%mm1\n"
476
+        "add %2, %0\n"
477
+        "jmp 2f\n"
478
+        "1:\n"
479
+
480
+        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
481
+        "2:\n"
482
+        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
483
+
484
+        "subl $2, %%ecx\n"
485
+        "jnz 1b\n"
486
+
487
+        "movq  %%mm6, %%mm0\n"
488
+        "psrlq $32,   %%mm6\n"
489
+        "paddw %%mm6, %%mm0\n"
490
+        "movq  %%mm0, %%mm6\n"
491
+        "psrlq $16,   %%mm0\n"
492
+        "paddw %%mm6, %%mm0\n"
493
+        "movd  %%mm0, %1\n"
494
+        : "+r" (pix), "=r" (tmp)
495
+        : "r" ((x86_reg) line_size), "m" (h)
496
+        : "%ecx");
497
+
498
+    return tmp & 0xFFFF;
499
+}
500
+#undef SUM
501
+
502
+static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
503
+                               int line_size, int h)
504
+{
505
+    int tmp;
506
+
507
+    assert((((int) pix) & 7) == 0);
508
+    assert((line_size & 7) == 0);
509
+
510
+#define SUM(in0, in1, out0, out1)               \
511
+    "movq (%0), " #out0 "\n"                    \
512
+    "movq 8(%0), " #out1 "\n"                   \
513
+    "add %2, %0\n"                              \
514
+    "psadbw " #out0 ", " #in0 "\n"              \
515
+    "psadbw " #out1 ", " #in1 "\n"              \
516
+    "paddw " #in1 ", " #in0 "\n"                \
517
+    "paddw " #in0 ", %%mm6\n"
518
+
519
+    __asm__ volatile (
520
+        "movl %3, %%ecx\n"
521
+        "pxor %%mm6, %%mm6\n"
522
+        "pxor %%mm7, %%mm7\n"
523
+        "movq (%0), %%mm0\n"
524
+        "movq 8(%0), %%mm1\n"
525
+        "add %2, %0\n"
526
+        "jmp 2f\n"
527
+        "1:\n"
528
+
529
+        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
530
+        "2:\n"
531
+        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
532
+
533
+        "subl $2, %%ecx\n"
534
+        "jnz 1b\n"
535
+
536
+        "movd %%mm6, %1\n"
537
+        : "+r" (pix), "=r" (tmp)
538
+        : "r" ((x86_reg) line_size), "m" (h)
539
+        : "%ecx");
540
+
541
+    return tmp;
542
+}
543
+#undef SUM
544
+
545
+static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
546
+                      int line_size, int h)
547
+{
548
+    int tmp;
549
+
550
+    assert((((int) pix1) & 7) == 0);
551
+    assert((((int) pix2) & 7) == 0);
552
+    assert((line_size & 7) == 0);
553
+
554
+#define SUM(in0, in1, out0, out1)       \
555
+    "movq (%0), %%mm2\n"                \
556
+    "movq (%1), " #out0 "\n"            \
557
+    "movq 8(%0), %%mm3\n"               \
558
+    "movq 8(%1), " #out1 "\n"           \
559
+    "add %3, %0\n"                      \
560
+    "add %3, %1\n"                      \
561
+    "psubb " #out0 ", %%mm2\n"          \
562
+    "psubb " #out1 ", %%mm3\n"          \
563
+    "pxor %%mm7, %%mm2\n"               \
564
+    "pxor %%mm7, %%mm3\n"               \
565
+    "movq %%mm2, " #out0 "\n"           \
566
+    "movq %%mm3, " #out1 "\n"           \
567
+    "psubusb " #in0 ", %%mm2\n"         \
568
+    "psubusb " #in1 ", %%mm3\n"         \
569
+    "psubusb " #out0 ", " #in0 "\n"     \
570
+    "psubusb " #out1 ", " #in1 "\n"     \
571
+    "por %%mm2, " #in0 "\n"             \
572
+    "por %%mm3, " #in1 "\n"             \
573
+    "movq " #in0 ", %%mm2\n"            \
574
+    "movq " #in1 ", %%mm3\n"            \
575
+    "punpcklbw %%mm7, " #in0 "\n"       \
576
+    "punpcklbw %%mm7, " #in1 "\n"       \
577
+    "punpckhbw %%mm7, %%mm2\n"          \
578
+    "punpckhbw %%mm7, %%mm3\n"          \
579
+    "paddw " #in1 ", " #in0 "\n"        \
580
+    "paddw %%mm3, %%mm2\n"              \
581
+    "paddw %%mm2, " #in0 "\n"           \
582
+    "paddw " #in0 ", %%mm6\n"
583
+
584
+
585
+    __asm__ volatile (
586
+        "movl %4, %%ecx\n"
587
+        "pxor %%mm6, %%mm6\n"
588
+        "pcmpeqw %%mm7, %%mm7\n"
589
+        "psllw $15, %%mm7\n"
590
+        "packsswb %%mm7, %%mm7\n"
591
+        "movq (%0), %%mm0\n"
592
+        "movq (%1), %%mm2\n"
593
+        "movq 8(%0), %%mm1\n"
594
+        "movq 8(%1), %%mm3\n"
595
+        "add %3, %0\n"
596
+        "add %3, %1\n"
597
+        "psubb %%mm2, %%mm0\n"
598
+        "psubb %%mm3, %%mm1\n"
599
+        "pxor %%mm7, %%mm0\n"
600
+        "pxor %%mm7, %%mm1\n"
601
+        "jmp 2f\n"
602
+        "1:\n"
603
+
604
+        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
605
+        "2:\n"
606
+        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
607
+
608
+        "subl $2, %%ecx\n"
609
+        "jnz 1b\n"
610
+
611
+        "movq %%mm6, %%mm0\n"
612
+        "psrlq $32, %%mm6\n"
613
+        "paddw %%mm6, %%mm0\n"
614
+        "movq %%mm0, %%mm6\n"
615
+        "psrlq $16, %%mm0\n"
616
+        "paddw %%mm6, %%mm0\n"
617
+        "movd %%mm0, %2\n"
618
+        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
619
+        : "r" ((x86_reg) line_size), "m" (h)
620
+        : "%ecx");
621
+
622
+    return tmp & 0x7FFF;
623
+}
624
+#undef SUM
625
+
626
+static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
627
+                         int line_size, int h)
628
+{
629
+    int tmp;
630
+
631
+    assert((((int) pix1) & 7) == 0);
632
+    assert((((int) pix2) & 7) == 0);
633
+    assert((line_size & 7) == 0);
634
+
635
+#define SUM(in0, in1, out0, out1)               \
636
+    "movq (%0), " #out0 "\n"                    \
637
+    "movq (%1), %%mm2\n"                        \
638
+    "movq 8(%0), " #out1 "\n"                   \
639
+    "movq 8(%1), %%mm3\n"                       \
640
+    "add %3, %0\n"                              \
641
+    "add %3, %1\n"                              \
642
+    "psubb %%mm2, " #out0 "\n"                  \
643
+    "psubb %%mm3, " #out1 "\n"                  \
644
+    "pxor %%mm7, " #out0 "\n"                   \
645
+    "pxor %%mm7, " #out1 "\n"                   \
646
+    "psadbw " #out0 ", " #in0 "\n"              \
647
+    "psadbw " #out1 ", " #in1 "\n"              \
648
+    "paddw " #in1 ", " #in0 "\n"                \
649
+    "paddw " #in0 ", %%mm6\n    "
650
+
651
+    __asm__ volatile (
652
+        "movl %4, %%ecx\n"
653
+        "pxor %%mm6, %%mm6\n"
654
+        "pcmpeqw %%mm7, %%mm7\n"
655
+        "psllw $15, %%mm7\n"
656
+        "packsswb %%mm7, %%mm7\n"
657
+        "movq (%0), %%mm0\n"
658
+        "movq (%1), %%mm2\n"
659
+        "movq 8(%0), %%mm1\n"
660
+        "movq 8(%1), %%mm3\n"
661
+        "add %3, %0\n"
662
+        "add %3, %1\n"
663
+        "psubb %%mm2, %%mm0\n"
664
+        "psubb %%mm3, %%mm1\n"
665
+        "pxor %%mm7, %%mm0\n"
666
+        "pxor %%mm7, %%mm1\n"
667
+        "jmp 2f\n"
668
+        "1:\n"
669
+
670
+        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
671
+        "2:\n"
672
+        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
673
+
674
+        "subl $2, %%ecx\n"
675
+        "jnz 1b\n"
676
+
677
+        "movd %%mm6, %2\n"
678
+        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
679
+        : "r" ((x86_reg) line_size), "m" (h)
680
+        : "%ecx");
681
+
682
+    return tmp;
683
+}
684
+#undef SUM
685
+
686
+#define MMABS_MMX(a,z)                          \
687
+    "pxor "    #z ", " #z "             \n\t"   \
688
+    "pcmpgtw " #a ", " #z "             \n\t"   \
689
+    "pxor "    #z ", " #a "             \n\t"   \
690
+    "psubw "   #z ", " #a "             \n\t"
691
+
692
+#define MMABS_MMXEXT(a, z)                      \
693
+    "pxor "    #z ", " #z "             \n\t"   \
694
+    "psubw "   #a ", " #z "             \n\t"   \
695
+    "pmaxsw "  #z ", " #a "             \n\t"
696
+
697
+#define MMABS_SSSE3(a,z)                        \
698
+    "pabsw "   #a ", " #a "             \n\t"
699
+
700
+#define MMABS_SUM(a,z, sum)                     \
701
+    MMABS(a,z)                                  \
702
+    "paddusw " #a ", " #sum "           \n\t"
703
+
704
+/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get
705
+ * up to about 100k on extreme inputs. But that's very unlikely to occur in
706
+ * natural video, and it's even more unlikely to not have any alternative
707
+ * mvs/modes with lower cost. */
708
+#define HSUM_MMX(a, t, dst)                     \
709
+    "movq    " #a ", " #t "             \n\t"   \
710
+    "psrlq      $32, " #a "             \n\t"   \
711
+    "paddusw " #t ", " #a "             \n\t"   \
712
+    "movq    " #a ", " #t "             \n\t"   \
713
+    "psrlq      $16, " #a "             \n\t"   \
714
+    "paddusw " #t ", " #a "             \n\t"   \
715
+    "movd    " #a ", " #dst "           \n\t"   \
716
+
717
+#define HSUM_MMXEXT(a, t, dst)                  \
718
+    "pshufw   $0x0E, " #a ", " #t "     \n\t"   \
719
+    "paddusw " #t ", " #a "             \n\t"   \
720
+    "pshufw   $0x01, " #a ", " #t "     \n\t"   \
721
+    "paddusw " #t ", " #a "             \n\t"   \
722
+    "movd    " #a ", " #dst "           \n\t"   \
723
+
724
+#define HSUM_SSE2(a, t, dst)                    \
725
+    "movhlps " #a ", " #t "             \n\t"   \
726
+    "paddusw " #t ", " #a "             \n\t"   \
727
+    "pshuflw  $0x0E, " #a ", " #t "     \n\t"   \
728
+    "paddusw " #t ", " #a "             \n\t"   \
729
+    "pshuflw  $0x01, " #a ", " #t "     \n\t"   \
730
+    "paddusw " #t ", " #a "             \n\t"   \
731
+    "movd    " #a ", " #dst "           \n\t"   \
732
+
733
+#define DCT_SAD4(m, mm, o)                      \
734
+    "mov"#m" "#o" +  0(%1), " #mm "2    \n\t"   \
735
+    "mov"#m" "#o" + 16(%1), " #mm "3    \n\t"   \
736
+    "mov"#m" "#o" + 32(%1), " #mm "4    \n\t"   \
737
+    "mov"#m" "#o" + 48(%1), " #mm "5    \n\t"   \
738
+    MMABS_SUM(mm ## 2, mm ## 6, mm ## 0)        \
739
+    MMABS_SUM(mm ## 3, mm ## 7, mm ## 1)        \
740
+    MMABS_SUM(mm ## 4, mm ## 6, mm ## 0)        \
741
+    MMABS_SUM(mm ## 5, mm ## 7, mm ## 1)        \
742
+
743
+#define DCT_SAD_MMX                             \
744
+    "pxor    %%mm0, %%mm0               \n\t"   \
745
+    "pxor    %%mm1, %%mm1               \n\t"   \
746
+    DCT_SAD4(q, %%mm, 0)                        \
747
+    DCT_SAD4(q, %%mm, 8)                        \
748
+    DCT_SAD4(q, %%mm, 64)                       \
749
+    DCT_SAD4(q, %%mm, 72)                       \
750
+    "paddusw %%mm1, %%mm0               \n\t"   \
751
+    HSUM(%%mm0, %%mm1, %0)
752
+
753
+#define DCT_SAD_SSE2                            \
754
+    "pxor    %%xmm0, %%xmm0             \n\t"   \
755
+    "pxor    %%xmm1, %%xmm1             \n\t"   \
756
+    DCT_SAD4(dqa, %%xmm, 0)                     \
757
+    DCT_SAD4(dqa, %%xmm, 64)                    \
758
+    "paddusw %%xmm1, %%xmm0             \n\t"   \
759
+    HSUM(%%xmm0, %%xmm1, %0)
760
+
761
+#define DCT_SAD_FUNC(cpu)                           \
762
+static int sum_abs_dctelem_ ## cpu(int16_t *block)  \
763
+{                                                   \
764
+    int sum;                                        \
765
+    __asm__ volatile (                              \
766
+        DCT_SAD                                     \
767
+        :"=r"(sum)                                  \
768
+        :"r"(block));                               \
769
+    return sum & 0xFFFF;                            \
770
+}
771
+
772
+#define DCT_SAD         DCT_SAD_MMX
773
+#define HSUM(a, t, dst) HSUM_MMX(a, t, dst)
774
+#define MMABS(a, z)     MMABS_MMX(a, z)
775
+DCT_SAD_FUNC(mmx)
776
+#undef MMABS
777
+#undef HSUM
778
+
779
+#define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst)
780
+#define MMABS(a, z)     MMABS_MMXEXT(a, z)
781
+DCT_SAD_FUNC(mmxext)
782
+#undef HSUM
783
+#undef DCT_SAD
784
+
785
+#define DCT_SAD         DCT_SAD_SSE2
786
+#define HSUM(a, t, dst) HSUM_SSE2(a, t, dst)
787
+DCT_SAD_FUNC(sse2)
788
+#undef MMABS
789
+
790
+#if HAVE_SSSE3_INLINE
791
+#define MMABS(a, z)     MMABS_SSSE3(a, z)
792
+DCT_SAD_FUNC(ssse3)
793
+#undef MMABS
794
+#endif
795
+#undef HSUM
796
+#undef DCT_SAD
797
+
798
+
799
+DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
800
+    0x0000000000000000ULL,
801
+    0x0001000100010001ULL,
802
+    0x0002000200020002ULL,
803
+};
804
+
805
+DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
806
+
807
+static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
808
+{
809
+    x86_reg len = -(stride * h);
810
+    __asm__ volatile (
811
+        ".p2align 4                     \n\t"
812
+        "1:                             \n\t"
813
+        "movq (%1, %%"REG_a"), %%mm0    \n\t"
814
+        "movq (%2, %%"REG_a"), %%mm2    \n\t"
815
+        "movq (%2, %%"REG_a"), %%mm4    \n\t"
816
+        "add %3, %%"REG_a"              \n\t"
817
+        "psubusb %%mm0, %%mm2           \n\t"
818
+        "psubusb %%mm4, %%mm0           \n\t"
819
+        "movq (%1, %%"REG_a"), %%mm1    \n\t"
820
+        "movq (%2, %%"REG_a"), %%mm3    \n\t"
821
+        "movq (%2, %%"REG_a"), %%mm5    \n\t"
822
+        "psubusb %%mm1, %%mm3           \n\t"
823
+        "psubusb %%mm5, %%mm1           \n\t"
824
+        "por %%mm2, %%mm0               \n\t"
825
+        "por %%mm1, %%mm3               \n\t"
826
+        "movq %%mm0, %%mm1              \n\t"
827
+        "movq %%mm3, %%mm2              \n\t"
828
+        "punpcklbw %%mm7, %%mm0         \n\t"
829
+        "punpckhbw %%mm7, %%mm1         \n\t"
830
+        "punpcklbw %%mm7, %%mm3         \n\t"
831
+        "punpckhbw %%mm7, %%mm2         \n\t"
832
+        "paddw %%mm1, %%mm0             \n\t"
833
+        "paddw %%mm3, %%mm2             \n\t"
834
+        "paddw %%mm2, %%mm0             \n\t"
835
+        "paddw %%mm0, %%mm6             \n\t"
836
+        "add %3, %%"REG_a"              \n\t"
837
+        " js 1b                         \n\t"
838
+        : "+a" (len)
839
+        : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride));
840
+}
841
+
842
+static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
843
+                                 int stride, int h)
844
+{
845
+    __asm__ volatile (
846
+        ".p2align 4                     \n\t"
847
+        "1:                             \n\t"
848
+        "movq (%1), %%mm0               \n\t"
849
+        "movq (%1, %3), %%mm1           \n\t"
850
+        "psadbw (%2), %%mm0             \n\t"
851
+        "psadbw (%2, %3), %%mm1         \n\t"
852
+        "paddw %%mm0, %%mm6             \n\t"
853
+        "paddw %%mm1, %%mm6             \n\t"
854
+        "lea (%1,%3,2), %1              \n\t"
855
+        "lea (%2,%3,2), %2              \n\t"
856
+        "sub $2, %0                     \n\t"
857
+        " jg 1b                         \n\t"
858
+        : "+r" (h), "+r" (blk1), "+r" (blk2)
859
+        : "r" ((x86_reg) stride));
860
+}
861
+
862
+static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
863
+                      int stride, int h)
864
+{
865
+    int ret;
866
+    __asm__ volatile (
867
+        "pxor %%xmm2, %%xmm2            \n\t"
868
+        ".p2align 4                     \n\t"
869
+        "1:                             \n\t"
870
+        "movdqu (%1), %%xmm0            \n\t"
871
+        "movdqu (%1, %4), %%xmm1        \n\t"
872
+        "psadbw (%2), %%xmm0            \n\t"
873
+        "psadbw (%2, %4), %%xmm1        \n\t"
874
+        "paddw %%xmm0, %%xmm2           \n\t"
875
+        "paddw %%xmm1, %%xmm2           \n\t"
876
+        "lea (%1,%4,2), %1              \n\t"
877
+        "lea (%2,%4,2), %2              \n\t"
878
+        "sub $2, %0                     \n\t"
879
+        " jg 1b                         \n\t"
880
+        "movhlps %%xmm2, %%xmm0         \n\t"
881
+        "paddw   %%xmm0, %%xmm2         \n\t"
882
+        "movd    %%xmm2, %3             \n\t"
883
+        : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
884
+        : "r" ((x86_reg) stride));
885
+    return ret;
886
+}
887
+
888
+static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
889
+                                   int stride, int h)
890
+{
891
+    __asm__ volatile (
892
+        ".p2align 4                     \n\t"
893
+        "1:                             \n\t"
894
+        "movq (%1), %%mm0               \n\t"
895
+        "movq (%1, %3), %%mm1           \n\t"
896
+        "pavgb 1(%1), %%mm0             \n\t"
897
+        "pavgb 1(%1, %3), %%mm1         \n\t"
898
+        "psadbw (%2), %%mm0             \n\t"
899
+        "psadbw (%2, %3), %%mm1         \n\t"
900
+        "paddw %%mm0, %%mm6             \n\t"
901
+        "paddw %%mm1, %%mm6             \n\t"
902
+        "lea (%1,%3,2), %1              \n\t"
903
+        "lea (%2,%3,2), %2              \n\t"
904
+        "sub $2, %0                     \n\t"
905
+        " jg 1b                         \n\t"
906
+        : "+r" (h), "+r" (blk1), "+r" (blk2)
907
+        : "r" ((x86_reg) stride));
908
+}
909
+
910
+static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
911
+                                   int stride, int h)
912
+{
913
+    __asm__ volatile (
914
+        "movq (%1), %%mm0               \n\t"
915
+        "add %3, %1                     \n\t"
916
+        ".p2align 4                     \n\t"
917
+        "1:                             \n\t"
918
+        "movq (%1), %%mm1               \n\t"
919
+        "movq (%1, %3), %%mm2           \n\t"
920
+        "pavgb %%mm1, %%mm0             \n\t"
921
+        "pavgb %%mm2, %%mm1             \n\t"
922
+        "psadbw (%2), %%mm0             \n\t"
923
+        "psadbw (%2, %3), %%mm1         \n\t"
924
+        "paddw %%mm0, %%mm6             \n\t"
925
+        "paddw %%mm1, %%mm6             \n\t"
926
+        "movq %%mm2, %%mm0              \n\t"
927
+        "lea (%1,%3,2), %1              \n\t"
928
+        "lea (%2,%3,2), %2              \n\t"
929
+        "sub $2, %0                     \n\t"
930
+        " jg 1b                         \n\t"
931
+        : "+r" (h), "+r" (blk1), "+r" (blk2)
932
+        : "r" ((x86_reg) stride));
933
+}
934
+
935
+static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
936
+                                 int stride, int h)
937
+{
938
+    __asm__ volatile (
939
+        "movq "MANGLE(bone)", %%mm5     \n\t"
940
+        "movq (%1), %%mm0               \n\t"
941
+        "pavgb 1(%1), %%mm0             \n\t"
942
+        "add %3, %1                     \n\t"
943
+        ".p2align 4                     \n\t"
944
+        "1:                             \n\t"
945
+        "movq (%1), %%mm1               \n\t"
946
+        "movq (%1,%3), %%mm2            \n\t"
947
+        "pavgb 1(%1), %%mm1             \n\t"
948
+        "pavgb 1(%1,%3), %%mm2          \n\t"
949
+        "psubusb %%mm5, %%mm1           \n\t"
950
+        "pavgb %%mm1, %%mm0             \n\t"
951
+        "pavgb %%mm2, %%mm1             \n\t"
952
+        "psadbw (%2), %%mm0             \n\t"
953
+        "psadbw (%2,%3), %%mm1          \n\t"
954
+        "paddw %%mm0, %%mm6             \n\t"
955
+        "paddw %%mm1, %%mm6             \n\t"
956
+        "movq %%mm2, %%mm0              \n\t"
957
+        "lea (%1,%3,2), %1              \n\t"
958
+        "lea (%2,%3,2), %2              \n\t"
959
+        "sub $2, %0                     \n\t"
960
+        " jg 1b                         \n\t"
961
+        : "+r" (h), "+r" (blk1), "+r" (blk2)
962
+        : "r" ((x86_reg) stride));
963
+}
964
+
965
+static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
966
+                              int stride, int h)
967
+{
968
+    x86_reg len = -(stride * h);
969
+    __asm__ volatile (
970
+        ".p2align 4                     \n\t"
971
+        "1:                             \n\t"
972
+        "movq (%1, %%"REG_a"), %%mm0    \n\t"
973
+        "movq (%2, %%"REG_a"), %%mm1    \n\t"
974
+        "movq (%1, %%"REG_a"), %%mm2    \n\t"
975
+        "movq (%2, %%"REG_a"), %%mm3    \n\t"
976
+        "punpcklbw %%mm7, %%mm0         \n\t"
977
+        "punpcklbw %%mm7, %%mm1         \n\t"
978
+        "punpckhbw %%mm7, %%mm2         \n\t"
979
+        "punpckhbw %%mm7, %%mm3         \n\t"
980
+        "paddw %%mm0, %%mm1             \n\t"
981
+        "paddw %%mm2, %%mm3             \n\t"
982
+        "movq (%3, %%"REG_a"), %%mm4    \n\t"
983
+        "movq (%3, %%"REG_a"), %%mm2    \n\t"
984
+        "paddw %%mm5, %%mm1             \n\t"
985
+        "paddw %%mm5, %%mm3             \n\t"
986
+        "psrlw $1, %%mm1                \n\t"
987
+        "psrlw $1, %%mm3                \n\t"
988
+        "packuswb %%mm3, %%mm1          \n\t"
989
+        "psubusb %%mm1, %%mm4           \n\t"
990
+        "psubusb %%mm2, %%mm1           \n\t"
991
+        "por %%mm4, %%mm1               \n\t"
992
+        "movq %%mm1, %%mm0              \n\t"
993
+        "punpcklbw %%mm7, %%mm0         \n\t"
994
+        "punpckhbw %%mm7, %%mm1         \n\t"
995
+        "paddw %%mm1, %%mm0             \n\t"
996
+        "paddw %%mm0, %%mm6             \n\t"
997
+        "add %4, %%"REG_a"              \n\t"
998
+        " js 1b                         \n\t"
999
+        : "+a" (len)
1000
+        : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
1001
+          "r" ((x86_reg) stride));
1002
+}
1003
+
1004
+static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
1005
+{
1006
+    x86_reg len = -(stride * h);
1007
+    __asm__ volatile (
1008
+        "movq  (%1, %%"REG_a"), %%mm0   \n\t"
1009
+        "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
1010
+        "movq %%mm0, %%mm1              \n\t"
1011
+        "movq %%mm2, %%mm3              \n\t"
1012
+        "punpcklbw %%mm7, %%mm0         \n\t"
1013
+        "punpckhbw %%mm7, %%mm1         \n\t"
1014
+        "punpcklbw %%mm7, %%mm2         \n\t"
1015
+        "punpckhbw %%mm7, %%mm3         \n\t"
1016
+        "paddw %%mm2, %%mm0             \n\t"
1017
+        "paddw %%mm3, %%mm1             \n\t"
1018
+        ".p2align 4                     \n\t"
1019
+        "1:                             \n\t"
1020
+        "movq  (%2, %%"REG_a"), %%mm2   \n\t"
1021
+        "movq 1(%2, %%"REG_a"), %%mm4   \n\t"
1022
+        "movq %%mm2, %%mm3              \n\t"
1023
+        "movq %%mm4, %%mm5              \n\t"
1024
+        "punpcklbw %%mm7, %%mm2         \n\t"
1025
+        "punpckhbw %%mm7, %%mm3         \n\t"
1026
+        "punpcklbw %%mm7, %%mm4         \n\t"
1027
+        "punpckhbw %%mm7, %%mm5         \n\t"
1028
+        "paddw %%mm4, %%mm2             \n\t"
1029
+        "paddw %%mm5, %%mm3             \n\t"
1030
+        "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
1031
+        "paddw %%mm2, %%mm0             \n\t"
1032
+        "paddw %%mm3, %%mm1             \n\t"
1033
+        "paddw %%mm5, %%mm0             \n\t"
1034
+        "paddw %%mm5, %%mm1             \n\t"
1035
+        "movq (%3, %%"REG_a"), %%mm4    \n\t"
1036
+        "movq (%3, %%"REG_a"), %%mm5    \n\t"
1037
+        "psrlw $2, %%mm0                \n\t"
1038
+        "psrlw $2, %%mm1                \n\t"
1039
+        "packuswb %%mm1, %%mm0          \n\t"
1040
+        "psubusb %%mm0, %%mm4           \n\t"
1041
+        "psubusb %%mm5, %%mm0           \n\t"
1042
+        "por %%mm4, %%mm0               \n\t"
1043
+        "movq %%mm0, %%mm4              \n\t"
1044
+        "punpcklbw %%mm7, %%mm0         \n\t"
1045
+        "punpckhbw %%mm7, %%mm4         \n\t"
1046
+        "paddw %%mm0, %%mm6             \n\t"
1047
+        "paddw %%mm4, %%mm6             \n\t"
1048
+        "movq  %%mm2, %%mm0             \n\t"
1049
+        "movq  %%mm3, %%mm1             \n\t"
1050
+        "add %4, %%"REG_a"              \n\t"
1051
+        " js 1b                         \n\t"
1052
+        : "+a" (len)
1053
+        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
1054
+          "r" ((x86_reg) stride));
1055
+}
1056
+
1057
+static inline int sum_mmx(void)
1058
+{
1059
+    int ret;
1060
+    __asm__ volatile (
1061
+        "movq %%mm6, %%mm0              \n\t"
1062
+        "psrlq $32, %%mm6               \n\t"
1063
+        "paddw %%mm0, %%mm6             \n\t"
1064
+        "movq %%mm6, %%mm0              \n\t"
1065
+        "psrlq $16, %%mm6               \n\t"
1066
+        "paddw %%mm0, %%mm6             \n\t"
1067
+        "movd %%mm6, %0                 \n\t"
1068
+        : "=r" (ret));
1069
+    return ret & 0xFFFF;
1070
+}
1071
+
1072
+static inline int sum_mmxext(void)
1073
+{
1074
+    int ret;
1075
+    __asm__ volatile (
1076
+        "movd %%mm6, %0                 \n\t"
1077
+        : "=r" (ret));
1078
+    return ret;
1079
+}
1080
+
1081
+static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
1082
+{
1083
+    sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
1084
+}
1085
+
1086
+static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
1087
+{
1088
+    sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
1089
+}
1090
+
1091
+#define PIX_SAD(suf)                                                    \
1092
+static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
1093
+                        uint8_t *blk1, int stride, int h)               \
1094
+{                                                                       \
1095
+    assert(h == 8);                                                     \
1096
+    __asm__ volatile (                                                  \
1097
+        "pxor %%mm7, %%mm7     \n\t"                                    \
1098
+        "pxor %%mm6, %%mm6     \n\t"                                    \
1099
+        :);                                                             \
1100
+                                                                        \
1101
+    sad8_1_ ## suf(blk1, blk2, stride, 8);                              \
1102
+                                                                        \
1103
+    return sum_ ## suf();                                               \
1104
+}                                                                       \
1105
+                                                                        \
1106
+static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
1107
+                           uint8_t *blk1, int stride, int h)            \
1108
+{                                                                       \
1109
+    assert(h == 8);                                                     \
1110
+    __asm__ volatile (                                                  \
1111
+        "pxor %%mm7, %%mm7     \n\t"                                    \
1112
+        "pxor %%mm6, %%mm6     \n\t"                                    \
1113
+        "movq %0, %%mm5        \n\t"                                    \
1114
+        :: "m" (round_tab[1]));                                         \
1115
+                                                                        \
1116
+    sad8_x2a_ ## suf(blk1, blk2, stride, 8);                            \
1117
+                                                                        \
1118
+    return sum_ ## suf();                                               \
1119
+}                                                                       \
1120
+                                                                        \
1121
+static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
1122
+                           uint8_t *blk1, int stride, int h)            \
1123
+{                                                                       \
1124
+    assert(h == 8);                                                     \
1125
+    __asm__ volatile (                                                  \
1126
+        "pxor %%mm7, %%mm7     \n\t"                                    \
1127
+        "pxor %%mm6, %%mm6     \n\t"                                    \
1128
+        "movq %0, %%mm5        \n\t"                                    \
1129
+        :: "m" (round_tab[1]));                                         \
1130
+                                                                        \
1131
+    sad8_y2a_ ## suf(blk1, blk2, stride, 8);                            \
1132
+                                                                        \
1133
+    return sum_ ## suf();                                               \
1134
+}                                                                       \
1135
+                                                                        \
1136
+static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
1137
+                            uint8_t *blk1, int stride, int h)           \
1138
+{                                                                       \
1139
+    assert(h == 8);                                                     \
1140
+    __asm__ volatile (                                                  \
1141
+        "pxor %%mm7, %%mm7     \n\t"                                    \
1142
+        "pxor %%mm6, %%mm6     \n\t"                                    \
1143
+        ::);                                                            \
1144
+                                                                        \
1145
+    sad8_4_ ## suf(blk1, blk2, stride, 8);                              \
1146
+                                                                        \
1147
+    return sum_ ## suf();                                               \
1148
+}                                                                       \
1149
+                                                                        \
1150
+static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2,              \
1151
+                         uint8_t *blk1, int stride, int h)              \
1152
+{                                                                       \
1153
+    __asm__ volatile (                                                  \
1154
+        "pxor %%mm7, %%mm7     \n\t"                                    \
1155
+        "pxor %%mm6, %%mm6     \n\t"                                    \
1156
+        :);                                                             \
1157
+                                                                        \
1158
+    sad8_1_ ## suf(blk1,     blk2,     stride, h);                      \
1159
+    sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
1160
+                                                                        \
1161
+    return sum_ ## suf();                                               \
1162
+}                                                                       \
1163
+                                                                        \
1164
+static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
1165
+                            uint8_t *blk1, int stride, int h)           \
1166
+{                                                                       \
1167
+    __asm__ volatile (                                                  \
1168
+        "pxor %%mm7, %%mm7     \n\t"                                    \
1169
+        "pxor %%mm6, %%mm6     \n\t"                                    \
1170
+        "movq %0, %%mm5        \n\t"                                    \
1171
+        :: "m" (round_tab[1]));                                         \
1172
+                                                                        \
1173
+    sad8_x2a_ ## suf(blk1,     blk2,     stride, h);                    \
1174
+    sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
1175
+                                                                        \
1176
+    return sum_ ## suf();                                               \
1177
+}                                                                       \
1178
+                                                                        \
1179
+static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
1180
+                            uint8_t *blk1, int stride, int h)           \
1181
+{                                                                       \
1182
+    __asm__ volatile (                                                  \
1183
+        "pxor %%mm7, %%mm7     \n\t"                                    \
1184
+        "pxor %%mm6, %%mm6     \n\t"                                    \
1185
+        "movq %0, %%mm5        \n\t"                                    \
1186
+        :: "m" (round_tab[1]));                                         \
1187
+                                                                        \
1188
+    sad8_y2a_ ## suf(blk1,     blk2,     stride, h);                    \
1189
+    sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
1190
+                                                                        \
1191
+    return sum_ ## suf();                                               \
1192
+}                                                                       \
1193
+                                                                        \
1194
+static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
1195
+                             uint8_t *blk1, int stride, int h)          \
1196
+{                                                                       \
1197
+    __asm__ volatile (                                                  \
1198
+        "pxor %%mm7, %%mm7     \n\t"                                    \
1199
+        "pxor %%mm6, %%mm6     \n\t"                                    \
1200
+        ::);                                                            \
1201
+                                                                        \
1202
+    sad8_4_ ## suf(blk1,     blk2,     stride, h);                      \
1203
+    sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
1204
+                                                                        \
1205
+    return sum_ ## suf();                                               \
1206
+}                                                                       \
1207
+
1208
+PIX_SAD(mmx)
1209
+PIX_SAD(mmxext)
1210
+
1211
+#endif /* HAVE_INLINE_ASM */
1212
+
1213
+int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1214
+                  int line_size, int h);
1215
+
1216
+#define hadamard_func(cpu)                                              \
1217
+    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,     \
1218
+                                  uint8_t *src2, int stride, int h);    \
1219
+    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,   \
1220
+                                    uint8_t *src2, int stride, int h);
1221
+
1222
+hadamard_func(mmx)
1223
+hadamard_func(mmxext)
1224
+hadamard_func(sse2)
1225
+hadamard_func(ssse3)
1226
+
1227
+av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
1228
+{
1229
+    int cpu_flags = av_get_cpu_flags();
1230
+
1231
+#if HAVE_INLINE_ASM
1232
+    if (INLINE_MMX(cpu_flags)) {
1233
+        c->sum_abs_dctelem = sum_abs_dctelem_mmx;
1234
+
1235
+        c->pix_abs[0][0] = sad16_mmx;
1236
+        c->pix_abs[0][1] = sad16_x2_mmx;
1237
+        c->pix_abs[0][2] = sad16_y2_mmx;
1238
+        c->pix_abs[0][3] = sad16_xy2_mmx;
1239
+        c->pix_abs[1][0] = sad8_mmx;
1240
+        c->pix_abs[1][1] = sad8_x2_mmx;
1241
+        c->pix_abs[1][2] = sad8_y2_mmx;
1242
+        c->pix_abs[1][3] = sad8_xy2_mmx;
1243
+
1244
+        c->sad[0] = sad16_mmx;
1245
+        c->sad[1] = sad8_mmx;
1246
+
1247
+        c->sse[0]  = sse16_mmx;
1248
+        c->sse[1]  = sse8_mmx;
1249
+        c->vsad[4] = vsad_intra16_mmx;
1250
+
1251
+        c->nsse[0] = nsse16_mmx;
1252
+        c->nsse[1] = nsse8_mmx;
1253
+
1254
+        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1255
+            c->vsad[0] = vsad16_mmx;
1256
+        }
1257
+    }
1258
+
1259
+    if (INLINE_MMXEXT(cpu_flags)) {
1260
+        c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
1261
+
1262
+        c->vsad[4] = vsad_intra16_mmxext;
1263
+
1264
+        c->pix_abs[0][0] = sad16_mmxext;
1265
+        c->pix_abs[1][0] = sad8_mmxext;
1266
+
1267
+        c->sad[0] = sad16_mmxext;
1268
+        c->sad[1] = sad8_mmxext;
1269
+
1270
+        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1271
+            c->pix_abs[0][1] = sad16_x2_mmxext;
1272
+            c->pix_abs[0][2] = sad16_y2_mmxext;
1273
+            c->pix_abs[0][3] = sad16_xy2_mmxext;
1274
+            c->pix_abs[1][1] = sad8_x2_mmxext;
1275
+            c->pix_abs[1][2] = sad8_y2_mmxext;
1276
+            c->pix_abs[1][3] = sad8_xy2_mmxext;
1277
+
1278
+            c->vsad[0] = vsad16_mmxext;
1279
+        }
1280
+    }
1281
+
1282
+    if (INLINE_SSE2(cpu_flags)) {
1283
+        c->sum_abs_dctelem = sum_abs_dctelem_sse2;
1284
+    }
1285
+
1286
+    if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) {
1287
+        c->sad[0] = sad16_sse2;
1288
+    }
1289
+
1290
+#if HAVE_SSSE3_INLINE
1291
+    if (INLINE_SSSE3(cpu_flags)) {
1292
+        c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
1293
+    }
1294
+#endif
1295
+#endif /* HAVE_INLINE_ASM */
1296
+
1297
+    if (EXTERNAL_MMX(cpu_flags)) {
1298
+        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
1299
+        c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
1300
+    }
1301
+
1302
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
1303
+        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
1304
+        c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
1305
+    }
1306
+
1307
+    if (EXTERNAL_SSE2(cpu_flags)) {
1308
+        c->sse[0] = ff_sse16_sse2;
1309
+
1310
+#if HAVE_ALIGNED_STACK
1311
+        c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
1312
+        c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
1313
+#endif
1314
+    }
1315
+
1316
+    if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
1317
+        c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
1318
+        c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
1319
+    }
1320
+}