Browse code

Remove all SPARC architecture optimizations

SPARC is no longer being used in any multimedia-related fields and the
VIS optimizations only represent a maintenance burden.

Diego Biurrun authored on 2014/01/07 19:00:46
Showing 20 changed files
... ...
@@ -115,8 +115,7 @@ config.h: .config
115 115
 SUBDIR_VARS := CLEANFILES EXAMPLES FFLIBS HOSTPROGS TESTPROGS TOOLS      \
116 116
                HEADERS ARCH_HEADERS BUILT_HEADERS SKIPHEADERS            \
117 117
                ARMV5TE-OBJS ARMV6-OBJS VFP-OBJS NEON-OBJS                \
118
-               ALTIVEC-OBJS VIS-OBJS                                     \
119
-               MMX-OBJS YASM-OBJS                                        \
118
+               ALTIVEC-OBJS MMX-OBJS YASM-OBJS                           \
120 119
                OBJS HOSTOBJS TESTOBJS
121 120
 
122 121
 define RESET
... ...
@@ -5,7 +5,5 @@ OBJS-$(HAVE_NEON)    += $(NEON-OBJS)    $(NEON-OBJS-yes)
5 5
 
6 6
 OBJS-$(HAVE_ALTIVEC) += $(ALTIVEC-OBJS) $(ALTIVEC-OBJS-yes)
7 7
 
8
-OBJS-$(HAVE_VIS)     += $(VIS-OBJS)     $(VIS-OBJS-yes)
9
-
10 8
 OBJS-$(HAVE_MMX)     += $(MMX-OBJS)     $(MMX-OBJS-yes)
11 9
 OBJS-$(HAVE_YASM)    += $(YASM-OBJS)    $(YASM-OBJS-yes)
... ...
@@ -284,7 +284,6 @@ Optimization options (experts only):
284 284
   --disable-armv6t2        disable armv6t2 optimizations
285 285
   --disable-vfp            disable VFP optimizations
286 286
   --disable-neon           disable NEON optimizations
287
-  --disable-vis            disable VIS optimizations
288 287
   --disable-inline-asm     disable use of inline assembler
289 288
   --disable-yasm           disable use of yasm assembler
290 289
 
... ...
@@ -1294,7 +1293,6 @@ ARCH_EXT_LIST="
1294 1294
     $ARCH_EXT_LIST_X86
1295 1295
     altivec
1296 1296
     ppc4xx
1297
-    vis
1298 1297
 "
1299 1298
 
1300 1299
 HAVE_LIST_CMDLINE="
... ...
@@ -1584,8 +1582,6 @@ map 'eval ${v}_inline_deps=inline_asm' $ARCH_EXT_LIST_ARM
1584 1584
 altivec_deps="ppc"
1585 1585
 ppc4xx_deps="ppc"
1586 1586
 
1587
-vis_deps="sparc"
1588
-
1589 1587
 cpunop_deps="i686"
1590 1588
 x86_64_select="i686"
1591 1589
 x86_64_suggest="fast_cmov"
... ...
@@ -3151,7 +3147,6 @@ elif enabled sparc; then
3151 3151
     case $cpu in
3152 3152
         cypress|f93[04]|tsc701|sparcl*|supersparc|hypersparc|niagara|v[789])
3153 3153
             cpuflags="-mcpu=$cpu"
3154
-            disable vis
3155 3154
         ;;
3156 3155
         ultrasparc*|niagara[234])
3157 3156
             cpuflags="-mcpu=$cpu"
... ...
@@ -3740,10 +3735,6 @@ EOF
3740 3740
         enabled altivec || warn "Altivec disabled, possibly missing --cpu flag"
3741 3741
     fi
3742 3742
 
3743
-elif enabled sparc; then
3744
-
3745
-    enabled vis && check_inline_asm vis '"pdist %f0, %f0, %f0"'
3746
-
3747 3743
 elif enabled x86; then
3748 3744
 
3749 3745
     check_builtin rdtsc    intrin.h   "__rdtsc()"
... ...
@@ -4351,9 +4342,6 @@ if enabled ppc; then
4351 4351
     echo "PPC 4xx optimizations     ${ppc4xx-no}"
4352 4352
     echo "dcbzl available           ${dcbzl-no}"
4353 4353
 fi
4354
-if enabled sparc; then
4355
-    echo "VIS enabled               ${vis-no}"
4356
-fi
4357 4354
 echo "debug symbols             ${debug-no}"
4358 4355
 echo "optimize for size         ${small-no}"
4359 4356
 echo "optimizations             ${optimizations-no}"
... ...
@@ -268,17 +268,6 @@ CELL/SPU:
268 268
 http://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/30B3520C93F437AB87257060006FFE5E/$file/Language_Extensions_for_CBEA_2.4.pdf
269 269
 http://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/9F820A5FFA3ECE8C8725716A0062585F/$file/CBE_Handbook_v1.1_24APR2007_pub.pdf
270 270
 
271
-SPARC-specific:
272
-SPARC Joint Programming Specification (JPS1): Commonality
273
-http://www.fujitsu.com/downloads/PRMPWR/JPS1-R1.0.4-Common-pub.pdf
274
-
275
-UltraSPARC III Processor User's Manual (contains instruction timings)
276
-http://www.sun.com/processors/manuals/USIIIv2.pdf
277
-
278
-VIS Whitepaper (contains optimization guidelines)
279
-http://www.sun.com/processors/vis/download/vis/vis_whitepaper.pdf
280
-
281 271
 GCC asm links:
282 272
 --------------
283 273
 official doc but quite ugly
... ...
@@ -2468,7 +2468,9 @@ typedef struct AVCodecContext {
2468 2468
 #define FF_IDCT_XVIDMMX       14
2469 2469
 #define FF_IDCT_SIMPLEARMV5TE 16
2470 2470
 #define FF_IDCT_SIMPLEARMV6   17
2471
+#if FF_API_ARCH_SPARC
2471 2472
 #define FF_IDCT_SIMPLEVIS     18
2473
+#endif
2472 2474
 #define FF_IDCT_FAAN          20
2473 2475
 #define FF_IDCT_SIMPLENEON    22
2474 2476
 #if FF_API_ARCH_ALPHA
... ...
@@ -2653,8 +2653,6 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2653 2653
         ff_dsputil_init_bfin(c, avctx);
2654 2654
     if (ARCH_PPC)
2655 2655
         ff_dsputil_init_ppc(c, avctx);
2656
-    if (HAVE_VIS)
2657
-        ff_dsputil_init_vis(c, avctx);
2658 2656
     if (ARCH_X86)
2659 2657
         ff_dsputil_init_x86(c, avctx);
2660 2658
 
... ...
@@ -300,7 +300,6 @@ void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type);
300 300
 void ff_dsputil_init_arm(DSPContext* c, AVCodecContext *avctx);
301 301
 void ff_dsputil_init_bfin(DSPContext* c, AVCodecContext *avctx);
302 302
 void ff_dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx);
303
-void ff_dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
304 303
 void ff_dsputil_init_x86(DSPContext* c, AVCodecContext *avctx);
305 304
 
306 305
 #endif /* AVCODEC_DSPUTIL_H */
... ...
@@ -62,8 +62,6 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags)
62 62
         ff_hpeldsp_init_bfin(c, flags);
63 63
     if (ARCH_PPC)
64 64
         ff_hpeldsp_init_ppc(c, flags);
65
-    if (HAVE_VIS)
66
-        ff_hpeldsp_init_vis(c, flags);
67 65
     if (ARCH_X86)
68 66
         ff_hpeldsp_init_x86(c, flags);
69 67
 }
... ...
@@ -98,7 +98,6 @@ void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags);
98 98
 void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags);
99 99
 void ff_hpeldsp_init_bfin(HpelDSPContext *c, int flags);
100 100
 void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags);
101
-void ff_hpeldsp_init_vis(HpelDSPContext *c, int flags);
102 101
 void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags);
103 102
 
104 103
 #endif /* AVCODEC_HPELDSP_H */
105 104
deleted file mode 100644
... ...
@@ -1,4 +0,0 @@
1
-VIS-OBJS += sparc/dsputil_vis.o                                         \
2
-            sparc/simple_idct_vis.o                                     \
3
-
4
-VIS-OBJS-$(CONFIG_HPELDSP) += sparc/hpeldsp_vis.o
5 1
deleted file mode 100644
... ...
@@ -1,40 +0,0 @@
1
-/*
2
- * Copyright (C) 2003 David S. Miller <davem@redhat.com>
3
- *
4
- * This file is part of Libav.
5
- *
6
- * Libav is free software; you can redistribute it and/or
7
- * modify it under the terms of the GNU Lesser General Public
8
- * License as published by the Free Software Foundation; either
9
- * version 2.1 of the License, or (at your option) any later version.
10
- *
11
- * Libav is distributed in the hope that it will be useful,
12
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
- * Lesser General Public License for more details.
15
- *
16
- * You should have received a copy of the GNU Lesser General Public
17
- * License along with Libav; if not, write to the Free Software
18
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
- */
20
-
21
-#include "libavutil/attributes.h"
22
-#include "libavcodec/dsputil.h"
23
-#include "dsputil_vis.h"
24
-#include "vis.h"
25
-
26
-av_cold void ff_dsputil_init_vis(DSPContext *c, AVCodecContext *avctx)
27
-{
28
-  /* VIS-specific optimizations */
29
-  int accel = vis_level ();
30
-  const int high_bit_depth = avctx->bits_per_raw_sample > 8;
31
-
32
-  if (accel & ACCEL_SPARC_VIS && !high_bit_depth) {
33
-      if (avctx->idct_algo == FF_IDCT_SIMPLEVIS) {
34
-          c->idct_put = ff_simple_idct_put_vis;
35
-          c->idct_add = ff_simple_idct_add_vis;
36
-          c->idct     = ff_simple_idct_vis;
37
-          c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
38
-      }
39
-  }
40
-}
41 1
deleted file mode 100644
... ...
@@ -1,28 +0,0 @@
1
-/*
2
- * This file is part of Libav.
3
- *
4
- * Libav is free software; you can redistribute it and/or
5
- * modify it under the terms of the GNU Lesser General Public
6
- * License as published by the Free Software Foundation; either
7
- * version 2.1 of the License, or (at your option) any later version.
8
- *
9
- * Libav is distributed in the hope that it will be useful,
10
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
- * Lesser General Public License for more details.
13
- *
14
- * You should have received a copy of the GNU Lesser General Public
15
- * License along with Libav; if not, write to the Free Software
16
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
- */
18
-
19
-#ifndef AVCODEC_SPARC_DSPUTIL_VIS_H
20
-#define AVCODEC_SPARC_DSPUTIL_VIS_H
21
-
22
-#include <stdint.h>
23
-
24
-void ff_simple_idct_put_vis(uint8_t *dest, int line_size, int16_t *data);
25
-void ff_simple_idct_add_vis(uint8_t *dest, int line_size, int16_t *data);
26
-void ff_simple_idct_vis(int16_t *data);
27
-
28
-#endif /* AVCODEC_SPARC_DSPUTIL_VIS_H */
29 1
deleted file mode 100644
... ...
@@ -1,3524 +0,0 @@
1
-/*
2
- * Copyright (C) 2003 David S. Miller <davem@redhat.com>
3
- *
4
- * This file is part of Libav.
5
- *
6
- * Libav is free software; you can redistribute it and/or
7
- * modify it under the terms of the GNU Lesser General Public
8
- * License as published by the Free Software Foundation; either
9
- * version 2.1 of the License, or (at your option) any later version.
10
- *
11
- * Libav is distributed in the hope that it will be useful,
12
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
- * Lesser General Public License for more details.
15
- *
16
- * You should have received a copy of the GNU Lesser General Public
17
- * License along with Libav; if not, write to the Free Software
18
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
- */
20
-
21
-/* The *no_round* functions have been added by James A. Morrison, 2003,2004.
22
-   The vis code from libmpeg2 was adapted for libavcodec by James A. Morrison.
23
- */
24
-
25
-#include <stddef.h>
26
-#include <stdint.h>
27
-
28
-#include "libavutil/attributes.h"
29
-#include "libavutil/mem.h"
30
-#include "libavcodec/hpeldsp.h"
31
-#include "vis.h"
32
-
33
-/* The trick used in some of this file is the formula from the MMX
34
- * motion comp code, which is:
35
- *
36
- * (x+y+1)>>1 == (x|y)-((x^y)>>1)
37
- *
38
- * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
39
- * We avoid overflows by masking before we do the shift, and we
40
- * implement the shift by multiplying by 1/2 using mul8x16.  So in
41
- * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
42
- * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
43
- * the value 0x80808080 is in f8):
44
- *
45
- *      fxor            f0,   f2, f10
46
- *      fand            f10,  f4, f10
47
- *      fmul8x16        f8,  f10, f10
48
- *      fand            f10,  f6, f10
49
- *      for             f0,   f2, f12
50
- *      fpsub16         f12, f10, f10
51
- */
52
-
53
-#define DUP4(x) {x, x, x, x}
54
-#define DUP8(x) {x, x, x, x, x, x, x, x}
55
-DECLARE_ALIGNED(8, static const int16_t, constants1)[] = DUP4 (1);
56
-DECLARE_ALIGNED(8, static const int16_t, constants2)[] = DUP4 (2);
57
-DECLARE_ALIGNED(8, static const int16_t, constants3)[] = DUP4 (3);
58
-DECLARE_ALIGNED(8, static const int16_t, constants6)[] = DUP4 (6);
59
-DECLARE_ALIGNED(8, static const int8_t, constants_fe)[] = DUP8 (0xfe);
60
-DECLARE_ALIGNED(8, static const int8_t, constants_7f)[] = DUP8 (0x7f);
61
-DECLARE_ALIGNED(8, static const int8_t, constants128)[] = DUP8 (128);
62
-DECLARE_ALIGNED(8, static const int16_t, constants256_512)[] =
63
-        {256, 512, 256, 512};
64
-DECLARE_ALIGNED(8, static const int16_t, constants256_1024)[] =
65
-        {256, 1024, 256, 1024};
66
-
67
-#define REF_0           0
68
-#define REF_0_1         1
69
-#define REF_2           2
70
-#define REF_2_1         3
71
-#define REF_4           4
72
-#define REF_4_1         5
73
-#define REF_6           6
74
-#define REF_6_1         7
75
-#define REF_S0          8
76
-#define REF_S0_1        9
77
-#define REF_S2          10
78
-#define REF_S2_1        11
79
-#define REF_S4          12
80
-#define REF_S4_1        13
81
-#define REF_S6          14
82
-#define REF_S6_1        15
83
-#define DST_0           16
84
-#define DST_1           17
85
-#define DST_2           18
86
-#define DST_3           19
87
-#define CONST_1         20
88
-#define CONST_2         20
89
-#define CONST_3         20
90
-#define CONST_6         20
91
-#define MASK_fe         20
92
-#define CONST_128       22
93
-#define CONST_256       22
94
-#define CONST_512       22
95
-#define CONST_1024      22
96
-#define TMP0            24
97
-#define TMP1            25
98
-#define TMP2            26
99
-#define TMP3            27
100
-#define TMP4            28
101
-#define TMP5            29
102
-#define ZERO            30
103
-#define MASK_7f         30
104
-
105
-#define TMP6            32
106
-#define TMP8            34
107
-#define TMP10           36
108
-#define TMP12           38
109
-#define TMP14           40
110
-#define TMP16           42
111
-#define TMP18           44
112
-#define TMP20           46
113
-#define TMP22           48
114
-#define TMP24           50
115
-#define TMP26           52
116
-#define TMP28           54
117
-#define TMP30           56
118
-#define TMP32           58
119
-
120
-static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * ref,
121
-                             const ptrdiff_t stride, int height)
122
-{
123
-        ref = vis_alignaddr(ref);
124
-        do {    /* 5 cycles */
125
-                vis_ld64(ref[0], TMP0);
126
-
127
-                vis_ld64_2(ref, 8, TMP2);
128
-
129
-                vis_ld64_2(ref, 16, TMP4);
130
-                ref += stride;
131
-
132
-                vis_faligndata(TMP0, TMP2, REF_0);
133
-                vis_st64(REF_0, dest[0]);
134
-
135
-                vis_faligndata(TMP2, TMP4, REF_2);
136
-                vis_st64_2(REF_2, dest, 8);
137
-                dest += stride;
138
-        } while (--height);
139
-}
140
-
141
-static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * ref,
142
-                            const ptrdiff_t stride, int height)
143
-{
144
-        ref = vis_alignaddr(ref);
145
-        do {    /* 4 cycles */
146
-                vis_ld64(ref[0], TMP0);
147
-
148
-                vis_ld64(ref[8], TMP2);
149
-                ref += stride;
150
-
151
-                /* stall */
152
-
153
-                vis_faligndata(TMP0, TMP2, REF_0);
154
-                vis_st64(REF_0, dest[0]);
155
-                dest += stride;
156
-        } while (--height);
157
-}
158
-
159
-
160
-static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * ref,
161
-                             const ptrdiff_t stride, int height)
162
-{
163
-        int stride_8 = stride + 8;
164
-
165
-        ref = vis_alignaddr(ref);
166
-
167
-        vis_ld64(ref[0], TMP0);
168
-
169
-        vis_ld64(ref[8], TMP2);
170
-
171
-        vis_ld64(ref[16], TMP4);
172
-
173
-        vis_ld64(dest[0], DST_0);
174
-
175
-        vis_ld64(dest[8], DST_2);
176
-
177
-        vis_ld64(constants_fe[0], MASK_fe);
178
-        vis_faligndata(TMP0, TMP2, REF_0);
179
-
180
-        vis_ld64(constants_7f[0], MASK_7f);
181
-        vis_faligndata(TMP2, TMP4, REF_2);
182
-
183
-        vis_ld64(constants128[0], CONST_128);
184
-
185
-        ref += stride;
186
-        height = (height >> 1) - 1;
187
-
188
-        do {    /* 24 cycles */
189
-                vis_ld64(ref[0], TMP0);
190
-                vis_xor(DST_0, REF_0, TMP6);
191
-
192
-                vis_ld64_2(ref, 8, TMP2);
193
-                vis_and(TMP6, MASK_fe, TMP6);
194
-
195
-                vis_ld64_2(ref, 16, TMP4);
196
-                ref += stride;
197
-                vis_mul8x16(CONST_128, TMP6, TMP6);
198
-                vis_xor(DST_2, REF_2, TMP8);
199
-
200
-                vis_and(TMP8, MASK_fe, TMP8);
201
-
202
-                vis_or(DST_0, REF_0, TMP10);
203
-                vis_ld64_2(dest, stride, DST_0);
204
-                vis_mul8x16(CONST_128, TMP8, TMP8);
205
-
206
-                vis_or(DST_2, REF_2, TMP12);
207
-                vis_ld64_2(dest, stride_8, DST_2);
208
-
209
-                vis_ld64(ref[0], TMP14);
210
-                vis_and(TMP6, MASK_7f, TMP6);
211
-
212
-                vis_and(TMP8, MASK_7f, TMP8);
213
-
214
-                vis_psub16(TMP10, TMP6, TMP6);
215
-                vis_st64(TMP6, dest[0]);
216
-
217
-                vis_psub16(TMP12, TMP8, TMP8);
218
-                vis_st64_2(TMP8, dest, 8);
219
-
220
-                dest += stride;
221
-                vis_ld64_2(ref, 8, TMP16);
222
-                vis_faligndata(TMP0, TMP2, REF_0);
223
-
224
-                vis_ld64_2(ref, 16, TMP18);
225
-                vis_faligndata(TMP2, TMP4, REF_2);
226
-                ref += stride;
227
-
228
-                vis_xor(DST_0, REF_0, TMP20);
229
-
230
-                vis_and(TMP20, MASK_fe, TMP20);
231
-
232
-                vis_xor(DST_2, REF_2, TMP22);
233
-                vis_mul8x16(CONST_128, TMP20, TMP20);
234
-
235
-                vis_and(TMP22, MASK_fe, TMP22);
236
-
237
-                vis_or(DST_0, REF_0, TMP24);
238
-                vis_mul8x16(CONST_128, TMP22, TMP22);
239
-
240
-                vis_or(DST_2, REF_2, TMP26);
241
-
242
-                vis_ld64_2(dest, stride, DST_0);
243
-                vis_faligndata(TMP14, TMP16, REF_0);
244
-
245
-                vis_ld64_2(dest, stride_8, DST_2);
246
-                vis_faligndata(TMP16, TMP18, REF_2);
247
-
248
-                vis_and(TMP20, MASK_7f, TMP20);
249
-
250
-                vis_and(TMP22, MASK_7f, TMP22);
251
-
252
-                vis_psub16(TMP24, TMP20, TMP20);
253
-                vis_st64(TMP20, dest[0]);
254
-
255
-                vis_psub16(TMP26, TMP22, TMP22);
256
-                vis_st64_2(TMP22, dest, 8);
257
-                dest += stride;
258
-        } while (--height);
259
-
260
-        vis_ld64(ref[0], TMP0);
261
-        vis_xor(DST_0, REF_0, TMP6);
262
-
263
-        vis_ld64_2(ref, 8, TMP2);
264
-        vis_and(TMP6, MASK_fe, TMP6);
265
-
266
-        vis_ld64_2(ref, 16, TMP4);
267
-        vis_mul8x16(CONST_128, TMP6, TMP6);
268
-        vis_xor(DST_2, REF_2, TMP8);
269
-
270
-        vis_and(TMP8, MASK_fe, TMP8);
271
-
272
-        vis_or(DST_0, REF_0, TMP10);
273
-        vis_ld64_2(dest, stride, DST_0);
274
-        vis_mul8x16(CONST_128, TMP8, TMP8);
275
-
276
-        vis_or(DST_2, REF_2, TMP12);
277
-        vis_ld64_2(dest, stride_8, DST_2);
278
-
279
-        vis_ld64(ref[0], TMP14);
280
-        vis_and(TMP6, MASK_7f, TMP6);
281
-
282
-        vis_and(TMP8, MASK_7f, TMP8);
283
-
284
-        vis_psub16(TMP10, TMP6, TMP6);
285
-        vis_st64(TMP6, dest[0]);
286
-
287
-        vis_psub16(TMP12, TMP8, TMP8);
288
-        vis_st64_2(TMP8, dest, 8);
289
-
290
-        dest += stride;
291
-        vis_faligndata(TMP0, TMP2, REF_0);
292
-
293
-        vis_faligndata(TMP2, TMP4, REF_2);
294
-
295
-        vis_xor(DST_0, REF_0, TMP20);
296
-
297
-        vis_and(TMP20, MASK_fe, TMP20);
298
-
299
-        vis_xor(DST_2, REF_2, TMP22);
300
-        vis_mul8x16(CONST_128, TMP20, TMP20);
301
-
302
-        vis_and(TMP22, MASK_fe, TMP22);
303
-
304
-        vis_or(DST_0, REF_0, TMP24);
305
-        vis_mul8x16(CONST_128, TMP22, TMP22);
306
-
307
-        vis_or(DST_2, REF_2, TMP26);
308
-
309
-        vis_and(TMP20, MASK_7f, TMP20);
310
-
311
-        vis_and(TMP22, MASK_7f, TMP22);
312
-
313
-        vis_psub16(TMP24, TMP20, TMP20);
314
-        vis_st64(TMP20, dest[0]);
315
-
316
-        vis_psub16(TMP26, TMP22, TMP22);
317
-        vis_st64_2(TMP22, dest, 8);
318
-}
319
-
320
-static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * ref,
321
-                            const ptrdiff_t stride, int height)
322
-{
323
-        ref = vis_alignaddr(ref);
324
-
325
-        vis_ld64(ref[0], TMP0);
326
-
327
-        vis_ld64(ref[8], TMP2);
328
-
329
-        vis_ld64(dest[0], DST_0);
330
-
331
-        vis_ld64(constants_fe[0], MASK_fe);
332
-
333
-        vis_ld64(constants_7f[0], MASK_7f);
334
-        vis_faligndata(TMP0, TMP2, REF_0);
335
-
336
-        vis_ld64(constants128[0], CONST_128);
337
-
338
-        ref += stride;
339
-        height = (height >> 1) - 1;
340
-
341
-        do {    /* 12 cycles */
342
-                vis_ld64(ref[0], TMP0);
343
-                vis_xor(DST_0, REF_0, TMP4);
344
-
345
-                vis_ld64(ref[8], TMP2);
346
-                vis_and(TMP4, MASK_fe, TMP4);
347
-
348
-                vis_or(DST_0, REF_0, TMP6);
349
-                vis_ld64_2(dest, stride, DST_0);
350
-                ref += stride;
351
-                vis_mul8x16(CONST_128, TMP4, TMP4);
352
-
353
-                vis_ld64(ref[0], TMP12);
354
-                vis_faligndata(TMP0, TMP2, REF_0);
355
-
356
-                vis_ld64(ref[8], TMP2);
357
-                vis_xor(DST_0, REF_0, TMP0);
358
-                ref += stride;
359
-
360
-                vis_and(TMP0, MASK_fe, TMP0);
361
-
362
-                vis_and(TMP4, MASK_7f, TMP4);
363
-
364
-                vis_psub16(TMP6, TMP4, TMP4);
365
-                vis_st64(TMP4, dest[0]);
366
-                dest += stride;
367
-                vis_mul8x16(CONST_128, TMP0, TMP0);
368
-
369
-                vis_or(DST_0, REF_0, TMP6);
370
-                vis_ld64_2(dest, stride, DST_0);
371
-
372
-                vis_faligndata(TMP12, TMP2, REF_0);
373
-
374
-                vis_and(TMP0, MASK_7f, TMP0);
375
-
376
-                vis_psub16(TMP6, TMP0, TMP4);
377
-                vis_st64(TMP4, dest[0]);
378
-                dest += stride;
379
-        } while (--height);
380
-
381
-        vis_ld64(ref[0], TMP0);
382
-        vis_xor(DST_0, REF_0, TMP4);
383
-
384
-        vis_ld64(ref[8], TMP2);
385
-        vis_and(TMP4, MASK_fe, TMP4);
386
-
387
-        vis_or(DST_0, REF_0, TMP6);
388
-        vis_ld64_2(dest, stride, DST_0);
389
-        vis_mul8x16(CONST_128, TMP4, TMP4);
390
-
391
-        vis_faligndata(TMP0, TMP2, REF_0);
392
-
393
-        vis_xor(DST_0, REF_0, TMP0);
394
-
395
-        vis_and(TMP0, MASK_fe, TMP0);
396
-
397
-        vis_and(TMP4, MASK_7f, TMP4);
398
-
399
-        vis_psub16(TMP6, TMP4, TMP4);
400
-        vis_st64(TMP4, dest[0]);
401
-        dest += stride;
402
-        vis_mul8x16(CONST_128, TMP0, TMP0);
403
-
404
-        vis_or(DST_0, REF_0, TMP6);
405
-
406
-        vis_and(TMP0, MASK_7f, TMP0);
407
-
408
-        vis_psub16(TMP6, TMP0, TMP4);
409
-        vis_st64(TMP4, dest[0]);
410
-}
411
-
412
-static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * ref,
413
-                             const ptrdiff_t stride, int height)
414
-{
415
-        unsigned long off = (unsigned long) ref & 0x7;
416
-        unsigned long off_plus_1 = off + 1;
417
-
418
-        ref = vis_alignaddr(ref);
419
-
420
-        vis_ld64(ref[0],    TMP0);
421
-
422
-        vis_ld64_2(ref, 8,  TMP2);
423
-
424
-        vis_ld64_2(ref, 16, TMP4);
425
-
426
-        vis_ld64(constants_fe[0], MASK_fe);
427
-
428
-        vis_ld64(constants_7f[0], MASK_7f);
429
-        vis_faligndata(TMP0, TMP2, REF_0);
430
-
431
-        vis_ld64(constants128[0], CONST_128);
432
-        vis_faligndata(TMP2, TMP4, REF_4);
433
-
434
-        if (off != 0x7) {
435
-                vis_alignaddr_g0((void *)off_plus_1);
436
-                vis_faligndata(TMP0, TMP2, REF_2);
437
-                vis_faligndata(TMP2, TMP4, REF_6);
438
-        } else {
439
-                vis_src1(TMP2, REF_2);
440
-                vis_src1(TMP4, REF_6);
441
-        }
442
-
443
-        ref += stride;
444
-        height = (height >> 1) - 1;
445
-
446
-        do {    /* 34 cycles */
447
-                vis_ld64(ref[0],    TMP0);
448
-                vis_xor(REF_0, REF_2, TMP6);
449
-
450
-                vis_ld64_2(ref, 8,  TMP2);
451
-                vis_xor(REF_4, REF_6, TMP8);
452
-
453
-                vis_ld64_2(ref, 16, TMP4);
454
-                vis_and(TMP6, MASK_fe, TMP6);
455
-                ref += stride;
456
-
457
-                vis_ld64(ref[0],    TMP14);
458
-                vis_mul8x16(CONST_128, TMP6, TMP6);
459
-                vis_and(TMP8, MASK_fe, TMP8);
460
-
461
-                vis_ld64_2(ref, 8,  TMP16);
462
-                vis_mul8x16(CONST_128, TMP8, TMP8);
463
-                vis_or(REF_0, REF_2, TMP10);
464
-
465
-                vis_ld64_2(ref, 16, TMP18);
466
-                ref += stride;
467
-                vis_or(REF_4, REF_6, TMP12);
468
-
469
-                vis_alignaddr_g0((void *)off);
470
-
471
-                vis_faligndata(TMP0, TMP2, REF_0);
472
-
473
-                vis_faligndata(TMP2, TMP4, REF_4);
474
-
475
-                if (off != 0x7) {
476
-                        vis_alignaddr_g0((void *)off_plus_1);
477
-                        vis_faligndata(TMP0, TMP2, REF_2);
478
-                        vis_faligndata(TMP2, TMP4, REF_6);
479
-                } else {
480
-                        vis_src1(TMP2, REF_2);
481
-                        vis_src1(TMP4, REF_6);
482
-                }
483
-
484
-                vis_and(TMP6, MASK_7f, TMP6);
485
-
486
-                vis_and(TMP8, MASK_7f, TMP8);
487
-
488
-                vis_psub16(TMP10, TMP6, TMP6);
489
-                vis_st64(TMP6, dest[0]);
490
-
491
-                vis_psub16(TMP12, TMP8, TMP8);
492
-                vis_st64_2(TMP8, dest, 8);
493
-                dest += stride;
494
-
495
-                vis_xor(REF_0, REF_2, TMP6);
496
-
497
-                vis_xor(REF_4, REF_6, TMP8);
498
-
499
-                vis_and(TMP6, MASK_fe, TMP6);
500
-
501
-                vis_mul8x16(CONST_128, TMP6, TMP6);
502
-                vis_and(TMP8, MASK_fe, TMP8);
503
-
504
-                vis_mul8x16(CONST_128, TMP8, TMP8);
505
-                vis_or(REF_0, REF_2, TMP10);
506
-
507
-                vis_or(REF_4, REF_6, TMP12);
508
-
509
-                vis_alignaddr_g0((void *)off);
510
-
511
-                vis_faligndata(TMP14, TMP16, REF_0);
512
-
513
-                vis_faligndata(TMP16, TMP18, REF_4);
514
-
515
-                if (off != 0x7) {
516
-                        vis_alignaddr_g0((void *)off_plus_1);
517
-                        vis_faligndata(TMP14, TMP16, REF_2);
518
-                        vis_faligndata(TMP16, TMP18, REF_6);
519
-                } else {
520
-                        vis_src1(TMP16, REF_2);
521
-                        vis_src1(TMP18, REF_6);
522
-                }
523
-
524
-                vis_and(TMP6, MASK_7f, TMP6);
525
-
526
-                vis_and(TMP8, MASK_7f, TMP8);
527
-
528
-                vis_psub16(TMP10, TMP6, TMP6);
529
-                vis_st64(TMP6, dest[0]);
530
-
531
-                vis_psub16(TMP12, TMP8, TMP8);
532
-                vis_st64_2(TMP8, dest, 8);
533
-                dest += stride;
534
-        } while (--height);
535
-
536
-        vis_ld64(ref[0],    TMP0);
537
-        vis_xor(REF_0, REF_2, TMP6);
538
-
539
-        vis_ld64_2(ref, 8,  TMP2);
540
-        vis_xor(REF_4, REF_6, TMP8);
541
-
542
-        vis_ld64_2(ref, 16, TMP4);
543
-        vis_and(TMP6, MASK_fe, TMP6);
544
-
545
-        vis_mul8x16(CONST_128, TMP6, TMP6);
546
-        vis_and(TMP8, MASK_fe, TMP8);
547
-
548
-        vis_mul8x16(CONST_128, TMP8, TMP8);
549
-        vis_or(REF_0, REF_2, TMP10);
550
-
551
-        vis_or(REF_4, REF_6, TMP12);
552
-
553
-        vis_alignaddr_g0((void *)off);
554
-
555
-        vis_faligndata(TMP0, TMP2, REF_0);
556
-
557
-        vis_faligndata(TMP2, TMP4, REF_4);
558
-
559
-        if (off != 0x7) {
560
-                vis_alignaddr_g0((void *)off_plus_1);
561
-                vis_faligndata(TMP0, TMP2, REF_2);
562
-                vis_faligndata(TMP2, TMP4, REF_6);
563
-        } else {
564
-                vis_src1(TMP2, REF_2);
565
-                vis_src1(TMP4, REF_6);
566
-        }
567
-
568
-        vis_and(TMP6, MASK_7f, TMP6);
569
-
570
-        vis_and(TMP8, MASK_7f, TMP8);
571
-
572
-        vis_psub16(TMP10, TMP6, TMP6);
573
-        vis_st64(TMP6, dest[0]);
574
-
575
-        vis_psub16(TMP12, TMP8, TMP8);
576
-        vis_st64_2(TMP8, dest, 8);
577
-        dest += stride;
578
-
579
-        vis_xor(REF_0, REF_2, TMP6);
580
-
581
-        vis_xor(REF_4, REF_6, TMP8);
582
-
583
-        vis_and(TMP6, MASK_fe, TMP6);
584
-
585
-        vis_mul8x16(CONST_128, TMP6, TMP6);
586
-        vis_and(TMP8, MASK_fe, TMP8);
587
-
588
-        vis_mul8x16(CONST_128, TMP8, TMP8);
589
-        vis_or(REF_0, REF_2, TMP10);
590
-
591
-        vis_or(REF_4, REF_6, TMP12);
592
-
593
-        vis_and(TMP6, MASK_7f, TMP6);
594
-
595
-        vis_and(TMP8, MASK_7f, TMP8);
596
-
597
-        vis_psub16(TMP10, TMP6, TMP6);
598
-        vis_st64(TMP6, dest[0]);
599
-
600
-        vis_psub16(TMP12, TMP8, TMP8);
601
-        vis_st64_2(TMP8, dest, 8);
602
-}
603
-
604
-static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * ref,
605
-                            const ptrdiff_t stride, int height)
606
-{
607
-        unsigned long off = (unsigned long) ref & 0x7;
608
-        unsigned long off_plus_1 = off + 1;
609
-
610
-        ref = vis_alignaddr(ref);
611
-
612
-        vis_ld64(ref[0], TMP0);
613
-
614
-        vis_ld64(ref[8], TMP2);
615
-
616
-        vis_ld64(constants_fe[0], MASK_fe);
617
-
618
-        vis_ld64(constants_7f[0], MASK_7f);
619
-
620
-        vis_ld64(constants128[0], CONST_128);
621
-        vis_faligndata(TMP0, TMP2, REF_0);
622
-
623
-        if (off != 0x7) {
624
-                vis_alignaddr_g0((void *)off_plus_1);
625
-                vis_faligndata(TMP0, TMP2, REF_2);
626
-        } else {
627
-                vis_src1(TMP2, REF_2);
628
-        }
629
-
630
-        ref += stride;
631
-        height = (height >> 1) - 1;
632
-
633
-        do {    /* 20 cycles */
634
-                vis_ld64(ref[0], TMP0);
635
-                vis_xor(REF_0, REF_2, TMP4);
636
-
637
-                vis_ld64_2(ref, 8, TMP2);
638
-                vis_and(TMP4, MASK_fe, TMP4);
639
-                ref += stride;
640
-
641
-                vis_ld64(ref[0], TMP8);
642
-                vis_or(REF_0, REF_2, TMP6);
643
-                vis_mul8x16(CONST_128, TMP4, TMP4);
644
-
645
-                vis_alignaddr_g0((void *)off);
646
-
647
-                vis_ld64_2(ref, 8, TMP10);
648
-                ref += stride;
649
-                vis_faligndata(TMP0, TMP2, REF_0);
650
-
651
-                if (off != 0x7) {
652
-                        vis_alignaddr_g0((void *)off_plus_1);
653
-                        vis_faligndata(TMP0, TMP2, REF_2);
654
-                } else {
655
-                        vis_src1(TMP2, REF_2);
656
-                }
657
-
658
-                vis_and(TMP4, MASK_7f, TMP4);
659
-
660
-                vis_psub16(TMP6, TMP4, DST_0);
661
-                vis_st64(DST_0, dest[0]);
662
-                dest += stride;
663
-
664
-                vis_xor(REF_0, REF_2, TMP12);
665
-
666
-                vis_and(TMP12, MASK_fe, TMP12);
667
-
668
-                vis_or(REF_0, REF_2, TMP14);
669
-                vis_mul8x16(CONST_128, TMP12, TMP12);
670
-
671
-                vis_alignaddr_g0((void *)off);
672
-                vis_faligndata(TMP8, TMP10, REF_0);
673
-                if (off != 0x7) {
674
-                        vis_alignaddr_g0((void *)off_plus_1);
675
-                        vis_faligndata(TMP8, TMP10, REF_2);
676
-                } else {
677
-                        vis_src1(TMP10, REF_2);
678
-                }
679
-
680
-                vis_and(TMP12, MASK_7f, TMP12);
681
-
682
-                vis_psub16(TMP14, TMP12, DST_0);
683
-                vis_st64(DST_0, dest[0]);
684
-                dest += stride;
685
-        } while (--height);
686
-
687
-        vis_ld64(ref[0], TMP0);
688
-        vis_xor(REF_0, REF_2, TMP4);
689
-
690
-        vis_ld64_2(ref, 8, TMP2);
691
-        vis_and(TMP4, MASK_fe, TMP4);
692
-
693
-        vis_or(REF_0, REF_2, TMP6);
694
-        vis_mul8x16(CONST_128, TMP4, TMP4);
695
-
696
-        vis_alignaddr_g0((void *)off);
697
-
698
-        vis_faligndata(TMP0, TMP2, REF_0);
699
-
700
-        if (off != 0x7) {
701
-                vis_alignaddr_g0((void *)off_plus_1);
702
-                vis_faligndata(TMP0, TMP2, REF_2);
703
-        } else {
704
-                vis_src1(TMP2, REF_2);
705
-        }
706
-
707
-        vis_and(TMP4, MASK_7f, TMP4);
708
-
709
-        vis_psub16(TMP6, TMP4, DST_0);
710
-        vis_st64(DST_0, dest[0]);
711
-        dest += stride;
712
-
713
-        vis_xor(REF_0, REF_2, TMP12);
714
-
715
-        vis_and(TMP12, MASK_fe, TMP12);
716
-
717
-        vis_or(REF_0, REF_2, TMP14);
718
-        vis_mul8x16(CONST_128, TMP12, TMP12);
719
-
720
-        vis_and(TMP12, MASK_7f, TMP12);
721
-
722
-        vis_psub16(TMP14, TMP12, DST_0);
723
-        vis_st64(DST_0, dest[0]);
724
-        dest += stride;
725
-}
726
-
727
-static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * ref,
728
-                             const ptrdiff_t stride, int height)
729
-{
730
-        unsigned long off = (unsigned long) ref & 0x7;
731
-        unsigned long off_plus_1 = off + 1;
732
-
733
-        vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
734
-
735
-        vis_ld64(constants3[0], CONST_3);
736
-        vis_fzero(ZERO);
737
-        vis_ld64(constants256_512[0], CONST_256);
738
-
739
-        ref = vis_alignaddr(ref);
740
-        do {    /* 26 cycles */
741
-                vis_ld64(ref[0], TMP0);
742
-
743
-                vis_ld64(ref[8], TMP2);
744
-
745
-                vis_alignaddr_g0((void *)off);
746
-
747
-                vis_ld64(ref[16], TMP4);
748
-
749
-                vis_ld64(dest[0], DST_0);
750
-                vis_faligndata(TMP0, TMP2, REF_0);
751
-
752
-                vis_ld64(dest[8], DST_2);
753
-                vis_faligndata(TMP2, TMP4, REF_4);
754
-
755
-                if (off != 0x7) {
756
-                        vis_alignaddr_g0((void *)off_plus_1);
757
-                        vis_faligndata(TMP0, TMP2, REF_2);
758
-                        vis_faligndata(TMP2, TMP4, REF_6);
759
-                } else {
760
-                        vis_src1(TMP2, REF_2);
761
-                        vis_src1(TMP4, REF_6);
762
-                }
763
-
764
-                vis_mul8x16au(REF_0,   CONST_256, TMP0);
765
-
766
-                vis_pmerge(ZERO,     REF_2,     TMP4);
767
-                vis_mul8x16au(REF_0_1, CONST_256, TMP2);
768
-
769
-                vis_pmerge(ZERO, REF_2_1, TMP6);
770
-
771
-                vis_padd16(TMP0, TMP4, TMP0);
772
-
773
-                vis_mul8x16al(DST_0,   CONST_512, TMP4);
774
-                vis_padd16(TMP2, TMP6, TMP2);
775
-
776
-                vis_mul8x16al(DST_1,   CONST_512, TMP6);
777
-
778
-                vis_mul8x16au(REF_6,   CONST_256, TMP12);
779
-
780
-                vis_padd16(TMP0, TMP4, TMP0);
781
-                vis_mul8x16au(REF_6_1, CONST_256, TMP14);
782
-
783
-                vis_padd16(TMP2, TMP6, TMP2);
784
-                vis_mul8x16au(REF_4,   CONST_256, TMP16);
785
-
786
-                vis_padd16(TMP0, CONST_3, TMP8);
787
-                vis_mul8x16au(REF_4_1, CONST_256, TMP18);
788
-
789
-                vis_padd16(TMP2, CONST_3, TMP10);
790
-                vis_pack16(TMP8, DST_0);
791
-
792
-                vis_pack16(TMP10, DST_1);
793
-                vis_padd16(TMP16, TMP12, TMP0);
794
-
795
-                vis_st64(DST_0, dest[0]);
796
-                vis_mul8x16al(DST_2,   CONST_512, TMP4);
797
-                vis_padd16(TMP18, TMP14, TMP2);
798
-
799
-                vis_mul8x16al(DST_3,   CONST_512, TMP6);
800
-                vis_padd16(TMP0, CONST_3, TMP0);
801
-
802
-                vis_padd16(TMP2, CONST_3, TMP2);
803
-
804
-                vis_padd16(TMP0, TMP4, TMP0);
805
-
806
-                vis_padd16(TMP2, TMP6, TMP2);
807
-                vis_pack16(TMP0, DST_2);
808
-
809
-                vis_pack16(TMP2, DST_3);
810
-                vis_st64(DST_2, dest[8]);
811
-
812
-                ref += stride;
813
-                dest += stride;
814
-        } while (--height);
815
-}
816
-
817
-static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * ref,
818
-                            const ptrdiff_t stride, int height)
819
-{
820
-        unsigned long off = (unsigned long) ref & 0x7;
821
-        unsigned long off_plus_1 = off + 1;
822
-        int stride_times_2 = stride << 1;
823
-
824
-        vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
825
-
826
-        vis_ld64(constants3[0], CONST_3);
827
-        vis_fzero(ZERO);
828
-        vis_ld64(constants256_512[0], CONST_256);
829
-
830
-        ref = vis_alignaddr(ref);
831
-        height >>= 2;
832
-        do {    /* 47 cycles */
833
-                vis_ld64(ref[0],   TMP0);
834
-
835
-                vis_ld64_2(ref, 8, TMP2);
836
-                ref += stride;
837
-
838
-                vis_alignaddr_g0((void *)off);
839
-
840
-                vis_ld64(ref[0],   TMP4);
841
-                vis_faligndata(TMP0, TMP2, REF_0);
842
-
843
-                vis_ld64_2(ref, 8, TMP6);
844
-                ref += stride;
845
-
846
-                vis_ld64(ref[0],   TMP8);
847
-
848
-                vis_ld64_2(ref, 8, TMP10);
849
-                ref += stride;
850
-                vis_faligndata(TMP4, TMP6, REF_4);
851
-
852
-                vis_ld64(ref[0],   TMP12);
853
-
854
-                vis_ld64_2(ref, 8, TMP14);
855
-                ref += stride;
856
-                vis_faligndata(TMP8, TMP10, REF_S0);
857
-
858
-                vis_faligndata(TMP12, TMP14, REF_S4);
859
-
860
-                if (off != 0x7) {
861
-                        vis_alignaddr_g0((void *)off_plus_1);
862
-
863
-                        vis_ld64(dest[0], DST_0);
864
-                        vis_faligndata(TMP0, TMP2, REF_2);
865
-
866
-                        vis_ld64_2(dest, stride, DST_2);
867
-                        vis_faligndata(TMP4, TMP6, REF_6);
868
-
869
-                        vis_faligndata(TMP8, TMP10, REF_S2);
870
-
871
-                        vis_faligndata(TMP12, TMP14, REF_S6);
872
-                } else {
873
-                        vis_ld64(dest[0], DST_0);
874
-                        vis_src1(TMP2, REF_2);
875
-
876
-                        vis_ld64_2(dest, stride, DST_2);
877
-                        vis_src1(TMP6, REF_6);
878
-
879
-                        vis_src1(TMP10, REF_S2);
880
-
881
-                        vis_src1(TMP14, REF_S6);
882
-                }
883
-
884
-                vis_pmerge(ZERO,     REF_0,     TMP0);
885
-                vis_mul8x16au(REF_0_1, CONST_256, TMP2);
886
-
887
-                vis_pmerge(ZERO,     REF_2,     TMP4);
888
-                vis_mul8x16au(REF_2_1, CONST_256, TMP6);
889
-
890
-                vis_padd16(TMP0, CONST_3, TMP0);
891
-                vis_mul8x16al(DST_0,   CONST_512, TMP16);
892
-
893
-                vis_padd16(TMP2, CONST_3, TMP2);
894
-                vis_mul8x16al(DST_1,   CONST_512, TMP18);
895
-
896
-                vis_padd16(TMP0, TMP4, TMP0);
897
-                vis_mul8x16au(REF_4, CONST_256, TMP8);
898
-
899
-                vis_padd16(TMP2, TMP6, TMP2);
900
-                vis_mul8x16au(REF_4_1, CONST_256, TMP10);
901
-
902
-                vis_padd16(TMP0, TMP16, TMP0);
903
-                vis_mul8x16au(REF_6, CONST_256, TMP12);
904
-
905
-                vis_padd16(TMP2, TMP18, TMP2);
906
-                vis_mul8x16au(REF_6_1, CONST_256, TMP14);
907
-
908
-                vis_padd16(TMP8, CONST_3, TMP8);
909
-                vis_mul8x16al(DST_2, CONST_512, TMP16);
910
-
911
-                vis_padd16(TMP8, TMP12, TMP8);
912
-                vis_mul8x16al(DST_3, CONST_512, TMP18);
913
-
914
-                vis_padd16(TMP10, TMP14, TMP10);
915
-                vis_pack16(TMP0, DST_0);
916
-
917
-                vis_pack16(TMP2, DST_1);
918
-                vis_st64(DST_0, dest[0]);
919
-                dest += stride;
920
-                vis_padd16(TMP10, CONST_3, TMP10);
921
-
922
-                vis_ld64_2(dest, stride, DST_0);
923
-                vis_padd16(TMP8, TMP16, TMP8);
924
-
925
-                vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
926
-                vis_padd16(TMP10, TMP18, TMP10);
927
-                vis_pack16(TMP8, DST_2);
928
-
929
-                vis_pack16(TMP10, DST_3);
930
-                vis_st64(DST_2, dest[0]);
931
-                dest += stride;
932
-
933
-                vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
934
-                vis_pmerge(ZERO,     REF_S0,     TMP0);
935
-
936
-                vis_pmerge(ZERO,     REF_S2,     TMP24);
937
-                vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
938
-
939
-                vis_padd16(TMP0, CONST_3, TMP0);
940
-                vis_mul8x16au(REF_S4, CONST_256, TMP8);
941
-
942
-                vis_padd16(TMP2, CONST_3, TMP2);
943
-                vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
944
-
945
-                vis_padd16(TMP0, TMP24, TMP0);
946
-                vis_mul8x16au(REF_S6, CONST_256, TMP12);
947
-
948
-                vis_padd16(TMP2, TMP6, TMP2);
949
-                vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
950
-
951
-                vis_padd16(TMP8, CONST_3, TMP8);
952
-                vis_mul8x16al(DST_0,   CONST_512, TMP16);
953
-
954
-                vis_padd16(TMP10, CONST_3, TMP10);
955
-                vis_mul8x16al(DST_1,   CONST_512, TMP18);
956
-
957
-                vis_padd16(TMP8, TMP12, TMP8);
958
-                vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
959
-
960
-                vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
961
-                vis_padd16(TMP0, TMP16, TMP0);
962
-
963
-                vis_padd16(TMP2, TMP18, TMP2);
964
-                vis_pack16(TMP0, DST_0);
965
-
966
-                vis_padd16(TMP10, TMP14, TMP10);
967
-                vis_pack16(TMP2, DST_1);
968
-                vis_st64(DST_0, dest[0]);
969
-                dest += stride;
970
-
971
-                vis_padd16(TMP8, TMP20, TMP8);
972
-
973
-                vis_padd16(TMP10, TMP22, TMP10);
974
-                vis_pack16(TMP8, DST_2);
975
-
976
-                vis_pack16(TMP10, DST_3);
977
-                vis_st64(DST_2, dest[0]);
978
-                dest += stride;
979
-        } while (--height);
980
-}
981
-
982
-static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * ref,
983
-                             const ptrdiff_t stride, int height)
984
-{
985
-        ref = vis_alignaddr(ref);
986
-        vis_ld64(ref[0], TMP0);
987
-
988
-        vis_ld64_2(ref, 8, TMP2);
989
-
990
-        vis_ld64_2(ref, 16, TMP4);
991
-        ref += stride;
992
-
993
-        vis_ld64(ref[0], TMP6);
994
-        vis_faligndata(TMP0, TMP2, REF_0);
995
-
996
-        vis_ld64_2(ref, 8, TMP8);
997
-        vis_faligndata(TMP2, TMP4, REF_4);
998
-
999
-        vis_ld64_2(ref, 16, TMP10);
1000
-        ref += stride;
1001
-
1002
-        vis_ld64(constants_fe[0], MASK_fe);
1003
-        vis_faligndata(TMP6, TMP8, REF_2);
1004
-
1005
-        vis_ld64(constants_7f[0], MASK_7f);
1006
-        vis_faligndata(TMP8, TMP10, REF_6);
1007
-
1008
-        vis_ld64(constants128[0], CONST_128);
1009
-        height = (height >> 1) - 1;
1010
-        do {    /* 24 cycles */
1011
-                vis_ld64(ref[0], TMP0);
1012
-                vis_xor(REF_0, REF_2, TMP12);
1013
-
1014
-                vis_ld64_2(ref, 8, TMP2);
1015
-                vis_xor(REF_4, REF_6, TMP16);
1016
-
1017
-                vis_ld64_2(ref, 16, TMP4);
1018
-                ref += stride;
1019
-                vis_or(REF_0, REF_2, TMP14);
1020
-
1021
-                vis_ld64(ref[0], TMP6);
1022
-                vis_or(REF_4, REF_6, TMP18);
1023
-
1024
-                vis_ld64_2(ref, 8, TMP8);
1025
-                vis_faligndata(TMP0, TMP2, REF_0);
1026
-
1027
-                vis_ld64_2(ref, 16, TMP10);
1028
-                ref += stride;
1029
-                vis_faligndata(TMP2, TMP4, REF_4);
1030
-
1031
-                vis_and(TMP12, MASK_fe, TMP12);
1032
-
1033
-                vis_and(TMP16, MASK_fe, TMP16);
1034
-                vis_mul8x16(CONST_128, TMP12, TMP12);
1035
-
1036
-                vis_mul8x16(CONST_128, TMP16, TMP16);
1037
-                vis_xor(REF_0, REF_2, TMP0);
1038
-
1039
-                vis_xor(REF_4, REF_6, TMP2);
1040
-
1041
-                vis_or(REF_0, REF_2, TMP20);
1042
-
1043
-                vis_and(TMP12, MASK_7f, TMP12);
1044
-
1045
-                vis_and(TMP16, MASK_7f, TMP16);
1046
-
1047
-                vis_psub16(TMP14, TMP12, TMP12);
1048
-                vis_st64(TMP12, dest[0]);
1049
-
1050
-                vis_psub16(TMP18, TMP16, TMP16);
1051
-                vis_st64_2(TMP16, dest, 8);
1052
-                dest += stride;
1053
-
1054
-                vis_or(REF_4, REF_6, TMP18);
1055
-
1056
-                vis_and(TMP0, MASK_fe, TMP0);
1057
-
1058
-                vis_and(TMP2, MASK_fe, TMP2);
1059
-                vis_mul8x16(CONST_128, TMP0, TMP0);
1060
-
1061
-                vis_faligndata(TMP6, TMP8, REF_2);
1062
-                vis_mul8x16(CONST_128, TMP2, TMP2);
1063
-
1064
-                vis_faligndata(TMP8, TMP10, REF_6);
1065
-
1066
-                vis_and(TMP0, MASK_7f, TMP0);
1067
-
1068
-                vis_and(TMP2, MASK_7f, TMP2);
1069
-
1070
-                vis_psub16(TMP20, TMP0, TMP0);
1071
-                vis_st64(TMP0, dest[0]);
1072
-
1073
-                vis_psub16(TMP18, TMP2, TMP2);
1074
-                vis_st64_2(TMP2, dest, 8);
1075
-                dest += stride;
1076
-        } while (--height);
1077
-
1078
-        vis_ld64(ref[0], TMP0);
1079
-        vis_xor(REF_0, REF_2, TMP12);
1080
-
1081
-        vis_ld64_2(ref, 8, TMP2);
1082
-        vis_xor(REF_4, REF_6, TMP16);
1083
-
1084
-        vis_ld64_2(ref, 16, TMP4);
1085
-        vis_or(REF_0, REF_2, TMP14);
1086
-
1087
-        vis_or(REF_4, REF_6, TMP18);
1088
-
1089
-        vis_faligndata(TMP0, TMP2, REF_0);
1090
-
1091
-        vis_faligndata(TMP2, TMP4, REF_4);
1092
-
1093
-        vis_and(TMP12, MASK_fe, TMP12);
1094
-
1095
-        vis_and(TMP16, MASK_fe, TMP16);
1096
-        vis_mul8x16(CONST_128, TMP12, TMP12);
1097
-
1098
-        vis_mul8x16(CONST_128, TMP16, TMP16);
1099
-        vis_xor(REF_0, REF_2, TMP0);
1100
-
1101
-        vis_xor(REF_4, REF_6, TMP2);
1102
-
1103
-        vis_or(REF_0, REF_2, TMP20);
1104
-
1105
-        vis_and(TMP12, MASK_7f, TMP12);
1106
-
1107
-        vis_and(TMP16, MASK_7f, TMP16);
1108
-
1109
-        vis_psub16(TMP14, TMP12, TMP12);
1110
-        vis_st64(TMP12, dest[0]);
1111
-
1112
-        vis_psub16(TMP18, TMP16, TMP16);
1113
-        vis_st64_2(TMP16, dest, 8);
1114
-        dest += stride;
1115
-
1116
-        vis_or(REF_4, REF_6, TMP18);
1117
-
1118
-        vis_and(TMP0, MASK_fe, TMP0);
1119
-
1120
-        vis_and(TMP2, MASK_fe, TMP2);
1121
-        vis_mul8x16(CONST_128, TMP0, TMP0);
1122
-
1123
-        vis_mul8x16(CONST_128, TMP2, TMP2);
1124
-
1125
-        vis_and(TMP0, MASK_7f, TMP0);
1126
-
1127
-        vis_and(TMP2, MASK_7f, TMP2);
1128
-
1129
-        vis_psub16(TMP20, TMP0, TMP0);
1130
-        vis_st64(TMP0, dest[0]);
1131
-
1132
-        vis_psub16(TMP18, TMP2, TMP2);
1133
-        vis_st64_2(TMP2, dest, 8);
1134
-}
1135
-
1136
-static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * ref,
1137
-                            const ptrdiff_t stride, int height)
1138
-{
1139
-        ref = vis_alignaddr(ref);
1140
-        vis_ld64(ref[0], TMP0);
1141
-
1142
-        vis_ld64_2(ref, 8, TMP2);
1143
-        ref += stride;
1144
-
1145
-        vis_ld64(ref[0], TMP4);
1146
-
1147
-        vis_ld64_2(ref, 8, TMP6);
1148
-        ref += stride;
1149
-
1150
-        vis_ld64(constants_fe[0], MASK_fe);
1151
-        vis_faligndata(TMP0, TMP2, REF_0);
1152
-
1153
-        vis_ld64(constants_7f[0], MASK_7f);
1154
-        vis_faligndata(TMP4, TMP6, REF_2);
1155
-
1156
-        vis_ld64(constants128[0], CONST_128);
1157
-        height = (height >> 1) - 1;
1158
-        do {    /* 12 cycles */
1159
-                vis_ld64(ref[0], TMP0);
1160
-                vis_xor(REF_0, REF_2, TMP4);
1161
-
1162
-                vis_ld64_2(ref, 8, TMP2);
1163
-                ref += stride;
1164
-                vis_and(TMP4, MASK_fe, TMP4);
1165
-
1166
-                vis_or(REF_0, REF_2, TMP6);
1167
-                vis_mul8x16(CONST_128, TMP4, TMP4);
1168
-
1169
-                vis_faligndata(TMP0, TMP2, REF_0);
1170
-                vis_ld64(ref[0], TMP0);
1171
-
1172
-                vis_ld64_2(ref, 8, TMP2);
1173
-                ref += stride;
1174
-                vis_xor(REF_0, REF_2, TMP12);
1175
-
1176
-                vis_and(TMP4, MASK_7f, TMP4);
1177
-
1178
-                vis_and(TMP12, MASK_fe, TMP12);
1179
-
1180
-                vis_mul8x16(CONST_128, TMP12, TMP12);
1181
-                vis_or(REF_0, REF_2, TMP14);
1182
-
1183
-                vis_psub16(TMP6, TMP4, DST_0);
1184
-                vis_st64(DST_0, dest[0]);
1185
-                dest += stride;
1186
-
1187
-                vis_faligndata(TMP0, TMP2, REF_2);
1188
-
1189
-                vis_and(TMP12, MASK_7f, TMP12);
1190
-
1191
-                vis_psub16(TMP14, TMP12, DST_0);
1192
-                vis_st64(DST_0, dest[0]);
1193
-                dest += stride;
1194
-        } while (--height);
1195
-
1196
-        vis_ld64(ref[0], TMP0);
1197
-        vis_xor(REF_0, REF_2, TMP4);
1198
-
1199
-        vis_ld64_2(ref, 8, TMP2);
1200
-        vis_and(TMP4, MASK_fe, TMP4);
1201
-
1202
-        vis_or(REF_0, REF_2, TMP6);
1203
-        vis_mul8x16(CONST_128, TMP4, TMP4);
1204
-
1205
-        vis_faligndata(TMP0, TMP2, REF_0);
1206
-
1207
-        vis_xor(REF_0, REF_2, TMP12);
1208
-
1209
-        vis_and(TMP4, MASK_7f, TMP4);
1210
-
1211
-        vis_and(TMP12, MASK_fe, TMP12);
1212
-
1213
-        vis_mul8x16(CONST_128, TMP12, TMP12);
1214
-        vis_or(REF_0, REF_2, TMP14);
1215
-
1216
-        vis_psub16(TMP6, TMP4, DST_0);
1217
-        vis_st64(DST_0, dest[0]);
1218
-        dest += stride;
1219
-
1220
-        vis_and(TMP12, MASK_7f, TMP12);
1221
-
1222
-        vis_psub16(TMP14, TMP12, DST_0);
1223
-        vis_st64(DST_0, dest[0]);
1224
-}
1225
-
1226
-static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * ref,
1227
-                             const ptrdiff_t stride, int height)
1228
-{
1229
-        int stride_8 = stride + 8;
1230
-        int stride_16 = stride + 16;
1231
-
1232
-        vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
1233
-
1234
-        ref = vis_alignaddr(ref);
1235
-
1236
-        vis_ld64(ref[ 0], TMP0);
1237
-        vis_fzero(ZERO);
1238
-
1239
-        vis_ld64(ref[ 8], TMP2);
1240
-
1241
-        vis_ld64(ref[16], TMP4);
1242
-
1243
-        vis_ld64(constants3[0], CONST_3);
1244
-        vis_faligndata(TMP0, TMP2, REF_2);
1245
-
1246
-        vis_ld64(constants256_512[0], CONST_256);
1247
-        vis_faligndata(TMP2, TMP4, REF_6);
1248
-        height >>= 1;
1249
-
1250
-        do {    /* 31 cycles */
1251
-                vis_ld64_2(ref, stride, TMP0);
1252
-                vis_pmerge(ZERO,       REF_2,     TMP12);
1253
-                vis_mul8x16au(REF_2_1, CONST_256, TMP14);
1254
-
1255
-                vis_ld64_2(ref, stride_8, TMP2);
1256
-                vis_pmerge(ZERO,       REF_6,     TMP16);
1257
-                vis_mul8x16au(REF_6_1, CONST_256, TMP18);
1258
-
1259
-                vis_ld64_2(ref, stride_16, TMP4);
1260
-                ref += stride;
1261
-
1262
-                vis_ld64(dest[0], DST_0);
1263
-                vis_faligndata(TMP0, TMP2, REF_0);
1264
-
1265
-                vis_ld64_2(dest, 8, DST_2);
1266
-                vis_faligndata(TMP2, TMP4, REF_4);
1267
-
1268
-                vis_ld64_2(ref, stride, TMP6);
1269
-                vis_pmerge(ZERO,     REF_0,     TMP0);
1270
-                vis_mul8x16au(REF_0_1, CONST_256, TMP2);
1271
-
1272
-                vis_ld64_2(ref, stride_8, TMP8);
1273
-                vis_pmerge(ZERO,     REF_4,     TMP4);
1274
-
1275
-                vis_ld64_2(ref, stride_16, TMP10);
1276
-                ref += stride;
1277
-
1278
-                vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
1279
-                vis_faligndata(TMP6, TMP8, REF_2);
1280
-                vis_mul8x16au(REF_4_1, CONST_256, TMP6);
1281
-
1282
-                vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
1283
-                vis_faligndata(TMP8, TMP10, REF_6);
1284
-                vis_mul8x16al(DST_0,   CONST_512, TMP20);
1285
-
1286
-                vis_padd16(TMP0, CONST_3, TMP0);
1287
-                vis_mul8x16al(DST_1,   CONST_512, TMP22);
1288
-
1289
-                vis_padd16(TMP2, CONST_3, TMP2);
1290
-                vis_mul8x16al(DST_2,   CONST_512, TMP24);
1291
-
1292
-                vis_padd16(TMP4, CONST_3, TMP4);
1293
-                vis_mul8x16al(DST_3,   CONST_512, TMP26);
1294
-
1295
-                vis_padd16(TMP6, CONST_3, TMP6);
1296
-
1297
-                vis_padd16(TMP12, TMP20, TMP12);
1298
-                vis_mul8x16al(REF_S0,   CONST_512, TMP20);
1299
-
1300
-                vis_padd16(TMP14, TMP22, TMP14);
1301
-                vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
1302
-
1303
-                vis_padd16(TMP16, TMP24, TMP16);
1304
-                vis_mul8x16al(REF_S2,   CONST_512, TMP24);
1305
-
1306
-                vis_padd16(TMP18, TMP26, TMP18);
1307
-                vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
1308
-
1309
-                vis_padd16(TMP12, TMP0, TMP12);
1310
-                vis_mul8x16au(REF_2,   CONST_256, TMP28);
1311
-
1312
-                vis_padd16(TMP14, TMP2, TMP14);
1313
-                vis_mul8x16au(REF_2_1, CONST_256, TMP30);
1314
-
1315
-                vis_padd16(TMP16, TMP4, TMP16);
1316
-                vis_mul8x16au(REF_6,   CONST_256, REF_S4);
1317
-
1318
-                vis_padd16(TMP18, TMP6, TMP18);
1319
-                vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
1320
-
1321
-                vis_pack16(TMP12, DST_0);
1322
-                vis_padd16(TMP28, TMP0, TMP12);
1323
-
1324
-                vis_pack16(TMP14, DST_1);
1325
-                vis_st64(DST_0, dest[0]);
1326
-                vis_padd16(TMP30, TMP2, TMP14);
1327
-
1328
-                vis_pack16(TMP16, DST_2);
1329
-                vis_padd16(REF_S4, TMP4, TMP16);
1330
-
1331
-                vis_pack16(TMP18, DST_3);
1332
-                vis_st64_2(DST_2, dest, 8);
1333
-                dest += stride;
1334
-                vis_padd16(REF_S6, TMP6, TMP18);
1335
-
1336
-                vis_padd16(TMP12, TMP20, TMP12);
1337
-
1338
-                vis_padd16(TMP14, TMP22, TMP14);
1339
-                vis_pack16(TMP12, DST_0);
1340
-
1341
-                vis_padd16(TMP16, TMP24, TMP16);
1342
-                vis_pack16(TMP14, DST_1);
1343
-                vis_st64(DST_0, dest[0]);
1344
-
1345
-                vis_padd16(TMP18, TMP26, TMP18);
1346
-                vis_pack16(TMP16, DST_2);
1347
-
1348
-                vis_pack16(TMP18, DST_3);
1349
-                vis_st64_2(DST_2, dest, 8);
1350
-                dest += stride;
1351
-        } while (--height);
1352
-}
1353
-
1354
-static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * ref,
1355
-                            const ptrdiff_t stride, int height)
1356
-{
1357
-        int stride_8 = stride + 8;
1358
-
1359
-        vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
1360
-
1361
-        ref = vis_alignaddr(ref);
1362
-
1363
-        vis_ld64(ref[ 0], TMP0);
1364
-        vis_fzero(ZERO);
1365
-
1366
-        vis_ld64(ref[ 8], TMP2);
1367
-
1368
-        vis_ld64(constants3[0], CONST_3);
1369
-        vis_faligndata(TMP0, TMP2, REF_2);
1370
-
1371
-        vis_ld64(constants256_512[0], CONST_256);
1372
-
1373
-        height >>= 1;
1374
-        do {    /* 20 cycles */
1375
-                vis_ld64_2(ref, stride, TMP0);
1376
-                vis_pmerge(ZERO,       REF_2,     TMP8);
1377
-                vis_mul8x16au(REF_2_1, CONST_256, TMP10);
1378
-
1379
-                vis_ld64_2(ref, stride_8, TMP2);
1380
-                ref += stride;
1381
-
1382
-                vis_ld64(dest[0], DST_0);
1383
-
1384
-                vis_ld64_2(dest, stride, DST_2);
1385
-                vis_faligndata(TMP0, TMP2, REF_0);
1386
-
1387
-                vis_ld64_2(ref, stride, TMP4);
1388
-                vis_mul8x16al(DST_0,   CONST_512, TMP16);
1389
-                vis_pmerge(ZERO,       REF_0,     TMP12);
1390
-
1391
-                vis_ld64_2(ref, stride_8, TMP6);
1392
-                ref += stride;
1393
-                vis_mul8x16al(DST_1,   CONST_512, TMP18);
1394
-                vis_pmerge(ZERO,       REF_0_1,   TMP14);
1395
-
1396
-                vis_padd16(TMP12, CONST_3, TMP12);
1397
-                vis_mul8x16al(DST_2,   CONST_512, TMP24);
1398
-
1399
-                vis_padd16(TMP14, CONST_3, TMP14);
1400
-                vis_mul8x16al(DST_3,   CONST_512, TMP26);
1401
-
1402
-                vis_faligndata(TMP4, TMP6, REF_2);
1403
-
1404
-                vis_padd16(TMP8, TMP12, TMP8);
1405
-
1406
-                vis_padd16(TMP10, TMP14, TMP10);
1407
-                vis_mul8x16au(REF_2,   CONST_256, TMP20);
1408
-
1409
-                vis_padd16(TMP8, TMP16, TMP0);
1410
-                vis_mul8x16au(REF_2_1, CONST_256, TMP22);
1411
-
1412
-                vis_padd16(TMP10, TMP18, TMP2);
1413
-                vis_pack16(TMP0, DST_0);
1414
-
1415
-                vis_pack16(TMP2, DST_1);
1416
-                vis_st64(DST_0, dest[0]);
1417
-                dest += stride;
1418
-                vis_padd16(TMP12, TMP20, TMP12);
1419
-
1420
-                vis_padd16(TMP14, TMP22, TMP14);
1421
-
1422
-                vis_padd16(TMP12, TMP24, TMP0);
1423
-
1424
-                vis_padd16(TMP14, TMP26, TMP2);
1425
-                vis_pack16(TMP0, DST_2);
1426
-
1427
-                vis_pack16(TMP2, DST_3);
1428
-                vis_st64(DST_2, dest[0]);
1429
-                dest += stride;
1430
-        } while (--height);
1431
-}
1432
-
1433
-static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * ref,
1434
-                              const ptrdiff_t stride, int height)
1435
-{
1436
-        unsigned long off = (unsigned long) ref & 0x7;
1437
-        unsigned long off_plus_1 = off + 1;
1438
-        int stride_8 = stride + 8;
1439
-        int stride_16 = stride + 16;
1440
-
1441
-        vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
1442
-
1443
-        ref = vis_alignaddr(ref);
1444
-
1445
-        vis_ld64(ref[ 0], TMP0);
1446
-        vis_fzero(ZERO);
1447
-
1448
-        vis_ld64(ref[ 8], TMP2);
1449
-
1450
-        vis_ld64(ref[16], TMP4);
1451
-
1452
-        vis_ld64(constants2[0], CONST_2);
1453
-        vis_faligndata(TMP0, TMP2, REF_S0);
1454
-
1455
-        vis_ld64(constants256_512[0], CONST_256);
1456
-        vis_faligndata(TMP2, TMP4, REF_S4);
1457
-
1458
-        if (off != 0x7) {
1459
-                vis_alignaddr_g0((void *)off_plus_1);
1460
-                vis_faligndata(TMP0, TMP2, REF_S2);
1461
-                vis_faligndata(TMP2, TMP4, REF_S6);
1462
-        } else {
1463
-                vis_src1(TMP2, REF_S2);
1464
-                vis_src1(TMP4, REF_S6);
1465
-        }
1466
-
1467
-        height >>= 1;
1468
-        do {
1469
-                vis_ld64_2(ref, stride, TMP0);
1470
-                vis_mul8x16au(REF_S0, CONST_256, TMP12);
1471
-                vis_pmerge(ZERO,      REF_S0_1,  TMP14);
1472
-
1473
-                vis_alignaddr_g0((void *)off);
1474
-
1475
-                vis_ld64_2(ref, stride_8, TMP2);
1476
-                vis_mul8x16au(REF_S2, CONST_256, TMP16);
1477
-                vis_pmerge(ZERO,      REF_S2_1,  TMP18);
1478
-
1479
-                vis_ld64_2(ref, stride_16, TMP4);
1480
-                ref += stride;
1481
-                vis_mul8x16au(REF_S4, CONST_256, TMP20);
1482
-                vis_pmerge(ZERO,      REF_S4_1,  TMP22);
1483
-
1484
-                vis_ld64_2(ref, stride, TMP6);
1485
-                vis_mul8x16au(REF_S6, CONST_256, TMP24);
1486
-                vis_pmerge(ZERO,      REF_S6_1,  TMP26);
1487
-
1488
-                vis_ld64_2(ref, stride_8, TMP8);
1489
-                vis_faligndata(TMP0, TMP2, REF_0);
1490
-
1491
-                vis_ld64_2(ref, stride_16, TMP10);
1492
-                ref += stride;
1493
-                vis_faligndata(TMP2, TMP4, REF_4);
1494
-
1495
-                vis_faligndata(TMP6, TMP8, REF_S0);
1496
-
1497
-                vis_faligndata(TMP8, TMP10, REF_S4);
1498
-
1499
-                if (off != 0x7) {
1500
-                        vis_alignaddr_g0((void *)off_plus_1);
1501
-                        vis_faligndata(TMP0, TMP2, REF_2);
1502
-                        vis_faligndata(TMP2, TMP4, REF_6);
1503
-                        vis_faligndata(TMP6, TMP8, REF_S2);
1504
-                        vis_faligndata(TMP8, TMP10, REF_S6);
1505
-                } else {
1506
-                        vis_src1(TMP2, REF_2);
1507
-                        vis_src1(TMP4, REF_6);
1508
-                        vis_src1(TMP8, REF_S2);
1509
-                        vis_src1(TMP10, REF_S6);
1510
-                }
1511
-
1512
-                vis_mul8x16au(REF_0, CONST_256, TMP0);
1513
-                vis_pmerge(ZERO,      REF_0_1,  TMP2);
1514
-
1515
-                vis_mul8x16au(REF_2, CONST_256, TMP4);
1516
-                vis_pmerge(ZERO,      REF_2_1,  TMP6);
1517
-
1518
-                vis_padd16(TMP0, CONST_2, TMP8);
1519
-                vis_mul8x16au(REF_4, CONST_256, TMP0);
1520
-
1521
-                vis_padd16(TMP2, CONST_2, TMP10);
1522
-                vis_mul8x16au(REF_4_1, CONST_256, TMP2);
1523
-
1524
-                vis_padd16(TMP8, TMP4, TMP8);
1525
-                vis_mul8x16au(REF_6, CONST_256, TMP4);
1526
-
1527
-                vis_padd16(TMP10, TMP6, TMP10);
1528
-                vis_mul8x16au(REF_6_1, CONST_256, TMP6);
1529
-
1530
-                vis_padd16(TMP12, TMP8, TMP12);
1531
-
1532
-                vis_padd16(TMP14, TMP10, TMP14);
1533
-
1534
-                vis_padd16(TMP12, TMP16, TMP12);
1535
-
1536
-                vis_padd16(TMP14, TMP18, TMP14);
1537
-                vis_pack16(TMP12, DST_0);
1538
-
1539
-                vis_pack16(TMP14, DST_1);
1540
-                vis_st64(DST_0, dest[0]);
1541
-                vis_padd16(TMP0, CONST_2, TMP12);
1542
-
1543
-                vis_mul8x16au(REF_S0, CONST_256, TMP0);
1544
-                vis_padd16(TMP2, CONST_2, TMP14);
1545
-
1546
-                vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
1547
-                vis_padd16(TMP12, TMP4, TMP12);
1548
-
1549
-                vis_mul8x16au(REF_S2, CONST_256, TMP4);
1550
-                vis_padd16(TMP14, TMP6, TMP14);
1551
-
1552
-                vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
1553
-                vis_padd16(TMP20, TMP12, TMP20);
1554
-
1555
-                vis_padd16(TMP22, TMP14, TMP22);
1556
-
1557
-                vis_padd16(TMP20, TMP24, TMP20);
1558
-
1559
-                vis_padd16(TMP22, TMP26, TMP22);
1560
-                vis_pack16(TMP20, DST_2);
1561
-
1562
-                vis_pack16(TMP22, DST_3);
1563
-                vis_st64_2(DST_2, dest, 8);
1564
-                dest += stride;
1565
-                vis_padd16(TMP0, TMP4, TMP24);
1566
-
1567
-                vis_mul8x16au(REF_S4, CONST_256, TMP0);
1568
-                vis_padd16(TMP2, TMP6, TMP26);
1569
-
1570
-                vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
1571
-                vis_padd16(TMP24, TMP8, TMP24);
1572
-
1573
-                vis_padd16(TMP26, TMP10, TMP26);
1574
-                vis_pack16(TMP24, DST_0);
1575
-
1576
-                vis_pack16(TMP26, DST_1);
1577
-                vis_st64(DST_0, dest[0]);
1578
-                vis_pmerge(ZERO, REF_S6, TMP4);
1579
-
1580
-                vis_pmerge(ZERO,      REF_S6_1,  TMP6);
1581
-
1582
-                vis_padd16(TMP0, TMP4, TMP0);
1583
-
1584
-                vis_padd16(TMP2, TMP6, TMP2);
1585
-
1586
-                vis_padd16(TMP0, TMP12, TMP0);
1587
-
1588
-                vis_padd16(TMP2, TMP14, TMP2);
1589
-                vis_pack16(TMP0, DST_2);
1590
-
1591
-                vis_pack16(TMP2, DST_3);
1592
-                vis_st64_2(DST_2, dest, 8);
1593
-                dest += stride;
1594
-        } while (--height);
1595
-}
1596
-
1597
-static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * ref,
1598
-                             const ptrdiff_t stride, int height)
1599
-{
1600
-        unsigned long off = (unsigned long) ref & 0x7;
1601
-        unsigned long off_plus_1 = off + 1;
1602
-        int stride_8 = stride + 8;
1603
-
1604
-        vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
1605
-
1606
-        ref = vis_alignaddr(ref);
1607
-
1608
-        vis_ld64(ref[ 0], TMP0);
1609
-        vis_fzero(ZERO);
1610
-
1611
-        vis_ld64(ref[ 8], TMP2);
1612
-
1613
-        vis_ld64(constants2[0], CONST_2);
1614
-
1615
-        vis_ld64(constants256_512[0], CONST_256);
1616
-        vis_faligndata(TMP0, TMP2, REF_S0);
1617
-
1618
-        if (off != 0x7) {
1619
-                vis_alignaddr_g0((void *)off_plus_1);
1620
-                vis_faligndata(TMP0, TMP2, REF_S2);
1621
-        } else {
1622
-                vis_src1(TMP2, REF_S2);
1623
-        }
1624
-
1625
-        height >>= 1;
1626
-        do {    /* 26 cycles */
1627
-                vis_ld64_2(ref, stride, TMP0);
1628
-                vis_mul8x16au(REF_S0,   CONST_256, TMP8);
1629
-                vis_pmerge(ZERO,        REF_S2,    TMP12);
1630
-
1631
-                vis_alignaddr_g0((void *)off);
1632
-
1633
-                vis_ld64_2(ref, stride_8, TMP2);
1634
-                ref += stride;
1635
-                vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
1636
-                vis_pmerge(ZERO,        REF_S2_1,  TMP14);
1637
-
1638
-                vis_ld64_2(ref, stride, TMP4);
1639
-
1640
-                vis_ld64_2(ref, stride_8, TMP6);
1641
-                ref += stride;
1642
-                vis_faligndata(TMP0, TMP2, REF_S4);
1643
-
1644
-                vis_pmerge(ZERO, REF_S4, TMP18);
1645
-
1646
-                vis_pmerge(ZERO, REF_S4_1, TMP20);
1647
-
1648
-                vis_faligndata(TMP4, TMP6, REF_S0);
1649
-
1650
-                if (off != 0x7) {
1651
-                        vis_alignaddr_g0((void *)off_plus_1);
1652
-                        vis_faligndata(TMP0, TMP2, REF_S6);
1653
-                        vis_faligndata(TMP4, TMP6, REF_S2);
1654
-                } else {
1655
-                        vis_src1(TMP2, REF_S6);
1656
-                        vis_src1(TMP6, REF_S2);
1657
-                }
1658
-
1659
-                vis_padd16(TMP18, CONST_2, TMP18);
1660
-                vis_mul8x16au(REF_S6,   CONST_256, TMP22);
1661
-
1662
-                vis_padd16(TMP20, CONST_2, TMP20);
1663
-                vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
1664
-
1665
-                vis_mul8x16au(REF_S0,   CONST_256, TMP26);
1666
-                vis_pmerge(ZERO, REF_S0_1, TMP28);
1667
-
1668
-                vis_mul8x16au(REF_S2,   CONST_256, TMP30);
1669
-                vis_padd16(TMP18, TMP22, TMP18);
1670
-
1671
-                vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
1672
-                vis_padd16(TMP20, TMP24, TMP20);
1673
-
1674
-                vis_padd16(TMP8,  TMP18, TMP8);
1675
-
1676
-                vis_padd16(TMP10, TMP20, TMP10);
1677
-
1678
-                vis_padd16(TMP8,  TMP12, TMP8);
1679
-
1680
-                vis_padd16(TMP10, TMP14, TMP10);
1681
-                vis_pack16(TMP8,  DST_0);
1682
-
1683
-                vis_pack16(TMP10, DST_1);
1684
-                vis_st64(DST_0, dest[0]);
1685
-                dest += stride;
1686
-                vis_padd16(TMP18, TMP26, TMP18);
1687
-
1688
-                vis_padd16(TMP20, TMP28, TMP20);
1689
-
1690
-                vis_padd16(TMP18, TMP30, TMP18);
1691
-
1692
-                vis_padd16(TMP20, TMP32, TMP20);
1693
-                vis_pack16(TMP18, DST_2);
1694
-
1695
-                vis_pack16(TMP20, DST_3);
1696
-                vis_st64(DST_2, dest[0]);
1697
-                dest += stride;
1698
-        } while (--height);
1699
-}
1700
-
1701
-static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * ref,
1702
-                              const ptrdiff_t stride, int height)
1703
-{
1704
-        unsigned long off = (unsigned long) ref & 0x7;
1705
-        unsigned long off_plus_1 = off + 1;
1706
-        int stride_8 = stride + 8;
1707
-        int stride_16 = stride + 16;
1708
-
1709
-        vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
1710
-
1711
-        ref = vis_alignaddr(ref);
1712
-
1713
-        vis_ld64(ref[ 0], TMP0);
1714
-        vis_fzero(ZERO);
1715
-
1716
-        vis_ld64(ref[ 8], TMP2);
1717
-
1718
-        vis_ld64(ref[16], TMP4);
1719
-
1720
-        vis_ld64(constants6[0], CONST_6);
1721
-        vis_faligndata(TMP0, TMP2, REF_S0);
1722
-
1723
-        vis_ld64(constants256_1024[0], CONST_256);
1724
-        vis_faligndata(TMP2, TMP4, REF_S4);
1725
-
1726
-        if (off != 0x7) {
1727
-                vis_alignaddr_g0((void *)off_plus_1);
1728
-                vis_faligndata(TMP0, TMP2, REF_S2);
1729
-                vis_faligndata(TMP2, TMP4, REF_S6);
1730
-        } else {
1731
-                vis_src1(TMP2, REF_S2);
1732
-                vis_src1(TMP4, REF_S6);
1733
-        }
1734
-
1735
-        height >>= 1;
1736
-        do {    /* 55 cycles */
1737
-                vis_ld64_2(ref, stride, TMP0);
1738
-                vis_mul8x16au(REF_S0, CONST_256, TMP12);
1739
-                vis_pmerge(ZERO,      REF_S0_1,  TMP14);
1740
-
1741
-                vis_alignaddr_g0((void *)off);
1742
-
1743
-                vis_ld64_2(ref, stride_8, TMP2);
1744
-                vis_mul8x16au(REF_S2, CONST_256, TMP16);
1745
-                vis_pmerge(ZERO,      REF_S2_1,  TMP18);
1746
-
1747
-                vis_ld64_2(ref, stride_16, TMP4);
1748
-                ref += stride;
1749
-                vis_mul8x16au(REF_S4, CONST_256, TMP20);
1750
-                vis_pmerge(ZERO,      REF_S4_1,  TMP22);
1751
-
1752
-                vis_ld64_2(ref, stride, TMP6);
1753
-                vis_mul8x16au(REF_S6, CONST_256, TMP24);
1754
-                vis_pmerge(ZERO,      REF_S6_1,  TMP26);
1755
-
1756
-                vis_ld64_2(ref, stride_8, TMP8);
1757
-                vis_faligndata(TMP0, TMP2, REF_0);
1758
-
1759
-                vis_ld64_2(ref, stride_16, TMP10);
1760
-                ref += stride;
1761
-                vis_faligndata(TMP2, TMP4, REF_4);
1762
-
1763
-                vis_ld64(dest[0], DST_0);
1764
-                vis_faligndata(TMP6, TMP8, REF_S0);
1765
-
1766
-                vis_ld64_2(dest, 8, DST_2);
1767
-                vis_faligndata(TMP8, TMP10, REF_S4);
1768
-
1769
-                if (off != 0x7) {
1770
-                        vis_alignaddr_g0((void *)off_plus_1);
1771
-                        vis_faligndata(TMP0, TMP2, REF_2);
1772
-                        vis_faligndata(TMP2, TMP4, REF_6);
1773
-                        vis_faligndata(TMP6, TMP8, REF_S2);
1774
-                        vis_faligndata(TMP8, TMP10, REF_S6);
1775
-                } else {
1776
-                        vis_src1(TMP2, REF_2);
1777
-                        vis_src1(TMP4, REF_6);
1778
-                        vis_src1(TMP8, REF_S2);
1779
-                        vis_src1(TMP10, REF_S6);
1780
-                }
1781
-
1782
-                vis_mul8x16al(DST_0,   CONST_1024, TMP30);
1783
-                vis_pmerge(ZERO, REF_0, TMP0);
1784
-
1785
-                vis_mul8x16al(DST_1,   CONST_1024, TMP32);
1786
-                vis_pmerge(ZERO,      REF_0_1,  TMP2);
1787
-
1788
-                vis_mul8x16au(REF_2, CONST_256, TMP4);
1789
-                vis_pmerge(ZERO,      REF_2_1,  TMP6);
1790
-
1791
-                vis_mul8x16al(DST_2,   CONST_1024, REF_0);
1792
-                vis_padd16(TMP0, CONST_6, TMP0);
1793
-
1794
-                vis_mul8x16al(DST_3,   CONST_1024, REF_2);
1795
-                vis_padd16(TMP2, CONST_6, TMP2);
1796
-
1797
-                vis_padd16(TMP0, TMP4, TMP0);
1798
-                vis_mul8x16au(REF_4, CONST_256, TMP4);
1799
-
1800
-                vis_padd16(TMP2, TMP6, TMP2);
1801
-                vis_mul8x16au(REF_4_1, CONST_256, TMP6);
1802
-
1803
-                vis_padd16(TMP12, TMP0, TMP12);
1804
-                vis_mul8x16au(REF_6, CONST_256, TMP8);
1805
-
1806
-                vis_padd16(TMP14, TMP2, TMP14);
1807
-                vis_mul8x16au(REF_6_1, CONST_256, TMP10);
1808
-
1809
-                vis_padd16(TMP12, TMP16, TMP12);
1810
-                vis_mul8x16au(REF_S0, CONST_256, REF_4);
1811
-
1812
-                vis_padd16(TMP14, TMP18, TMP14);
1813
-                vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
1814
-
1815
-                vis_padd16(TMP12, TMP30, TMP12);
1816
-
1817
-                vis_padd16(TMP14, TMP32, TMP14);
1818
-                vis_pack16(TMP12, DST_0);
1819
-
1820
-                vis_pack16(TMP14, DST_1);
1821
-                vis_st64(DST_0, dest[0]);
1822
-                vis_padd16(TMP4, CONST_6, TMP4);
1823
-
1824
-                vis_ld64_2(dest, stride, DST_0);
1825
-                vis_padd16(TMP6, CONST_6, TMP6);
1826
-                vis_mul8x16au(REF_S2, CONST_256, TMP12);
1827
-
1828
-                vis_padd16(TMP4, TMP8, TMP4);
1829
-                vis_mul8x16au(REF_S2_1, CONST_256,  TMP14);
1830
-
1831
-                vis_padd16(TMP6, TMP10, TMP6);
1832
-
1833
-                vis_padd16(TMP20, TMP4, TMP20);
1834
-
1835
-                vis_padd16(TMP22, TMP6, TMP22);
1836
-
1837
-                vis_padd16(TMP20, TMP24, TMP20);
1838
-
1839
-                vis_padd16(TMP22, TMP26, TMP22);
1840
-
1841
-                vis_padd16(TMP20, REF_0, TMP20);
1842
-                vis_mul8x16au(REF_S4, CONST_256, REF_0);
1843
-
1844
-                vis_padd16(TMP22, REF_2, TMP22);
1845
-                vis_pack16(TMP20, DST_2);
1846
-
1847
-                vis_pack16(TMP22, DST_3);
1848
-                vis_st64_2(DST_2, dest, 8);
1849
-                dest += stride;
1850
-
1851
-                vis_ld64_2(dest, 8, DST_2);
1852
-                vis_mul8x16al(DST_0,   CONST_1024, TMP30);
1853
-                vis_pmerge(ZERO,      REF_S4_1,  REF_2);
1854
-
1855
-                vis_mul8x16al(DST_1,   CONST_1024, TMP32);
1856
-                vis_padd16(REF_4, TMP0, TMP8);
1857
-
1858
-                vis_mul8x16au(REF_S6, CONST_256, REF_4);
1859
-                vis_padd16(REF_6, TMP2, TMP10);
1860
-
1861
-                vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
1862
-                vis_padd16(TMP8, TMP12, TMP8);
1863
-
1864
-                vis_padd16(TMP10, TMP14, TMP10);
1865
-
1866
-                vis_padd16(TMP8, TMP30, TMP8);
1867
-
1868
-                vis_padd16(TMP10, TMP32, TMP10);
1869
-                vis_pack16(TMP8, DST_0);
1870
-
1871
-                vis_pack16(TMP10, DST_1);
1872
-                vis_st64(DST_0, dest[0]);
1873
-
1874
-                vis_padd16(REF_0, TMP4, REF_0);
1875
-
1876
-                vis_mul8x16al(DST_2,   CONST_1024, TMP30);
1877
-                vis_padd16(REF_2, TMP6, REF_2);
1878
-
1879
-                vis_mul8x16al(DST_3,   CONST_1024, TMP32);
1880
-                vis_padd16(REF_0, REF_4, REF_0);
1881
-
1882
-                vis_padd16(REF_2, REF_6, REF_2);
1883
-
1884
-                vis_padd16(REF_0, TMP30, REF_0);
1885
-
1886
-                /* stall */
1887
-
1888
-                vis_padd16(REF_2, TMP32, REF_2);
1889
-                vis_pack16(REF_0, DST_2);
1890
-
1891
-                vis_pack16(REF_2, DST_3);
1892
-                vis_st64_2(DST_2, dest, 8);
1893
-                dest += stride;
1894
-        } while (--height);
1895
-}
1896
-
1897
-static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * ref,
1898
-                             const ptrdiff_t stride, int height)
1899
-{
1900
-        unsigned long off = (unsigned long) ref & 0x7;
1901
-        unsigned long off_plus_1 = off + 1;
1902
-        int stride_8 = stride + 8;
1903
-
1904
-        vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
1905
-
1906
-        ref = vis_alignaddr(ref);
1907
-
1908
-        vis_ld64(ref[0], TMP0);
1909
-        vis_fzero(ZERO);
1910
-
1911
-        vis_ld64_2(ref, 8, TMP2);
1912
-
1913
-        vis_ld64(constants6[0], CONST_6);
1914
-
1915
-        vis_ld64(constants256_1024[0], CONST_256);
1916
-        vis_faligndata(TMP0, TMP2, REF_S0);
1917
-
1918
-        if (off != 0x7) {
1919
-                vis_alignaddr_g0((void *)off_plus_1);
1920
-                vis_faligndata(TMP0, TMP2, REF_S2);
1921
-        } else {
1922
-                vis_src1(TMP2, REF_S2);
1923
-        }
1924
-
1925
-        height >>= 1;
1926
-        do {    /* 31 cycles */
1927
-                vis_ld64_2(ref, stride, TMP0);
1928
-                vis_mul8x16au(REF_S0, CONST_256, TMP8);
1929
-                vis_pmerge(ZERO,      REF_S0_1,  TMP10);
1930
-
1931
-                vis_ld64_2(ref, stride_8, TMP2);
1932
-                ref += stride;
1933
-                vis_mul8x16au(REF_S2, CONST_256, TMP12);
1934
-                vis_pmerge(ZERO,      REF_S2_1,  TMP14);
1935
-
1936
-                vis_alignaddr_g0((void *)off);
1937
-
1938
-                vis_ld64_2(ref, stride, TMP4);
1939
-                vis_faligndata(TMP0, TMP2, REF_S4);
1940
-
1941
-                vis_ld64_2(ref, stride_8, TMP6);
1942
-                ref += stride;
1943
-
1944
-                vis_ld64(dest[0], DST_0);
1945
-                vis_faligndata(TMP4, TMP6, REF_S0);
1946
-
1947
-                vis_ld64_2(dest, stride, DST_2);
1948
-
1949
-                if (off != 0x7) {
1950
-                        vis_alignaddr_g0((void *)off_plus_1);
1951
-                        vis_faligndata(TMP0, TMP2, REF_S6);
1952
-                        vis_faligndata(TMP4, TMP6, REF_S2);
1953
-                } else {
1954
-                        vis_src1(TMP2, REF_S6);
1955
-                        vis_src1(TMP6, REF_S2);
1956
-                }
1957
-
1958
-                vis_mul8x16al(DST_0,   CONST_1024, TMP30);
1959
-                vis_pmerge(ZERO, REF_S4, TMP22);
1960
-
1961
-                vis_mul8x16al(DST_1,   CONST_1024, TMP32);
1962
-                vis_pmerge(ZERO,      REF_S4_1,  TMP24);
1963
-
1964
-                vis_mul8x16au(REF_S6, CONST_256, TMP26);
1965
-                vis_pmerge(ZERO,      REF_S6_1,  TMP28);
1966
-
1967
-                vis_mul8x16au(REF_S0, CONST_256, REF_S4);
1968
-                vis_padd16(TMP22, CONST_6, TMP22);
1969
-
1970
-                vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
1971
-                vis_padd16(TMP24, CONST_6, TMP24);
1972
-
1973
-                vis_mul8x16al(DST_2,   CONST_1024, REF_0);
1974
-                vis_padd16(TMP22, TMP26, TMP22);
1975
-
1976
-                vis_mul8x16al(DST_3,   CONST_1024, REF_2);
1977
-                vis_padd16(TMP24, TMP28, TMP24);
1978
-
1979
-                vis_mul8x16au(REF_S2, CONST_256, TMP26);
1980
-                vis_padd16(TMP8, TMP22, TMP8);
1981
-
1982
-                vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
1983
-                vis_padd16(TMP10, TMP24, TMP10);
1984
-
1985
-                vis_padd16(TMP8, TMP12, TMP8);
1986
-
1987
-                vis_padd16(TMP10, TMP14, TMP10);
1988
-
1989
-                vis_padd16(TMP8, TMP30, TMP8);
1990
-
1991
-                vis_padd16(TMP10, TMP32, TMP10);
1992
-                vis_pack16(TMP8, DST_0);
1993
-
1994
-                vis_pack16(TMP10, DST_1);
1995
-                vis_st64(DST_0, dest[0]);
1996
-                dest += stride;
1997
-
1998
-                vis_padd16(REF_S4, TMP22, TMP12);
1999
-
2000
-                vis_padd16(REF_S6, TMP24, TMP14);
2001
-
2002
-                vis_padd16(TMP12, TMP26, TMP12);
2003
-
2004
-                vis_padd16(TMP14, TMP28, TMP14);
2005
-
2006
-                vis_padd16(TMP12, REF_0, TMP12);
2007
-
2008
-                vis_padd16(TMP14, REF_2, TMP14);
2009
-                vis_pack16(TMP12, DST_2);
2010
-
2011
-                vis_pack16(TMP14, DST_3);
2012
-                vis_st64(DST_2, dest[0]);
2013
-                dest += stride;
2014
-        } while (--height);
2015
-}
2016
-
2017
-/* End of rounding code */
2018
-
2019
-/* Start of no rounding code */
2020
-/* The trick used in some of this file is the formula from the MMX
2021
- * motion comp code, which is:
2022
- *
2023
- * (x+y)>>1 == (x&y)+((x^y)>>1)
2024
- *
2025
- * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
2026
- * We avoid overflows by masking before we do the shift, and we
2027
- * implement the shift by multiplying by 1/2 using mul8x16.  So in
2028
- * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
2029
- * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
2030
- * the value 0x80808080 is in f8):
2031
- *
2032
- *      fxor            f0,   f2, f10
2033
- *      fand            f10,  f4, f10
2034
- *      fmul8x16        f8,  f10, f10
2035
- *      fand            f10,  f6, f10
2036
- *      fand            f0,   f2, f12
2037
- *      fpadd16         f12, f10, f10
2038
- */
2039
-
2040
-static void MC_put_no_round_o_16_vis (uint8_t * dest, const uint8_t * ref,
2041
-                                      const ptrdiff_t stride, int height)
2042
-{
2043
-        ref = vis_alignaddr(ref);
2044
-        do {    /* 5 cycles */
2045
-                vis_ld64(ref[0], TMP0);
2046
-
2047
-                vis_ld64_2(ref, 8, TMP2);
2048
-
2049
-                vis_ld64_2(ref, 16, TMP4);
2050
-                ref += stride;
2051
-
2052
-                vis_faligndata(TMP0, TMP2, REF_0);
2053
-                vis_st64(REF_0, dest[0]);
2054
-
2055
-                vis_faligndata(TMP2, TMP4, REF_2);
2056
-                vis_st64_2(REF_2, dest, 8);
2057
-                dest += stride;
2058
-        } while (--height);
2059
-}
2060
-
2061
-static void MC_put_no_round_o_8_vis (uint8_t * dest, const uint8_t * ref,
2062
-                                     const ptrdiff_t stride, int height)
2063
-{
2064
-        ref = vis_alignaddr(ref);
2065
-        do {    /* 4 cycles */
2066
-                vis_ld64(ref[0], TMP0);
2067
-
2068
-                vis_ld64(ref[8], TMP2);
2069
-                ref += stride;
2070
-
2071
-                /* stall */
2072
-
2073
-                vis_faligndata(TMP0, TMP2, REF_0);
2074
-                vis_st64(REF_0, dest[0]);
2075
-                dest += stride;
2076
-        } while (--height);
2077
-}
2078
-
2079
-
2080
-static void MC_avg_no_round_o_16_vis (uint8_t * dest, const uint8_t * ref,
2081
-                                      const ptrdiff_t stride, int height)
2082
-{
2083
-        int stride_8 = stride + 8;
2084
-
2085
-        ref = vis_alignaddr(ref);
2086
-
2087
-        vis_ld64(ref[0], TMP0);
2088
-
2089
-        vis_ld64(ref[8], TMP2);
2090
-
2091
-        vis_ld64(ref[16], TMP4);
2092
-
2093
-        vis_ld64(dest[0], DST_0);
2094
-
2095
-        vis_ld64(dest[8], DST_2);
2096
-
2097
-        vis_ld64(constants_fe[0], MASK_fe);
2098
-        vis_faligndata(TMP0, TMP2, REF_0);
2099
-
2100
-        vis_ld64(constants_7f[0], MASK_7f);
2101
-        vis_faligndata(TMP2, TMP4, REF_2);
2102
-
2103
-        vis_ld64(constants128[0], CONST_128);
2104
-
2105
-        ref += stride;
2106
-        height = (height >> 1) - 1;
2107
-
2108
-        do {    /* 24 cycles */
2109
-                vis_ld64(ref[0], TMP0);
2110
-                vis_xor(DST_0, REF_0, TMP6);
2111
-
2112
-                vis_ld64_2(ref, 8, TMP2);
2113
-                vis_and(TMP6, MASK_fe, TMP6);
2114
-
2115
-                vis_ld64_2(ref, 16, TMP4);
2116
-                ref += stride;
2117
-                vis_mul8x16(CONST_128, TMP6, TMP6);
2118
-                vis_xor(DST_2, REF_2, TMP8);
2119
-
2120
-                vis_and(TMP8, MASK_fe, TMP8);
2121
-
2122
-                vis_and(DST_0, REF_0, TMP10);
2123
-                vis_ld64_2(dest, stride, DST_0);
2124
-                vis_mul8x16(CONST_128, TMP8, TMP8);
2125
-
2126
-                vis_and(DST_2, REF_2, TMP12);
2127
-                vis_ld64_2(dest, stride_8, DST_2);
2128
-
2129
-                vis_ld64(ref[0], TMP14);
2130
-                vis_and(TMP6, MASK_7f, TMP6);
2131
-
2132
-                vis_and(TMP8, MASK_7f, TMP8);
2133
-
2134
-                vis_padd16(TMP10, TMP6, TMP6);
2135
-                vis_st64(TMP6, dest[0]);
2136
-
2137
-                vis_padd16(TMP12, TMP8, TMP8);
2138
-                vis_st64_2(TMP8, dest, 8);
2139
-
2140
-                dest += stride;
2141
-                vis_ld64_2(ref, 8, TMP16);
2142
-                vis_faligndata(TMP0, TMP2, REF_0);
2143
-
2144
-                vis_ld64_2(ref, 16, TMP18);
2145
-                vis_faligndata(TMP2, TMP4, REF_2);
2146
-                ref += stride;
2147
-
2148
-                vis_xor(DST_0, REF_0, TMP20);
2149
-
2150
-                vis_and(TMP20, MASK_fe, TMP20);
2151
-
2152
-                vis_xor(DST_2, REF_2, TMP22);
2153
-                vis_mul8x16(CONST_128, TMP20, TMP20);
2154
-
2155
-                vis_and(TMP22, MASK_fe, TMP22);
2156
-
2157
-                vis_and(DST_0, REF_0, TMP24);
2158
-                vis_mul8x16(CONST_128, TMP22, TMP22);
2159
-
2160
-                vis_and(DST_2, REF_2, TMP26);
2161
-
2162
-                vis_ld64_2(dest, stride, DST_0);
2163
-                vis_faligndata(TMP14, TMP16, REF_0);
2164
-
2165
-                vis_ld64_2(dest, stride_8, DST_2);
2166
-                vis_faligndata(TMP16, TMP18, REF_2);
2167
-
2168
-                vis_and(TMP20, MASK_7f, TMP20);
2169
-
2170
-                vis_and(TMP22, MASK_7f, TMP22);
2171
-
2172
-                vis_padd16(TMP24, TMP20, TMP20);
2173
-                vis_st64(TMP20, dest[0]);
2174
-
2175
-                vis_padd16(TMP26, TMP22, TMP22);
2176
-                vis_st64_2(TMP22, dest, 8);
2177
-                dest += stride;
2178
-        } while (--height);
2179
-
2180
-        vis_ld64(ref[0], TMP0);
2181
-        vis_xor(DST_0, REF_0, TMP6);
2182
-
2183
-        vis_ld64_2(ref, 8, TMP2);
2184
-        vis_and(TMP6, MASK_fe, TMP6);
2185
-
2186
-        vis_ld64_2(ref, 16, TMP4);
2187
-        vis_mul8x16(CONST_128, TMP6, TMP6);
2188
-        vis_xor(DST_2, REF_2, TMP8);
2189
-
2190
-        vis_and(TMP8, MASK_fe, TMP8);
2191
-
2192
-        vis_and(DST_0, REF_0, TMP10);
2193
-        vis_ld64_2(dest, stride, DST_0);
2194
-        vis_mul8x16(CONST_128, TMP8, TMP8);
2195
-
2196
-        vis_and(DST_2, REF_2, TMP12);
2197
-        vis_ld64_2(dest, stride_8, DST_2);
2198
-
2199
-        vis_ld64(ref[0], TMP14);
2200
-        vis_and(TMP6, MASK_7f, TMP6);
2201
-
2202
-        vis_and(TMP8, MASK_7f, TMP8);
2203
-
2204
-        vis_padd16(TMP10, TMP6, TMP6);
2205
-        vis_st64(TMP6, dest[0]);
2206
-
2207
-        vis_padd16(TMP12, TMP8, TMP8);
2208
-        vis_st64_2(TMP8, dest, 8);
2209
-
2210
-        dest += stride;
2211
-        vis_faligndata(TMP0, TMP2, REF_0);
2212
-
2213
-        vis_faligndata(TMP2, TMP4, REF_2);
2214
-
2215
-        vis_xor(DST_0, REF_0, TMP20);
2216
-
2217
-        vis_and(TMP20, MASK_fe, TMP20);
2218
-
2219
-        vis_xor(DST_2, REF_2, TMP22);
2220
-        vis_mul8x16(CONST_128, TMP20, TMP20);
2221
-
2222
-        vis_and(TMP22, MASK_fe, TMP22);
2223
-
2224
-        vis_and(DST_0, REF_0, TMP24);
2225
-        vis_mul8x16(CONST_128, TMP22, TMP22);
2226
-
2227
-        vis_and(DST_2, REF_2, TMP26);
2228
-
2229
-        vis_and(TMP20, MASK_7f, TMP20);
2230
-
2231
-        vis_and(TMP22, MASK_7f, TMP22);
2232
-
2233
-        vis_padd16(TMP24, TMP20, TMP20);
2234
-        vis_st64(TMP20, dest[0]);
2235
-
2236
-        vis_padd16(TMP26, TMP22, TMP22);
2237
-        vis_st64_2(TMP22, dest, 8);
2238
-}
2239
-
2240
-static void MC_put_no_round_x_16_vis (uint8_t * dest, const uint8_t * ref,
2241
-                                      const ptrdiff_t stride, int height)
2242
-{
2243
-        unsigned long off = (unsigned long) ref & 0x7;
2244
-        unsigned long off_plus_1 = off + 1;
2245
-
2246
-        ref = vis_alignaddr(ref);
2247
-
2248
-        vis_ld64(ref[0],    TMP0);
2249
-
2250
-        vis_ld64_2(ref, 8,  TMP2);
2251
-
2252
-        vis_ld64_2(ref, 16, TMP4);
2253
-
2254
-        vis_ld64(constants_fe[0], MASK_fe);
2255
-
2256
-        vis_ld64(constants_7f[0], MASK_7f);
2257
-        vis_faligndata(TMP0, TMP2, REF_0);
2258
-
2259
-        vis_ld64(constants128[0], CONST_128);
2260
-        vis_faligndata(TMP2, TMP4, REF_4);
2261
-
2262
-        if (off != 0x7) {
2263
-                vis_alignaddr_g0((void *)off_plus_1);
2264
-                vis_faligndata(TMP0, TMP2, REF_2);
2265
-                vis_faligndata(TMP2, TMP4, REF_6);
2266
-        } else {
2267
-                vis_src1(TMP2, REF_2);
2268
-                vis_src1(TMP4, REF_6);
2269
-        }
2270
-
2271
-        ref += stride;
2272
-        height = (height >> 1) - 1;
2273
-
2274
-        do {    /* 34 cycles */
2275
-                vis_ld64(ref[0],    TMP0);
2276
-                vis_xor(REF_0, REF_2, TMP6);
2277
-
2278
-                vis_ld64_2(ref, 8,  TMP2);
2279
-                vis_xor(REF_4, REF_6, TMP8);
2280
-
2281
-                vis_ld64_2(ref, 16, TMP4);
2282
-                vis_and(TMP6, MASK_fe, TMP6);
2283
-                ref += stride;
2284
-
2285
-                vis_ld64(ref[0],    TMP14);
2286
-                vis_mul8x16(CONST_128, TMP6, TMP6);
2287
-                vis_and(TMP8, MASK_fe, TMP8);
2288
-
2289
-                vis_ld64_2(ref, 8,  TMP16);
2290
-                vis_mul8x16(CONST_128, TMP8, TMP8);
2291
-                vis_and(REF_0, REF_2, TMP10);
2292
-
2293
-                vis_ld64_2(ref, 16, TMP18);
2294
-                ref += stride;
2295
-                vis_and(REF_4, REF_6, TMP12);
2296
-
2297
-                vis_alignaddr_g0((void *)off);
2298
-
2299
-                vis_faligndata(TMP0, TMP2, REF_0);
2300
-
2301
-                vis_faligndata(TMP2, TMP4, REF_4);
2302
-
2303
-                if (off != 0x7) {
2304
-                        vis_alignaddr_g0((void *)off_plus_1);
2305
-                        vis_faligndata(TMP0, TMP2, REF_2);
2306
-                        vis_faligndata(TMP2, TMP4, REF_6);
2307
-                } else {
2308
-                        vis_src1(TMP2, REF_2);
2309
-                        vis_src1(TMP4, REF_6);
2310
-                }
2311
-
2312
-                vis_and(TMP6, MASK_7f, TMP6);
2313
-
2314
-                vis_and(TMP8, MASK_7f, TMP8);
2315
-
2316
-                vis_padd16(TMP10, TMP6, TMP6);
2317
-                vis_st64(TMP6, dest[0]);
2318
-
2319
-                vis_padd16(TMP12, TMP8, TMP8);
2320
-                vis_st64_2(TMP8, dest, 8);
2321
-                dest += stride;
2322
-
2323
-                vis_xor(REF_0, REF_2, TMP6);
2324
-
2325
-                vis_xor(REF_4, REF_6, TMP8);
2326
-
2327
-                vis_and(TMP6, MASK_fe, TMP6);
2328
-
2329
-                vis_mul8x16(CONST_128, TMP6, TMP6);
2330
-                vis_and(TMP8, MASK_fe, TMP8);
2331
-
2332
-                vis_mul8x16(CONST_128, TMP8, TMP8);
2333
-                vis_and(REF_0, REF_2, TMP10);
2334
-
2335
-                vis_and(REF_4, REF_6, TMP12);
2336
-
2337
-                vis_alignaddr_g0((void *)off);
2338
-
2339
-                vis_faligndata(TMP14, TMP16, REF_0);
2340
-
2341
-                vis_faligndata(TMP16, TMP18, REF_4);
2342
-
2343
-                if (off != 0x7) {
2344
-                        vis_alignaddr_g0((void *)off_plus_1);
2345
-                        vis_faligndata(TMP14, TMP16, REF_2);
2346
-                        vis_faligndata(TMP16, TMP18, REF_6);
2347
-                } else {
2348
-                        vis_src1(TMP16, REF_2);
2349
-                        vis_src1(TMP18, REF_6);
2350
-                }
2351
-
2352
-                vis_and(TMP6, MASK_7f, TMP6);
2353
-
2354
-                vis_and(TMP8, MASK_7f, TMP8);
2355
-
2356
-                vis_padd16(TMP10, TMP6, TMP6);
2357
-                vis_st64(TMP6, dest[0]);
2358
-
2359
-                vis_padd16(TMP12, TMP8, TMP8);
2360
-                vis_st64_2(TMP8, dest, 8);
2361
-                dest += stride;
2362
-        } while (--height);
2363
-
2364
-        vis_ld64(ref[0],    TMP0);
2365
-        vis_xor(REF_0, REF_2, TMP6);
2366
-
2367
-        vis_ld64_2(ref, 8,  TMP2);
2368
-        vis_xor(REF_4, REF_6, TMP8);
2369
-
2370
-        vis_ld64_2(ref, 16, TMP4);
2371
-        vis_and(TMP6, MASK_fe, TMP6);
2372
-
2373
-        vis_mul8x16(CONST_128, TMP6, TMP6);
2374
-        vis_and(TMP8, MASK_fe, TMP8);
2375
-
2376
-        vis_mul8x16(CONST_128, TMP8, TMP8);
2377
-        vis_and(REF_0, REF_2, TMP10);
2378
-
2379
-        vis_and(REF_4, REF_6, TMP12);
2380
-
2381
-        vis_alignaddr_g0((void *)off);
2382
-
2383
-        vis_faligndata(TMP0, TMP2, REF_0);
2384
-
2385
-        vis_faligndata(TMP2, TMP4, REF_4);
2386
-
2387
-        if (off != 0x7) {
2388
-                vis_alignaddr_g0((void *)off_plus_1);
2389
-                vis_faligndata(TMP0, TMP2, REF_2);
2390
-                vis_faligndata(TMP2, TMP4, REF_6);
2391
-        } else {
2392
-                vis_src1(TMP2, REF_2);
2393
-                vis_src1(TMP4, REF_6);
2394
-        }
2395
-
2396
-        vis_and(TMP6, MASK_7f, TMP6);
2397
-
2398
-        vis_and(TMP8, MASK_7f, TMP8);
2399
-
2400
-        vis_padd16(TMP10, TMP6, TMP6);
2401
-        vis_st64(TMP6, dest[0]);
2402
-
2403
-        vis_padd16(TMP12, TMP8, TMP8);
2404
-        vis_st64_2(TMP8, dest, 8);
2405
-        dest += stride;
2406
-
2407
-        vis_xor(REF_0, REF_2, TMP6);
2408
-
2409
-        vis_xor(REF_4, REF_6, TMP8);
2410
-
2411
-        vis_and(TMP6, MASK_fe, TMP6);
2412
-
2413
-        vis_mul8x16(CONST_128, TMP6, TMP6);
2414
-        vis_and(TMP8, MASK_fe, TMP8);
2415
-
2416
-        vis_mul8x16(CONST_128, TMP8, TMP8);
2417
-        vis_and(REF_0, REF_2, TMP10);
2418
-
2419
-        vis_and(REF_4, REF_6, TMP12);
2420
-
2421
-        vis_and(TMP6, MASK_7f, TMP6);
2422
-
2423
-        vis_and(TMP8, MASK_7f, TMP8);
2424
-
2425
-        vis_padd16(TMP10, TMP6, TMP6);
2426
-        vis_st64(TMP6, dest[0]);
2427
-
2428
-        vis_padd16(TMP12, TMP8, TMP8);
2429
-        vis_st64_2(TMP8, dest, 8);
2430
-}
2431
-
2432
-static void MC_put_no_round_x_8_vis (uint8_t * dest, const uint8_t * ref,
2433
-                                     const ptrdiff_t stride, int height)
2434
-{
2435
-        unsigned long off = (unsigned long) ref & 0x7;
2436
-        unsigned long off_plus_1 = off + 1;
2437
-
2438
-        ref = vis_alignaddr(ref);
2439
-
2440
-        vis_ld64(ref[0], TMP0);
2441
-
2442
-        vis_ld64(ref[8], TMP2);
2443
-
2444
-        vis_ld64(constants_fe[0], MASK_fe);
2445
-
2446
-        vis_ld64(constants_7f[0], MASK_7f);
2447
-
2448
-        vis_ld64(constants128[0], CONST_128);
2449
-        vis_faligndata(TMP0, TMP2, REF_0);
2450
-
2451
-        if (off != 0x7) {
2452
-                vis_alignaddr_g0((void *)off_plus_1);
2453
-                vis_faligndata(TMP0, TMP2, REF_2);
2454
-        } else {
2455
-                vis_src1(TMP2, REF_2);
2456
-        }
2457
-
2458
-        ref += stride;
2459
-        height = (height >> 1) - 1;
2460
-
2461
-        do {    /* 20 cycles */
2462
-                vis_ld64(ref[0], TMP0);
2463
-                vis_xor(REF_0, REF_2, TMP4);
2464
-
2465
-                vis_ld64_2(ref, 8, TMP2);
2466
-                vis_and(TMP4, MASK_fe, TMP4);
2467
-                ref += stride;
2468
-
2469
-                vis_ld64(ref[0], TMP8);
2470
-                vis_and(REF_0, REF_2, TMP6);
2471
-                vis_mul8x16(CONST_128, TMP4, TMP4);
2472
-
2473
-                vis_alignaddr_g0((void *)off);
2474
-
2475
-                vis_ld64_2(ref, 8, TMP10);
2476
-                ref += stride;
2477
-                vis_faligndata(TMP0, TMP2, REF_0);
2478
-
2479
-                if (off != 0x7) {
2480
-                        vis_alignaddr_g0((void *)off_plus_1);
2481
-                        vis_faligndata(TMP0, TMP2, REF_2);
2482
-                } else {
2483
-                        vis_src1(TMP2, REF_2);
2484
-                }
2485
-
2486
-                vis_and(TMP4, MASK_7f, TMP4);
2487
-
2488
-                vis_padd16(TMP6, TMP4, DST_0);
2489
-                vis_st64(DST_0, dest[0]);
2490
-                dest += stride;
2491
-
2492
-                vis_xor(REF_0, REF_2, TMP12);
2493
-
2494
-                vis_and(TMP12, MASK_fe, TMP12);
2495
-
2496
-                vis_and(REF_0, REF_2, TMP14);
2497
-                vis_mul8x16(CONST_128, TMP12, TMP12);
2498
-
2499
-                vis_alignaddr_g0((void *)off);
2500
-                vis_faligndata(TMP8, TMP10, REF_0);
2501
-                if (off != 0x7) {
2502
-                        vis_alignaddr_g0((void *)off_plus_1);
2503
-                        vis_faligndata(TMP8, TMP10, REF_2);
2504
-                } else {
2505
-                        vis_src1(TMP10, REF_2);
2506
-                }
2507
-
2508
-                vis_and(TMP12, MASK_7f, TMP12);
2509
-
2510
-                vis_padd16(TMP14, TMP12, DST_0);
2511
-                vis_st64(DST_0, dest[0]);
2512
-                dest += stride;
2513
-        } while (--height);
2514
-
2515
-        vis_ld64(ref[0], TMP0);
2516
-        vis_xor(REF_0, REF_2, TMP4);
2517
-
2518
-        vis_ld64_2(ref, 8, TMP2);
2519
-        vis_and(TMP4, MASK_fe, TMP4);
2520
-
2521
-        vis_and(REF_0, REF_2, TMP6);
2522
-        vis_mul8x16(CONST_128, TMP4, TMP4);
2523
-
2524
-        vis_alignaddr_g0((void *)off);
2525
-
2526
-        vis_faligndata(TMP0, TMP2, REF_0);
2527
-
2528
-        if (off != 0x7) {
2529
-                vis_alignaddr_g0((void *)off_plus_1);
2530
-                vis_faligndata(TMP0, TMP2, REF_2);
2531
-        } else {
2532
-                vis_src1(TMP2, REF_2);
2533
-        }
2534
-
2535
-        vis_and(TMP4, MASK_7f, TMP4);
2536
-
2537
-        vis_padd16(TMP6, TMP4, DST_0);
2538
-        vis_st64(DST_0, dest[0]);
2539
-        dest += stride;
2540
-
2541
-        vis_xor(REF_0, REF_2, TMP12);
2542
-
2543
-        vis_and(TMP12, MASK_fe, TMP12);
2544
-
2545
-        vis_and(REF_0, REF_2, TMP14);
2546
-        vis_mul8x16(CONST_128, TMP12, TMP12);
2547
-
2548
-        vis_and(TMP12, MASK_7f, TMP12);
2549
-
2550
-        vis_padd16(TMP14, TMP12, DST_0);
2551
-        vis_st64(DST_0, dest[0]);
2552
-        dest += stride;
2553
-}
2554
-
2555
-static void MC_avg_no_round_x_16_vis (uint8_t * dest, const uint8_t * ref,
2556
-                                      const ptrdiff_t stride, int height)
2557
-{
2558
-        unsigned long off = (unsigned long) ref & 0x7;
2559
-        unsigned long off_plus_1 = off + 1;
2560
-
2561
-        vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
2562
-
2563
-        vis_ld64(constants3[0], CONST_3);
2564
-        vis_fzero(ZERO);
2565
-        vis_ld64(constants256_512[0], CONST_256);
2566
-
2567
-        ref = vis_alignaddr(ref);
2568
-        do {    /* 26 cycles */
2569
-                vis_ld64(ref[0], TMP0);
2570
-
2571
-                vis_ld64(ref[8], TMP2);
2572
-
2573
-                vis_alignaddr_g0((void *)off);
2574
-
2575
-                vis_ld64(ref[16], TMP4);
2576
-
2577
-                vis_ld64(dest[0], DST_0);
2578
-                vis_faligndata(TMP0, TMP2, REF_0);
2579
-
2580
-                vis_ld64(dest[8], DST_2);
2581
-                vis_faligndata(TMP2, TMP4, REF_4);
2582
-
2583
-                if (off != 0x7) {
2584
-                        vis_alignaddr_g0((void *)off_plus_1);
2585
-                        vis_faligndata(TMP0, TMP2, REF_2);
2586
-                        vis_faligndata(TMP2, TMP4, REF_6);
2587
-                } else {
2588
-                        vis_src1(TMP2, REF_2);
2589
-                        vis_src1(TMP4, REF_6);
2590
-                }
2591
-
2592
-                vis_mul8x16au(REF_0,   CONST_256, TMP0);
2593
-
2594
-                vis_pmerge(ZERO,     REF_2,     TMP4);
2595
-                vis_mul8x16au(REF_0_1, CONST_256, TMP2);
2596
-
2597
-                vis_pmerge(ZERO, REF_2_1, TMP6);
2598
-
2599
-                vis_padd16(TMP0, TMP4, TMP0);
2600
-
2601
-                vis_mul8x16al(DST_0,   CONST_512, TMP4);
2602
-                vis_padd16(TMP2, TMP6, TMP2);
2603
-
2604
-                vis_mul8x16al(DST_1,   CONST_512, TMP6);
2605
-
2606
-                vis_mul8x16au(REF_6,   CONST_256, TMP12);
2607
-
2608
-                vis_padd16(TMP0, TMP4, TMP0);
2609
-                vis_mul8x16au(REF_6_1, CONST_256, TMP14);
2610
-
2611
-                vis_padd16(TMP2, TMP6, TMP2);
2612
-                vis_mul8x16au(REF_4,   CONST_256, TMP16);
2613
-
2614
-                vis_padd16(TMP0, CONST_3, TMP8);
2615
-                vis_mul8x16au(REF_4_1, CONST_256, TMP18);
2616
-
2617
-                vis_padd16(TMP2, CONST_3, TMP10);
2618
-                vis_pack16(TMP8, DST_0);
2619
-
2620
-                vis_pack16(TMP10, DST_1);
2621
-                vis_padd16(TMP16, TMP12, TMP0);
2622
-
2623
-                vis_st64(DST_0, dest[0]);
2624
-                vis_mul8x16al(DST_2,   CONST_512, TMP4);
2625
-                vis_padd16(TMP18, TMP14, TMP2);
2626
-
2627
-                vis_mul8x16al(DST_3,   CONST_512, TMP6);
2628
-                vis_padd16(TMP0, CONST_3, TMP0);
2629
-
2630
-                vis_padd16(TMP2, CONST_3, TMP2);
2631
-
2632
-                vis_padd16(TMP0, TMP4, TMP0);
2633
-
2634
-                vis_padd16(TMP2, TMP6, TMP2);
2635
-                vis_pack16(TMP0, DST_2);
2636
-
2637
-                vis_pack16(TMP2, DST_3);
2638
-                vis_st64(DST_2, dest[8]);
2639
-
2640
-                ref += stride;
2641
-                dest += stride;
2642
-        } while (--height);
2643
-}
2644
-
2645
-static void MC_put_no_round_y_16_vis (uint8_t * dest, const uint8_t * ref,
2646
-                                      const ptrdiff_t stride, int height)
2647
-{
2648
-        ref = vis_alignaddr(ref);
2649
-        vis_ld64(ref[0], TMP0);
2650
-
2651
-        vis_ld64_2(ref, 8, TMP2);
2652
-
2653
-        vis_ld64_2(ref, 16, TMP4);
2654
-        ref += stride;
2655
-
2656
-        vis_ld64(ref[0], TMP6);
2657
-        vis_faligndata(TMP0, TMP2, REF_0);
2658
-
2659
-        vis_ld64_2(ref, 8, TMP8);
2660
-        vis_faligndata(TMP2, TMP4, REF_4);
2661
-
2662
-        vis_ld64_2(ref, 16, TMP10);
2663
-        ref += stride;
2664
-
2665
-        vis_ld64(constants_fe[0], MASK_fe);
2666
-        vis_faligndata(TMP6, TMP8, REF_2);
2667
-
2668
-        vis_ld64(constants_7f[0], MASK_7f);
2669
-        vis_faligndata(TMP8, TMP10, REF_6);
2670
-
2671
-        vis_ld64(constants128[0], CONST_128);
2672
-        height = (height >> 1) - 1;
2673
-        do {    /* 24 cycles */
2674
-                vis_ld64(ref[0], TMP0);
2675
-                vis_xor(REF_0, REF_2, TMP12);
2676
-
2677
-                vis_ld64_2(ref, 8, TMP2);
2678
-                vis_xor(REF_4, REF_6, TMP16);
2679
-
2680
-                vis_ld64_2(ref, 16, TMP4);
2681
-                ref += stride;
2682
-                vis_and(REF_0, REF_2, TMP14);
2683
-
2684
-                vis_ld64(ref[0], TMP6);
2685
-                vis_and(REF_4, REF_6, TMP18);
2686
-
2687
-                vis_ld64_2(ref, 8, TMP8);
2688
-                vis_faligndata(TMP0, TMP2, REF_0);
2689
-
2690
-                vis_ld64_2(ref, 16, TMP10);
2691
-                ref += stride;
2692
-                vis_faligndata(TMP2, TMP4, REF_4);
2693
-
2694
-                vis_and(TMP12, MASK_fe, TMP12);
2695
-
2696
-                vis_and(TMP16, MASK_fe, TMP16);
2697
-                vis_mul8x16(CONST_128, TMP12, TMP12);
2698
-
2699
-                vis_mul8x16(CONST_128, TMP16, TMP16);
2700
-                vis_xor(REF_0, REF_2, TMP0);
2701
-
2702
-                vis_xor(REF_4, REF_6, TMP2);
2703
-
2704
-                vis_and(REF_0, REF_2, TMP20);
2705
-
2706
-                vis_and(TMP12, MASK_7f, TMP12);
2707
-
2708
-                vis_and(TMP16, MASK_7f, TMP16);
2709
-
2710
-                vis_padd16(TMP14, TMP12, TMP12);
2711
-                vis_st64(TMP12, dest[0]);
2712
-
2713
-                vis_padd16(TMP18, TMP16, TMP16);
2714
-                vis_st64_2(TMP16, dest, 8);
2715
-                dest += stride;
2716
-
2717
-                vis_and(REF_4, REF_6, TMP18);
2718
-
2719
-                vis_and(TMP0, MASK_fe, TMP0);
2720
-
2721
-                vis_and(TMP2, MASK_fe, TMP2);
2722
-                vis_mul8x16(CONST_128, TMP0, TMP0);
2723
-
2724
-                vis_faligndata(TMP6, TMP8, REF_2);
2725
-                vis_mul8x16(CONST_128, TMP2, TMP2);
2726
-
2727
-                vis_faligndata(TMP8, TMP10, REF_6);
2728
-
2729
-                vis_and(TMP0, MASK_7f, TMP0);
2730
-
2731
-                vis_and(TMP2, MASK_7f, TMP2);
2732
-
2733
-                vis_padd16(TMP20, TMP0, TMP0);
2734
-                vis_st64(TMP0, dest[0]);
2735
-
2736
-                vis_padd16(TMP18, TMP2, TMP2);
2737
-                vis_st64_2(TMP2, dest, 8);
2738
-                dest += stride;
2739
-        } while (--height);
2740
-
2741
-        vis_ld64(ref[0], TMP0);
2742
-        vis_xor(REF_0, REF_2, TMP12);
2743
-
2744
-        vis_ld64_2(ref, 8, TMP2);
2745
-        vis_xor(REF_4, REF_6, TMP16);
2746
-
2747
-        vis_ld64_2(ref, 16, TMP4);
2748
-        vis_and(REF_0, REF_2, TMP14);
2749
-
2750
-        vis_and(REF_4, REF_6, TMP18);
2751
-
2752
-        vis_faligndata(TMP0, TMP2, REF_0);
2753
-
2754
-        vis_faligndata(TMP2, TMP4, REF_4);
2755
-
2756
-        vis_and(TMP12, MASK_fe, TMP12);
2757
-
2758
-        vis_and(TMP16, MASK_fe, TMP16);
2759
-        vis_mul8x16(CONST_128, TMP12, TMP12);
2760
-
2761
-        vis_mul8x16(CONST_128, TMP16, TMP16);
2762
-        vis_xor(REF_0, REF_2, TMP0);
2763
-
2764
-        vis_xor(REF_4, REF_6, TMP2);
2765
-
2766
-        vis_and(REF_0, REF_2, TMP20);
2767
-
2768
-        vis_and(TMP12, MASK_7f, TMP12);
2769
-
2770
-        vis_and(TMP16, MASK_7f, TMP16);
2771
-
2772
-        vis_padd16(TMP14, TMP12, TMP12);
2773
-        vis_st64(TMP12, dest[0]);
2774
-
2775
-        vis_padd16(TMP18, TMP16, TMP16);
2776
-        vis_st64_2(TMP16, dest, 8);
2777
-        dest += stride;
2778
-
2779
-        vis_and(REF_4, REF_6, TMP18);
2780
-
2781
-        vis_and(TMP0, MASK_fe, TMP0);
2782
-
2783
-        vis_and(TMP2, MASK_fe, TMP2);
2784
-        vis_mul8x16(CONST_128, TMP0, TMP0);
2785
-
2786
-        vis_mul8x16(CONST_128, TMP2, TMP2);
2787
-
2788
-        vis_and(TMP0, MASK_7f, TMP0);
2789
-
2790
-        vis_and(TMP2, MASK_7f, TMP2);
2791
-
2792
-        vis_padd16(TMP20, TMP0, TMP0);
2793
-        vis_st64(TMP0, dest[0]);
2794
-
2795
-        vis_padd16(TMP18, TMP2, TMP2);
2796
-        vis_st64_2(TMP2, dest, 8);
2797
-}
2798
-
2799
-static void MC_put_no_round_y_8_vis (uint8_t * dest, const uint8_t * ref,
2800
-                                     const ptrdiff_t stride, int height)
2801
-{
2802
-        ref = vis_alignaddr(ref);
2803
-        vis_ld64(ref[0], TMP0);
2804
-
2805
-        vis_ld64_2(ref, 8, TMP2);
2806
-        ref += stride;
2807
-
2808
-        vis_ld64(ref[0], TMP4);
2809
-
2810
-        vis_ld64_2(ref, 8, TMP6);
2811
-        ref += stride;
2812
-
2813
-        vis_ld64(constants_fe[0], MASK_fe);
2814
-        vis_faligndata(TMP0, TMP2, REF_0);
2815
-
2816
-        vis_ld64(constants_7f[0], MASK_7f);
2817
-        vis_faligndata(TMP4, TMP6, REF_2);
2818
-
2819
-        vis_ld64(constants128[0], CONST_128);
2820
-        height = (height >> 1) - 1;
2821
-        do {    /* 12 cycles */
2822
-                vis_ld64(ref[0], TMP0);
2823
-                vis_xor(REF_0, REF_2, TMP4);
2824
-
2825
-                vis_ld64_2(ref, 8, TMP2);
2826
-                ref += stride;
2827
-                vis_and(TMP4, MASK_fe, TMP4);
2828
-
2829
-                vis_and(REF_0, REF_2, TMP6);
2830
-                vis_mul8x16(CONST_128, TMP4, TMP4);
2831
-
2832
-                vis_faligndata(TMP0, TMP2, REF_0);
2833
-                vis_ld64(ref[0], TMP0);
2834
-
2835
-                vis_ld64_2(ref, 8, TMP2);
2836
-                ref += stride;
2837
-                vis_xor(REF_0, REF_2, TMP12);
2838
-
2839
-                vis_and(TMP4, MASK_7f, TMP4);
2840
-
2841
-                vis_and(TMP12, MASK_fe, TMP12);
2842
-
2843
-                vis_mul8x16(CONST_128, TMP12, TMP12);
2844
-                vis_and(REF_0, REF_2, TMP14);
2845
-
2846
-                vis_padd16(TMP6, TMP4, DST_0);
2847
-                vis_st64(DST_0, dest[0]);
2848
-                dest += stride;
2849
-
2850
-                vis_faligndata(TMP0, TMP2, REF_2);
2851
-
2852
-                vis_and(TMP12, MASK_7f, TMP12);
2853
-
2854
-                vis_padd16(TMP14, TMP12, DST_0);
2855
-                vis_st64(DST_0, dest[0]);
2856
-                dest += stride;
2857
-        } while (--height);
2858
-
2859
-        vis_ld64(ref[0], TMP0);
2860
-        vis_xor(REF_0, REF_2, TMP4);
2861
-
2862
-        vis_ld64_2(ref, 8, TMP2);
2863
-        vis_and(TMP4, MASK_fe, TMP4);
2864
-
2865
-        vis_and(REF_0, REF_2, TMP6);
2866
-        vis_mul8x16(CONST_128, TMP4, TMP4);
2867
-
2868
-        vis_faligndata(TMP0, TMP2, REF_0);
2869
-
2870
-        vis_xor(REF_0, REF_2, TMP12);
2871
-
2872
-        vis_and(TMP4, MASK_7f, TMP4);
2873
-
2874
-        vis_and(TMP12, MASK_fe, TMP12);
2875
-
2876
-        vis_mul8x16(CONST_128, TMP12, TMP12);
2877
-        vis_and(REF_0, REF_2, TMP14);
2878
-
2879
-        vis_padd16(TMP6, TMP4, DST_0);
2880
-        vis_st64(DST_0, dest[0]);
2881
-        dest += stride;
2882
-
2883
-        vis_and(TMP12, MASK_7f, TMP12);
2884
-
2885
-        vis_padd16(TMP14, TMP12, DST_0);
2886
-        vis_st64(DST_0, dest[0]);
2887
-}
2888
-
2889
-static void MC_avg_no_round_y_16_vis (uint8_t * dest, const uint8_t * ref,
2890
-                                      const ptrdiff_t stride, int height)
2891
-{
2892
-        int stride_8 = stride + 8;
2893
-        int stride_16 = stride + 16;
2894
-
2895
-        vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
2896
-
2897
-        ref = vis_alignaddr(ref);
2898
-
2899
-        vis_ld64(ref[ 0], TMP0);
2900
-        vis_fzero(ZERO);
2901
-
2902
-        vis_ld64(ref[ 8], TMP2);
2903
-
2904
-        vis_ld64(ref[16], TMP4);
2905
-
2906
-        vis_ld64(constants3[0], CONST_3);
2907
-        vis_faligndata(TMP0, TMP2, REF_2);
2908
-
2909
-        vis_ld64(constants256_512[0], CONST_256);
2910
-        vis_faligndata(TMP2, TMP4, REF_6);
2911
-        height >>= 1;
2912
-
2913
-        do {    /* 31 cycles */
2914
-                vis_ld64_2(ref, stride, TMP0);
2915
-                vis_pmerge(ZERO,       REF_2,     TMP12);
2916
-                vis_mul8x16au(REF_2_1, CONST_256, TMP14);
2917
-
2918
-                vis_ld64_2(ref, stride_8, TMP2);
2919
-                vis_pmerge(ZERO,       REF_6,     TMP16);
2920
-                vis_mul8x16au(REF_6_1, CONST_256, TMP18);
2921
-
2922
-                vis_ld64_2(ref, stride_16, TMP4);
2923
-                ref += stride;
2924
-
2925
-                vis_ld64(dest[0], DST_0);
2926
-                vis_faligndata(TMP0, TMP2, REF_0);
2927
-
2928
-                vis_ld64_2(dest, 8, DST_2);
2929
-                vis_faligndata(TMP2, TMP4, REF_4);
2930
-
2931
-                vis_ld64_2(ref, stride, TMP6);
2932
-                vis_pmerge(ZERO,     REF_0,     TMP0);
2933
-                vis_mul8x16au(REF_0_1, CONST_256, TMP2);
2934
-
2935
-                vis_ld64_2(ref, stride_8, TMP8);
2936
-                vis_pmerge(ZERO,     REF_4,     TMP4);
2937
-
2938
-                vis_ld64_2(ref, stride_16, TMP10);
2939
-                ref += stride;
2940
-
2941
-                vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
2942
-                vis_faligndata(TMP6, TMP8, REF_2);
2943
-                vis_mul8x16au(REF_4_1, CONST_256, TMP6);
2944
-
2945
-                vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
2946
-                vis_faligndata(TMP8, TMP10, REF_6);
2947
-                vis_mul8x16al(DST_0,   CONST_512, TMP20);
2948
-
2949
-                vis_padd16(TMP0, CONST_3, TMP0);
2950
-                vis_mul8x16al(DST_1,   CONST_512, TMP22);
2951
-
2952
-                vis_padd16(TMP2, CONST_3, TMP2);
2953
-                vis_mul8x16al(DST_2,   CONST_512, TMP24);
2954
-
2955
-                vis_padd16(TMP4, CONST_3, TMP4);
2956
-                vis_mul8x16al(DST_3,   CONST_512, TMP26);
2957
-
2958
-                vis_padd16(TMP6, CONST_3, TMP6);
2959
-
2960
-                vis_padd16(TMP12, TMP20, TMP12);
2961
-                vis_mul8x16al(REF_S0,   CONST_512, TMP20);
2962
-
2963
-                vis_padd16(TMP14, TMP22, TMP14);
2964
-                vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
2965
-
2966
-                vis_padd16(TMP16, TMP24, TMP16);
2967
-                vis_mul8x16al(REF_S2,   CONST_512, TMP24);
2968
-
2969
-                vis_padd16(TMP18, TMP26, TMP18);
2970
-                vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
2971
-
2972
-                vis_padd16(TMP12, TMP0, TMP12);
2973
-                vis_mul8x16au(REF_2,   CONST_256, TMP28);
2974
-
2975
-                vis_padd16(TMP14, TMP2, TMP14);
2976
-                vis_mul8x16au(REF_2_1, CONST_256, TMP30);
2977
-
2978
-                vis_padd16(TMP16, TMP4, TMP16);
2979
-                vis_mul8x16au(REF_6,   CONST_256, REF_S4);
2980
-
2981
-                vis_padd16(TMP18, TMP6, TMP18);
2982
-                vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
2983
-
2984
-                vis_pack16(TMP12, DST_0);
2985
-                vis_padd16(TMP28, TMP0, TMP12);
2986
-
2987
-                vis_pack16(TMP14, DST_1);
2988
-                vis_st64(DST_0, dest[0]);
2989
-                vis_padd16(TMP30, TMP2, TMP14);
2990
-
2991
-                vis_pack16(TMP16, DST_2);
2992
-                vis_padd16(REF_S4, TMP4, TMP16);
2993
-
2994
-                vis_pack16(TMP18, DST_3);
2995
-                vis_st64_2(DST_2, dest, 8);
2996
-                dest += stride;
2997
-                vis_padd16(REF_S6, TMP6, TMP18);
2998
-
2999
-                vis_padd16(TMP12, TMP20, TMP12);
3000
-
3001
-                vis_padd16(TMP14, TMP22, TMP14);
3002
-                vis_pack16(TMP12, DST_0);
3003
-
3004
-                vis_padd16(TMP16, TMP24, TMP16);
3005
-                vis_pack16(TMP14, DST_1);
3006
-                vis_st64(DST_0, dest[0]);
3007
-
3008
-                vis_padd16(TMP18, TMP26, TMP18);
3009
-                vis_pack16(TMP16, DST_2);
3010
-
3011
-                vis_pack16(TMP18, DST_3);
3012
-                vis_st64_2(DST_2, dest, 8);
3013
-                dest += stride;
3014
-        } while (--height);
3015
-}
3016
-
3017
-static void MC_put_no_round_xy_16_vis (uint8_t * dest, const uint8_t * ref,
3018
-                                       const ptrdiff_t stride, int height)
3019
-{
3020
-        unsigned long off = (unsigned long) ref & 0x7;
3021
-        unsigned long off_plus_1 = off + 1;
3022
-        int stride_8 = stride + 8;
3023
-        int stride_16 = stride + 16;
3024
-
3025
-        vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
3026
-
3027
-        ref = vis_alignaddr(ref);
3028
-
3029
-        vis_ld64(ref[ 0], TMP0);
3030
-        vis_fzero(ZERO);
3031
-
3032
-        vis_ld64(ref[ 8], TMP2);
3033
-
3034
-        vis_ld64(ref[16], TMP4);
3035
-
3036
-        vis_ld64(constants1[0], CONST_1);
3037
-        vis_faligndata(TMP0, TMP2, REF_S0);
3038
-
3039
-        vis_ld64(constants256_512[0], CONST_256);
3040
-        vis_faligndata(TMP2, TMP4, REF_S4);
3041
-
3042
-        if (off != 0x7) {
3043
-                vis_alignaddr_g0((void *)off_plus_1);
3044
-                vis_faligndata(TMP0, TMP2, REF_S2);
3045
-                vis_faligndata(TMP2, TMP4, REF_S6);
3046
-        } else {
3047
-                vis_src1(TMP2, REF_S2);
3048
-                vis_src1(TMP4, REF_S6);
3049
-        }
3050
-
3051
-        height >>= 1;
3052
-        do {
3053
-                vis_ld64_2(ref, stride, TMP0);
3054
-                vis_mul8x16au(REF_S0, CONST_256, TMP12);
3055
-                vis_pmerge(ZERO,      REF_S0_1,  TMP14);
3056
-
3057
-                vis_alignaddr_g0((void *)off);
3058
-
3059
-                vis_ld64_2(ref, stride_8, TMP2);
3060
-                vis_mul8x16au(REF_S2, CONST_256, TMP16);
3061
-                vis_pmerge(ZERO,      REF_S2_1,  TMP18);
3062
-
3063
-                vis_ld64_2(ref, stride_16, TMP4);
3064
-                ref += stride;
3065
-                vis_mul8x16au(REF_S4, CONST_256, TMP20);
3066
-                vis_pmerge(ZERO,      REF_S4_1,  TMP22);
3067
-
3068
-                vis_ld64_2(ref, stride, TMP6);
3069
-                vis_mul8x16au(REF_S6, CONST_256, TMP24);
3070
-                vis_pmerge(ZERO,      REF_S6_1,  TMP26);
3071
-
3072
-                vis_ld64_2(ref, stride_8, TMP8);
3073
-                vis_faligndata(TMP0, TMP2, REF_0);
3074
-
3075
-                vis_ld64_2(ref, stride_16, TMP10);
3076
-                ref += stride;
3077
-                vis_faligndata(TMP2, TMP4, REF_4);
3078
-
3079
-                vis_faligndata(TMP6, TMP8, REF_S0);
3080
-
3081
-                vis_faligndata(TMP8, TMP10, REF_S4);
3082
-
3083
-                if (off != 0x7) {
3084
-                        vis_alignaddr_g0((void *)off_plus_1);
3085
-                        vis_faligndata(TMP0, TMP2, REF_2);
3086
-                        vis_faligndata(TMP2, TMP4, REF_6);
3087
-                        vis_faligndata(TMP6, TMP8, REF_S2);
3088
-                        vis_faligndata(TMP8, TMP10, REF_S6);
3089
-                } else {
3090
-                        vis_src1(TMP2, REF_2);
3091
-                        vis_src1(TMP4, REF_6);
3092
-                        vis_src1(TMP8, REF_S2);
3093
-                        vis_src1(TMP10, REF_S6);
3094
-                }
3095
-
3096
-                vis_mul8x16au(REF_0, CONST_256, TMP0);
3097
-                vis_pmerge(ZERO,      REF_0_1,  TMP2);
3098
-
3099
-                vis_mul8x16au(REF_2, CONST_256, TMP4);
3100
-                vis_pmerge(ZERO,      REF_2_1,  TMP6);
3101
-
3102
-                vis_padd16(TMP0, CONST_2, TMP8);
3103
-                vis_mul8x16au(REF_4, CONST_256, TMP0);
3104
-
3105
-                vis_padd16(TMP2, CONST_1, TMP10);
3106
-                vis_mul8x16au(REF_4_1, CONST_256, TMP2);
3107
-
3108
-                vis_padd16(TMP8, TMP4, TMP8);
3109
-                vis_mul8x16au(REF_6, CONST_256, TMP4);
3110
-
3111
-                vis_padd16(TMP10, TMP6, TMP10);
3112
-                vis_mul8x16au(REF_6_1, CONST_256, TMP6);
3113
-
3114
-                vis_padd16(TMP12, TMP8, TMP12);
3115
-
3116
-                vis_padd16(TMP14, TMP10, TMP14);
3117
-
3118
-                vis_padd16(TMP12, TMP16, TMP12);
3119
-
3120
-                vis_padd16(TMP14, TMP18, TMP14);
3121
-                vis_pack16(TMP12, DST_0);
3122
-
3123
-                vis_pack16(TMP14, DST_1);
3124
-                vis_st64(DST_0, dest[0]);
3125
-                vis_padd16(TMP0, CONST_1, TMP12);
3126
-
3127
-                vis_mul8x16au(REF_S0, CONST_256, TMP0);
3128
-                vis_padd16(TMP2, CONST_1, TMP14);
3129
-
3130
-                vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
3131
-                vis_padd16(TMP12, TMP4, TMP12);
3132
-
3133
-                vis_mul8x16au(REF_S2, CONST_256, TMP4);
3134
-                vis_padd16(TMP14, TMP6, TMP14);
3135
-
3136
-                vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
3137
-                vis_padd16(TMP20, TMP12, TMP20);
3138
-
3139
-                vis_padd16(TMP22, TMP14, TMP22);
3140
-
3141
-                vis_padd16(TMP20, TMP24, TMP20);
3142
-
3143
-                vis_padd16(TMP22, TMP26, TMP22);
3144
-                vis_pack16(TMP20, DST_2);
3145
-
3146
-                vis_pack16(TMP22, DST_3);
3147
-                vis_st64_2(DST_2, dest, 8);
3148
-                dest += stride;
3149
-                vis_padd16(TMP0, TMP4, TMP24);
3150
-
3151
-                vis_mul8x16au(REF_S4, CONST_256, TMP0);
3152
-                vis_padd16(TMP2, TMP6, TMP26);
3153
-
3154
-                vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
3155
-                vis_padd16(TMP24, TMP8, TMP24);
3156
-
3157
-                vis_padd16(TMP26, TMP10, TMP26);
3158
-                vis_pack16(TMP24, DST_0);
3159
-
3160
-                vis_pack16(TMP26, DST_1);
3161
-                vis_st64(DST_0, dest[0]);
3162
-                vis_pmerge(ZERO, REF_S6, TMP4);
3163
-
3164
-                vis_pmerge(ZERO,      REF_S6_1,  TMP6);
3165
-
3166
-                vis_padd16(TMP0, TMP4, TMP0);
3167
-
3168
-                vis_padd16(TMP2, TMP6, TMP2);
3169
-
3170
-                vis_padd16(TMP0, TMP12, TMP0);
3171
-
3172
-                vis_padd16(TMP2, TMP14, TMP2);
3173
-                vis_pack16(TMP0, DST_2);
3174
-
3175
-                vis_pack16(TMP2, DST_3);
3176
-                vis_st64_2(DST_2, dest, 8);
3177
-                dest += stride;
3178
-        } while (--height);
3179
-}
3180
-
3181
-static void MC_put_no_round_xy_8_vis (uint8_t * dest, const uint8_t * ref,
3182
-                                      const ptrdiff_t stride, int height)
3183
-{
3184
-        unsigned long off = (unsigned long) ref & 0x7;
3185
-        unsigned long off_plus_1 = off + 1;
3186
-        int stride_8 = stride + 8;
3187
-
3188
-        vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
3189
-
3190
-        ref = vis_alignaddr(ref);
3191
-
3192
-        vis_ld64(ref[ 0], TMP0);
3193
-        vis_fzero(ZERO);
3194
-
3195
-        vis_ld64(ref[ 8], TMP2);
3196
-
3197
-        vis_ld64(constants1[0], CONST_1);
3198
-
3199
-        vis_ld64(constants256_512[0], CONST_256);
3200
-        vis_faligndata(TMP0, TMP2, REF_S0);
3201
-
3202
-        if (off != 0x7) {
3203
-                vis_alignaddr_g0((void *)off_plus_1);
3204
-                vis_faligndata(TMP0, TMP2, REF_S2);
3205
-        } else {
3206
-                vis_src1(TMP2, REF_S2);
3207
-        }
3208
-
3209
-        height >>= 1;
3210
-        do {    /* 26 cycles */
3211
-                vis_ld64_2(ref, stride, TMP0);
3212
-                vis_mul8x16au(REF_S0,   CONST_256, TMP8);
3213
-                vis_pmerge(ZERO,        REF_S2,    TMP12);
3214
-
3215
-                vis_alignaddr_g0((void *)off);
3216
-
3217
-                vis_ld64_2(ref, stride_8, TMP2);
3218
-                ref += stride;
3219
-                vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
3220
-                vis_pmerge(ZERO,        REF_S2_1,  TMP14);
3221
-
3222
-                vis_ld64_2(ref, stride, TMP4);
3223
-
3224
-                vis_ld64_2(ref, stride_8, TMP6);
3225
-                ref += stride;
3226
-                vis_faligndata(TMP0, TMP2, REF_S4);
3227
-
3228
-                vis_pmerge(ZERO, REF_S4, TMP18);
3229
-
3230
-                vis_pmerge(ZERO, REF_S4_1, TMP20);
3231
-
3232
-                vis_faligndata(TMP4, TMP6, REF_S0);
3233
-
3234
-                if (off != 0x7) {
3235
-                        vis_alignaddr_g0((void *)off_plus_1);
3236
-                        vis_faligndata(TMP0, TMP2, REF_S6);
3237
-                        vis_faligndata(TMP4, TMP6, REF_S2);
3238
-                } else {
3239
-                        vis_src1(TMP2, REF_S6);
3240
-                        vis_src1(TMP6, REF_S2);
3241
-                }
3242
-
3243
-                vis_padd16(TMP18, CONST_1, TMP18);
3244
-                vis_mul8x16au(REF_S6,   CONST_256, TMP22);
3245
-
3246
-                vis_padd16(TMP20, CONST_1, TMP20);
3247
-                vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
3248
-
3249
-                vis_mul8x16au(REF_S0,   CONST_256, TMP26);
3250
-                vis_pmerge(ZERO, REF_S0_1, TMP28);
3251
-
3252
-                vis_mul8x16au(REF_S2,   CONST_256, TMP30);
3253
-                vis_padd16(TMP18, TMP22, TMP18);
3254
-
3255
-                vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
3256
-                vis_padd16(TMP20, TMP24, TMP20);
3257
-
3258
-                vis_padd16(TMP8,  TMP18, TMP8);
3259
-
3260
-                vis_padd16(TMP10, TMP20, TMP10);
3261
-
3262
-                vis_padd16(TMP8,  TMP12, TMP8);
3263
-
3264
-                vis_padd16(TMP10, TMP14, TMP10);
3265
-                vis_pack16(TMP8,  DST_0);
3266
-
3267
-                vis_pack16(TMP10, DST_1);
3268
-                vis_st64(DST_0, dest[0]);
3269
-                dest += stride;
3270
-                vis_padd16(TMP18, TMP26, TMP18);
3271
-
3272
-                vis_padd16(TMP20, TMP28, TMP20);
3273
-
3274
-                vis_padd16(TMP18, TMP30, TMP18);
3275
-
3276
-                vis_padd16(TMP20, TMP32, TMP20);
3277
-                vis_pack16(TMP18, DST_2);
3278
-
3279
-                vis_pack16(TMP20, DST_3);
3280
-                vis_st64(DST_2, dest[0]);
3281
-                dest += stride;
3282
-        } while (--height);
3283
-}
3284
-
3285
-static void MC_avg_no_round_xy_16_vis (uint8_t * dest, const uint8_t * ref,
3286
-                                       const ptrdiff_t stride, int height)
3287
-{
3288
-        unsigned long off = (unsigned long) ref & 0x7;
3289
-        unsigned long off_plus_1 = off + 1;
3290
-        int stride_8 = stride + 8;
3291
-        int stride_16 = stride + 16;
3292
-
3293
-        vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
3294
-
3295
-        ref = vis_alignaddr(ref);
3296
-
3297
-        vis_ld64(ref[ 0], TMP0);
3298
-        vis_fzero(ZERO);
3299
-
3300
-        vis_ld64(ref[ 8], TMP2);
3301
-
3302
-        vis_ld64(ref[16], TMP4);
3303
-
3304
-        vis_ld64(constants6[0], CONST_6);
3305
-        vis_faligndata(TMP0, TMP2, REF_S0);
3306
-
3307
-        vis_ld64(constants256_1024[0], CONST_256);
3308
-        vis_faligndata(TMP2, TMP4, REF_S4);
3309
-
3310
-        if (off != 0x7) {
3311
-                vis_alignaddr_g0((void *)off_plus_1);
3312
-                vis_faligndata(TMP0, TMP2, REF_S2);
3313
-                vis_faligndata(TMP2, TMP4, REF_S6);
3314
-        } else {
3315
-                vis_src1(TMP2, REF_S2);
3316
-                vis_src1(TMP4, REF_S6);
3317
-        }
3318
-
3319
-        height >>= 1;
3320
-        do {    /* 55 cycles */
3321
-                vis_ld64_2(ref, stride, TMP0);
3322
-                vis_mul8x16au(REF_S0, CONST_256, TMP12);
3323
-                vis_pmerge(ZERO,      REF_S0_1,  TMP14);
3324
-
3325
-                vis_alignaddr_g0((void *)off);
3326
-
3327
-                vis_ld64_2(ref, stride_8, TMP2);
3328
-                vis_mul8x16au(REF_S2, CONST_256, TMP16);
3329
-                vis_pmerge(ZERO,      REF_S2_1,  TMP18);
3330
-
3331
-                vis_ld64_2(ref, stride_16, TMP4);
3332
-                ref += stride;
3333
-                vis_mul8x16au(REF_S4, CONST_256, TMP20);
3334
-                vis_pmerge(ZERO,      REF_S4_1,  TMP22);
3335
-
3336
-                vis_ld64_2(ref, stride, TMP6);
3337
-                vis_mul8x16au(REF_S6, CONST_256, TMP24);
3338
-                vis_pmerge(ZERO,      REF_S6_1,  TMP26);
3339
-
3340
-                vis_ld64_2(ref, stride_8, TMP8);
3341
-                vis_faligndata(TMP0, TMP2, REF_0);
3342
-
3343
-                vis_ld64_2(ref, stride_16, TMP10);
3344
-                ref += stride;
3345
-                vis_faligndata(TMP2, TMP4, REF_4);
3346
-
3347
-                vis_ld64(dest[0], DST_0);
3348
-                vis_faligndata(TMP6, TMP8, REF_S0);
3349
-
3350
-                vis_ld64_2(dest, 8, DST_2);
3351
-                vis_faligndata(TMP8, TMP10, REF_S4);
3352
-
3353
-                if (off != 0x7) {
3354
-                        vis_alignaddr_g0((void *)off_plus_1);
3355
-                        vis_faligndata(TMP0, TMP2, REF_2);
3356
-                        vis_faligndata(TMP2, TMP4, REF_6);
3357
-                        vis_faligndata(TMP6, TMP8, REF_S2);
3358
-                        vis_faligndata(TMP8, TMP10, REF_S6);
3359
-                } else {
3360
-                        vis_src1(TMP2, REF_2);
3361
-                        vis_src1(TMP4, REF_6);
3362
-                        vis_src1(TMP8, REF_S2);
3363
-                        vis_src1(TMP10, REF_S6);
3364
-                }
3365
-
3366
-                vis_mul8x16al(DST_0,   CONST_1024, TMP30);
3367
-                vis_pmerge(ZERO, REF_0, TMP0);
3368
-
3369
-                vis_mul8x16al(DST_1,   CONST_1024, TMP32);
3370
-                vis_pmerge(ZERO,      REF_0_1,  TMP2);
3371
-
3372
-                vis_mul8x16au(REF_2, CONST_256, TMP4);
3373
-                vis_pmerge(ZERO,      REF_2_1,  TMP6);
3374
-
3375
-                vis_mul8x16al(DST_2,   CONST_1024, REF_0);
3376
-                vis_padd16(TMP0, CONST_6, TMP0);
3377
-
3378
-                vis_mul8x16al(DST_3,   CONST_1024, REF_2);
3379
-                vis_padd16(TMP2, CONST_6, TMP2);
3380
-
3381
-                vis_padd16(TMP0, TMP4, TMP0);
3382
-                vis_mul8x16au(REF_4, CONST_256, TMP4);
3383
-
3384
-                vis_padd16(TMP2, TMP6, TMP2);
3385
-                vis_mul8x16au(REF_4_1, CONST_256, TMP6);
3386
-
3387
-                vis_padd16(TMP12, TMP0, TMP12);
3388
-                vis_mul8x16au(REF_6, CONST_256, TMP8);
3389
-
3390
-                vis_padd16(TMP14, TMP2, TMP14);
3391
-                vis_mul8x16au(REF_6_1, CONST_256, TMP10);
3392
-
3393
-                vis_padd16(TMP12, TMP16, TMP12);
3394
-                vis_mul8x16au(REF_S0, CONST_256, REF_4);
3395
-
3396
-                vis_padd16(TMP14, TMP18, TMP14);
3397
-                vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
3398
-
3399
-                vis_padd16(TMP12, TMP30, TMP12);
3400
-
3401
-                vis_padd16(TMP14, TMP32, TMP14);
3402
-                vis_pack16(TMP12, DST_0);
3403
-
3404
-                vis_pack16(TMP14, DST_1);
3405
-                vis_st64(DST_0, dest[0]);
3406
-                vis_padd16(TMP4, CONST_6, TMP4);
3407
-
3408
-                vis_ld64_2(dest, stride, DST_0);
3409
-                vis_padd16(TMP6, CONST_6, TMP6);
3410
-                vis_mul8x16au(REF_S2, CONST_256, TMP12);
3411
-
3412
-                vis_padd16(TMP4, TMP8, TMP4);
3413
-                vis_mul8x16au(REF_S2_1, CONST_256,  TMP14);
3414
-
3415
-                vis_padd16(TMP6, TMP10, TMP6);
3416
-
3417
-                vis_padd16(TMP20, TMP4, TMP20);
3418
-
3419
-                vis_padd16(TMP22, TMP6, TMP22);
3420
-
3421
-                vis_padd16(TMP20, TMP24, TMP20);
3422
-
3423
-                vis_padd16(TMP22, TMP26, TMP22);
3424
-
3425
-                vis_padd16(TMP20, REF_0, TMP20);
3426
-                vis_mul8x16au(REF_S4, CONST_256, REF_0);
3427
-
3428
-                vis_padd16(TMP22, REF_2, TMP22);
3429
-                vis_pack16(TMP20, DST_2);
3430
-
3431
-                vis_pack16(TMP22, DST_3);
3432
-                vis_st64_2(DST_2, dest, 8);
3433
-                dest += stride;
3434
-
3435
-                vis_ld64_2(dest, 8, DST_2);
3436
-                vis_mul8x16al(DST_0,   CONST_1024, TMP30);
3437
-                vis_pmerge(ZERO,      REF_S4_1,  REF_2);
3438
-
3439
-                vis_mul8x16al(DST_1,   CONST_1024, TMP32);
3440
-                vis_padd16(REF_4, TMP0, TMP8);
3441
-
3442
-                vis_mul8x16au(REF_S6, CONST_256, REF_4);
3443
-                vis_padd16(REF_6, TMP2, TMP10);
3444
-
3445
-                vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
3446
-                vis_padd16(TMP8, TMP12, TMP8);
3447
-
3448
-                vis_padd16(TMP10, TMP14, TMP10);
3449
-
3450
-                vis_padd16(TMP8, TMP30, TMP8);
3451
-
3452
-                vis_padd16(TMP10, TMP32, TMP10);
3453
-                vis_pack16(TMP8, DST_0);
3454
-
3455
-                vis_pack16(TMP10, DST_1);
3456
-                vis_st64(DST_0, dest[0]);
3457
-
3458
-                vis_padd16(REF_0, TMP4, REF_0);
3459
-
3460
-                vis_mul8x16al(DST_2,   CONST_1024, TMP30);
3461
-                vis_padd16(REF_2, TMP6, REF_2);
3462
-
3463
-                vis_mul8x16al(DST_3,   CONST_1024, TMP32);
3464
-                vis_padd16(REF_0, REF_4, REF_0);
3465
-
3466
-                vis_padd16(REF_2, REF_6, REF_2);
3467
-
3468
-                vis_padd16(REF_0, TMP30, REF_0);
3469
-
3470
-                /* stall */
3471
-
3472
-                vis_padd16(REF_2, TMP32, REF_2);
3473
-                vis_pack16(REF_0, DST_2);
3474
-
3475
-                vis_pack16(REF_2, DST_3);
3476
-                vis_st64_2(DST_2, dest, 8);
3477
-                dest += stride;
3478
-        } while (--height);
3479
-}
3480
-
3481
-/* End of no rounding code */
3482
-
3483
-av_cold void ff_hpeldsp_init_vis(HpelDSPContext *c, int flags)
3484
-{
3485
-  /* VIS-specific optimizations */
3486
-  int accel = vis_level ();
3487
-
3488
-  if (accel & ACCEL_SPARC_VIS) {
3489
-      c->put_pixels_tab[0][0] = MC_put_o_16_vis;
3490
-      c->put_pixels_tab[0][1] = MC_put_x_16_vis;
3491
-      c->put_pixels_tab[0][2] = MC_put_y_16_vis;
3492
-      c->put_pixels_tab[0][3] = MC_put_xy_16_vis;
3493
-
3494
-      c->put_pixels_tab[1][0] = MC_put_o_8_vis;
3495
-      c->put_pixels_tab[1][1] = MC_put_x_8_vis;
3496
-      c->put_pixels_tab[1][2] = MC_put_y_8_vis;
3497
-      c->put_pixels_tab[1][3] = MC_put_xy_8_vis;
3498
-
3499
-      c->avg_pixels_tab[0][0] = MC_avg_o_16_vis;
3500
-      c->avg_pixels_tab[0][1] = MC_avg_x_16_vis;
3501
-      c->avg_pixels_tab[0][2] = MC_avg_y_16_vis;
3502
-      c->avg_pixels_tab[0][3] = MC_avg_xy_16_vis;
3503
-
3504
-      c->avg_pixels_tab[1][0] = MC_avg_o_8_vis;
3505
-      c->avg_pixels_tab[1][1] = MC_avg_x_8_vis;
3506
-      c->avg_pixels_tab[1][2] = MC_avg_y_8_vis;
3507
-      c->avg_pixels_tab[1][3] = MC_avg_xy_8_vis;
3508
-
3509
-      c->put_no_rnd_pixels_tab[0][0] = MC_put_no_round_o_16_vis;
3510
-      c->put_no_rnd_pixels_tab[0][1] = MC_put_no_round_x_16_vis;
3511
-      c->put_no_rnd_pixels_tab[0][2] = MC_put_no_round_y_16_vis;
3512
-      c->put_no_rnd_pixels_tab[0][3] = MC_put_no_round_xy_16_vis;
3513
-
3514
-      c->put_no_rnd_pixels_tab[1][0] = MC_put_no_round_o_8_vis;
3515
-      c->put_no_rnd_pixels_tab[1][1] = MC_put_no_round_x_8_vis;
3516
-      c->put_no_rnd_pixels_tab[1][2] = MC_put_no_round_y_8_vis;
3517
-      c->put_no_rnd_pixels_tab[1][3] = MC_put_no_round_xy_8_vis;
3518
-
3519
-      c->avg_no_rnd_pixels_tab[0] = MC_avg_no_round_o_16_vis;
3520
-      c->avg_no_rnd_pixels_tab[1] = MC_avg_no_round_x_16_vis;
3521
-      c->avg_no_rnd_pixels_tab[2] = MC_avg_no_round_y_16_vis;
3522
-      c->avg_no_rnd_pixels_tab[3] = MC_avg_no_round_xy_16_vis;
3523
-  }
3524
-}
3525 1
deleted file mode 100644
... ...
@@ -1,531 +0,0 @@
1
-/*
2
- * SPARC VIS optimized inverse DCT
3
- * Copyright (c) 2007 Denes Balatoni < dbalatoni XatX interware XdotX hu >
4
- *
5
- * I did consult the following fine web page about dct
6
- * http://www.geocities.com/ssavekar/dct.htm
7
- *
8
- * This file is part of Libav.
9
- *
10
- * Libav is free software; you can redistribute it and/or
11
- * modify it under the terms of the GNU Lesser General Public
12
- * License as published by the Free Software Foundation; either
13
- * version 2.1 of the License, or (at your option) any later version.
14
- *
15
- * Libav is distributed in the hope that it will be useful,
16
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
- * Lesser General Public License for more details.
19
- *
20
- * You should have received a copy of the GNU Lesser General Public
21
- * License along with Libav; if not, write to the Free Software
22
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
- */
24
-
25
-#include <stdint.h>
26
-
27
-#include "dsputil_vis.h"
28
-#include "libavutil/mem.h"
29
-
30
-static const DECLARE_ALIGNED(8, int16_t, coeffs)[28] = {
31
-    - 1259,- 1259,- 1259,- 1259,
32
-    - 4989,- 4989,- 4989,- 4989,
33
-    -11045,-11045,-11045,-11045,
34
-    -19195,-19195,-19195,-19195,
35
-    -29126,-29126,-29126,-29126,
36
-     25080, 25080, 25080, 25080,
37
-     12785, 12785, 12785, 12785
38
-};
39
-static const DECLARE_ALIGNED(8, uint16_t, scale)[4] = {
40
-    65536>>6, 65536>>6, 65536>>6, 65536>>6
41
-};
42
-static const DECLARE_ALIGNED(8, uint16_t, rounder)[4] = {
43
-    1<<5, 1<<5, 1<<5, 1<<5
44
-};
45
-static const DECLARE_ALIGNED(8, uint16_t, expand)[4] = {
46
-    1<<14, 1<<14, 1<<14, 1<<14
47
-};
48
-
49
-#define INIT_IDCT \
50
-        "ldd [%1], %%f32         \n\t"\
51
-        "ldd [%1+8], %%f34       \n\t"\
52
-        "ldd [%1+16], %%f36      \n\t"\
53
-        "ldd [%1+24], %%f38      \n\t"\
54
-        "ldd [%1+32], %%f40      \n\t"\
55
-        "ldd [%1+40], %%f42      \n\t"\
56
-        "ldd [%1+48], %%f44      \n\t"\
57
-        "ldd [%0], %%f46         \n\t"\
58
-        "fzero %%f62             \n\t"\
59
-
60
-#define LOADSCALE(in) \
61
-        "ldd [" in "], %%f0          \n\t"\
62
-        "ldd [" in "+16], %%f2       \n\t"\
63
-        "ldd [" in "+32], %%f4       \n\t"\
64
-        "ldd [" in "+48], %%f6       \n\t"\
65
-        "ldd [" in "+64], %%f8       \n\t"\
66
-        "ldd [" in "+80], %%f10      \n\t"\
67
-        "ldd [" in "+96], %%f12      \n\t"\
68
-        "ldd [" in "+112], %%f14     \n\t"\
69
-        "fpadd16 %%f0, %%f0, %%f0    \n\t"\
70
-        "fpadd16 %%f2, %%f2, %%f2    \n\t"\
71
-        "fpadd16 %%f4, %%f4, %%f4    \n\t"\
72
-        "fpadd16 %%f6, %%f6, %%f6    \n\t"\
73
-        "fpadd16 %%f8, %%f8, %%f8    \n\t"\
74
-        "fpadd16 %%f10, %%f10, %%f10 \n\t"\
75
-        "fpadd16 %%f12, %%f12, %%f12 \n\t"\
76
-        "fpadd16 %%f14, %%f14, %%f14 \n\t"\
77
-\
78
-        "fpadd16 %%f0, %%f0, %%f0    \n\t"\
79
-        "fpadd16 %%f2, %%f2, %%f2    \n\t"\
80
-        "fpadd16 %%f4, %%f4, %%f4    \n\t"\
81
-        "fpadd16 %%f6, %%f6, %%f6    \n\t"\
82
-        "fpadd16 %%f8, %%f8, %%f8    \n\t"\
83
-        "fpadd16 %%f10, %%f10, %%f10 \n\t"\
84
-        "fpadd16 %%f12, %%f12, %%f12 \n\t"\
85
-        "fpadd16 %%f14, %%f14, %%f14 \n\t"\
86
-\
87
-        "fpadd16 %%f0, %%f0, %%f0    \n\t"\
88
-        "fpadd16 %%f2, %%f2, %%f2    \n\t"\
89
-        "fpadd16 %%f4, %%f4, %%f4    \n\t"\
90
-        "fpadd16 %%f6, %%f6, %%f6    \n\t"\
91
-        "fpadd16 %%f8, %%f8, %%f8    \n\t"\
92
-        "fpadd16 %%f10, %%f10, %%f10 \n\t"\
93
-        "fpadd16 %%f12, %%f12, %%f12 \n\t"\
94
-        "fpadd16 %%f14, %%f14, %%f14 \n\t"\
95
-\
96
-        "fpadd16 %%f0, %%f0, %%f0    \n\t"\
97
-        "fpadd16 %%f2, %%f2, %%f2    \n\t"\
98
-        "fpadd16 %%f4, %%f4, %%f4    \n\t"\
99
-        "fpadd16 %%f6, %%f6, %%f6    \n\t"\
100
-        "fpadd16 %%f8, %%f8, %%f8    \n\t"\
101
-        "fpadd16 %%f10, %%f10, %%f10 \n\t"\
102
-        "fpadd16 %%f12, %%f12, %%f12 \n\t"\
103
-        "fpadd16 %%f14, %%f14, %%f14 \n\t"\
104
-
105
-#define LOAD(in) \
106
-        "ldd [" in "], %%f16         \n\t"\
107
-        "ldd [" in "+8], %%f18       \n\t"\
108
-        "ldd [" in "+16], %%f20      \n\t"\
109
-        "ldd [" in "+24], %%f22      \n\t"\
110
-        "ldd [" in "+32], %%f24      \n\t"\
111
-        "ldd [" in "+40], %%f26      \n\t"\
112
-        "ldd [" in "+48], %%f28      \n\t"\
113
-        "ldd [" in "+56], %%f30      \n\t"\
114
-
115
-#define TRANSPOSE \
116
-        "fpmerge %%f16, %%f24, %%f0  \n\t"\
117
-        "fpmerge %%f20, %%f28, %%f2  \n\t"\
118
-        "fpmerge %%f17, %%f25, %%f4  \n\t"\
119
-        "fpmerge %%f21, %%f29, %%f6  \n\t"\
120
-        "fpmerge %%f18, %%f26, %%f8  \n\t"\
121
-        "fpmerge %%f22, %%f30, %%f10 \n\t"\
122
-        "fpmerge %%f19, %%f27, %%f12 \n\t"\
123
-        "fpmerge %%f23, %%f31, %%f14 \n\t"\
124
-\
125
-        "fpmerge %%f0, %%f2, %%f16   \n\t"\
126
-        "fpmerge %%f1, %%f3, %%f18   \n\t"\
127
-        "fpmerge %%f4, %%f6, %%f20   \n\t"\
128
-        "fpmerge %%f5, %%f7, %%f22   \n\t"\
129
-        "fpmerge %%f8, %%f10, %%f24  \n\t"\
130
-        "fpmerge %%f9, %%f11, %%f26  \n\t"\
131
-        "fpmerge %%f12, %%f14, %%f28 \n\t"\
132
-        "fpmerge %%f13, %%f15, %%f30 \n\t"\
133
-\
134
-        "fpmerge %%f16, %%f17, %%f0  \n\t"\
135
-        "fpmerge %%f18, %%f19, %%f2  \n\t"\
136
-        "fpmerge %%f20, %%f21, %%f4  \n\t"\
137
-        "fpmerge %%f22, %%f23, %%f6  \n\t"\
138
-        "fpmerge %%f24, %%f25, %%f8  \n\t"\
139
-        "fpmerge %%f26, %%f27, %%f10 \n\t"\
140
-        "fpmerge %%f28, %%f29, %%f12 \n\t"\
141
-        "fpmerge %%f30, %%f31, %%f14 \n\t"\
142
-
143
-#define IDCT4ROWS \
144
-    /* 1. column */\
145
-        "fmul8ulx16 %%f0, %%f38, %%f28 \n\t"\
146
-        "for %%f4, %%f6, %%f60         \n\t"\
147
-        "fmul8ulx16 %%f2, %%f32, %%f18 \n\t"\
148
-        "fmul8ulx16 %%f2, %%f36, %%f22 \n\t"\
149
-        "fmul8ulx16 %%f2, %%f40, %%f26 \n\t"\
150
-        "fmul8ulx16 %%f2, %%f44, %%f30 \n\t"\
151
-\
152
-        ADDROUNDER\
153
-\
154
-        "fmul8sux16 %%f0, %%f38, %%f48 \n\t"\
155
-        "fcmpd %%fcc0, %%f62, %%f60    \n\t"\
156
-        "for %%f8, %%f10, %%f60        \n\t"\
157
-        "fmul8sux16 %%f2, %%f32, %%f50 \n\t"\
158
-        "fmul8sux16 %%f2, %%f36, %%f52 \n\t"\
159
-        "fmul8sux16 %%f2, %%f40, %%f54 \n\t"\
160
-        "fmul8sux16 %%f2, %%f44, %%f56 \n\t"\
161
-\
162
-        "fpadd16 %%f48, %%f28, %%f28 \n\t"\
163
-        "fcmpd %%fcc1, %%f62, %%f60  \n\t"\
164
-        "for %%f12, %%f14, %%f60     \n\t"\
165
-        "fpadd16 %%f50, %%f18, %%f18 \n\t"\
166
-        "fpadd16 %%f52, %%f22, %%f22 \n\t"\
167
-        "fpadd16 %%f54, %%f26, %%f26 \n\t"\
168
-        "fpadd16 %%f56, %%f30, %%f30 \n\t"\
169
-\
170
-        "fpadd16 %%f28, %%f0, %%f16  \n\t"\
171
-        "fcmpd %%fcc2, %%f62, %%f60  \n\t"\
172
-        "fpadd16 %%f28, %%f0, %%f20  \n\t"\
173
-        "fpadd16 %%f28, %%f0, %%f24  \n\t"\
174
-        "fpadd16 %%f28, %%f0, %%f28  \n\t"\
175
-        "fpadd16 %%f18, %%f2, %%f18  \n\t"\
176
-        "fpadd16 %%f22, %%f2, %%f22  \n\t"\
177
-    /* 2. column */\
178
-        "fbe %%fcc0, 3f                \n\t"\
179
-        "fpadd16 %%f26, %%f2, %%f26    \n\t"\
180
-        "fmul8ulx16 %%f4, %%f34, %%f48 \n\t"\
181
-        "fmul8ulx16 %%f4, %%f42, %%f50 \n\t"\
182
-        "fmul8ulx16 %%f6, %%f36, %%f52 \n\t"\
183
-        "fmul8ulx16 %%f6, %%f44, %%f54 \n\t"\
184
-        "fmul8ulx16 %%f6, %%f32, %%f56 \n\t"\
185
-        "fmul8ulx16 %%f6, %%f40, %%f58 \n\t"\
186
-\
187
-        "fpadd16 %%f16, %%f48, %%f16 \n\t"\
188
-        "fpadd16 %%f20, %%f50, %%f20 \n\t"\
189
-        "fpsub16 %%f24, %%f50, %%f24 \n\t"\
190
-        "fpsub16 %%f28, %%f48, %%f28 \n\t"\
191
-        "fpadd16 %%f18, %%f52, %%f18 \n\t"\
192
-        "fpsub16 %%f22, %%f54, %%f22 \n\t"\
193
-        "fpsub16 %%f26, %%f56, %%f26 \n\t"\
194
-        "fpsub16 %%f30, %%f58, %%f30 \n\t"\
195
-\
196
-        "fmul8sux16 %%f4, %%f34, %%f48 \n\t"\
197
-        "fmul8sux16 %%f4, %%f42, %%f50 \n\t"\
198
-        "fmul8sux16 %%f6, %%f36, %%f52 \n\t"\
199
-        "fmul8sux16 %%f6, %%f44, %%f54 \n\t"\
200
-        "fmul8sux16 %%f6, %%f32, %%f56 \n\t"\
201
-        "fmul8sux16 %%f6, %%f40, %%f58 \n\t"\
202
-\
203
-        "fpadd16 %%f16, %%f48, %%f16 \n\t"\
204
-        "fpadd16 %%f20, %%f50, %%f20 \n\t"\
205
-        "fpsub16 %%f24, %%f50, %%f24 \n\t"\
206
-        "fpsub16 %%f28, %%f48, %%f28 \n\t"\
207
-        "fpadd16 %%f18, %%f52, %%f18 \n\t"\
208
-        "fpsub16 %%f22, %%f54, %%f22 \n\t"\
209
-        "fpsub16 %%f26, %%f56, %%f26 \n\t"\
210
-        "fpsub16 %%f30, %%f58, %%f30 \n\t"\
211
-\
212
-        "fpadd16 %%f16, %%f4, %%f16  \n\t"\
213
-        "fpsub16 %%f28, %%f4, %%f28  \n\t"\
214
-        "fpadd16 %%f18, %%f6, %%f18  \n\t"\
215
-        "fpsub16 %%f26, %%f6, %%f26  \n\t"\
216
-    /* 3. column */\
217
-        "3:                             \n\t"\
218
-        "fbe %%fcc1, 4f                 \n\t"\
219
-        "fpsub16 %%f30, %%f6, %%f30     \n\t"\
220
-        "fmul8ulx16 %%f8, %%f38, %%f48  \n\t"\
221
-        "fmul8ulx16 %%f10, %%f40, %%f50 \n\t"\
222
-        "fmul8ulx16 %%f10, %%f32, %%f52 \n\t"\
223
-        "fmul8ulx16 %%f10, %%f44, %%f54 \n\t"\
224
-        "fmul8ulx16 %%f10, %%f36, %%f56 \n\t"\
225
-\
226
-        "fpadd16 %%f16, %%f48, %%f16 \n\t"\
227
-        "fpsub16 %%f20, %%f48, %%f20 \n\t"\
228
-        "fpsub16 %%f24, %%f48, %%f24 \n\t"\
229
-        "fpadd16 %%f28, %%f48, %%f28 \n\t"\
230
-        "fpadd16 %%f18, %%f50, %%f18 \n\t"\
231
-        "fpsub16 %%f22, %%f52, %%f22 \n\t"\
232
-        "fpadd16 %%f26, %%f54, %%f26 \n\t"\
233
-        "fpadd16 %%f30, %%f56, %%f30 \n\t"\
234
-\
235
-        "fmul8sux16 %%f8, %%f38, %%f48 \n\t"\
236
-        "fmul8sux16 %%f10, %%f40, %%f50 \n\t"\
237
-        "fmul8sux16 %%f10, %%f32, %%f52 \n\t"\
238
-        "fmul8sux16 %%f10, %%f44, %%f54 \n\t"\
239
-        "fmul8sux16 %%f10, %%f36, %%f56 \n\t"\
240
-\
241
-        "fpadd16 %%f16, %%f48, %%f16 \n\t"\
242
-        "fpsub16 %%f20, %%f48, %%f20 \n\t"\
243
-        "fpsub16 %%f24, %%f48, %%f24 \n\t"\
244
-        "fpadd16 %%f28, %%f48, %%f28 \n\t"\
245
-        "fpadd16 %%f18, %%f50, %%f18 \n\t"\
246
-        "fpsub16 %%f22, %%f52, %%f22 \n\t"\
247
-        "fpadd16 %%f26, %%f54, %%f26 \n\t"\
248
-        "fpadd16 %%f30, %%f56, %%f30 \n\t"\
249
-\
250
-        "fpadd16 %%f16, %%f8, %%f16  \n\t"\
251
-        "fpsub16 %%f20, %%f8, %%f20  \n\t"\
252
-        "fpsub16 %%f24, %%f8, %%f24  \n\t"\
253
-        "fpadd16 %%f28, %%f8, %%f28  \n\t"\
254
-        "fpadd16 %%f18, %%f10, %%f18 \n\t"\
255
-        "fpsub16 %%f22, %%f10, %%f22 \n\t"\
256
-    /* 4. column */\
257
-        "4:                             \n\t"\
258
-        "fbe %%fcc2, 5f                 \n\t"\
259
-        "fpadd16 %%f30, %%f10, %%f30    \n\t"\
260
-        "fmul8ulx16 %%f12, %%f42, %%f48 \n\t"\
261
-        "fmul8ulx16 %%f12, %%f34, %%f50 \n\t"\
262
-        "fmul8ulx16 %%f14, %%f44, %%f52 \n\t"\
263
-        "fmul8ulx16 %%f14, %%f40, %%f54 \n\t"\
264
-        "fmul8ulx16 %%f14, %%f36, %%f56 \n\t"\
265
-        "fmul8ulx16 %%f14, %%f32, %%f58 \n\t"\
266
-\
267
-        "fpadd16 %%f16, %%f48, %%f16 \n\t"\
268
-        "fpsub16 %%f20, %%f50, %%f20 \n\t"\
269
-        "fpadd16 %%f24, %%f50, %%f24 \n\t"\
270
-        "fpsub16 %%f28, %%f48, %%f28 \n\t"\
271
-        "fpadd16 %%f18, %%f52, %%f18 \n\t"\
272
-        "fpsub16 %%f22, %%f54, %%f22 \n\t"\
273
-        "fpadd16 %%f26, %%f56, %%f26 \n\t"\
274
-        "fpsub16 %%f30, %%f58, %%f30 \n\t"\
275
-\
276
-        "fmul8sux16 %%f12, %%f42, %%f48 \n\t"\
277
-        "fmul8sux16 %%f12, %%f34, %%f50 \n\t"\
278
-        "fmul8sux16 %%f14, %%f44, %%f52 \n\t"\
279
-        "fmul8sux16 %%f14, %%f40, %%f54 \n\t"\
280
-        "fmul8sux16 %%f14, %%f36, %%f56 \n\t"\
281
-        "fmul8sux16 %%f14, %%f32, %%f58 \n\t"\
282
-\
283
-        "fpadd16 %%f16, %%f48, %%f16 \n\t"\
284
-        "fpsub16 %%f20, %%f50, %%f20 \n\t"\
285
-        "fpadd16 %%f24, %%f50, %%f24 \n\t"\
286
-        "fpsub16 %%f28, %%f48, %%f28 \n\t"\
287
-        "fpadd16 %%f18, %%f52, %%f18 \n\t"\
288
-        "fpsub16 %%f22, %%f54, %%f22 \n\t"\
289
-        "fpadd16 %%f26, %%f56, %%f26 \n\t"\
290
-        "fpsub16 %%f30, %%f58, %%f30 \n\t"\
291
-\
292
-        "fpsub16 %%f20, %%f12, %%f20 \n\t"\
293
-        "fpadd16 %%f24, %%f12, %%f24 \n\t"\
294
-        "fpsub16 %%f22, %%f14, %%f22 \n\t"\
295
-        "fpadd16 %%f26, %%f14, %%f26 \n\t"\
296
-        "fpsub16 %%f30, %%f14, %%f30 \n\t"\
297
-    /* final butterfly */\
298
-        "5:                          \n\t"\
299
-        "fpsub16 %%f16, %%f18, %%f48 \n\t"\
300
-        "fpsub16 %%f20, %%f22, %%f50 \n\t"\
301
-        "fpsub16 %%f24, %%f26, %%f52 \n\t"\
302
-        "fpsub16 %%f28, %%f30, %%f54 \n\t"\
303
-        "fpadd16 %%f16, %%f18, %%f16 \n\t"\
304
-        "fpadd16 %%f20, %%f22, %%f20 \n\t"\
305
-        "fpadd16 %%f24, %%f26, %%f24 \n\t"\
306
-        "fpadd16 %%f28, %%f30, %%f28 \n\t"\
307
-
308
-#define STOREROWS(out) \
309
-        "std %%f48, [" out "+112]          \n\t"\
310
-        "std %%f50, [" out "+96]           \n\t"\
311
-        "std %%f52, [" out "+80]           \n\t"\
312
-        "std %%f54, [" out "+64]           \n\t"\
313
-        "std %%f16, [" out "]              \n\t"\
314
-        "std %%f20, [" out "+16]           \n\t"\
315
-        "std %%f24, [" out "+32]           \n\t"\
316
-        "std %%f28, [" out "+48]           \n\t"\
317
-
318
-#define SCALEROWS \
319
-        "fmul8sux16 %%f46, %%f48, %%f48 \n\t"\
320
-        "fmul8sux16 %%f46, %%f50, %%f50 \n\t"\
321
-        "fmul8sux16 %%f46, %%f52, %%f52 \n\t"\
322
-        "fmul8sux16 %%f46, %%f54, %%f54 \n\t"\
323
-        "fmul8sux16 %%f46, %%f16, %%f16 \n\t"\
324
-        "fmul8sux16 %%f46, %%f20, %%f20 \n\t"\
325
-        "fmul8sux16 %%f46, %%f24, %%f24 \n\t"\
326
-        "fmul8sux16 %%f46, %%f28, %%f28 \n\t"\
327
-
328
-#define PUTPIXELSCLAMPED(dest) \
329
-        "fpack16 %%f48, %%f14 \n\t"\
330
-        "fpack16 %%f50, %%f12 \n\t"\
331
-        "fpack16 %%f16, %%f0  \n\t"\
332
-        "fpack16 %%f20, %%f2  \n\t"\
333
-        "fpack16 %%f24, %%f4  \n\t"\
334
-        "fpack16 %%f28, %%f6  \n\t"\
335
-        "fpack16 %%f54, %%f8  \n\t"\
336
-        "fpack16 %%f52, %%f10 \n\t"\
337
-        "st %%f0, [%3+" dest "]   \n\t"\
338
-        "st %%f2, [%5+" dest "]   \n\t"\
339
-        "st %%f4, [%6+" dest "]   \n\t"\
340
-        "st %%f6, [%7+" dest "]   \n\t"\
341
-        "st %%f8, [%8+" dest "]   \n\t"\
342
-        "st %%f10, [%9+" dest "]  \n\t"\
343
-        "st %%f12, [%10+" dest "] \n\t"\
344
-        "st %%f14, [%11+" dest "] \n\t"\
345
-
346
-#define ADDPIXELSCLAMPED(dest) \
347
-        "ldd [%5], %%f18         \n\t"\
348
-        "ld [%3+" dest"], %%f0   \n\t"\
349
-        "ld [%6+" dest"], %%f2   \n\t"\
350
-        "ld [%7+" dest"], %%f4   \n\t"\
351
-        "ld [%8+" dest"], %%f6   \n\t"\
352
-        "ld [%9+" dest"], %%f8   \n\t"\
353
-        "ld [%10+" dest"], %%f10 \n\t"\
354
-        "ld [%11+" dest"], %%f12 \n\t"\
355
-        "ld [%12+" dest"], %%f14 \n\t"\
356
-        "fmul8x16 %%f0, %%f18, %%f0   \n\t"\
357
-        "fmul8x16 %%f2, %%f18, %%f2   \n\t"\
358
-        "fmul8x16 %%f4, %%f18, %%f4   \n\t"\
359
-        "fmul8x16 %%f6, %%f18, %%f6   \n\t"\
360
-        "fmul8x16 %%f8, %%f18, %%f8   \n\t"\
361
-        "fmul8x16 %%f10, %%f18, %%f10 \n\t"\
362
-        "fmul8x16 %%f12, %%f18, %%f12 \n\t"\
363
-        "fmul8x16 %%f14, %%f18, %%f14 \n\t"\
364
-        "fpadd16 %%f0, %%f16, %%f0    \n\t"\
365
-        "fpadd16 %%f2, %%f20, %%f2    \n\t"\
366
-        "fpadd16 %%f4, %%f24, %%f4    \n\t"\
367
-        "fpadd16 %%f6, %%f28, %%f6    \n\t"\
368
-        "fpadd16 %%f8, %%f54, %%f8    \n\t"\
369
-        "fpadd16 %%f10, %%f52, %%f10  \n\t"\
370
-        "fpadd16 %%f12, %%f50, %%f12  \n\t"\
371
-        "fpadd16 %%f14, %%f48, %%f14  \n\t"\
372
-        "fpack16 %%f0, %%f0   \n\t"\
373
-        "fpack16 %%f2, %%f2   \n\t"\
374
-        "fpack16 %%f4, %%f4   \n\t"\
375
-        "fpack16 %%f6, %%f6   \n\t"\
376
-        "fpack16 %%f8, %%f8   \n\t"\
377
-        "fpack16 %%f10, %%f10 \n\t"\
378
-        "fpack16 %%f12, %%f12 \n\t"\
379
-        "fpack16 %%f14, %%f14 \n\t"\
380
-        "st %%f0, [%3+" dest "]   \n\t"\
381
-        "st %%f2, [%6+" dest "]   \n\t"\
382
-        "st %%f4, [%7+" dest "]   \n\t"\
383
-        "st %%f6, [%8+" dest "]   \n\t"\
384
-        "st %%f8, [%9+" dest "]   \n\t"\
385
-        "st %%f10, [%10+" dest "] \n\t"\
386
-        "st %%f12, [%11+" dest "] \n\t"\
387
-        "st %%f14, [%12+" dest "] \n\t"\
388
-
389
-
390
-void ff_simple_idct_vis(int16_t *data) {
391
-    int out1, out2, out3, out4;
392
-    DECLARE_ALIGNED(8, int16_t, temp)[8*8];
393
-
394
-    __asm__ volatile(
395
-        INIT_IDCT
396
-
397
-#define ADDROUNDER
398
-
399
-        // shift right 16-4=12
400
-        LOADSCALE("%2+8")
401
-        IDCT4ROWS
402
-        STOREROWS("%3+8")
403
-        LOADSCALE("%2+0")
404
-        IDCT4ROWS
405
-        "std %%f48, [%3+112] \n\t"
406
-        "std %%f50, [%3+96]  \n\t"
407
-        "std %%f52, [%3+80]  \n\t"
408
-        "std %%f54, [%3+64]  \n\t"
409
-
410
-        // shift right 16+4
411
-        "ldd [%3+8], %%f18  \n\t"
412
-        "ldd [%3+24], %%f22 \n\t"
413
-        "ldd [%3+40], %%f26 \n\t"
414
-        "ldd [%3+56], %%f30 \n\t"
415
-        TRANSPOSE
416
-        IDCT4ROWS
417
-        SCALEROWS
418
-        STOREROWS("%2+0")
419
-        LOAD("%3+64")
420
-        TRANSPOSE
421
-        IDCT4ROWS
422
-        SCALEROWS
423
-        STOREROWS("%2+8")
424
-
425
-        : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4)
426
-        : "0" (scale), "1" (coeffs), "2" (data), "3" (temp)
427
-    );
428
-}
429
-
430
-void ff_simple_idct_put_vis(uint8_t *dest, int line_size, int16_t *data) {
431
-    int out1, out2, out3, out4, out5;
432
-    int r1, r2, r3, r4, r5, r6, r7;
433
-
434
-    __asm__ volatile(
435
-        "wr %%g0, 0x8, %%gsr \n\t"
436
-
437
-        INIT_IDCT
438
-
439
-        "add %3, %4, %5   \n\t"
440
-        "add %5, %4, %6   \n\t"
441
-        "add %6, %4, %7   \n\t"
442
-        "add %7, %4, %8   \n\t"
443
-        "add %8, %4, %9   \n\t"
444
-        "add %9, %4, %10  \n\t"
445
-        "add %10, %4, %11 \n\t"
446
-
447
-        // shift right 16-4=12
448
-        LOADSCALE("%2+8")
449
-        IDCT4ROWS
450
-        STOREROWS("%2+8")
451
-        LOADSCALE("%2+0")
452
-        IDCT4ROWS
453
-        "std %%f48, [%2+112] \n\t"
454
-        "std %%f50, [%2+96]  \n\t"
455
-        "std %%f52, [%2+80]  \n\t"
456
-        "std %%f54, [%2+64]  \n\t"
457
-
458
-#undef ADDROUNDER
459
-#define ADDROUNDER "fpadd16 %%f28, %%f46, %%f28 \n\t"
460
-
461
-        // shift right 16+4
462
-        "ldd [%2+8], %%f18  \n\t"
463
-        "ldd [%2+24], %%f22 \n\t"
464
-        "ldd [%2+40], %%f26 \n\t"
465
-        "ldd [%2+56], %%f30 \n\t"
466
-        TRANSPOSE
467
-        IDCT4ROWS
468
-        PUTPIXELSCLAMPED("0")
469
-        LOAD("%2+64")
470
-        TRANSPOSE
471
-        IDCT4ROWS
472
-        PUTPIXELSCLAMPED("4")
473
-
474
-        : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5),
475
-          "=r" (r1), "=r" (r2), "=r" (r3), "=r" (r4), "=r" (r5), "=r" (r6), "=r" (r7)
476
-        : "0" (rounder), "1" (coeffs), "2" (data), "3" (dest), "4" (line_size)
477
-    );
478
-}
479
-
480
-void ff_simple_idct_add_vis(uint8_t *dest, int line_size, int16_t *data) {
481
-    int out1, out2, out3, out4, out5, out6;
482
-    int r1, r2, r3, r4, r5, r6, r7;
483
-
484
-    __asm__ volatile(
485
-        "wr %%g0, 0x8, %%gsr \n\t"
486
-
487
-        INIT_IDCT
488
-
489
-        "add %3, %4, %6   \n\t"
490
-        "add %6, %4, %7   \n\t"
491
-        "add %7, %4, %8   \n\t"
492
-        "add %8, %4, %9   \n\t"
493
-        "add %9, %4, %10  \n\t"
494
-        "add %10, %4, %11 \n\t"
495
-        "add %11, %4, %12 \n\t"
496
-
497
-#undef ADDROUNDER
498
-#define ADDROUNDER
499
-
500
-        // shift right 16-4=12
501
-        LOADSCALE("%2+8")
502
-        IDCT4ROWS
503
-        STOREROWS("%2+8")
504
-        LOADSCALE("%2+0")
505
-        IDCT4ROWS
506
-        "std %%f48, [%2+112] \n\t"
507
-        "std %%f50, [%2+96]  \n\t"
508
-        "std %%f52, [%2+80]  \n\t"
509
-        "std %%f54, [%2+64]  \n\t"
510
-
511
-#undef ADDROUNDER
512
-#define ADDROUNDER "fpadd16 %%f28, %%f46, %%f28 \n\t"
513
-
514
-        // shift right 16+4
515
-        "ldd [%2+8], %%f18  \n\t"
516
-        "ldd [%2+24], %%f22 \n\t"
517
-        "ldd [%2+40], %%f26 \n\t"
518
-        "ldd [%2+56], %%f30 \n\t"
519
-        TRANSPOSE
520
-        IDCT4ROWS
521
-        ADDPIXELSCLAMPED("0")
522
-        LOAD("%2+64")
523
-        TRANSPOSE
524
-        IDCT4ROWS
525
-        ADDPIXELSCLAMPED("4")
526
-
527
-        : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6),
528
-          "=r" (r1), "=r" (r2), "=r" (r3), "=r" (r4), "=r" (r5), "=r" (r6), "=r" (r7)
529
-        : "0" (rounder), "1" (coeffs), "2" (data), "3" (dest), "4" (line_size), "5" (expand)
530
-    );
531
-}
532 1
deleted file mode 100644
... ...
@@ -1,264 +0,0 @@
1
-/*
2
- * Copyright (C) 2003 David S. Miller <davem@redhat.com>
3
- *
4
- * This file is part of Libav.
5
- *
6
- * Libav is free software; you can redistribute it and/or
7
- * modify it under the terms of the GNU Lesser General Public
8
- * License as published by the Free Software Foundation; either
9
- * version 2.1 of the License, or (at your option) any later version.
10
- *
11
- * Libav is distributed in the hope that it will be useful,
12
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
- * Lesser General Public License for more details.
15
- *
16
- * You should have received a copy of the GNU Lesser General Public
17
- * License along with Libav; if not, write to the Free Software
18
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
- */
20
-
21
-/* You may be asking why I hard-code the instruction opcodes and don't
22
- * use the normal VIS assembler mnenomics for the VIS instructions.
23
- *
24
- * The reason is that Sun, in their infinite wisdom, decided that a binary
25
- * using a VIS instruction will cause it to be marked (in the ELF headers)
26
- * as doing so, and this prevents the OS from loading such binaries if the
27
- * current cpu doesn't have VIS.  There is no way to easily override this
28
- * behavior of the assembler that I am aware of.
29
- *
30
- * This totally defeats what libmpeg2 is trying to do which is allow a
31
- * single binary to be created, and then detect the availability of VIS
32
- * at runtime.
33
- *
34
- * I'm not saying that tainting the binary by default is bad, rather I'm
35
- * saying that not providing a way to override this easily unnecessarily
36
- * ties people's hands.
37
- *
38
- * Thus, we do the opcode encoding by hand and output 32-bit words in
39
- * the assembler to keep the binary from becoming tainted.
40
- */
41
-
42
-#ifndef AVCODEC_SPARC_VIS_H
43
-#define AVCODEC_SPARC_VIS_H
44
-
45
-#define ACCEL_SPARC_VIS 1
46
-#define ACCEL_SPARC_VIS2 2
47
-
48
-static inline int vis_level(void)
49
-{
50
-    int accel = 0;
51
-    accel |= ACCEL_SPARC_VIS;
52
-    accel |= ACCEL_SPARC_VIS2;
53
-    return accel;
54
-}
55
-
56
-#define vis_opc_base    ((0x1 << 31) | (0x36 << 19))
57
-#define vis_opf(X)      ((X) << 5)
58
-#define vis_sreg(X)     (X)
59
-#define vis_dreg(X)     (((X)&0x1f)|((X)>>5))
60
-#define vis_rs1_s(X)    (vis_sreg(X) << 14)
61
-#define vis_rs1_d(X)    (vis_dreg(X) << 14)
62
-#define vis_rs2_s(X)    (vis_sreg(X) << 0)
63
-#define vis_rs2_d(X)    (vis_dreg(X) << 0)
64
-#define vis_rd_s(X)     (vis_sreg(X) << 25)
65
-#define vis_rd_d(X)     (vis_dreg(X) << 25)
66
-
67
-#define vis_ss2s(opf,rs1,rs2,rd) \
68
-        __asm__ volatile (".word %0" \
69
-                              : : "i" (vis_opc_base | vis_opf(opf) | \
70
-                                       vis_rs1_s(rs1) | \
71
-                                       vis_rs2_s(rs2) | \
72
-                                       vis_rd_s(rd)))
73
-
74
-#define vis_dd2d(opf,rs1,rs2,rd) \
75
-        __asm__ volatile (".word %0" \
76
-                              : : "i" (vis_opc_base | vis_opf(opf) | \
77
-                                       vis_rs1_d(rs1) | \
78
-                                       vis_rs2_d(rs2) | \
79
-                                       vis_rd_d(rd)))
80
-
81
-#define vis_ss2d(opf,rs1,rs2,rd) \
82
-        __asm__ volatile (".word %0" \
83
-                              : : "i" (vis_opc_base | vis_opf(opf) | \
84
-                                       vis_rs1_s(rs1) | \
85
-                                       vis_rs2_s(rs2) | \
86
-                                       vis_rd_d(rd)))
87
-
88
-#define vis_sd2d(opf,rs1,rs2,rd) \
89
-        __asm__ volatile (".word %0" \
90
-                              : : "i" (vis_opc_base | vis_opf(opf) | \
91
-                                       vis_rs1_s(rs1) | \
92
-                                       vis_rs2_d(rs2) | \
93
-                                       vis_rd_d(rd)))
94
-
95
-#define vis_d2s(opf,rs2,rd) \
96
-        __asm__ volatile (".word %0" \
97
-                              : : "i" (vis_opc_base | vis_opf(opf) | \
98
-                                       vis_rs2_d(rs2) | \
99
-                                       vis_rd_s(rd)))
100
-
101
-#define vis_s2d(opf,rs2,rd) \
102
-        __asm__ volatile (".word %0" \
103
-                              : : "i" (vis_opc_base | vis_opf(opf) | \
104
-                                       vis_rs2_s(rs2) | \
105
-                                       vis_rd_d(rd)))
106
-
107
-#define vis_d12d(opf,rs1,rd) \
108
-        __asm__ volatile (".word %0" \
109
-                              : : "i" (vis_opc_base | vis_opf(opf) | \
110
-                                       vis_rs1_d(rs1) | \
111
-                                       vis_rd_d(rd)))
112
-
113
-#define vis_d22d(opf,rs2,rd) \
114
-        __asm__ volatile (".word %0" \
115
-                              : : "i" (vis_opc_base | vis_opf(opf) | \
116
-                                       vis_rs2_d(rs2) | \
117
-                                       vis_rd_d(rd)))
118
-
119
-#define vis_s12s(opf,rs1,rd) \
120
-        __asm__ volatile (".word %0" \
121
-                              : : "i" (vis_opc_base | vis_opf(opf) | \
122
-                                       vis_rs1_s(rs1) | \
123
-                                       vis_rd_s(rd)))
124
-
125
-#define vis_s22s(opf,rs2,rd) \
126
-        __asm__ volatile (".word %0" \
127
-                              : : "i" (vis_opc_base | vis_opf(opf) | \
128
-                                       vis_rs2_s(rs2) | \
129
-                                       vis_rd_s(rd)))
130
-
131
-#define vis_s(opf,rd) \
132
-        __asm__ volatile (".word %0" \
133
-                              : : "i" (vis_opc_base | vis_opf(opf) | \
134
-                                       vis_rd_s(rd)))
135
-
136
-#define vis_d(opf,rd) \
137
-        __asm__ volatile (".word %0" \
138
-                              : : "i" (vis_opc_base | vis_opf(opf) | \
139
-                                       vis_rd_d(rd)))
140
-
141
-#define vis_r2m(op,rd,mem) \
142
-        __asm__ volatile (#op "\t%%f" #rd ", [%0]" : : "r" (&(mem)) )
143
-
144
-#define vis_r2m_2(op,rd,mem1,mem2) \
145
-        __asm__ volatile (#op "\t%%f" #rd ", [%0 + %1]" : : "r" (mem1), "r" (mem2) )
146
-
147
-#define vis_m2r(op,mem,rd) \
148
-        __asm__ volatile (#op "\t[%0], %%f" #rd : : "r" (&(mem)) )
149
-
150
-#define vis_m2r_2(op,mem1,mem2,rd) \
151
-        __asm__ volatile (#op "\t[%0 + %1], %%f" #rd : : "r" (mem1), "r" (mem2) )
152
-
153
-static inline void vis_set_gsr(unsigned int val)
154
-{
155
-        __asm__ volatile("mov %0,%%asr19"
156
-                             : : "r" (val));
157
-}
158
-
159
-#define VIS_GSR_ALIGNADDR_MASK          0x0000007
160
-#define VIS_GSR_ALIGNADDR_SHIFT         0
161
-#define VIS_GSR_SCALEFACT_MASK          0x0000078
162
-#define VIS_GSR_SCALEFACT_SHIFT         3
163
-
164
-#define vis_ld32(mem,rs1)               vis_m2r(ld, mem, rs1)
165
-#define vis_ld32_2(mem1,mem2,rs1)       vis_m2r_2(ld, mem1, mem2, rs1)
166
-#define vis_st32(rs1,mem)               vis_r2m(st, rs1, mem)
167
-#define vis_st32_2(rs1,mem1,mem2)       vis_r2m_2(st, rs1, mem1, mem2)
168
-#define vis_ld64(mem,rs1)               vis_m2r(ldd, mem, rs1)
169
-#define vis_ld64_2(mem1,mem2,rs1)       vis_m2r_2(ldd, mem1, mem2, rs1)
170
-#define vis_st64(rs1,mem)               vis_r2m(std, rs1, mem)
171
-#define vis_st64_2(rs1,mem1,mem2)       vis_r2m_2(std, rs1, mem1, mem2)
172
-
173
-/* 16 and 32 bit partitioned addition and subtraction.  The normal
174
- * versions perform 4 16-bit or 2 32-bit additions or subtractions.
175
- * The 's' versions perform 2 16-bit or 1 32-bit additions or
176
- * subtractions.
177
- */
178
-
179
-#define vis_padd16(rs1,rs2,rd)          vis_dd2d(0x50, rs1, rs2, rd)
180
-#define vis_padd16s(rs1,rs2,rd)         vis_ss2s(0x51, rs1, rs2, rd)
181
-#define vis_padd32(rs1,rs2,rd)          vis_dd2d(0x52, rs1, rs2, rd)
182
-#define vis_padd32s(rs1,rs2,rd)         vis_ss2s(0x53, rs1, rs2, rd)
183
-#define vis_psub16(rs1,rs2,rd)          vis_dd2d(0x54, rs1, rs2, rd)
184
-#define vis_psub16s(rs1,rs2,rd)         vis_ss2s(0x55, rs1, rs2, rd)
185
-#define vis_psub32(rs1,rs2,rd)          vis_dd2d(0x56, rs1, rs2, rd)
186
-#define vis_psub32s(rs1,rs2,rd)         vis_ss2s(0x57, rs1, rs2, rd)
187
-
188
-/* Pixel formatting instructions.  */
189
-
190
-#define vis_pack16(rs2,rd)              vis_d2s( 0x3b,      rs2, rd)
191
-#define vis_pack32(rs1,rs2,rd)          vis_dd2d(0x3a, rs1, rs2, rd)
192
-#define vis_packfix(rs2,rd)             vis_d2s( 0x3d,      rs2, rd)
193
-#define vis_expand(rs2,rd)              vis_s2d( 0x4d,      rs2, rd)
194
-#define vis_pmerge(rs1,rs2,rd)          vis_ss2d(0x4b, rs1, rs2, rd)
195
-
196
-/* Partitioned multiply instructions.  */
197
-
198
-#define vis_mul8x16(rs1,rs2,rd)         vis_sd2d(0x31, rs1, rs2, rd)
199
-#define vis_mul8x16au(rs1,rs2,rd)       vis_ss2d(0x33, rs1, rs2, rd)
200
-#define vis_mul8x16al(rs1,rs2,rd)       vis_ss2d(0x35, rs1, rs2, rd)
201
-#define vis_mul8sux16(rs1,rs2,rd)       vis_dd2d(0x36, rs1, rs2, rd)
202
-#define vis_mul8ulx16(rs1,rs2,rd)       vis_dd2d(0x37, rs1, rs2, rd)
203
-#define vis_muld8sux16(rs1,rs2,rd)      vis_ss2d(0x38, rs1, rs2, rd)
204
-#define vis_muld8ulx16(rs1,rs2,rd)      vis_ss2d(0x39, rs1, rs2, rd)
205
-
206
-/* Alignment instructions.  */
207
-
208
-static inline const void *vis_alignaddr(const void *ptr)
209
-{
210
-        __asm__ volatile("alignaddr %0, %%g0, %0"
211
-                             : "=&r" (ptr)
212
-                             : "0" (ptr));
213
-
214
-        return ptr;
215
-}
216
-
217
-static inline void vis_alignaddr_g0(void *ptr)
218
-{
219
-        __asm__ volatile("alignaddr %0, %%g0, %%g0"
220
-                             : : "r" (ptr));
221
-}
222
-
223
-#define vis_faligndata(rs1,rs2,rd)        vis_dd2d(0x48, rs1, rs2, rd)
224
-
225
-/* Logical operate instructions.  */
226
-
227
-#define vis_fzero(rd)                   vis_d(   0x60,           rd)
228
-#define vis_fzeros(rd)                  vis_s(   0x61,           rd)
229
-#define vis_fone(rd)                    vis_d(   0x7e,           rd)
230
-#define vis_fones(rd)                   vis_s(   0x7f,           rd)
231
-#define vis_src1(rs1,rd)                vis_d12d(0x74, rs1,      rd)
232
-#define vis_src1s(rs1,rd)               vis_s12s(0x75, rs1,      rd)
233
-#define vis_src2(rs2,rd)                vis_d22d(0x78,      rs2, rd)
234
-#define vis_src2s(rs2,rd)               vis_s22s(0x79,      rs2, rd)
235
-#define vis_not1(rs1,rd)                vis_d12d(0x6a, rs1,      rd)
236
-#define vis_not1s(rs1,rd)               vis_s12s(0x6b, rs1,      rd)
237
-#define vis_not2(rs2,rd)                vis_d22d(0x66,      rs2, rd)
238
-#define vis_not2s(rs2,rd)               vis_s22s(0x67,      rs2, rd)
239
-#define vis_or(rs1,rs2,rd)              vis_dd2d(0x7c, rs1, rs2, rd)
240
-#define vis_ors(rs1,rs2,rd)             vis_ss2s(0x7d, rs1, rs2, rd)
241
-#define vis_nor(rs1,rs2,rd)             vis_dd2d(0x62, rs1, rs2, rd)
242
-#define vis_nors(rs1,rs2,rd)            vis_ss2s(0x63, rs1, rs2, rd)
243
-#define vis_and(rs1,rs2,rd)             vis_dd2d(0x70, rs1, rs2, rd)
244
-#define vis_ands(rs1,rs2,rd)            vis_ss2s(0x71, rs1, rs2, rd)
245
-#define vis_nand(rs1,rs2,rd)            vis_dd2d(0x6e, rs1, rs2, rd)
246
-#define vis_nands(rs1,rs2,rd)           vis_ss2s(0x6f, rs1, rs2, rd)
247
-#define vis_xor(rs1,rs2,rd)             vis_dd2d(0x6c, rs1, rs2, rd)
248
-#define vis_xors(rs1,rs2,rd)            vis_ss2s(0x6d, rs1, rs2, rd)
249
-#define vis_xnor(rs1,rs2,rd)            vis_dd2d(0x72, rs1, rs2, rd)
250
-#define vis_xnors(rs1,rs2,rd)           vis_ss2s(0x73, rs1, rs2, rd)
251
-#define vis_ornot1(rs1,rs2,rd)          vis_dd2d(0x7a, rs1, rs2, rd)
252
-#define vis_ornot1s(rs1,rs2,rd)         vis_ss2s(0x7b, rs1, rs2, rd)
253
-#define vis_ornot2(rs1,rs2,rd)          vis_dd2d(0x76, rs1, rs2, rd)
254
-#define vis_ornot2s(rs1,rs2,rd)         vis_ss2s(0x77, rs1, rs2, rd)
255
-#define vis_andnot1(rs1,rs2,rd)         vis_dd2d(0x68, rs1, rs2, rd)
256
-#define vis_andnot1s(rs1,rs2,rd)        vis_ss2s(0x69, rs1, rs2, rd)
257
-#define vis_andnot2(rs1,rs2,rd)         vis_dd2d(0x64, rs1, rs2, rd)
258
-#define vis_andnot2s(rs1,rs2,rd)        vis_ss2s(0x65, rs1, rs2, rd)
259
-
260
-/* Pixel component distance.  */
261
-
262
-#define vis_pdist(rs1,rs2,rd)           vis_dd2d(0x3e, rs1, rs2, rd)
263
-
264
-#endif /* AVCODEC_SPARC_VIS_H */
... ...
@@ -126,5 +126,8 @@
126 126
 #ifndef FF_API_ARCH_SH4
127 127
 #define FF_API_ARCH_SH4          (LIBAVCODEC_VERSION_MAJOR < 56)
128 128
 #endif
129
+#ifndef FF_API_ARCH_SPARC
130
+#define FF_API_ARCH_SPARC        (LIBAVCODEC_VERSION_MAJOR < 56)
131
+#endif
129 132
 
130 133
 #endif /* AVCODEC_VERSION_H */
131 134
deleted file mode 100644
... ...
@@ -1 +0,0 @@
1
-VIS-OBJS += sparc/yuv2rgb_vis.o                                         \
2 1
deleted file mode 100644
... ...
@@ -1,212 +0,0 @@
1
-/*
2
- * VIS optimized software YUV to RGB converter
3
- * Copyright (c) 2007 Denes Balatoni <dbalatoni@programozo.hu>
4
- *
5
- * This file is part of Libav.
6
- *
7
- * Libav is free software; you can redistribute it and/or
8
- * modify it under the terms of the GNU Lesser General Public
9
- * License as published by the Free Software Foundation; either
10
- * version 2.1 of the License, or (at your option) any later version.
11
- *
12
- * Libav is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
- * Lesser General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU Lesser General Public
18
- * License along with Libav; if not, write to the Free Software
19
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
- */
21
-
22
-#include <inttypes.h>
23
-#include <stdlib.h>
24
-
25
-#include "libavutil/attributes.h"
26
-#include "libswscale/swscale.h"
27
-#include "libswscale/swscale_internal.h"
28
-
29
-#define YUV2RGB_INIT                               \
30
-    "wr %%g0, 0x10, %%gsr \n\t"                    \
31
-    "ldd [%5],      %%f32 \n\t"                    \
32
-    "ldd [%5 +  8], %%f34 \n\t"                    \
33
-    "ldd [%5 + 16], %%f36 \n\t"                    \
34
-    "ldd [%5 + 24], %%f38 \n\t"                    \
35
-    "ldd [%5 + 32], %%f40 \n\t"                    \
36
-    "ldd [%5 + 40], %%f42 \n\t"                    \
37
-    "ldd [%5 + 48], %%f44 \n\t"                    \
38
-    "ldd [%5 + 56], %%f46 \n\t"                    \
39
-    "ldd [%5 + 64], %%f48 \n\t"                    \
40
-    "ldd [%5 + 72], %%f50 \n\t"
41
-
42
-#define YUV2RGB_KERNEL                             \
43
-    /* ^^^^ f0=Y f3=u f5=v */                      \
44
-    "fmul8x16 %%f3,  %%f48,  %%f6 \n\t"            \
45
-    "fmul8x16 %%f19, %%f48, %%f22 \n\t"            \
46
-    "fmul8x16 %%f5,  %%f44,  %%f8 \n\t"            \
47
-    "fmul8x16 %%f21, %%f44, %%f24 \n\t"            \
48
-    "fmul8x16 %%f0,  %%f42,  %%f0 \n\t"            \
49
-    "fmul8x16 %%f16, %%f42, %%f16 \n\t"            \
50
-    "fmul8x16 %%f3,  %%f50,  %%f2 \n\t"            \
51
-    "fmul8x16 %%f19, %%f50, %%f18 \n\t"            \
52
-    "fmul8x16 %%f5,  %%f46,  %%f4 \n\t"            \
53
-    "fmul8x16 %%f21, %%f46, %%f20 \n\t"            \
54
-                                                   \
55
-    "fpsub16 %%f6,  %%f34,  %%f6 \n\t" /* 1 */     \
56
-    "fpsub16 %%f22, %%f34, %%f22 \n\t" /* 1 */     \
57
-    "fpsub16 %%f8,  %%f38,  %%f8 \n\t" /* 3 */     \
58
-    "fpsub16 %%f24, %%f38, %%f24 \n\t" /* 3 */     \
59
-    "fpsub16 %%f0,  %%f32,  %%f0 \n\t" /* 0 */     \
60
-    "fpsub16 %%f16, %%f32, %%f16 \n\t" /* 0 */     \
61
-    "fpsub16 %%f2,  %%f36,  %%f2 \n\t" /* 2 */     \
62
-    "fpsub16 %%f18, %%f36, %%f18 \n\t" /* 2 */     \
63
-    "fpsub16 %%f4,  %%f40,  %%f4 \n\t" /* 4 */     \
64
-    "fpsub16 %%f20, %%f40, %%f20 \n\t" /* 4 */     \
65
-                                                   \
66
-    "fpadd16 %%f0,  %%f8,  %%f8  \n\t" /* Gt */    \
67
-    "fpadd16 %%f16, %%f24, %%f24 \n\t" /* Gt */    \
68
-    "fpadd16 %%f0,  %%f4,  %%f4  \n\t" /* R */     \
69
-    "fpadd16 %%f16, %%f20, %%f20 \n\t" /* R */     \
70
-    "fpadd16 %%f0,  %%f6,  %%f6  \n\t" /* B */     \
71
-    "fpadd16 %%f16, %%f22, %%f22 \n\t" /* B */     \
72
-    "fpadd16 %%f8,  %%f2,  %%f2  \n\t" /* G */     \
73
-    "fpadd16 %%f24, %%f18, %%f18 \n\t" /* G */     \
74
-                                                   \
75
-    "fpack16 %%f4,  %%f4  \n\t"                    \
76
-    "fpack16 %%f20, %%f20 \n\t"                    \
77
-    "fpack16 %%f6,  %%f6  \n\t"                    \
78
-    "fpack16 %%f22, %%f22 \n\t"                    \
79
-    "fpack16 %%f2,  %%f2  \n\t"                    \
80
-    "fpack16 %%f18, %%f18 \n\t"
81
-
82
-// FIXME: must be changed to set alpha to 255 instead of 0
83
-static int vis_420P_ARGB32(SwsContext *c, uint8_t *src[], int srcStride[],
84
-                           int srcSliceY, int srcSliceH,
85
-                           uint8_t *dst[], int dstStride[])
86
-{
87
-    int y, out1, out2, out3, out4, out5, out6;
88
-
89
-    for (y = 0; y < srcSliceH; ++y)
90
-        __asm__ volatile (
91
-            YUV2RGB_INIT
92
-            "wr %%g0, 0xd2, %%asi        \n\t"  /* ASI_FL16_P */
93
-            "1:                          \n\t"
94
-            "ldda [%1]     %%asi, %%f2   \n\t"
95
-            "ldda [%1 + 2] %%asi, %%f18  \n\t"
96
-            "ldda [%2]     %%asi, %%f4   \n\t"
97
-            "ldda [%2 + 2] %%asi, %%f20  \n\t"
98
-            "ld [%0], %%f0               \n\t"
99
-            "ld [%0+4], %%f16            \n\t"
100
-            "fpmerge %%f3,  %%f3,  %%f2  \n\t"
101
-            "fpmerge %%f19, %%f19, %%f18 \n\t"
102
-            "fpmerge %%f5,  %%f5,  %%f4  \n\t"
103
-            "fpmerge %%f21, %%f21, %%f20 \n\t"
104
-            YUV2RGB_KERNEL
105
-            "fzero %%f0                  \n\t"
106
-            "fpmerge %%f4,  %%f6,  %%f8  \n\t"  // r, b, t1
107
-            "fpmerge %%f20, %%f22, %%f24 \n\t"  // r, b, t1
108
-            "fpmerge %%f0,  %%f2,  %%f10 \n\t"  // 0, g, t2
109
-            "fpmerge %%f0,  %%f18, %%f26 \n\t"  // 0, g, t2
110
-            "fpmerge %%f10, %%f8,  %%f4  \n\t"  // t2, t1, msb
111
-            "fpmerge %%f26, %%f24, %%f20 \n\t"  // t2, t1, msb
112
-            "fpmerge %%f11, %%f9,  %%f6  \n\t"  // t2, t1, lsb
113
-            "fpmerge %%f27, %%f25, %%f22 \n\t"  // t2, t1, lsb
114
-            "std %%f4,  [%3]             \n\t"
115
-            "std %%f20, [%3 + 16]        \n\t"
116
-            "std %%f6,  [%3 +  8]        \n\t"
117
-            "std %%f22, [%3 + 24]        \n\t"
118
-
119
-            "add %0, 8, %0   \n\t"
120
-            "add %1, 4, %1   \n\t"
121
-            "add %2, 4, %2   \n\t"
122
-            "subcc %4, 8, %4 \n\t"
123
-            "bne 1b          \n\t"
124
-            "add %3, 32, %3  \n\t"              // delay slot
125
-            : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
126
-            : "0" (src[0] + (y + srcSliceY) * srcStride[0]), "1" (src[1] + ((y + srcSliceY) >> 1) * srcStride[1]),
127
-            "2" (src[2] + ((y + srcSliceY) >> 1) * srcStride[2]), "3" (dst[0] + (y + srcSliceY) * dstStride[0]),
128
-            "4" (c->dstW),
129
-            "5" (c->sparc_coeffs)
130
-            );
131
-
132
-    return srcSliceH;
133
-}
134
-
135
-// FIXME: must be changed to set alpha to 255 instead of 0
136
-static int vis_422P_ARGB32(SwsContext *c, uint8_t *src[], int srcStride[],
137
-                           int srcSliceY, int srcSliceH,
138
-                           uint8_t *dst[], int dstStride[])
139
-{
140
-    int y, out1, out2, out3, out4, out5, out6;
141
-
142
-    for (y = 0; y < srcSliceH; ++y)
143
-        __asm__ volatile (
144
-            YUV2RGB_INIT
145
-            "wr %%g0, 0xd2, %%asi        \n\t" /* ASI_FL16_P */
146
-            "1:                          \n\t"
147
-            "ldda [%1]     %%asi, %%f2   \n\t"
148
-            "ldda [%1 + 2] %%asi, %%f18  \n\t"
149
-            "ldda [%2]     %%asi, %%f4   \n\t"
150
-            "ldda [%2 + 2] %%asi, %%f20  \n\t"
151
-            "ld [%0],     %%f0           \n\t"
152
-            "ld [%0 + 4], %%f16          \n\t"
153
-            "fpmerge %%f3,  %%f3,  %%f2  \n\t"
154
-            "fpmerge %%f19, %%f19, %%f18 \n\t"
155
-            "fpmerge %%f5,  %%f5,  %%f4  \n\t"
156
-            "fpmerge %%f21, %%f21, %%f20 \n\t"
157
-            YUV2RGB_KERNEL
158
-            "fzero %%f0 \n\t"
159
-            "fpmerge %%f4,  %%f6,  %%f8  \n\t"  // r,b,t1
160
-            "fpmerge %%f20, %%f22, %%f24 \n\t"  // r,b,t1
161
-            "fpmerge %%f0,  %%f2,  %%f10 \n\t"  // 0,g,t2
162
-            "fpmerge %%f0,  %%f18, %%f26 \n\t"  // 0,g,t2
163
-            "fpmerge %%f10, %%f8,  %%f4  \n\t"  // t2,t1,msb
164
-            "fpmerge %%f26, %%f24, %%f20 \n\t"  // t2,t1,msb
165
-            "fpmerge %%f11, %%f9,  %%f6  \n\t"  // t2,t1,lsb
166
-            "fpmerge %%f27, %%f25, %%f22 \n\t"  // t2,t1,lsb
167
-            "std %%f4,  [%3]             \n\t"
168
-            "std %%f20, [%3 + 16]        \n\t"
169
-            "std %%f6,  [%3 + 8]         \n\t"
170
-            "std %%f22, [%3 + 24]        \n\t"
171
-
172
-            "add %0, 8, %0   \n\t"
173
-            "add %1, 4, %1   \n\t"
174
-            "add %2, 4, %2   \n\t"
175
-            "subcc %4, 8, %4 \n\t"
176
-            "bne 1b          \n\t"
177
-            "add %3, 32, %3  \n\t" //delay slot
178
-            : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
179
-            : "0" (src[0] + (y + srcSliceY) * srcStride[0]), "1" (src[1] + (y + srcSliceY) * srcStride[1]),
180
-            "2" (src[2] + (y + srcSliceY) * srcStride[2]), "3" (dst[0] + (y + srcSliceY) * dstStride[0]),
181
-            "4" (c->dstW),
182
-            "5" (c->sparc_coeffs)
183
-            );
184
-
185
-    return srcSliceH;
186
-}
187
-
188
-av_cold SwsFunc ff_yuv2rgb_init_vis(SwsContext *c)
189
-{
190
-    c->sparc_coeffs[5] = c->yCoeff;
191
-    c->sparc_coeffs[6] = c->vgCoeff;
192
-    c->sparc_coeffs[7] = c->vrCoeff;
193
-    c->sparc_coeffs[8] = c->ubCoeff;
194
-    c->sparc_coeffs[9] = c->ugCoeff;
195
-
196
-    c->sparc_coeffs[0] = (((int16_t)c->yOffset * (int16_t)c->yCoeff  >> 11) & 0xffff) * 0x0001000100010001ULL;
197
-    c->sparc_coeffs[1] = (((int16_t)c->uOffset * (int16_t)c->ubCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
198
-    c->sparc_coeffs[2] = (((int16_t)c->uOffset * (int16_t)c->ugCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
199
-    c->sparc_coeffs[3] = (((int16_t)c->vOffset * (int16_t)c->vgCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
200
-    c->sparc_coeffs[4] = (((int16_t)c->vOffset * (int16_t)c->vrCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
201
-
202
-    if (c->dstFormat == AV_PIX_FMT_RGB32 && c->srcFormat == AV_PIX_FMT_YUV422P && (c->dstW & 7) == 0) {
203
-        av_log(c, AV_LOG_INFO,
204
-               "SPARC VIS accelerated YUV422P -> RGB32 (WARNING: alpha value is wrong)\n");
205
-        return vis_422P_ARGB32;
206
-    } else if (c->dstFormat == AV_PIX_FMT_RGB32 && c->srcFormat == AV_PIX_FMT_YUV420P && (c->dstW & 7) == 0) {
207
-        av_log(c, AV_LOG_INFO,
208
-               "SPARC VIS accelerated YUV420P -> RGB32 (WARNING: alpha value is wrong)\n");
209
-        return vis_420P_ARGB32;
210
-    }
211
-    return NULL;
212
-}
... ...
@@ -450,10 +450,6 @@ typedef struct SwsContext {
450 450
     DECLARE_ALIGNED(4, uint32_t, gmask);
451 451
 #endif
452 452
 
453
-#if HAVE_VIS
454
-    DECLARE_ALIGNED(8, uint64_t, sparc_coeffs)[10];
455
-#endif
456
-
457 453
     /* function pointers for swscale() */
458 454
     yuv2planar1_fn yuv2plane1;
459 455
     yuv2planarX_fn yuv2planeX;
... ...
@@ -571,7 +567,6 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
571 571
                            int lastInLumBuf, int lastInChrBuf);
572 572
 
573 573
 SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
574
-SwsFunc ff_yuv2rgb_init_vis(SwsContext *c);
575 574
 SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
576 575
 SwsFunc ff_yuv2rgb_init_bfin(SwsContext *c);
577 576
 
... ...
@@ -564,8 +564,6 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
564 564
         t = ff_yuv2rgb_init_bfin(c);
565 565
     if (ARCH_PPC)
566 566
         t = ff_yuv2rgb_init_ppc(c);
567
-    if (HAVE_VIS)
568
-        t = ff_yuv2rgb_init_vis(c);
569 567
     if (ARCH_X86)
570 568
         t = ff_yuv2rgb_init_x86(c);
571 569