* qatar/master:
dxa: remove useless code
lavf: don't select an attached picture as default stream for seeking.
avconv: remove pointless checks.
avconv: check for get_filtered_frame() failure.
avconv: remove a pointless check.
swscale: convert hscale() to use named arguments.
x86inc: add *mp named argument support to DEFINE_ARGS.
swscale: convert hscale to cpuflags().
Conflicts:
ffmpeg.c
libswscale/x86/scale.asm
Merged-by: Michael Niedermayer <michaelni@gmx.at>
... | ... |
@@ -2178,13 +2178,13 @@ static int transcode_video(InputStream *ist, AVPacket *pkt, int *got_output, int |
2178 | 2178 |
} |
2179 | 2179 |
if (!ist->filtered_frame && !(ist->filtered_frame = avcodec_alloc_frame())) { |
2180 | 2180 |
ret = AVERROR(ENOMEM); |
2181 |
- goto end; |
|
2181 |
+ goto fail; |
|
2182 | 2182 |
} |
2183 | 2183 |
filtered_frame = ist->filtered_frame; |
2184 | 2184 |
*filtered_frame= *decoded_frame; //for me_threshold |
2185 | 2185 |
avfilter_fill_frame_from_video_buffer_ref(filtered_frame, ost->picref); |
2186 | 2186 |
filtered_frame->pts = av_rescale_q(ost->picref->pts, ist_pts_tb, AV_TIME_BASE_Q); |
2187 |
- if (ost->picref->video && !ost->frame_aspect_ratio) |
|
2187 |
+ if (!ost->frame_aspect_ratio) |
|
2188 | 2188 |
ost->st->codec->sample_aspect_ratio = ost->picref->video->sample_aspect_ratio; |
2189 | 2189 |
do_video_out(output_files[ost->file_index].ctx, ost, ist, filtered_frame); |
2190 | 2190 |
cont: |
... | ... |
@@ -2195,7 +2195,7 @@ static int transcode_video(InputStream *ist, AVPacket *pkt, int *got_output, int |
2195 | 2195 |
#endif |
2196 | 2196 |
} |
2197 | 2197 |
|
2198 |
-end: |
|
2198 |
+fail: |
|
2199 | 2199 |
av_free(buffer_to_free); |
2200 | 2200 |
return ret; |
2201 | 2201 |
} |
... | ... |
@@ -36,7 +36,6 @@ |
36 | 36 |
* Decoder context |
37 | 37 |
*/ |
38 | 38 |
typedef struct DxaDecContext { |
39 |
- AVCodecContext *avctx; |
|
40 | 39 |
AVFrame pic, prev; |
41 | 40 |
|
42 | 41 |
int dsize; |
... | ... |
@@ -292,7 +291,6 @@ static av_cold int decode_init(AVCodecContext *avctx) |
292 | 292 |
{ |
293 | 293 |
DxaDecContext * const c = avctx->priv_data; |
294 | 294 |
|
295 |
- c->avctx = avctx; |
|
296 | 295 |
avctx->pix_fmt = PIX_FMT_PAL8; |
297 | 296 |
|
298 | 297 |
avcodec_get_frame_defaults(&c->pic); |
... | ... |
@@ -1399,7 +1399,8 @@ int av_find_default_stream_index(AVFormatContext *s) |
1399 | 1399 |
return -1; |
1400 | 1400 |
for(i = 0; i < s->nb_streams; i++) { |
1401 | 1401 |
st = s->streams[i]; |
1402 |
- if (st->codec->codec_type == AVMEDIA_TYPE_VIDEO) { |
|
1402 |
+ if (st->codec->codec_type == AVMEDIA_TYPE_VIDEO && |
|
1403 |
+ !(st->disposition & AV_DISPOSITION_ATTACHED_PIC)) { |
|
1403 | 1404 |
return i; |
1404 | 1405 |
} |
1405 | 1406 |
if (first_audio_index < 0 && st->codec->codec_type == AVMEDIA_TYPE_AUDIO) |
... | ... |
@@ -246,6 +246,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 |
246 | 246 |
CAT_UNDEF arg_name %+ %%i, w |
247 | 247 |
CAT_UNDEF arg_name %+ %%i, b |
248 | 248 |
CAT_UNDEF arg_name %+ %%i, m |
249 |
+ CAT_UNDEF arg_name %+ %%i, mp |
|
249 | 250 |
CAT_UNDEF arg_name, %%i |
250 | 251 |
%assign %%i %%i+1 |
251 | 252 |
%endrep |
... | ... |
@@ -260,6 +261,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 |
260 | 260 |
%xdefine %1w r %+ %%i %+ w |
261 | 261 |
%xdefine %1b r %+ %%i %+ b |
262 | 262 |
%xdefine %1m r %+ %%i %+ m |
263 |
+ %xdefine %1mp r %+ %%i %+ mp |
|
263 | 264 |
CAT_XDEFINE arg_name, %%i, %1 |
264 | 265 |
%assign %%i %%i+1 |
265 | 266 |
%rotate 1 |
... | ... |
@@ -48,11 +48,15 @@ SECTION .text |
48 | 48 |
; the first pixel is given in filterPos[nOutputPixel]. |
49 | 49 |
;----------------------------------------------------------------------------- |
50 | 50 |
|
51 |
-; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, opt, n_args, n_xmm |
|
52 |
-%macro SCALE_FUNC 7 |
|
53 |
-cglobal hscale%1to%2_%4_%5, %6, 7, %7 |
|
51 |
+; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm |
|
52 |
+%macro SCALE_FUNC 6 |
|
53 |
+%ifnidn %3, X |
|
54 |
+cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1 |
|
55 |
+%else |
|
56 |
+cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize |
|
57 |
+%endif |
|
54 | 58 |
%if ARCH_X86_64 |
55 |
- movsxd r2, r2d |
|
59 |
+ movsxd wq, wd |
|
56 | 60 |
%define mov32 movsxd |
57 | 61 |
%else ; x86-32 |
58 | 62 |
%define mov32 mov |
... | ... |
@@ -60,7 +64,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 |
60 | 60 |
%if %2 == 19 |
61 | 61 |
%if mmsize == 8 ; mmx |
62 | 62 |
mova m2, [max_19bit_int] |
63 |
-%elifidn %5, sse4 |
|
63 |
+%elif cpuflag(sse4) |
|
64 | 64 |
mova m2, [max_19bit_int] |
65 | 65 |
%else ; ssse3/sse2 |
66 | 66 |
mova m2, [max_19bit_flt] |
... | ... |
@@ -87,48 +91,48 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 |
87 | 87 |
|
88 | 88 |
; setup loop |
89 | 89 |
%if %3 == 8 |
90 |
- shl r2, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter |
|
91 |
-%define r2shr 1 |
|
90 |
+ shl wq, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter |
|
91 |
+%define wshr 1 |
|
92 | 92 |
%else ; %3 == 4 |
93 |
-%define r2shr 0 |
|
93 |
+%define wshr 0 |
|
94 | 94 |
%endif ; %3 == 8 |
95 |
- lea r4, [r4+r2*8] |
|
95 |
+ lea filterq, [filterq+wq*8] |
|
96 | 96 |
%if %2 == 15 |
97 |
- lea r1, [r1+r2*(2>>r2shr)] |
|
97 |
+ lea dstq, [dstq+wq*(2>>wshr)] |
|
98 | 98 |
%else ; %2 == 19 |
99 |
- lea r1, [r1+r2*(4>>r2shr)] |
|
99 |
+ lea dstq, [dstq+wq*(4>>wshr)] |
|
100 | 100 |
%endif ; %2 == 15/19 |
101 |
- lea r5, [r5+r2*(4>>r2shr)] |
|
102 |
- neg r2 |
|
101 |
+ lea fltposq, [fltposq+wq*(4>>wshr)] |
|
102 |
+ neg wq |
|
103 | 103 |
|
104 | 104 |
.loop: |
105 | 105 |
%if %3 == 4 ; filterSize == 4 scaling |
106 | 106 |
; load 2x4 or 4x4 source pixels into m0/m1 |
107 |
- mov32 r0, dword [r5+r2*4+0] ; filterPos[0] |
|
108 |
- mov32 r6, dword [r5+r2*4+4] ; filterPos[1] |
|
109 |
- movlh m0, [r3+r0*srcmul] ; src[filterPos[0] + {0,1,2,3}] |
|
107 |
+ mov32 pos0q, dword [fltposq+wq*4+ 0] ; filterPos[0] |
|
108 |
+ mov32 pos1q, dword [fltposq+wq*4+ 4] ; filterPos[1] |
|
109 |
+ movlh m0, [srcq+pos0q*srcmul] ; src[filterPos[0] + {0,1,2,3}] |
|
110 | 110 |
%if mmsize == 8 |
111 |
- movlh m1, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] |
|
111 |
+ movlh m1, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}] |
|
112 | 112 |
%else ; mmsize == 16 |
113 | 113 |
%if %1 > 8 |
114 |
- movhps m0, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] |
|
114 |
+ movhps m0, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}] |
|
115 | 115 |
%else ; %1 == 8 |
116 |
- movd m4, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] |
|
116 |
+ movd m4, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}] |
|
117 | 117 |
%endif |
118 |
- mov32 r0, dword [r5+r2*4+8] ; filterPos[2] |
|
119 |
- mov32 r6, dword [r5+r2*4+12] ; filterPos[3] |
|
120 |
- movlh m1, [r3+r0*srcmul] ; src[filterPos[2] + {0,1,2,3}] |
|
118 |
+ mov32 pos0q, dword [fltposq+wq*4+ 8] ; filterPos[2] |
|
119 |
+ mov32 pos1q, dword [fltposq+wq*4+12] ; filterPos[3] |
|
120 |
+ movlh m1, [srcq+pos0q*srcmul] ; src[filterPos[2] + {0,1,2,3}] |
|
121 | 121 |
%if %1 > 8 |
122 |
- movhps m1, [r3+r6*srcmul] ; src[filterPos[3] + {0,1,2,3}] |
|
122 |
+ movhps m1, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}] |
|
123 | 123 |
%else ; %1 == 8 |
124 |
- movd m5, [r3+r6*srcmul] ; src[filterPos[3] + {0,1,2,3}] |
|
124 |
+ movd m5, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}] |
|
125 | 125 |
punpckldq m0, m4 |
126 | 126 |
punpckldq m1, m5 |
127 |
-%endif ; %1 == 8 && %5 <= ssse |
|
127 |
+%endif ; %1 == 8 |
|
128 | 128 |
%endif ; mmsize == 8/16 |
129 | 129 |
%if %1 == 8 |
130 |
- punpcklbw m0, m3 ; byte -> word |
|
131 |
- punpcklbw m1, m3 ; byte -> word |
|
130 |
+ punpcklbw m0, m3 ; byte -> word |
|
131 |
+ punpcklbw m1, m3 ; byte -> word |
|
132 | 132 |
%endif ; %1 == 8 |
133 | 133 |
|
134 | 134 |
; multiply with filter coefficients |
... | ... |
@@ -137,8 +141,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 |
137 | 137 |
psubw m0, m6 |
138 | 138 |
psubw m1, m6 |
139 | 139 |
%endif ; %1 == 16 |
140 |
- pmaddwd m0, [r4+r2*8+mmsize*0] ; *= filter[{0,1,..,6,7}] |
|
141 |
- pmaddwd m1, [r4+r2*8+mmsize*1] ; *= filter[{8,9,..,14,15}] |
|
140 |
+ pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}] |
|
141 |
+ pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}] |
|
142 | 142 |
|
143 | 143 |
; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix) |
144 | 144 |
%if mmsize == 8 ; mmx |
... | ... |
@@ -146,38 +150,38 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 |
146 | 146 |
punpckldq m0, m1 |
147 | 147 |
punpckhdq m4, m1 |
148 | 148 |
paddd m0, m4 |
149 |
-%elifidn %5, sse2 |
|
149 |
+%elif notcpuflag(ssse3) ; sse2 |
|
150 | 150 |
mova m4, m0 |
151 | 151 |
shufps m0, m1, 10001000b |
152 | 152 |
shufps m4, m1, 11011101b |
153 | 153 |
paddd m0, m4 |
154 | 154 |
%else ; ssse3/sse4 |
155 |
- phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}], |
|
156 |
- ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}], |
|
157 |
- ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}], |
|
158 |
- ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}] |
|
155 |
+ phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}], |
|
156 |
+ ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}], |
|
157 |
+ ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}], |
|
158 |
+ ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}] |
|
159 | 159 |
%endif ; mmx/sse2/ssse3/sse4 |
160 | 160 |
%else ; %3 == 8, i.e. filterSize == 8 scaling |
161 | 161 |
; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5 |
162 |
- mov32 r0, dword [r5+r2*2+0] ; filterPos[0] |
|
163 |
- mov32 r6, dword [r5+r2*2+4] ; filterPos[1] |
|
164 |
- movbh m0, [r3+ r0 *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}] |
|
162 |
+ mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0] |
|
163 |
+ mov32 pos1q, dword [fltposq+wq*2+4] ; filterPos[1] |
|
164 |
+ movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}] |
|
165 | 165 |
%if mmsize == 8 |
166 |
- movbh m1, [r3+(r0+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}] |
|
167 |
- movbh m4, [r3+ r6 *srcmul] ; src[filterPos[1] + {0,1,2,3}] |
|
168 |
- movbh m5, [r3+(r6+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}] |
|
166 |
+ movbh m1, [srcq+(pos0q+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}] |
|
167 |
+ movbh m4, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3}] |
|
168 |
+ movbh m5, [srcq+(pos1q+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}] |
|
169 | 169 |
%else ; mmsize == 16 |
170 |
- movbh m1, [r3+ r6 *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}] |
|
171 |
- mov32 r0, dword [r5+r2*2+8] ; filterPos[2] |
|
172 |
- mov32 r6, dword [r5+r2*2+12] ; filterPos[3] |
|
173 |
- movbh m4, [r3+ r0 *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}] |
|
174 |
- movbh m5, [r3+ r6 *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}] |
|
170 |
+ movbh m1, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}] |
|
171 |
+ mov32 pos0q, dword [fltposq+wq*2+8] ; filterPos[2] |
|
172 |
+ mov32 pos1q, dword [fltposq+wq*2+12] ; filterPos[3] |
|
173 |
+ movbh m4, [srcq+ pos0q *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}] |
|
174 |
+ movbh m5, [srcq+ pos1q *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}] |
|
175 | 175 |
%endif ; mmsize == 8/16 |
176 | 176 |
%if %1 == 8 |
177 |
- punpcklbw m0, m3 ; byte -> word |
|
178 |
- punpcklbw m1, m3 ; byte -> word |
|
179 |
- punpcklbw m4, m3 ; byte -> word |
|
180 |
- punpcklbw m5, m3 ; byte -> word |
|
177 |
+ punpcklbw m0, m3 ; byte -> word |
|
178 |
+ punpcklbw m1, m3 ; byte -> word |
|
179 |
+ punpcklbw m4, m3 ; byte -> word |
|
180 |
+ punpcklbw m5, m3 ; byte -> word |
|
181 | 181 |
%endif ; %1 == 8 |
182 | 182 |
|
183 | 183 |
; multiply |
... | ... |
@@ -188,10 +192,10 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 |
188 | 188 |
psubw m4, m6 |
189 | 189 |
psubw m5, m6 |
190 | 190 |
%endif ; %1 == 16 |
191 |
- pmaddwd m0, [r4+r2*8+mmsize*0] ; *= filter[{0,1,..,6,7}] |
|
192 |
- pmaddwd m1, [r4+r2*8+mmsize*1] ; *= filter[{8,9,..,14,15}] |
|
193 |
- pmaddwd m4, [r4+r2*8+mmsize*2] ; *= filter[{16,17,..,22,23}] |
|
194 |
- pmaddwd m5, [r4+r2*8+mmsize*3] ; *= filter[{24,25,..,30,31}] |
|
191 |
+ pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}] |
|
192 |
+ pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}] |
|
193 |
+ pmaddwd m4, [filterq+wq*8+mmsize*2] ; *= filter[{16,17,..,22,23}] |
|
194 |
+ pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}] |
|
195 | 195 |
|
196 | 196 |
; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix) |
197 | 197 |
%if mmsize == 8 |
... | ... |
@@ -201,7 +205,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 |
201 | 201 |
punpckldq m0, m4 |
202 | 202 |
punpckhdq m1, m4 |
203 | 203 |
paddd m0, m1 |
204 |
-%elifidn %5, sse2 |
|
204 |
+%elif notcpuflag(ssse3) ; sse2 |
|
205 | 205 |
%if %1 == 8 |
206 | 206 |
%define mex m6 |
207 | 207 |
%else |
... | ... |
@@ -226,55 +230,55 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 |
226 | 226 |
; of 3 x phaddd here, faster on older cpus |
227 | 227 |
phaddd m0, m1 |
228 | 228 |
phaddd m4, m5 |
229 |
- phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}], |
|
230 |
- ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}], |
|
231 |
- ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}], |
|
232 |
- ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}] |
|
229 |
+ phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}], |
|
230 |
+ ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}], |
|
231 |
+ ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}], |
|
232 |
+ ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}] |
|
233 | 233 |
%endif ; mmx/sse2/ssse3/sse4 |
234 | 234 |
%endif ; %3 == 4/8 |
235 | 235 |
|
236 | 236 |
%else ; %3 == X, i.e. any filterSize scaling |
237 | 237 |
|
238 | 238 |
%ifidn %4, X4 |
239 |
-%define r6sub 4 |
|
239 |
+%define dlt 4 |
|
240 | 240 |
%else ; %4 == X || %4 == X8 |
241 |
-%define r6sub 0 |
|
241 |
+%define dlt 0 |
|
242 | 242 |
%endif ; %4 ==/!= X4 |
243 | 243 |
%if ARCH_X86_64 |
244 | 244 |
push r12 |
245 |
- movsxd r6, r6d ; filterSize |
|
246 |
- lea r12, [r3+(r6-r6sub)*srcmul] ; &src[filterSize&~4] |
|
247 |
-%define src_reg r11 |
|
248 |
-%define r1x r10 |
|
249 |
-%define filter2 r12 |
|
245 |
+%define srcq r11 |
|
246 |
+%define pos1q r10 |
|
247 |
+%define srcendq r12 |
|
248 |
+ movsxd fltsizeq, fltsized ; filterSize |
|
249 |
+ lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4] |
|
250 | 250 |
%else ; x86-32 |
251 |
- lea r0, [r3+(r6-r6sub)*srcmul] ; &src[filterSize&~4] |
|
252 |
- mov r6m, r0 |
|
253 |
-%define src_reg r3 |
|
254 |
-%define r1x r1 |
|
255 |
-%define filter2 r6m |
|
251 |
+%define srcq srcmemq |
|
252 |
+%define pos1q dstq |
|
253 |
+%define srcendq r6m |
|
254 |
+ lea pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4] |
|
255 |
+ mov srcendq, pos0q |
|
256 | 256 |
%endif ; x86-32/64 |
257 |
- lea r5, [r5+r2*4] |
|
257 |
+ lea fltposq, [fltposq+wq*4] |
|
258 | 258 |
%if %2 == 15 |
259 |
- lea r1, [r1+r2*2] |
|
259 |
+ lea dstq, [dstq+wq*2] |
|
260 | 260 |
%else ; %2 == 19 |
261 |
- lea r1, [r1+r2*4] |
|
261 |
+ lea dstq, [dstq+wq*4] |
|
262 | 262 |
%endif ; %2 == 15/19 |
263 |
- movifnidn r1mp, r1 |
|
264 |
- neg r2 |
|
263 |
+ movifnidn dstmp, dstq |
|
264 |
+ neg wq |
|
265 | 265 |
|
266 | 266 |
.loop: |
267 |
- mov32 r0, dword [r5+r2*4+0] ; filterPos[0] |
|
268 |
- mov32 r1x, dword [r5+r2*4+4] ; filterPos[1] |
|
267 |
+ mov32 pos0q, dword [fltposq+wq*4+0] ; filterPos[0] |
|
268 |
+ mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1] |
|
269 | 269 |
; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)? |
270 | 270 |
pxor m4, m4 |
271 | 271 |
pxor m5, m5 |
272 |
- mov src_reg, r3mp |
|
272 |
+ mov srcq, srcmemmp |
|
273 | 273 |
|
274 | 274 |
.innerloop: |
275 | 275 |
; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5 |
276 |
- movbh m0, [src_reg+r0 *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}] |
|
277 |
- movbh m1, [src_reg+(r1x+r6sub)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}] |
|
276 |
+ movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}] |
|
277 |
+ movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}] |
|
278 | 278 |
%if %1 == 8 |
279 | 279 |
punpcklbw m0, m3 |
280 | 280 |
punpcklbw m1, m3 |
... | ... |
@@ -286,25 +290,25 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 |
286 | 286 |
psubw m0, m6 |
287 | 287 |
psubw m1, m6 |
288 | 288 |
%endif ; %1 == 16 |
289 |
- pmaddwd m0, [r4 ] ; filter[{0,1,2,3(,4,5,6,7)}] |
|
290 |
- pmaddwd m1, [r4+(r6+r6sub)*2] ; filter[filtersize+{0,1,2,3(,4,5,6,7)}] |
|
289 |
+ pmaddwd m0, [filterq] ; filter[{0,1,2,3(,4,5,6,7)}] |
|
290 |
+ pmaddwd m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}] |
|
291 | 291 |
paddd m4, m0 |
292 | 292 |
paddd m5, m1 |
293 |
- add r4, mmsize |
|
294 |
- add src_reg, srcmul*mmsize/2 |
|
295 |
- cmp src_reg, filter2 ; while (src += 4) < &src[filterSize] |
|
293 |
+ add filterq, mmsize |
|
294 |
+ add srcq, srcmul*mmsize/2 |
|
295 |
+ cmp srcq, srcendq ; while (src += 4) < &src[filterSize] |
|
296 | 296 |
jl .innerloop |
297 | 297 |
|
298 | 298 |
%ifidn %4, X4 |
299 |
- mov32 r1x, dword [r5+r2*4+4] ; filterPos[1] |
|
300 |
- movlh m0, [src_reg+r0 *srcmul] ; split last 4 srcpx of dstpx[0] |
|
301 |
- sub r1x, r6 ; and first 4 srcpx of dstpx[1] |
|
299 |
+ mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1] |
|
300 |
+ movlh m0, [srcq+ pos0q *srcmul] ; split last 4 srcpx of dstpx[0] |
|
301 |
+ sub pos1q, fltsizeq ; and first 4 srcpx of dstpx[1] |
|
302 | 302 |
%if %1 > 8 |
303 |
- movhps m0, [src_reg+(r1x+r6sub)*srcmul] |
|
303 |
+ movhps m0, [srcq+(pos1q+dlt)*srcmul] |
|
304 | 304 |
%else ; %1 == 8 |
305 |
- movd m1, [src_reg+(r1x+r6sub)*srcmul] |
|
305 |
+ movd m1, [srcq+(pos1q+dlt)*srcmul] |
|
306 | 306 |
punpckldq m0, m1 |
307 |
-%endif ; %1 == 8 && %5 <= ssse |
|
307 |
+%endif ; %1 == 8 |
|
308 | 308 |
%if %1 == 8 |
309 | 309 |
punpcklbw m0, m3 |
310 | 310 |
%endif ; %1 == 8 |
... | ... |
@@ -312,10 +316,10 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 |
312 | 312 |
; add back 0x8000 * sum(coeffs) after the horizontal add |
313 | 313 |
psubw m0, m6 |
314 | 314 |
%endif ; %1 == 16 |
315 |
- pmaddwd m0, [r4] |
|
315 |
+ pmaddwd m0, [filterq] |
|
316 | 316 |
%endif ; %4 == X4 |
317 | 317 |
|
318 |
- lea r4, [r4+(r6+r6sub)*2] |
|
318 |
+ lea filterq, [filterq+(fltsizeq+dlt)*2] |
|
319 | 319 |
|
320 | 320 |
%if mmsize == 8 ; mmx |
321 | 321 |
movq m0, m4 |
... | ... |
@@ -323,7 +327,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 |
323 | 323 |
punpckhdq m0, m5 |
324 | 324 |
paddd m0, m4 |
325 | 325 |
%else ; mmsize == 16 |
326 |
-%ifidn %5, sse2 |
|
326 |
+%if notcpuflag(ssse3) ; sse2 |
|
327 | 327 |
mova m1, m4 |
328 | 328 |
punpcklqdq m4, m5 |
329 | 329 |
punpckhqdq m1, m5 |
... | ... |
@@ -334,7 +338,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 |
334 | 334 |
%ifidn %4, X4 |
335 | 335 |
paddd m4, m0 |
336 | 336 |
%endif ; %3 == X4 |
337 |
-%ifidn %5, sse2 |
|
337 |
+%if notcpuflag(ssse3) ; sse2 |
|
338 | 338 |
pshufd m4, m4, 11011000b |
339 | 339 |
movhlps m0, m4 |
340 | 340 |
paddd m0, m4 |
... | ... |
@@ -352,19 +356,19 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 |
352 | 352 |
; clip, store |
353 | 353 |
psrad m0, 14 + %1 - %2 |
354 | 354 |
%ifidn %3, X |
355 |
- movifnidn r1, r1mp |
|
355 |
+ movifnidn dstq, dstmp |
|
356 | 356 |
%endif ; %3 == X |
357 | 357 |
%if %2 == 15 |
358 | 358 |
packssdw m0, m0 |
359 | 359 |
%ifnidn %3, X |
360 |
- movh [r1+r2*(2>>r2shr)], m0 |
|
360 |
+ movh [dstq+wq*(2>>wshr)], m0 |
|
361 | 361 |
%else ; %3 == X |
362 |
- movd [r1+r2*2], m0 |
|
362 |
+ movd [dstq+wq*2], m0 |
|
363 | 363 |
%endif ; %3 ==/!= X |
364 | 364 |
%else ; %2 == 19 |
365 | 365 |
%if mmsize == 8 |
366 | 366 |
PMINSD_MMX m0, m2, m4 |
367 |
-%elifidn %5, sse4 |
|
367 |
+%elif cpuflag(sse4) |
|
368 | 368 |
pminsd m0, m2 |
369 | 369 |
%else ; sse2/ssse3 |
370 | 370 |
cvtdq2ps m0, m0 |
... | ... |
@@ -372,16 +376,16 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 |
372 | 372 |
cvtps2dq m0, m0 |
373 | 373 |
%endif ; mmx/sse2/ssse3/sse4 |
374 | 374 |
%ifnidn %3, X |
375 |
- mova [r1+r2*(4>>r2shr)], m0 |
|
375 |
+ mova [dstq+wq*(4>>wshr)], m0 |
|
376 | 376 |
%else ; %3 == X |
377 |
- movq [r1+r2*4], m0 |
|
377 |
+ movq [dstq+wq*4], m0 |
|
378 | 378 |
%endif ; %3 ==/!= X |
379 | 379 |
%endif ; %2 == 15/19 |
380 | 380 |
%ifnidn %3, X |
381 |
- add r2, (mmsize<<r2shr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels) |
|
382 |
- ; per iteration. see "shl r2,1" above as for why we do this |
|
381 |
+ add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels) |
|
382 |
+ ; per iteration. see "shl wq,1" above as for why we do this |
|
383 | 383 |
%else ; %3 == X |
384 |
- add r2, 2 |
|
384 |
+ add wq, 2 |
|
385 | 385 |
%endif ; %3 ==/!= X |
386 | 386 |
jl .loop |
387 | 387 |
%ifnidn %3, X |
... | ... |
@@ -396,39 +400,41 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 |
396 | 396 |
%endif ; %3 ==/!= X |
397 | 397 |
%endmacro |
398 | 398 |
|
399 |
-; SCALE_FUNCS source_width, intermediate_nbits, opt, n_xmm |
|
400 |
-%macro SCALE_FUNCS 4 |
|
401 |
-SCALE_FUNC %1, %2, 4, 4, %3, 6, %4 |
|
402 |
-SCALE_FUNC %1, %2, 8, 8, %3, 6, %4 |
|
399 |
+; SCALE_FUNCS source_width, intermediate_nbits, n_xmm |
|
400 |
+%macro SCALE_FUNCS 3 |
|
401 |
+SCALE_FUNC %1, %2, 4, 4, 6, %3 |
|
402 |
+SCALE_FUNC %1, %2, 8, 8, 6, %3 |
|
403 | 403 |
%if mmsize == 8 |
404 |
-SCALE_FUNC %1, %2, X, X, %3, 7, %4 |
|
404 |
+SCALE_FUNC %1, %2, X, X, 7, %3 |
|
405 | 405 |
%else |
406 |
-SCALE_FUNC %1, %2, X, X4, %3, 7, %4 |
|
407 |
-SCALE_FUNC %1, %2, X, X8, %3, 7, %4 |
|
406 |
+SCALE_FUNC %1, %2, X, X4, 7, %3 |
|
407 |
+SCALE_FUNC %1, %2, X, X8, 7, %3 |
|
408 | 408 |
%endif |
409 | 409 |
%endmacro |
410 | 410 |
|
411 |
-; SCALE_FUNCS2 opt, 8_xmm_args, 9to10_xmm_args, 16_xmm_args |
|
412 |
-%macro SCALE_FUNCS2 4 |
|
413 |
-%ifnidn %1, sse4 |
|
414 |
-SCALE_FUNCS 8, 15, %1, %2 |
|
415 |
-SCALE_FUNCS 9, 15, %1, %3 |
|
416 |
-SCALE_FUNCS 10, 15, %1, %3 |
|
417 |
-SCALE_FUNCS 14, 15, %1, %3 |
|
418 |
-SCALE_FUNCS 16, 15, %1, %4 |
|
411 |
+; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args |
|
412 |
+%macro SCALE_FUNCS2 3 |
|
413 |
+%if notcpuflag(sse4) |
|
414 |
+SCALE_FUNCS 8, 15, %1 |
|
415 |
+SCALE_FUNCS 9, 15, %2 |
|
416 |
+SCALE_FUNCS 10, 15, %2 |
|
417 |
+SCALE_FUNCS 14, 15, %2 |
|
418 |
+SCALE_FUNCS 16, 15, %3 |
|
419 | 419 |
%endif ; !sse4 |
420 |
-SCALE_FUNCS 8, 19, %1, %2 |
|
421 |
-SCALE_FUNCS 9, 19, %1, %3 |
|
422 |
-SCALE_FUNCS 10, 19, %1, %3 |
|
423 |
-SCALE_FUNCS 14, 19, %1, %3 |
|
424 |
-SCALE_FUNCS 16, 19, %1, %4 |
|
420 |
+SCALE_FUNCS 8, 19, %1 |
|
421 |
+SCALE_FUNCS 9, 19, %2 |
|
422 |
+SCALE_FUNCS 10, 19, %2 |
|
423 |
+SCALE_FUNCS 14, 19, %2 |
|
424 |
+SCALE_FUNCS 16, 19, %3 |
|
425 | 425 |
%endmacro |
426 | 426 |
|
427 | 427 |
%if ARCH_X86_32 |
428 |
-INIT_MMX |
|
429 |
-SCALE_FUNCS2 mmx, 0, 0, 0 |
|
428 |
+INIT_MMX mmx |
|
429 |
+SCALE_FUNCS2 0, 0, 0 |
|
430 | 430 |
%endif |
431 |
-INIT_XMM |
|
432 |
-SCALE_FUNCS2 sse2, 6, 7, 8 |
|
433 |
-SCALE_FUNCS2 ssse3, 6, 6, 8 |
|
434 |
-SCALE_FUNCS2 sse4, 6, 6, 8 |
|
431 |
+INIT_XMM sse2 |
|
432 |
+SCALE_FUNCS2 6, 7, 8 |
|
433 |
+INIT_XMM ssse3 |
|
434 |
+SCALE_FUNCS2 6, 6, 8 |
|
435 |
+INIT_XMM sse4 |
|
436 |
+SCALE_FUNCS2 6, 6, 8 |