Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org>
Yogender Gupta authored on 2017/05/11 02:18:16... | ... |
@@ -10,6 +10,7 @@ version <next>: |
10 | 10 |
- config.log and other configuration files moved into ffbuild/ directory |
11 | 11 |
- update cuvid/nvenc headers to Video Codec SDK 8.0.14 |
12 | 12 |
- afir audio filter |
13 |
+- scale_cuda CUDA based video scale filter |
|
13 | 14 |
|
14 | 15 |
version 3.3: |
15 | 16 |
- CrystalHD decoder moved to new decode API |
... | ... |
@@ -267,6 +267,7 @@ OBJS-$(CONFIG_REVERSE_FILTER) += f_reverse.o |
267 | 267 |
OBJS-$(CONFIG_ROTATE_FILTER) += vf_rotate.o |
268 | 268 |
OBJS-$(CONFIG_SAB_FILTER) += vf_sab.o |
269 | 269 |
OBJS-$(CONFIG_SCALE_FILTER) += vf_scale.o scale.o |
270 |
+OBJS-$(CONFIG_SCALE_CUDA_FILTER) += vf_scale_cuda.o vf_scale_cuda.ptx.o |
|
270 | 271 |
OBJS-$(CONFIG_SCALE_NPP_FILTER) += vf_scale_npp.o scale.o |
271 | 272 |
OBJS-$(CONFIG_SCALE_QSV_FILTER) += vf_scale_qsv.o |
272 | 273 |
OBJS-$(CONFIG_SCALE_VAAPI_FILTER) += vf_scale_vaapi.o scale.o |
... | ... |
@@ -278,6 +278,7 @@ static void register_all(void) |
278 | 278 |
REGISTER_FILTER(ROTATE, rotate, vf); |
279 | 279 |
REGISTER_FILTER(SAB, sab, vf); |
280 | 280 |
REGISTER_FILTER(SCALE, scale, vf); |
281 |
+ REGISTER_FILTER(SCALE_CUDA, scale_cuda, vf); |
|
281 | 282 |
REGISTER_FILTER(SCALE_NPP, scale_npp, vf); |
282 | 283 |
REGISTER_FILTER(SCALE_QSV, scale_qsv, vf); |
283 | 284 |
REGISTER_FILTER(SCALE_VAAPI, scale_vaapi, vf); |
... | ... |
@@ -31,7 +31,7 @@ |
31 | 31 |
|
32 | 32 |
#define LIBAVFILTER_VERSION_MAJOR 6 |
33 | 33 |
#define LIBAVFILTER_VERSION_MINOR 89 |
34 |
-#define LIBAVFILTER_VERSION_MICRO 100 |
|
34 |
+#define LIBAVFILTER_VERSION_MICRO 101 |
|
35 | 35 |
|
36 | 36 |
#define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \ |
37 | 37 |
LIBAVFILTER_VERSION_MINOR, \ |
38 | 38 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,556 @@ |
0 |
+/* |
|
1 |
+* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. |
|
2 |
+* |
|
3 |
+* Permission is hereby granted, free of charge, to any person obtaining a |
|
4 |
+* copy of this software and associated documentation files (the "Software"), |
|
5 |
+* to deal in the Software without restriction, including without limitation |
|
6 |
+* the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|
7 |
+* and/or sell copies of the Software, and to permit persons to whom the |
|
8 |
+* Software is furnished to do so, subject to the following conditions: |
|
9 |
+* |
|
10 |
+* The above copyright notice and this permission notice shall be included in |
|
11 |
+* all copies or substantial portions of the Software. |
|
12 |
+* |
|
13 |
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
14 |
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
15 |
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
|
16 |
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
17 |
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|
18 |
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|
19 |
+* DEALINGS IN THE SOFTWARE. |
|
20 |
+*/ |
|
21 |
+ |
|
22 |
+#include <cuda.h> |
|
23 |
+#include <stdio.h> |
|
24 |
+#include <string.h> |
|
25 |
+ |
|
26 |
+#include "libavutil/avstring.h" |
|
27 |
+#include "libavutil/common.h" |
|
28 |
+#include "libavutil/hwcontext.h" |
|
29 |
+#include "libavutil/hwcontext_cuda_internal.h" |
|
30 |
+#include "libavutil/internal.h" |
|
31 |
+#include "libavutil/opt.h" |
|
32 |
+#include "libavutil/pixdesc.h" |
|
33 |
+ |
|
34 |
+#include "avfilter.h" |
|
35 |
+#include "formats.h" |
|
36 |
+#include "internal.h" |
|
37 |
+#include "scale.h" |
|
38 |
+#include "video.h" |
|
39 |
+ |
|
40 |
+static const enum AVPixelFormat supported_formats[] = { |
|
41 |
+ AV_PIX_FMT_YUV420P, |
|
42 |
+ AV_PIX_FMT_NV12, |
|
43 |
+ AV_PIX_FMT_YUV444P, |
|
44 |
+ AV_PIX_FMT_P010, |
|
45 |
+ AV_PIX_FMT_P016 |
|
46 |
+}; |
|
47 |
+ |
|
48 |
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) |
|
49 |
+#define ALIGN_UP(a, b) (((a) + (b) - 1) & ~((b) - 1)) |
|
50 |
+#define NUM_BUFFERS 2 |
|
51 |
+#define BLOCKX 32 |
|
52 |
+#define BLOCKY 16 |
|
53 |
+ |
|
54 |
+typedef struct CUDAScaleContext { |
|
55 |
+ const AVClass *class; |
|
56 |
+ enum AVPixelFormat in_fmt; |
|
57 |
+ enum AVPixelFormat out_fmt; |
|
58 |
+ |
|
59 |
+ struct { |
|
60 |
+ int width; |
|
61 |
+ int height; |
|
62 |
+ } planes_in[3], planes_out[3]; |
|
63 |
+ |
|
64 |
+ AVBufferRef *frames_ctx; |
|
65 |
+ AVFrame *frame; |
|
66 |
+ |
|
67 |
+ AVFrame *tmp_frame; |
|
68 |
+ int passthrough; |
|
69 |
+ |
|
70 |
+ /** |
|
71 |
+ * Output sw format. AV_PIX_FMT_NONE for no conversion. |
|
72 |
+ */ |
|
73 |
+ enum AVPixelFormat format; |
|
74 |
+ |
|
75 |
+ char *w_expr; ///< width expression string |
|
76 |
+ char *h_expr; ///< height expression string |
|
77 |
+ |
|
78 |
+ CUcontext cu_ctx; |
|
79 |
+ CUevent cu_event; |
|
80 |
+ CUmodule cu_module; |
|
81 |
+ CUfunction cu_func_uchar; |
|
82 |
+ CUfunction cu_func_uchar2; |
|
83 |
+ CUfunction cu_func_uchar4; |
|
84 |
+ CUfunction cu_func_ushort; |
|
85 |
+ CUfunction cu_func_ushort2; |
|
86 |
+ CUfunction cu_func_ushort4; |
|
87 |
+ CUtexref cu_tex_uchar; |
|
88 |
+ CUtexref cu_tex_uchar2; |
|
89 |
+ CUtexref cu_tex_uchar4; |
|
90 |
+ CUtexref cu_tex_ushort; |
|
91 |
+ CUtexref cu_tex_ushort2; |
|
92 |
+ CUtexref cu_tex_ushort4; |
|
93 |
+ |
|
94 |
+ CUdeviceptr srcBuffer; |
|
95 |
+ CUdeviceptr dstBuffer; |
|
96 |
+ int tex_alignment; |
|
97 |
+} CUDAScaleContext; |
|
98 |
+ |
|
99 |
+static av_cold int cudascale_init(AVFilterContext *ctx) |
|
100 |
+{ |
|
101 |
+ CUDAScaleContext *s = ctx->priv; |
|
102 |
+ |
|
103 |
+ s->format = AV_PIX_FMT_NONE; |
|
104 |
+ s->frame = av_frame_alloc(); |
|
105 |
+ if (!s->frame) |
|
106 |
+ return AVERROR(ENOMEM); |
|
107 |
+ |
|
108 |
+ s->tmp_frame = av_frame_alloc(); |
|
109 |
+ if (!s->tmp_frame) |
|
110 |
+ return AVERROR(ENOMEM); |
|
111 |
+ |
|
112 |
+ return 0; |
|
113 |
+} |
|
114 |
+ |
|
115 |
+static av_cold void cudascale_uninit(AVFilterContext *ctx) |
|
116 |
+{ |
|
117 |
+ CUDAScaleContext *s = ctx->priv; |
|
118 |
+ |
|
119 |
+ av_frame_free(&s->frame); |
|
120 |
+ av_buffer_unref(&s->frames_ctx); |
|
121 |
+ av_frame_free(&s->tmp_frame); |
|
122 |
+} |
|
123 |
+ |
|
124 |
+static int cudascale_query_formats(AVFilterContext *ctx) |
|
125 |
+{ |
|
126 |
+ static const enum AVPixelFormat pixel_formats[] = { |
|
127 |
+ AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE, |
|
128 |
+ }; |
|
129 |
+ AVFilterFormats *pix_fmts = ff_make_format_list(pixel_formats); |
|
130 |
+ |
|
131 |
+ return ff_set_common_formats(ctx, pix_fmts); |
|
132 |
+} |
|
133 |
+ |
|
134 |
+static av_cold int init_stage(CUDAScaleContext *s, AVBufferRef *device_ctx) |
|
135 |
+{ |
|
136 |
+ AVBufferRef *out_ref = NULL; |
|
137 |
+ AVHWFramesContext *out_ctx; |
|
138 |
+ int in_sw, in_sh, out_sw, out_sh; |
|
139 |
+ int ret, i; |
|
140 |
+ |
|
141 |
+ av_pix_fmt_get_chroma_sub_sample(s->in_fmt, &in_sw, &in_sh); |
|
142 |
+ av_pix_fmt_get_chroma_sub_sample(s->out_fmt, &out_sw, &out_sh); |
|
143 |
+ if (!s->planes_out[0].width) { |
|
144 |
+ s->planes_out[0].width = s->planes_in[0].width; |
|
145 |
+ s->planes_out[0].height = s->planes_in[0].height; |
|
146 |
+ } |
|
147 |
+ |
|
148 |
+ for (i = 1; i < FF_ARRAY_ELEMS(s->planes_in); i++) { |
|
149 |
+ s->planes_in[i].width = s->planes_in[0].width >> in_sw; |
|
150 |
+ s->planes_in[i].height = s->planes_in[0].height >> in_sh; |
|
151 |
+ s->planes_out[i].width = s->planes_out[0].width >> out_sw; |
|
152 |
+ s->planes_out[i].height = s->planes_out[0].height >> out_sh; |
|
153 |
+ } |
|
154 |
+ |
|
155 |
+ out_ref = av_hwframe_ctx_alloc(device_ctx); |
|
156 |
+ if (!out_ref) |
|
157 |
+ return AVERROR(ENOMEM); |
|
158 |
+ out_ctx = (AVHWFramesContext*)out_ref->data; |
|
159 |
+ |
|
160 |
+ out_ctx->format = AV_PIX_FMT_CUDA; |
|
161 |
+ out_ctx->sw_format = s->out_fmt; |
|
162 |
+ out_ctx->width = FFALIGN(s->planes_out[0].width, 32); |
|
163 |
+ out_ctx->height = FFALIGN(s->planes_out[0].height, 32); |
|
164 |
+ |
|
165 |
+ ret = av_hwframe_ctx_init(out_ref); |
|
166 |
+ if (ret < 0) |
|
167 |
+ goto fail; |
|
168 |
+ |
|
169 |
+ av_frame_unref(s->frame); |
|
170 |
+ ret = av_hwframe_get_buffer(out_ref, s->frame, 0); |
|
171 |
+ if (ret < 0) |
|
172 |
+ goto fail; |
|
173 |
+ |
|
174 |
+ s->frame->width = s->planes_out[0].width; |
|
175 |
+ s->frame->height = s->planes_out[0].height; |
|
176 |
+ |
|
177 |
+ av_buffer_unref(&s->frames_ctx); |
|
178 |
+ s->frames_ctx = out_ref; |
|
179 |
+ |
|
180 |
+ return 0; |
|
181 |
+fail: |
|
182 |
+ av_buffer_unref(&out_ref); |
|
183 |
+ return ret; |
|
184 |
+} |
|
185 |
+ |
|
186 |
+static int format_is_supported(enum AVPixelFormat fmt) |
|
187 |
+{ |
|
188 |
+ int i; |
|
189 |
+ |
|
190 |
+ for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) |
|
191 |
+ if (supported_formats[i] == fmt) |
|
192 |
+ return 1; |
|
193 |
+ return 0; |
|
194 |
+} |
|
195 |
+ |
|
196 |
+static av_cold int init_processing_chain(AVFilterContext *ctx, int in_width, int in_height, |
|
197 |
+ int out_width, int out_height) |
|
198 |
+{ |
|
199 |
+ CUDAScaleContext *s = ctx->priv; |
|
200 |
+ |
|
201 |
+ AVHWFramesContext *in_frames_ctx; |
|
202 |
+ |
|
203 |
+ enum AVPixelFormat in_format; |
|
204 |
+ enum AVPixelFormat out_format; |
|
205 |
+ int ret; |
|
206 |
+ |
|
207 |
+ /* check that we have a hw context */ |
|
208 |
+ if (!ctx->inputs[0]->hw_frames_ctx) { |
|
209 |
+ av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n"); |
|
210 |
+ return AVERROR(EINVAL); |
|
211 |
+ } |
|
212 |
+ in_frames_ctx = (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data; |
|
213 |
+ in_format = in_frames_ctx->sw_format; |
|
214 |
+ out_format = (s->format == AV_PIX_FMT_NONE) ? in_format : s->format; |
|
215 |
+ |
|
216 |
+ if (!format_is_supported(in_format)) { |
|
217 |
+ av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n", |
|
218 |
+ av_get_pix_fmt_name(in_format)); |
|
219 |
+ return AVERROR(ENOSYS); |
|
220 |
+ } |
|
221 |
+ if (!format_is_supported(out_format)) { |
|
222 |
+ av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n", |
|
223 |
+ av_get_pix_fmt_name(out_format)); |
|
224 |
+ return AVERROR(ENOSYS); |
|
225 |
+ } |
|
226 |
+ |
|
227 |
+ if (in_width == out_width && in_height == out_height) |
|
228 |
+ s->passthrough = 1; |
|
229 |
+ |
|
230 |
+ s->in_fmt = in_format; |
|
231 |
+ s->out_fmt = out_format; |
|
232 |
+ |
|
233 |
+ s->planes_in[0].width = in_width; |
|
234 |
+ s->planes_in[0].height = in_height; |
|
235 |
+ s->planes_out[0].width = out_width; |
|
236 |
+ s->planes_out[0].height = out_height; |
|
237 |
+ |
|
238 |
+ ret = init_stage(s, in_frames_ctx->device_ref); |
|
239 |
+ if (ret < 0) |
|
240 |
+ return ret; |
|
241 |
+ |
|
242 |
+ ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx); |
|
243 |
+ if (!ctx->outputs[0]->hw_frames_ctx) |
|
244 |
+ return AVERROR(ENOMEM); |
|
245 |
+ |
|
246 |
+ return 0; |
|
247 |
+} |
|
248 |
+ |
|
249 |
+static av_cold int cudascale_config_props(AVFilterLink *outlink) |
|
250 |
+{ |
|
251 |
+ AVFilterContext *ctx = outlink->src; |
|
252 |
+ AVFilterLink *inlink = outlink->src->inputs[0]; |
|
253 |
+ CUDAScaleContext *s = ctx->priv; |
|
254 |
+ AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data; |
|
255 |
+ AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx; |
|
256 |
+ CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx; |
|
257 |
+ CUresult err; |
|
258 |
+ int w, h; |
|
259 |
+ int ret; |
|
260 |
+ |
|
261 |
+ extern char vf_scale_cuda_ptx[]; |
|
262 |
+ |
|
263 |
+ err = cuCtxPushCurrent(cuda_ctx); |
|
264 |
+ if (err != CUDA_SUCCESS) { |
|
265 |
+ av_log(ctx, AV_LOG_ERROR, "Error pushing cuda context\n"); |
|
266 |
+ ret = AVERROR_UNKNOWN; |
|
267 |
+ goto fail; |
|
268 |
+ } |
|
269 |
+ |
|
270 |
+ err = cuModuleLoadData(&s->cu_module, vf_scale_cuda_ptx); |
|
271 |
+ if (err != CUDA_SUCCESS) { |
|
272 |
+ av_log(ctx, AV_LOG_ERROR, "Error loading module data\n"); |
|
273 |
+ ret = AVERROR_UNKNOWN; |
|
274 |
+ goto fail; |
|
275 |
+ } |
|
276 |
+ |
|
277 |
+ cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Subsample_Bilinear_uchar"); |
|
278 |
+ cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Subsample_Bilinear_uchar2"); |
|
279 |
+ cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module, "Subsample_Bilinear_uchar4"); |
|
280 |
+ cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Subsample_Bilinear_ushort"); |
|
281 |
+ cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Subsample_Bilinear_ushort2"); |
|
282 |
+ cuModuleGetFunction(&s->cu_func_ushort4, s->cu_module, "Subsample_Bilinear_ushort4"); |
|
283 |
+ |
|
284 |
+ cuModuleGetTexRef(&s->cu_tex_uchar, s->cu_module, "uchar_tex"); |
|
285 |
+ cuModuleGetTexRef(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex"); |
|
286 |
+ cuModuleGetTexRef(&s->cu_tex_uchar4, s->cu_module, "uchar4_tex"); |
|
287 |
+ cuModuleGetTexRef(&s->cu_tex_ushort, s->cu_module, "ushort_tex"); |
|
288 |
+ cuModuleGetTexRef(&s->cu_tex_ushort2, s->cu_module, "ushort2_tex"); |
|
289 |
+ cuModuleGetTexRef(&s->cu_tex_ushort4, s->cu_module, "ushort4_tex"); |
|
290 |
+ |
|
291 |
+ cuTexRefSetFlags(s->cu_tex_uchar, CU_TRSF_READ_AS_INTEGER); |
|
292 |
+ cuTexRefSetFlags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER); |
|
293 |
+ cuTexRefSetFlags(s->cu_tex_uchar4, CU_TRSF_READ_AS_INTEGER); |
|
294 |
+ cuTexRefSetFlags(s->cu_tex_ushort, CU_TRSF_READ_AS_INTEGER); |
|
295 |
+ cuTexRefSetFlags(s->cu_tex_ushort2, CU_TRSF_READ_AS_INTEGER); |
|
296 |
+ cuTexRefSetFlags(s->cu_tex_ushort4, CU_TRSF_READ_AS_INTEGER); |
|
297 |
+ |
|
298 |
+ cuTexRefSetFilterMode(s->cu_tex_uchar, CU_TR_FILTER_MODE_LINEAR); |
|
299 |
+ cuTexRefSetFilterMode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR); |
|
300 |
+ cuTexRefSetFilterMode(s->cu_tex_uchar4, CU_TR_FILTER_MODE_LINEAR); |
|
301 |
+ cuTexRefSetFilterMode(s->cu_tex_ushort, CU_TR_FILTER_MODE_LINEAR); |
|
302 |
+ cuTexRefSetFilterMode(s->cu_tex_ushort2, CU_TR_FILTER_MODE_LINEAR); |
|
303 |
+ cuTexRefSetFilterMode(s->cu_tex_ushort4, CU_TR_FILTER_MODE_LINEAR); |
|
304 |
+ |
|
305 |
+ cuCtxPopCurrent(&dummy); |
|
306 |
+ |
|
307 |
+ if ((ret = ff_scale_eval_dimensions(s, |
|
308 |
+ s->w_expr, s->h_expr, |
|
309 |
+ inlink, outlink, |
|
310 |
+ &w, &h)) < 0) |
|
311 |
+ goto fail; |
|
312 |
+ |
|
313 |
+ if (((int64_t)h * inlink->w) > INT_MAX || |
|
314 |
+ ((int64_t)w * inlink->h) > INT_MAX) |
|
315 |
+ av_log(ctx, AV_LOG_ERROR, "Rescaled value for width or height is too big.\n"); |
|
316 |
+ |
|
317 |
+ outlink->w = w; |
|
318 |
+ outlink->h = h; |
|
319 |
+ |
|
320 |
+ ret = init_processing_chain(ctx, inlink->w, inlink->h, w, h); |
|
321 |
+ if (ret < 0) |
|
322 |
+ return ret; |
|
323 |
+ |
|
324 |
+ av_log(ctx, AV_LOG_VERBOSE, "w:%d h:%d -> w:%d h:%d\n", |
|
325 |
+ inlink->w, inlink->h, outlink->w, outlink->h); |
|
326 |
+ |
|
327 |
+ if (inlink->sample_aspect_ratio.num) { |
|
328 |
+ outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h*inlink->w, |
|
329 |
+ outlink->w*inlink->h}, |
|
330 |
+ inlink->sample_aspect_ratio); |
|
331 |
+ } else { |
|
332 |
+ outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; |
|
333 |
+ } |
|
334 |
+ |
|
335 |
+ return 0; |
|
336 |
+ |
|
337 |
+fail: |
|
338 |
+ return ret; |
|
339 |
+} |
|
340 |
+ |
|
341 |
+static int call_resize_kernel(CUDAScaleContext *s, CUfunction func, CUtexref tex, int channels, |
|
342 |
+ uint8_t *src_dptr, int src_width, int src_height, int src_pitch, |
|
343 |
+ uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch, |
|
344 |
+ int pixel_size) |
|
345 |
+{ |
|
346 |
+ CUdeviceptr src_devptr = (CUdeviceptr)src_dptr; |
|
347 |
+ CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr; |
|
348 |
+ void *args_uchar[] = { &dst_devptr, &dst_width, &dst_height, &dst_pitch, &src_width, &src_height }; |
|
349 |
+ CUDA_ARRAY_DESCRIPTOR desc; |
|
350 |
+ |
|
351 |
+ desc.Width = src_width; |
|
352 |
+ desc.Height = src_height; |
|
353 |
+ desc.NumChannels = channels; |
|
354 |
+ if (pixel_size == 1) { |
|
355 |
+ desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; |
|
356 |
+ } else { |
|
357 |
+ desc.Format = CU_AD_FORMAT_UNSIGNED_INT16; |
|
358 |
+ } |
|
359 |
+ |
|
360 |
+ cuTexRefSetAddress2D_v3(tex, &desc, src_devptr, src_pitch * pixel_size); |
|
361 |
+ cuLaunchKernel(func, DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, 0, args_uchar, NULL); |
|
362 |
+ |
|
363 |
+ return 0; |
|
364 |
+} |
|
365 |
+ |
|
366 |
+static int scalecuda_resize(AVFilterContext *ctx, |
|
367 |
+ AVFrame *out, AVFrame *in) |
|
368 |
+{ |
|
369 |
+ AVHWFramesContext *in_frames_ctx = (AVHWFramesContext*)in->hw_frames_ctx->data; |
|
370 |
+ CUDAScaleContext *s = ctx->priv; |
|
371 |
+ |
|
372 |
+ switch (in_frames_ctx->sw_format) { |
|
373 |
+ case AV_PIX_FMT_YUV420P: |
|
374 |
+ call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, |
|
375 |
+ in->data[0], in->width, in->height, in->linesize[0], |
|
376 |
+ out->data[0], out->width, out->height, out->linesize[0], |
|
377 |
+ 1); |
|
378 |
+ call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, |
|
379 |
+ in->data[0]+in->linesize[0]*in->height, in->width/2, in->height/2, in->linesize[0]/2, |
|
380 |
+ out->data[0]+out->linesize[0]*out->height, out->width/2, out->height/2, out->linesize[0]/2, |
|
381 |
+ 1); |
|
382 |
+ call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, |
|
383 |
+ in->data[0]+ ALIGN_UP((in->linesize[0]*in->height*5)/4, s->tex_alignment), in->width/2, in->height/2, in->linesize[0]/2, |
|
384 |
+ out->data[0]+(out->linesize[0]*out->height*5)/4, out->width/2, out->height/2, out->linesize[0]/2, |
|
385 |
+ 1); |
|
386 |
+ break; |
|
387 |
+ case AV_PIX_FMT_YUV444P: |
|
388 |
+ call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, |
|
389 |
+ in->data[0], in->width, in->height, in->linesize[0], |
|
390 |
+ out->data[0], out->width, out->height, out->linesize[0], |
|
391 |
+ 1); |
|
392 |
+ call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, |
|
393 |
+ in->data[0]+in->linesize[0]*in->height, in->width, in->height, in->linesize[0], |
|
394 |
+ out->data[0]+out->linesize[0]*out->height, out->width, out->height, out->linesize[0], |
|
395 |
+ 1); |
|
396 |
+ call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, |
|
397 |
+ in->data[0]+in->linesize[0]*in->height*2, in->width, in->height, in->linesize[0], |
|
398 |
+ out->data[0]+out->linesize[0]*out->height*2, out->width, out->height, out->linesize[0], |
|
399 |
+ 1); |
|
400 |
+ break; |
|
401 |
+ case AV_PIX_FMT_NV12: |
|
402 |
+ call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, |
|
403 |
+ in->data[0], in->width, in->height, in->linesize[0], |
|
404 |
+ out->data[0], out->width, out->height, out->linesize[0], |
|
405 |
+ 1); |
|
406 |
+ call_resize_kernel(s, s->cu_func_uchar2, s->cu_tex_uchar2, 2, |
|
407 |
+ in->data[1], in->width/2, in->height/2, in->linesize[1], |
|
408 |
+ out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width/2, out->height/2, out->linesize[1]/2, |
|
409 |
+ 1); |
|
410 |
+ break; |
|
411 |
+ case AV_PIX_FMT_P010LE: |
|
412 |
+ call_resize_kernel(s, s->cu_func_ushort, s->cu_tex_ushort, 1, |
|
413 |
+ in->data[0], in->width, in->height, in->linesize[0]/2, |
|
414 |
+ out->data[0], out->width, out->height, out->linesize[0]/2, |
|
415 |
+ 2); |
|
416 |
+ call_resize_kernel(s, s->cu_func_ushort2, s->cu_tex_ushort2, 2, |
|
417 |
+ in->data[1], in->width / 2, in->height / 2, in->linesize[1]/2, |
|
418 |
+ out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2, out->linesize[1] / 4, |
|
419 |
+ 2); |
|
420 |
+ break; |
|
421 |
+ case AV_PIX_FMT_P016LE: |
|
422 |
+ call_resize_kernel(s, s->cu_func_ushort, s->cu_tex_ushort, 1, |
|
423 |
+ in->data[0], in->width, in->height, in->linesize[0] / 2, |
|
424 |
+ out->data[0], out->width, out->height, out->linesize[0] / 2, |
|
425 |
+ 2); |
|
426 |
+ call_resize_kernel(s, s->cu_func_ushort2, s->cu_tex_ushort2, 2, |
|
427 |
+ in->data[1], in->width / 2, in->height / 2, in->linesize[1] / 2, |
|
428 |
+ out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2, out->linesize[1] / 4, |
|
429 |
+ 2); |
|
430 |
+ break; |
|
431 |
+ default: |
|
432 |
+ return AVERROR_BUG; |
|
433 |
+ } |
|
434 |
+ |
|
435 |
+ return 0; |
|
436 |
+} |
|
437 |
+ |
|
438 |
+static int cudascale_scale(AVFilterContext *ctx, AVFrame *out, AVFrame *in) |
|
439 |
+{ |
|
440 |
+ CUDAScaleContext *s = ctx->priv; |
|
441 |
+ AVFrame *src = in; |
|
442 |
+ int ret; |
|
443 |
+ |
|
444 |
+ ret = scalecuda_resize(ctx, s->frame, src); |
|
445 |
+ if (ret < 0) |
|
446 |
+ return ret; |
|
447 |
+ |
|
448 |
+ src = s->frame; |
|
449 |
+ ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0); |
|
450 |
+ if (ret < 0) |
|
451 |
+ return ret; |
|
452 |
+ |
|
453 |
+ av_frame_move_ref(out, s->frame); |
|
454 |
+ av_frame_move_ref(s->frame, s->tmp_frame); |
|
455 |
+ |
|
456 |
+ ret = av_frame_copy_props(out, in); |
|
457 |
+ if (ret < 0) |
|
458 |
+ return ret; |
|
459 |
+ |
|
460 |
+ return 0; |
|
461 |
+} |
|
462 |
+ |
|
463 |
+static int cudascale_filter_frame(AVFilterLink *link, AVFrame *in) |
|
464 |
+{ |
|
465 |
+ AVFilterContext *ctx = link->dst; |
|
466 |
+ CUDAScaleContext *s = ctx->priv; |
|
467 |
+ AVFilterLink *outlink = ctx->outputs[0]; |
|
468 |
+ AVHWFramesContext *frames_ctx = (AVHWFramesContext*)s->frames_ctx->data; |
|
469 |
+ AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx; |
|
470 |
+ |
|
471 |
+ AVFrame *out = NULL; |
|
472 |
+ CUresult err; |
|
473 |
+ CUcontext dummy; |
|
474 |
+ int ret = 0; |
|
475 |
+ |
|
476 |
+ out = av_frame_alloc(); |
|
477 |
+ if (!out) { |
|
478 |
+ ret = AVERROR(ENOMEM); |
|
479 |
+ goto fail; |
|
480 |
+ } |
|
481 |
+ |
|
482 |
+ err = cuCtxPushCurrent(device_hwctx->cuda_ctx); |
|
483 |
+ if (err != CUDA_SUCCESS) { |
|
484 |
+ ret = AVERROR_UNKNOWN; |
|
485 |
+ goto fail; |
|
486 |
+ } |
|
487 |
+ |
|
488 |
+ ret = cudascale_scale(ctx, out, in); |
|
489 |
+ |
|
490 |
+ cuCtxPopCurrent(&dummy); |
|
491 |
+ if (ret < 0) |
|
492 |
+ goto fail; |
|
493 |
+ |
|
494 |
+ av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den, |
|
495 |
+ (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w, |
|
496 |
+ (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h, |
|
497 |
+ INT_MAX); |
|
498 |
+ |
|
499 |
+ av_frame_free(&in); |
|
500 |
+ return ff_filter_frame(outlink, out); |
|
501 |
+fail: |
|
502 |
+ av_frame_free(&in); |
|
503 |
+ av_frame_free(&out); |
|
504 |
+ return ret; |
|
505 |
+} |
|
506 |
+ |
|
507 |
+#define OFFSET(x) offsetof(CUDAScaleContext, x) |
|
508 |
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM) |
|
509 |
+static const AVOption options[] = { |
|
510 |
+ { "w", "Output video width", OFFSET(w_expr), AV_OPT_TYPE_STRING, { .str = "iw" }, .flags = FLAGS }, |
|
511 |
+ { "h", "Output video height", OFFSET(h_expr), AV_OPT_TYPE_STRING, { .str = "ih" }, .flags = FLAGS }, |
|
512 |
+ { NULL }, |
|
513 |
+}; |
|
514 |
+ |
|
515 |
+static const AVClass cudascale_class = { |
|
516 |
+ .class_name = "cudascale", |
|
517 |
+ .item_name = av_default_item_name, |
|
518 |
+ .option = options, |
|
519 |
+ .version = LIBAVUTIL_VERSION_INT, |
|
520 |
+}; |
|
521 |
+ |
|
522 |
+static const AVFilterPad cudascale_inputs[] = { |
|
523 |
+ { |
|
524 |
+ .name = "default", |
|
525 |
+ .type = AVMEDIA_TYPE_VIDEO, |
|
526 |
+ .filter_frame = cudascale_filter_frame, |
|
527 |
+ }, |
|
528 |
+ { NULL } |
|
529 |
+}; |
|
530 |
+ |
|
531 |
+static const AVFilterPad cudascale_outputs[] = { |
|
532 |
+ { |
|
533 |
+ .name = "default", |
|
534 |
+ .type = AVMEDIA_TYPE_VIDEO, |
|
535 |
+ .config_props = cudascale_config_props, |
|
536 |
+ }, |
|
537 |
+ { NULL } |
|
538 |
+}; |
|
539 |
+ |
|
540 |
+AVFilter ff_vf_scale_cuda = { |
|
541 |
+ .name = "scale_cuda", |
|
542 |
+ .description = NULL_IF_CONFIG_SMALL("GPU accelerated video resizer"), |
|
543 |
+ |
|
544 |
+ .init = cudascale_init, |
|
545 |
+ .uninit = cudascale_uninit, |
|
546 |
+ .query_formats = cudascale_query_formats, |
|
547 |
+ |
|
548 |
+ .priv_size = sizeof(CUDAScaleContext), |
|
549 |
+ .priv_class = &cudascale_class, |
|
550 |
+ |
|
551 |
+ .inputs = cudascale_inputs, |
|
552 |
+ .outputs = cudascale_outputs, |
|
553 |
+ |
|
554 |
+ .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, |
|
555 |
+}; |
0 | 556 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,212 @@ |
0 |
+/* |
|
1 |
+ * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. |
|
2 |
+ * |
|
3 |
+ * Permission is hereby granted, free of charge, to any person obtaining a |
|
4 |
+ * copy of this software and associated documentation files (the "Software"), |
|
5 |
+ * to deal in the Software without restriction, including without limitation |
|
6 |
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|
7 |
+ * and/or sell copies of the Software, and to permit persons to whom the |
|
8 |
+ * Software is furnished to do so, subject to the following conditions: |
|
9 |
+ * |
|
10 |
+ * The above copyright notice and this permission notice shall be included in |
|
11 |
+ * all copies or substantial portions of the Software. |
|
12 |
+ * |
|
13 |
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
14 |
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
15 |
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
|
16 |
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
17 |
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|
18 |
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|
19 |
+ * DEALINGS IN THE SOFTWARE. |
|
20 |
+ */ |
|
21 |
+ |
|
22 |
+extern "C" { |
|
23 |
+ |
|
24 |
+texture<unsigned char, 2> uchar_tex; |
|
25 |
+texture<uchar2, 2> uchar2_tex; |
|
26 |
+texture<uchar4, 2> uchar4_tex; |
|
27 |
+texture<unsigned short, 2> ushort_tex; |
|
28 |
+texture<ushort2, 2> ushort2_tex; |
|
29 |
+texture<ushort4, 2> ushort4_tex; |
|
30 |
+ |
|
31 |
+__global__ void Subsample_Bilinear_uchar(unsigned char *dst, |
|
32 |
+ int dst_width, int dst_height, int dst_pitch, |
|
33 |
+ int src_width, int src_height) |
|
34 |
+{ |
|
35 |
+ int xo = blockIdx.x * blockDim.x + threadIdx.x; |
|
36 |
+ int yo = blockIdx.y * blockDim.y + threadIdx.y; |
|
37 |
+ |
|
38 |
+ if (yo < dst_height && xo < dst_width) |
|
39 |
+ { |
|
40 |
+ float hscale = (float)src_width / (float)dst_width; |
|
41 |
+ float vscale = (float)src_height / (float)dst_height; |
|
42 |
+ float xi = (xo + 0.5f) * hscale; |
|
43 |
+ float yi = (yo + 0.5f) * vscale; |
|
44 |
+ // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} |
|
45 |
+ float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); |
|
46 |
+ float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); |
|
47 |
+ // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} |
|
48 |
+ float dx = wh / (0.5f + wh); |
|
49 |
+ float dy = wv / (0.5f + wv); |
|
50 |
+ int y0 = tex2D(uchar_tex, xi-dx, yi-dy); |
|
51 |
+ int y1 = tex2D(uchar_tex, xi+dx, yi-dy); |
|
52 |
+ int y2 = tex2D(uchar_tex, xi-dx, yi+dy); |
|
53 |
+ int y3 = tex2D(uchar_tex, xi+dx, yi+dy); |
|
54 |
+ dst[yo*dst_pitch+xo] = (unsigned char)((y0+y1+y2+y3+2) >> 2); |
|
55 |
+ } |
|
56 |
+} |
|
57 |
+ |
|
58 |
+__global__ void Subsample_Bilinear_uchar2(uchar2 *dst, |
|
59 |
+ int dst_width, int dst_height, int dst_pitch2, |
|
60 |
+ int src_width, int src_height) |
|
61 |
+{ |
|
62 |
+ int xo = blockIdx.x * blockDim.x + threadIdx.x; |
|
63 |
+ int yo = blockIdx.y * blockDim.y + threadIdx.y; |
|
64 |
+ |
|
65 |
+ if (yo < dst_height && xo < dst_width) |
|
66 |
+ { |
|
67 |
+ float hscale = (float)src_width / (float)dst_width; |
|
68 |
+ float vscale = (float)src_height / (float)dst_height; |
|
69 |
+ float xi = (xo + 0.5f) * hscale; |
|
70 |
+ float yi = (yo + 0.5f) * vscale; |
|
71 |
+ // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} |
|
72 |
+ float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); |
|
73 |
+ float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); |
|
74 |
+ // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} |
|
75 |
+ float dx = wh / (0.5f + wh); |
|
76 |
+ float dy = wv / (0.5f + wv); |
|
77 |
+ uchar2 c0 = tex2D(uchar2_tex, xi-dx, yi-dy); |
|
78 |
+ uchar2 c1 = tex2D(uchar2_tex, xi+dx, yi-dy); |
|
79 |
+ uchar2 c2 = tex2D(uchar2_tex, xi-dx, yi+dy); |
|
80 |
+ uchar2 c3 = tex2D(uchar2_tex, xi+dx, yi+dy); |
|
81 |
+ int2 uv; |
|
82 |
+ uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; |
|
83 |
+ uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; |
|
84 |
+ dst[yo*dst_pitch2+xo] = make_uchar2((unsigned char)uv.x, (unsigned char)uv.y); |
|
85 |
+ } |
|
86 |
+} |
|
87 |
+ |
|
88 |
+__global__ void Subsample_Bilinear_uchar4(uchar4 *dst, |
|
89 |
+ int dst_width, int dst_height, int dst_pitch, |
|
90 |
+ int src_width, int src_height) |
|
91 |
+{ |
|
92 |
+ int xo = blockIdx.x * blockDim.x + threadIdx.x; |
|
93 |
+ int yo = blockIdx.y * blockDim.y + threadIdx.y; |
|
94 |
+ |
|
95 |
+ if (yo < dst_height && xo < dst_width) |
|
96 |
+ { |
|
97 |
+ float hscale = (float)src_width / (float)dst_width; |
|
98 |
+ float vscale = (float)src_height / (float)dst_height; |
|
99 |
+ float xi = (xo + 0.5f) * hscale; |
|
100 |
+ float yi = (yo + 0.5f) * vscale; |
|
101 |
+ // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} |
|
102 |
+ float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); |
|
103 |
+ float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); |
|
104 |
+ // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} |
|
105 |
+ float dx = wh / (0.5f + wh); |
|
106 |
+ float dy = wv / (0.5f + wv); |
|
107 |
+ uchar4 c0 = tex2D(uchar4_tex, xi-dx, yi-dy); |
|
108 |
+ uchar4 c1 = tex2D(uchar4_tex, xi+dx, yi-dy); |
|
109 |
+ uchar4 c2 = tex2D(uchar4_tex, xi-dx, yi+dy); |
|
110 |
+ uchar4 c3 = tex2D(uchar4_tex, xi+dx, yi+dy); |
|
111 |
+ int4 res; |
|
112 |
+ res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; |
|
113 |
+ res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; |
|
114 |
+ res.z = ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2; |
|
115 |
+ res.w = ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2; |
|
116 |
+ dst[yo*dst_pitch+xo] = make_uchar4( |
|
117 |
+ (unsigned char)res.x, (unsigned char)res.y, (unsigned char)res.z, (unsigned char)res.w); |
|
118 |
+ } |
|
119 |
+} |
|
120 |
+ |
|
121 |
+__global__ void Subsample_Bilinear_ushort(unsigned short *dst, |
|
122 |
+ int dst_width, int dst_height, int dst_pitch, |
|
123 |
+ int src_width, int src_height) |
|
124 |
+{ |
|
125 |
+ int xo = blockIdx.x * blockDim.x + threadIdx.x; |
|
126 |
+ int yo = blockIdx.y * blockDim.y + threadIdx.y; |
|
127 |
+ |
|
128 |
+ if (yo < dst_height && xo < dst_width) |
|
129 |
+ { |
|
130 |
+ float hscale = (float)src_width / (float)dst_width; |
|
131 |
+ float vscale = (float)src_height / (float)dst_height; |
|
132 |
+ float xi = (xo + 0.5f) * hscale; |
|
133 |
+ float yi = (yo + 0.5f) * vscale; |
|
134 |
+ // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} |
|
135 |
+ float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); |
|
136 |
+ float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); |
|
137 |
+ // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} |
|
138 |
+ float dx = wh / (0.5f + wh); |
|
139 |
+ float dy = wv / (0.5f + wv); |
|
140 |
+ int y0 = tex2D(ushort_tex, xi-dx, yi-dy); |
|
141 |
+ int y1 = tex2D(ushort_tex, xi+dx, yi-dy); |
|
142 |
+ int y2 = tex2D(ushort_tex, xi-dx, yi+dy); |
|
143 |
+ int y3 = tex2D(ushort_tex, xi+dx, yi+dy); |
|
144 |
+ dst[yo*dst_pitch+xo] = (unsigned short)((y0+y1+y2+y3+2) >> 2); |
|
145 |
+ } |
|
146 |
+} |
|
147 |
+ |
|
148 |
+__global__ void Subsample_Bilinear_ushort2(ushort2 *dst, |
|
149 |
+ int dst_width, int dst_height, int dst_pitch2, |
|
150 |
+ int src_width, int src_height) |
|
151 |
+{ |
|
152 |
+ int xo = blockIdx.x * blockDim.x + threadIdx.x; |
|
153 |
+ int yo = blockIdx.y * blockDim.y + threadIdx.y; |
|
154 |
+ |
|
155 |
+ if (yo < dst_height && xo < dst_width) |
|
156 |
+ { |
|
157 |
+ float hscale = (float)src_width / (float)dst_width; |
|
158 |
+ float vscale = (float)src_height / (float)dst_height; |
|
159 |
+ float xi = (xo + 0.5f) * hscale; |
|
160 |
+ float yi = (yo + 0.5f) * vscale; |
|
161 |
+ // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} |
|
162 |
+ float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); |
|
163 |
+ float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); |
|
164 |
+ // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} |
|
165 |
+ float dx = wh / (0.5f + wh); |
|
166 |
+ float dy = wv / (0.5f + wv); |
|
167 |
+ ushort2 c0 = tex2D(ushort2_tex, xi-dx, yi-dy); |
|
168 |
+ ushort2 c1 = tex2D(ushort2_tex, xi+dx, yi-dy); |
|
169 |
+ ushort2 c2 = tex2D(ushort2_tex, xi-dx, yi+dy); |
|
170 |
+ ushort2 c3 = tex2D(ushort2_tex, xi+dx, yi+dy); |
|
171 |
+ int2 uv; |
|
172 |
+ uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; |
|
173 |
+ uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; |
|
174 |
+ dst[yo*dst_pitch2+xo] = make_ushort2((unsigned short)uv.x, (unsigned short)uv.y); |
|
175 |
+ } |
|
176 |
+} |
|
177 |
+ |
|
178 |
+__global__ void Subsample_Bilinear_ushort4(ushort4 *dst, |
|
179 |
+ int dst_width, int dst_height, int dst_pitch, |
|
180 |
+ int src_width, int src_height) |
|
181 |
+{ |
|
182 |
+ int xo = blockIdx.x * blockDim.x + threadIdx.x; |
|
183 |
+ int yo = blockIdx.y * blockDim.y + threadIdx.y; |
|
184 |
+ |
|
185 |
+ if (yo < dst_height && xo < dst_width) |
|
186 |
+ { |
|
187 |
+ float hscale = (float)src_width / (float)dst_width; |
|
188 |
+ float vscale = (float)src_height / (float)dst_height; |
|
189 |
+ float xi = (xo + 0.5f) * hscale; |
|
190 |
+ float yi = (yo + 0.5f) * vscale; |
|
191 |
+ // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} |
|
192 |
+ float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); |
|
193 |
+ float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); |
|
194 |
+ // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} |
|
195 |
+ float dx = wh / (0.5f + wh); |
|
196 |
+ float dy = wv / (0.5f + wv); |
|
197 |
+ ushort4 c0 = tex2D(ushort4_tex, xi-dx, yi-dy); |
|
198 |
+ ushort4 c1 = tex2D(ushort4_tex, xi+dx, yi-dy); |
|
199 |
+ ushort4 c2 = tex2D(ushort4_tex, xi-dx, yi+dy); |
|
200 |
+ ushort4 c3 = tex2D(ushort4_tex, xi+dx, yi+dy); |
|
201 |
+ int4 res; |
|
202 |
+ res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; |
|
203 |
+ res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; |
|
204 |
+ res.z = ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2; |
|
205 |
+ res.w = ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2; |
|
206 |
+ dst[yo*dst_pitch+xo] = make_ushort4( |
|
207 |
+ (unsigned short)res.x, (unsigned short)res.y, (unsigned short)res.z, (unsigned short)res.w); |
|
208 |
+ } |
|
209 |
+} |
|
210 |
+ |
|
211 |
+} |