i7-4770K luma 21% faster, chroma 18% faster A10-7850K luma 42% faster, chroma 37% faster on 1920x1080 res
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
... | ... |
@@ -41,6 +41,10 @@ typedef struct { |
41 | 41 |
cl_kernel kernel_chroma; |
42 | 42 |
cl_mem cl_luma_mask; |
43 | 43 |
cl_mem cl_chroma_mask; |
44 |
+ cl_mem cl_luma_mask_x; |
|
45 |
+ cl_mem cl_chroma_mask_x; |
|
46 |
+ cl_mem cl_luma_mask_y; |
|
47 |
+ cl_mem cl_chroma_mask_y; |
|
44 | 48 |
int in_plane_size[8]; |
45 | 49 |
int out_plane_size[8]; |
46 | 50 |
int plane_num; |
... | ... |
@@ -87,11 +87,12 @@ end: |
87 | 87 |
return ret; |
88 | 88 |
} |
89 | 89 |
|
90 |
-static int compute_mask_matrix(cl_mem cl_mask_matrix, int step_x, int step_y) |
|
90 |
+static int copy_separable_masks(cl_mem cl_mask_x, cl_mem cl_mask_y, int step_x, int step_y) |
|
91 | 91 |
{ |
92 |
- int i, j, ret = 0; |
|
93 |
- uint32_t *mask_matrix, *mask_x, *mask_y; |
|
94 |
- size_t size_matrix = sizeof(uint32_t) * (2 * step_x + 1) * (2 * step_y + 1); |
|
92 |
+ int ret = 0; |
|
93 |
+ uint32_t *mask_x, *mask_y; |
|
94 |
+ size_t size_mask_x = sizeof(uint32_t) * (2 * step_x + 1); |
|
95 |
+ size_t size_mask_y = sizeof(uint32_t) * (2 * step_y + 1); |
|
95 | 96 |
mask_x = av_mallocz_array(2 * step_x + 1, sizeof(uint32_t)); |
96 | 97 |
if (!mask_x) { |
97 | 98 |
ret = AVERROR(ENOMEM); |
... | ... |
@@ -102,37 +103,36 @@ static int compute_mask_matrix(cl_mem cl_mask_matrix, int step_x, int step_y) |
102 | 102 |
ret = AVERROR(ENOMEM); |
103 | 103 |
goto end; |
104 | 104 |
} |
105 |
- mask_matrix = av_mallocz(size_matrix); |
|
106 |
- if (!mask_matrix) { |
|
107 |
- ret = AVERROR(ENOMEM); |
|
108 |
- goto end; |
|
109 |
- } |
|
105 |
+ |
|
110 | 106 |
ret = compute_mask(step_x, mask_x); |
111 | 107 |
if (ret < 0) |
112 | 108 |
goto end; |
113 | 109 |
ret = compute_mask(step_y, mask_y); |
114 | 110 |
if (ret < 0) |
115 | 111 |
goto end; |
116 |
- for (j = 0; j < 2 * step_y + 1; j++) { |
|
117 |
- for (i = 0; i < 2 * step_x + 1; i++) { |
|
118 |
- mask_matrix[i + j * (2 * step_x + 1)] = mask_y[j] * mask_x[i]; |
|
119 |
- } |
|
120 |
- } |
|
121 |
- ret = av_opencl_buffer_write(cl_mask_matrix, (uint8_t *)mask_matrix, size_matrix); |
|
112 |
+ |
|
113 |
+ ret = av_opencl_buffer_write(cl_mask_x, (uint8_t *)mask_x, size_mask_x); |
|
114 |
+ ret = av_opencl_buffer_write(cl_mask_y, (uint8_t *)mask_y, size_mask_y); |
|
122 | 115 |
end: |
123 | 116 |
av_freep(&mask_x); |
124 | 117 |
av_freep(&mask_y); |
125 |
- av_freep(&mask_matrix); |
|
118 |
+ |
|
126 | 119 |
return ret; |
127 | 120 |
} |
128 | 121 |
|
129 | 122 |
static int generate_mask(AVFilterContext *ctx) |
130 | 123 |
{ |
131 |
- UnsharpContext *unsharp = ctx->priv; |
|
132 |
- int i, ret = 0, step_x[2], step_y[2]; |
|
124 |
+ cl_mem masks[4]; |
|
133 | 125 |
cl_mem mask_matrix[2]; |
126 |
+ int i, ret = 0, step_x[2], step_y[2]; |
|
127 |
+ |
|
128 |
+ UnsharpContext *unsharp = ctx->priv; |
|
134 | 129 |
mask_matrix[0] = unsharp->opencl_ctx.cl_luma_mask; |
135 | 130 |
mask_matrix[1] = unsharp->opencl_ctx.cl_chroma_mask; |
131 |
+ masks[0] = unsharp->opencl_ctx.cl_luma_mask_x; |
|
132 |
+ masks[1] = unsharp->opencl_ctx.cl_luma_mask_y; |
|
133 |
+ masks[2] = unsharp->opencl_ctx.cl_chroma_mask_x; |
|
134 |
+ masks[3] = unsharp->opencl_ctx.cl_chroma_mask_y; |
|
136 | 135 |
step_x[0] = unsharp->luma.steps_x; |
137 | 136 |
step_x[1] = unsharp->chroma.steps_x; |
138 | 137 |
step_y[0] = unsharp->luma.steps_y; |
... | ... |
@@ -144,12 +144,16 @@ static int generate_mask(AVFilterContext *ctx) |
144 | 144 |
else |
145 | 145 |
unsharp->opencl_ctx.use_fast_kernels = 1; |
146 | 146 |
|
147 |
+ if (!masks[0] || !masks[1] || !masks[2] || !masks[3]) { |
|
148 |
+ av_log(ctx, AV_LOG_ERROR, "Luma mask and chroma mask should not be NULL\n"); |
|
149 |
+ return AVERROR(EINVAL); |
|
150 |
+ } |
|
147 | 151 |
if (!mask_matrix[0] || !mask_matrix[1]) { |
148 | 152 |
av_log(ctx, AV_LOG_ERROR, "Luma mask and chroma mask should not be NULL\n"); |
149 | 153 |
return AVERROR(EINVAL); |
150 | 154 |
} |
151 | 155 |
for (i = 0; i < 2; i++) { |
152 |
- ret = compute_mask_matrix(mask_matrix[i], step_x[i], step_y[i]); |
|
156 |
+ ret = copy_separable_masks(masks[2*i], masks[2*i+1], step_x[i], step_y[i]); |
|
153 | 157 |
if (ret < 0) |
154 | 158 |
return ret; |
155 | 159 |
} |
... | ... |
@@ -184,7 +188,8 @@ int ff_opencl_apply_unsharp(AVFilterContext *ctx, AVFrame *in, AVFrame *out) |
184 | 184 |
ret = avpriv_opencl_set_parameter(&kernel1, |
185 | 185 |
FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_inbuf), |
186 | 186 |
FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_outbuf), |
187 |
- FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_luma_mask), |
|
187 |
+ FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_luma_mask_x), |
|
188 |
+ FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_luma_mask_y), |
|
188 | 189 |
FF_OPENCL_PARAM_INFO(unsharp->luma.amount), |
189 | 190 |
FF_OPENCL_PARAM_INFO(unsharp->luma.scalebits), |
190 | 191 |
FF_OPENCL_PARAM_INFO(unsharp->luma.halfscale), |
... | ... |
@@ -201,7 +206,8 @@ int ff_opencl_apply_unsharp(AVFilterContext *ctx, AVFrame *in, AVFrame *out) |
201 | 201 |
ret = avpriv_opencl_set_parameter(&kernel2, |
202 | 202 |
FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_inbuf), |
203 | 203 |
FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_outbuf), |
204 |
- FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_chroma_mask), |
|
204 |
+ FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_chroma_mask_x), |
|
205 |
+ FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_chroma_mask_y), |
|
205 | 206 |
FF_OPENCL_PARAM_INFO(unsharp->chroma.amount), |
206 | 207 |
FF_OPENCL_PARAM_INFO(unsharp->chroma.scalebits), |
207 | 208 |
FF_OPENCL_PARAM_INFO(unsharp->chroma.halfscale), |
... | ... |
@@ -264,7 +270,9 @@ int ff_opencl_apply_unsharp(AVFilterContext *ctx, AVFrame *in, AVFrame *out) |
264 | 264 |
return AVERROR_EXTERNAL; |
265 | 265 |
} |
266 | 266 |
} |
267 |
- clFinish(unsharp->opencl_ctx.command_queue); |
|
267 |
+ //blocking map is suffficient, no need for clFinish |
|
268 |
+ //clFinish(unsharp->opencl_ctx.command_queue); |
|
269 |
+ |
|
268 | 270 |
return av_opencl_buffer_read_image(out->data, unsharp->opencl_ctx.out_plane_size, |
269 | 271 |
unsharp->opencl_ctx.plane_num, unsharp->opencl_ctx.cl_outbuf, |
270 | 272 |
unsharp->opencl_ctx.cl_outbuf_size); |
... | ... |
@@ -286,6 +294,27 @@ int ff_opencl_unsharp_init(AVFilterContext *ctx) |
286 | 286 |
ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_chroma_mask, |
287 | 287 |
sizeof(uint32_t) * (2 * unsharp->chroma.steps_x + 1) * (2 * unsharp->chroma.steps_y + 1), |
288 | 288 |
CL_MEM_READ_ONLY, NULL); |
289 |
+ // separable filters |
|
290 |
+ if (ret < 0) |
|
291 |
+ return ret; |
|
292 |
+ ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_luma_mask_x, |
|
293 |
+ sizeof(uint32_t) * (2 * unsharp->luma.steps_x + 1), |
|
294 |
+ CL_MEM_READ_ONLY, NULL); |
|
295 |
+ if (ret < 0) |
|
296 |
+ return ret; |
|
297 |
+ ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_luma_mask_y, |
|
298 |
+ sizeof(uint32_t) * (2 * unsharp->luma.steps_y + 1), |
|
299 |
+ CL_MEM_READ_ONLY, NULL); |
|
300 |
+ if (ret < 0) |
|
301 |
+ return ret; |
|
302 |
+ ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_chroma_mask_x, |
|
303 |
+ sizeof(uint32_t) * (2 * unsharp->chroma.steps_x + 1), |
|
304 |
+ CL_MEM_READ_ONLY, NULL); |
|
305 |
+ if (ret < 0) |
|
306 |
+ return ret; |
|
307 |
+ ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_chroma_mask_y, |
|
308 |
+ sizeof(uint32_t) * (2 * unsharp->chroma.steps_y + 1), |
|
309 |
+ CL_MEM_READ_ONLY, NULL); |
|
289 | 310 |
if (ret < 0) |
290 | 311 |
return ret; |
291 | 312 |
ret = generate_mask(ctx); |
... | ... |
@@ -339,6 +368,10 @@ void ff_opencl_unsharp_uninit(AVFilterContext *ctx) |
339 | 339 |
av_opencl_buffer_release(&unsharp->opencl_ctx.cl_outbuf); |
340 | 340 |
av_opencl_buffer_release(&unsharp->opencl_ctx.cl_luma_mask); |
341 | 341 |
av_opencl_buffer_release(&unsharp->opencl_ctx.cl_chroma_mask); |
342 |
+ av_opencl_buffer_release(&unsharp->opencl_ctx.cl_luma_mask_x); |
|
343 |
+ av_opencl_buffer_release(&unsharp->opencl_ctx.cl_chroma_mask_x); |
|
344 |
+ av_opencl_buffer_release(&unsharp->opencl_ctx.cl_luma_mask_y); |
|
345 |
+ av_opencl_buffer_release(&unsharp->opencl_ctx.cl_chroma_mask_y); |
|
342 | 346 |
clReleaseKernel(unsharp->opencl_ctx.kernel_default); |
343 | 347 |
clReleaseKernel(unsharp->opencl_ctx.kernel_luma); |
344 | 348 |
clReleaseKernel(unsharp->opencl_ctx.kernel_chroma); |
... | ... |
@@ -36,7 +36,8 @@ inline unsigned char clip_uint8(int a) |
36 | 36 |
kernel void unsharp_luma( |
37 | 37 |
global unsigned char *src, |
38 | 38 |
global unsigned char *dst, |
39 |
- global int *mask, |
|
39 |
+ global int *mask_x, |
|
40 |
+ global int *mask_y, |
|
40 | 41 |
int amount, |
41 | 42 |
int scalebits, |
42 | 43 |
int halfscale, |
... | ... |
@@ -59,10 +60,12 @@ kernel void unsharp_luma( |
59 | 59 |
return; |
60 | 60 |
} |
61 | 61 |
|
62 |
- local uchar l[32][32]; |
|
63 |
- local int lc[LU_RADIUS_X*LU_RADIUS_Y]; |
|
62 |
+ local unsigned int l[32][32]; |
|
63 |
+ local unsigned int lcx[LU_RADIUS_X]; |
|
64 |
+ local unsigned int lcy[LU_RADIUS_Y]; |
|
64 | 65 |
int indexIx, indexIy, i, j; |
65 | 66 |
|
67 |
+ //load up tile: actual workspace + halo of 8 points in x and y \n |
|
66 | 68 |
for(i = 0; i <= 1; i++) { |
67 | 69 |
indexIy = -8 + (blockIdx.y + i) * 16 + threadIdx.y; |
68 | 70 |
indexIy = indexIy < 0 ? 0 : indexIy; |
... | ... |
@@ -76,27 +79,54 @@ kernel void unsharp_luma( |
76 | 76 |
} |
77 | 77 |
|
78 | 78 |
int indexL = threadIdx.y*16 + threadIdx.x; |
79 |
- if (indexL < LU_RADIUS_X*LU_RADIUS_Y) |
|
80 |
- lc[indexL] = mask[indexL]; |
|
79 |
+ if (indexL < LU_RADIUS_X) |
|
80 |
+ lcx[indexL] = mask_x[indexL]; |
|
81 |
+ if (indexL < LU_RADIUS_Y) |
|
82 |
+ lcy[indexL] = mask_y[indexL]; |
|
81 | 83 |
barrier(CLK_LOCAL_MEM_FENCE); |
82 | 84 |
|
85 |
+ //needed for unsharp mask application in the end \n |
|
86 |
+ int orig_value = (int)l[threadIdx.y + 8][threadIdx.x + 8]; |
|
87 |
+ |
|
83 | 88 |
int idx, idy, maskIndex; |
84 |
- int sum = 0; |
|
85 |
- int steps_x = LU_RADIUS_X/2; |
|
86 |
- int steps_y = LU_RADIUS_Y/2; |
|
89 |
+ int temp[2] = {0}; |
|
90 |
+ int steps_x = (LU_RADIUS_X-1)/2; |
|
91 |
+ int steps_y = (LU_RADIUS_Y-1)/2; |
|
87 | 92 |
|
88 |
- \n#pragma unroll\n |
|
89 |
- for (i = -steps_y; i <= steps_y; i++) { |
|
90 |
- idy = 8 + i + threadIdx.y; |
|
91 |
- \n#pragma unroll\n |
|
92 |
- for (j = -steps_x; j <= steps_x; j++) { |
|
93 |
- idx = 8 + j + threadIdx.x; |
|
94 |
- maskIndex = (i + steps_y)*LU_RADIUS_X + j + steps_x; |
|
95 |
- sum += (int)l[idy][idx] * lc[maskIndex]; |
|
93 |
+ // compute the actual workspace + left&right halos \n |
|
94 |
+ \n#pragma unroll\n |
|
95 |
+ for (j = 0; j <=1; j++) { |
|
96 |
+ //extra work to cover left and right halos \n |
|
97 |
+ idx = 16*j + threadIdx.x; |
|
98 |
+ \n#pragma unroll\n |
|
99 |
+ for (i = -steps_y; i <= steps_y; i++) { |
|
100 |
+ idy = 8 + i + threadIdx.y; |
|
101 |
+ maskIndex = (i + steps_y); |
|
102 |
+ temp[j] += (int)l[idy][idx] * lcy[maskIndex]; |
|
96 | 103 |
} |
97 | 104 |
} |
98 |
- int temp = (int)l[threadIdx.y + 8][threadIdx.x + 8]; |
|
99 |
- int res = temp + (((temp - (int)((sum + halfscale) >> scalebits)) * amount) >> 16); |
|
105 |
+ barrier(CLK_LOCAL_MEM_FENCE); |
|
106 |
+ //save results from the vertical filter in local memory \n |
|
107 |
+ idy = 8 + threadIdx.y; |
|
108 |
+ \n#pragma unroll\n |
|
109 |
+ for (j = 0; j <=1; j++) { |
|
110 |
+ idx = 16*j + threadIdx.x; |
|
111 |
+ l[idy][idx] = temp[j]; |
|
112 |
+ } |
|
113 |
+ barrier(CLK_LOCAL_MEM_FENCE); |
|
114 |
+ |
|
115 |
+ //compute results with the horizontal filter \n |
|
116 |
+ int sum = 0; |
|
117 |
+ idy = 8 + threadIdx.y; |
|
118 |
+ \n#pragma unroll\n |
|
119 |
+ for (j = -steps_x; j <= steps_x; j++) { |
|
120 |
+ idx = 8 + j + threadIdx.x; |
|
121 |
+ maskIndex = j + steps_x; |
|
122 |
+ sum += (int)l[idy][idx] * lcx[maskIndex]; |
|
123 |
+ } |
|
124 |
+ |
|
125 |
+ int res = orig_value + (((orig_value - (int)((sum + halfscale) >> scalebits)) * amount) >> 16); |
|
126 |
+ |
|
100 | 127 |
if (globalIdx.x < width && globalIdx.y < height) |
101 | 128 |
dst[globalIdx.x + globalIdx.y*dst_stride] = clip_uint8(res); |
102 | 129 |
} |
... | ... |
@@ -104,7 +134,8 @@ kernel void unsharp_luma( |
104 | 104 |
kernel void unsharp_chroma( |
105 | 105 |
global unsigned char *src_y, |
106 | 106 |
global unsigned char *dst_y, |
107 |
- global int *mask, |
|
107 |
+ global int *mask_x, |
|
108 |
+ global int *mask_y, |
|
108 | 109 |
int amount, |
109 | 110 |
int scalebits, |
110 | 111 |
int halfscale, |
... | ... |
@@ -141,8 +172,9 @@ kernel void unsharp_chroma( |
141 | 141 |
return; |
142 | 142 |
} |
143 | 143 |
|
144 |
- local uchar l[32][32]; |
|
145 |
- local int lc[CH_RADIUS_X*CH_RADIUS_Y]; |
|
144 |
+ local unsigned int l[32][32]; |
|
145 |
+ local unsigned int lcx[CH_RADIUS_X]; |
|
146 |
+ local unsigned int lcy[CH_RADIUS_Y]; |
|
146 | 147 |
int indexIx, indexIy, i, j; |
147 | 148 |
for(i = 0; i <= 1; i++) { |
148 | 149 |
indexIy = -8 + (blockIdx.y + i) * 16 + threadIdx.y; |
... | ... |
@@ -157,27 +189,51 @@ kernel void unsharp_chroma( |
157 | 157 |
} |
158 | 158 |
|
159 | 159 |
int indexL = threadIdx.y*16 + threadIdx.x; |
160 |
- if (indexL < CH_RADIUS_X*CH_RADIUS_Y) |
|
161 |
- lc[indexL] = mask[indexL]; |
|
160 |
+ if (indexL < CH_RADIUS_X) |
|
161 |
+ lcx[indexL] = mask_x[indexL]; |
|
162 |
+ if (indexL < CH_RADIUS_Y) |
|
163 |
+ lcy[indexL] = mask_y[indexL]; |
|
162 | 164 |
barrier(CLK_LOCAL_MEM_FENCE); |
163 | 165 |
|
166 |
+ int orig_value = (int)l[threadIdx.y + 8][threadIdx.x + 8]; |
|
167 |
+ |
|
164 | 168 |
int idx, idy, maskIndex; |
165 |
- int sum = 0; |
|
166 | 169 |
int steps_x = CH_RADIUS_X/2; |
167 | 170 |
int steps_y = CH_RADIUS_Y/2; |
171 |
+ int temp[2] = {0,0}; |
|
168 | 172 |
|
169 | 173 |
\n#pragma unroll\n |
170 |
- for (i = -steps_y; i <= steps_y; i++) { |
|
171 |
- idy = 8 + i + threadIdx.y; |
|
174 |
+ for (j = 0; j <= 1; j++) { |
|
175 |
+ idx = 16*j + threadIdx.x; |
|
172 | 176 |
\n#pragma unroll\n |
173 |
- for (j = -steps_x; j <= steps_x; j++) { |
|
174 |
- idx = 8 + j + threadIdx.x; |
|
175 |
- maskIndex = (i + steps_y)*CH_RADIUS_X + j + steps_x; |
|
176 |
- sum += (int)l[idy][idx] * lc[maskIndex]; |
|
177 |
- } |
|
177 |
+ for (i = -steps_y; i <= steps_y; i++) { |
|
178 |
+ idy = 8 + i + threadIdx.y; |
|
179 |
+ maskIndex = i + steps_y; |
|
180 |
+ temp[j] += (int)l[idy][idx] * lcy[maskIndex]; |
|
181 |
+ } |
|
182 |
+ } |
|
183 |
+ |
|
184 |
+ barrier(CLK_LOCAL_MEM_FENCE); |
|
185 |
+ idy = 8 + threadIdx.y; |
|
186 |
+ \n#pragma unroll\n |
|
187 |
+ for (j = 0; j <= 1; j++) { |
|
188 |
+ idx = 16*j + threadIdx.x; |
|
189 |
+ l[idy][idx] = temp[j]; |
|
178 | 190 |
} |
179 |
- int temp = (int)l[threadIdx.y + 8][threadIdx.x + 8]; |
|
180 |
- int res = temp + (((temp - (int)((sum + halfscale) >> scalebits)) * amount) >> 16); |
|
191 |
+ barrier(CLK_LOCAL_MEM_FENCE); |
|
192 |
+ |
|
193 |
+ //compute results with the horizontal filter \n |
|
194 |
+ int sum = 0; |
|
195 |
+ idy = 8 + threadIdx.y; |
|
196 |
+ \n#pragma unroll\n |
|
197 |
+ for (j = -steps_x; j <= steps_x; j++) { |
|
198 |
+ idx = 8 + j + threadIdx.x; |
|
199 |
+ maskIndex = j + steps_x; |
|
200 |
+ sum += (int)l[idy][idx] * lcx[maskIndex]; |
|
201 |
+ } |
|
202 |
+ |
|
203 |
+ int res = orig_value + (((orig_value - (int)((sum + halfscale) >> scalebits)) * amount) >> 16); |
|
204 |
+ |
|
181 | 205 |
if (globalIdx.x < cw && globalIdx.y < ch) |
182 | 206 |
dst[globalIdx.x + globalIdx.y*dst_stride_ch] = clip_uint8(res); |
183 | 207 |
} |