FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
unsharp_opencl_kernel.h
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2013 Wei Gao <weigao@multicorewareinc.com>
3  * Copyright (C) 2013 Lenny Wang
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #ifndef AVFILTER_UNSHARP_OPENCL_KERNEL_H
23 #define AVFILTER_UNSHARP_OPENCL_KERNEL_H
24 
25 #include "libavutil/opencl.h"
26 
28 inline unsigned char clip_uint8(int a)
29 {
30  if (a & (~0xFF))
31  return (-a)>>31;
32  else
33  return a;
34 }
35 
36 kernel void unsharp_luma(
37  global unsigned char *src,
38  global unsigned char *dst,
39  global int *mask_x,
40  global int *mask_y,
41  int amount,
42  int scalebits,
43  int halfscale,
44  int src_stride,
45  int dst_stride,
46  int width,
47  int height)
48 {
49  int2 threadIdx, blockIdx, globalIdx;
50  threadIdx.x = get_local_id(0);
51  threadIdx.y = get_local_id(1);
52  blockIdx.x = get_group_id(0);
53  blockIdx.y = get_group_id(1);
54  globalIdx.x = get_global_id(0);
55  globalIdx.y = get_global_id(1);
56 
57  if (!amount) {
58  if (globalIdx.x < width && globalIdx.y < height)
59  dst[globalIdx.x + globalIdx.y*dst_stride] = src[globalIdx.x + globalIdx.y*src_stride];
60  return;
61  }
62 
63  local unsigned int l[32][32];
64  local unsigned int lcx[LU_RADIUS_X];
65  local unsigned int lcy[LU_RADIUS_Y];
66  int indexIx, indexIy, i, j;
67 
68  //load up tile: actual workspace + halo of 8 points in x and y \n
69  for(i = 0; i <= 1; i++) {
70  indexIy = -8 + (blockIdx.y + i) * 16 + threadIdx.y;
71  indexIy = indexIy < 0 ? 0 : indexIy;
72  indexIy = indexIy >= height ? height - 1: indexIy;
73  for(j = 0; j <= 1; j++) {
74  indexIx = -8 + (blockIdx.x + j) * 16 + threadIdx.x;
75  indexIx = indexIx < 0 ? 0 : indexIx;
76  indexIx = indexIx >= width ? width - 1: indexIx;
77  l[i*16 + threadIdx.y][j*16 + threadIdx.x] = src[indexIy*src_stride + indexIx];
78  }
79  }
80 
81  int indexL = threadIdx.y*16 + threadIdx.x;
82  if (indexL < LU_RADIUS_X)
83  lcx[indexL] = mask_x[indexL];
84  if (indexL < LU_RADIUS_Y)
85  lcy[indexL] = mask_y[indexL];
86  barrier(CLK_LOCAL_MEM_FENCE);
87 
88  //needed for unsharp mask application in the end \n
89  int orig_value = (int)l[threadIdx.y + 8][threadIdx.x + 8];
90 
91  int idx, idy, maskIndex;
92  int temp[2] = {0};
93  int steps_x = (LU_RADIUS_X-1)/2;
94  int steps_y = (LU_RADIUS_Y-1)/2;
95 
96  // compute the actual workspace + left&right halos \n
97  \n#pragma unroll\n
98  for (j = 0; j <=1; j++) {
99  //extra work to cover left and right halos \n
100  idx = 16*j + threadIdx.x;
101  \n#pragma unroll\n
102  for (i = -steps_y; i <= steps_y; i++) {
103  idy = 8 + i + threadIdx.y;
104  maskIndex = (i + steps_y);
105  temp[j] += (int)l[idy][idx] * lcy[maskIndex];
106  }
107  }
108  barrier(CLK_LOCAL_MEM_FENCE);
109  //save results from the vertical filter in local memory \n
110  idy = 8 + threadIdx.y;
111  \n#pragma unroll\n
112  for (j = 0; j <=1; j++) {
113  idx = 16*j + threadIdx.x;
114  l[idy][idx] = temp[j];
115  }
116  barrier(CLK_LOCAL_MEM_FENCE);
117 
118  //compute results with the horizontal filter \n
119  int sum = 0;
120  idy = 8 + threadIdx.y;
121  \n#pragma unroll\n
122  for (j = -steps_x; j <= steps_x; j++) {
123  idx = 8 + j + threadIdx.x;
124  maskIndex = j + steps_x;
125  sum += (int)l[idy][idx] * lcx[maskIndex];
126  }
127 
128  int res = orig_value + (((orig_value - (int)((sum + halfscale) >> scalebits)) * amount) >> 16);
129 
130  if (globalIdx.x < width && globalIdx.y < height)
131  dst[globalIdx.x + globalIdx.y*dst_stride] = clip_uint8(res);
132 }
133 
134 kernel void unsharp_chroma(
135  global unsigned char *src_y,
136  global unsigned char *dst_y,
137  global int *mask_x,
138  global int *mask_y,
139  int amount,
140  int scalebits,
141  int halfscale,
142  int src_stride_lu,
143  int src_stride_ch,
144  int dst_stride_lu,
145  int dst_stride_ch,
146  int width,
147  int height,
148  int cw,
149  int ch)
150 {
151  global unsigned char *dst_u = dst_y + height * dst_stride_lu;
152  global unsigned char *dst_v = dst_u + ch * dst_stride_ch;
153  global unsigned char *src_u = src_y + height * src_stride_lu;
154  global unsigned char *src_v = src_u + ch * src_stride_ch;
155  int2 threadIdx, blockIdx, globalIdx;
156  threadIdx.x = get_local_id(0);
157  threadIdx.y = get_local_id(1);
158  blockIdx.x = get_group_id(0);
159  blockIdx.y = get_group_id(1);
160  globalIdx.x = get_global_id(0);
161  globalIdx.y = get_global_id(1);
162  int padch = get_global_size(1)/2;
163  global unsigned char *src = globalIdx.y>=padch ? src_v : src_u;
164  global unsigned char *dst = globalIdx.y>=padch ? dst_v : dst_u;
165 
166  blockIdx.y = globalIdx.y>=padch ? blockIdx.y - get_num_groups(1)/2 : blockIdx.y;
167  globalIdx.y = globalIdx.y>=padch ? globalIdx.y - padch : globalIdx.y;
168 
169  if (!amount) {
170  if (globalIdx.x < cw && globalIdx.y < ch)
171  dst[globalIdx.x + globalIdx.y*dst_stride_ch] = src[globalIdx.x + globalIdx.y*src_stride_ch];
172  return;
173  }
174 
175  local unsigned int l[32][32];
176  local unsigned int lcx[CH_RADIUS_X];
177  local unsigned int lcy[CH_RADIUS_Y];
178  int indexIx, indexIy, i, j;
179  for(i = 0; i <= 1; i++) {
180  indexIy = -8 + (blockIdx.y + i) * 16 + threadIdx.y;
181  indexIy = indexIy < 0 ? 0 : indexIy;
182  indexIy = indexIy >= ch ? ch - 1: indexIy;
183  for(j = 0; j <= 1; j++) {
184  indexIx = -8 + (blockIdx.x + j) * 16 + threadIdx.x;
185  indexIx = indexIx < 0 ? 0 : indexIx;
186  indexIx = indexIx >= cw ? cw - 1: indexIx;
187  l[i*16 + threadIdx.y][j*16 + threadIdx.x] = src[indexIy * src_stride_ch + indexIx];
188  }
189  }
190 
191  int indexL = threadIdx.y*16 + threadIdx.x;
192  if (indexL < CH_RADIUS_X)
193  lcx[indexL] = mask_x[indexL];
194  if (indexL < CH_RADIUS_Y)
195  lcy[indexL] = mask_y[indexL];
196  barrier(CLK_LOCAL_MEM_FENCE);
197 
198  int orig_value = (int)l[threadIdx.y + 8][threadIdx.x + 8];
199 
200  int idx, idy, maskIndex;
201  int steps_x = CH_RADIUS_X/2;
202  int steps_y = CH_RADIUS_Y/2;
203  int temp[2] = {0,0};
204 
205  \n#pragma unroll\n
206  for (j = 0; j <= 1; j++) {
207  idx = 16*j + threadIdx.x;
208  \n#pragma unroll\n
209  for (i = -steps_y; i <= steps_y; i++) {
210  idy = 8 + i + threadIdx.y;
211  maskIndex = i + steps_y;
212  temp[j] += (int)l[idy][idx] * lcy[maskIndex];
213  }
214  }
215 
216  barrier(CLK_LOCAL_MEM_FENCE);
217  idy = 8 + threadIdx.y;
218  \n#pragma unroll\n
219  for (j = 0; j <= 1; j++) {
220  idx = 16*j + threadIdx.x;
221  l[idy][idx] = temp[j];
222  }
223  barrier(CLK_LOCAL_MEM_FENCE);
224 
225  //compute results with the horizontal filter \n
226  int sum = 0;
227  idy = 8 + threadIdx.y;
228  \n#pragma unroll\n
229  for (j = -steps_x; j <= steps_x; j++) {
230  idx = 8 + j + threadIdx.x;
231  maskIndex = j + steps_x;
232  sum += (int)l[idy][idx] * lcx[maskIndex];
233  }
234 
235  int res = orig_value + (((orig_value - (int)((sum + halfscale) >> scalebits)) * amount) >> 16);
236 
237  if (globalIdx.x < cw && globalIdx.y < ch)
238  dst[globalIdx.x + globalIdx.y*dst_stride_ch] = clip_uint8(res);
239 }
240 
241 kernel void unsharp_default(global unsigned char *src,
242  global unsigned char *dst,
243  const global unsigned int *mask_lu,
244  const global unsigned int *mask_ch,
245  int amount_lu,
246  int amount_ch,
247  int step_x_lu,
248  int step_y_lu,
249  int step_x_ch,
250  int step_y_ch,
251  int scalebits_lu,
252  int scalebits_ch,
253  int halfscale_lu,
254  int halfscale_ch,
255  int src_stride_lu,
256  int src_stride_ch,
257  int dst_stride_lu,
258  int dst_stride_ch,
259  int height,
260  int width,
261  int ch,
262  int cw)
263 {
264  global unsigned char *dst_y = dst;
265  global unsigned char *dst_u = dst_y + height * dst_stride_lu;
266  global unsigned char *dst_v = dst_u + ch * dst_stride_ch;
267 
268  global unsigned char *src_y = src;
269  global unsigned char *src_u = src_y + height * src_stride_lu;
270  global unsigned char *src_v = src_u + ch * src_stride_ch;
271 
272  global unsigned char *temp_dst;
273  global unsigned char *temp_src;
274  const global unsigned int *temp_mask;
275  int global_id = get_global_id(0);
276  int i, j, x, y, temp_src_stride, temp_dst_stride, temp_height, temp_width, temp_steps_x, temp_steps_y,
277  temp_amount, temp_scalebits, temp_halfscale, sum, idx_x, idx_y, temp, res;
278  if (global_id < width * height) {
279  y = global_id / width;
280  x = global_id % width;
281  temp_dst = dst_y;
282  temp_src = src_y;
283  temp_src_stride = src_stride_lu;
284  temp_dst_stride = dst_stride_lu;
285  temp_height = height;
286  temp_width = width;
287  temp_steps_x = step_x_lu;
288  temp_steps_y = step_y_lu;
289  temp_mask = mask_lu;
290  temp_amount = amount_lu;
291  temp_scalebits = scalebits_lu;
292  temp_halfscale = halfscale_lu;
293  } else if ((global_id >= width * height) && (global_id < width * height + ch * cw)) {
294  y = (global_id - width * height) / cw;
295  x = (global_id - width * height) % cw;
296  temp_dst = dst_u;
297  temp_src = src_u;
298  temp_src_stride = src_stride_ch;
299  temp_dst_stride = dst_stride_ch;
300  temp_height = ch;
301  temp_width = cw;
302  temp_steps_x = step_x_ch;
303  temp_steps_y = step_y_ch;
304  temp_mask = mask_ch;
305  temp_amount = amount_ch;
306  temp_scalebits = scalebits_ch;
307  temp_halfscale = halfscale_ch;
308  } else {
309  y = (global_id - width * height - ch * cw) / cw;
310  x = (global_id - width * height - ch * cw) % cw;
311  temp_dst = dst_v;
312  temp_src = src_v;
313  temp_src_stride = src_stride_ch;
314  temp_dst_stride = dst_stride_ch;
315  temp_height = ch;
316  temp_width = cw;
317  temp_steps_x = step_x_ch;
318  temp_steps_y = step_y_ch;
319  temp_mask = mask_ch;
320  temp_amount = amount_ch;
321  temp_scalebits = scalebits_ch;
322  temp_halfscale = halfscale_ch;
323  }
324  if (temp_amount) {
325  sum = 0;
326  for (j = 0; j <= 2 * temp_steps_y; j++) {
327  idx_y = (y - temp_steps_y + j) <= 0 ? 0 : (y - temp_steps_y + j) >= temp_height ? temp_height-1 : y - temp_steps_y + j;
328  for (i = 0; i <= 2 * temp_steps_x; i++) {
329  idx_x = (x - temp_steps_x + i) <= 0 ? 0 : (x - temp_steps_x + i) >= temp_width ? temp_width-1 : x - temp_steps_x + i;
330  sum += temp_mask[i + j * (2 * temp_steps_x + 1)] * temp_src[idx_x + idx_y * temp_src_stride];
331  }
332  }
333  temp = (int)temp_src[x + y * temp_src_stride];
334  res = temp + (((temp - (int)((sum + temp_halfscale) >> temp_scalebits)) * temp_amount) >> 16);
335  temp_dst[x + y * temp_dst_stride] = clip_uint8(res);
336  } else {
337  temp_dst[x + y * temp_dst_stride] = temp_src[x + y * temp_src_stride];
338  }
339 }
340 );
341 
342 #endif /* AVFILTER_UNSHARP_OPENCL_KERNEL_H */