FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
vf_fspp.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
3  * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
4  * Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License along
19  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21  */
22 
23 /**
24  * @file
25  * Fast Simple Post-processing filter
26  * This implementation is based on an algorithm described in
27  * "Aria Nosratinia Embedded Post-Processing for
28  * Enhancement of Compressed Images (1999)"
29  * (http://www.utdallas.edu/~aria/papers/vlsisp99.pdf)
30  * Further, with splitting (I)DCT into horizontal/vertical passes, one of
31  * them can be performed once per block, not per pixel. This allows for much
32  * higher speed.
33  *
34  * Originally written by Michael Niedermayer and Nikolaj for the MPlayer
35  * project, and ported by Arwa Arif for FFmpeg.
36  */
37 
38 #include "libavutil/avassert.h"
39 #include "libavutil/imgutils.h"
40 #include "libavutil/opt.h"
41 #include "libavutil/pixdesc.h"
42 #include "internal.h"
43 #include "vf_fspp.h"
44 
45 #define OFFSET(x) offsetof(FSPPContext, x)
46 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
47 static const AVOption fspp_options[] = {
48  { "quality", "set quality", OFFSET(log2_count), AV_OPT_TYPE_INT, {.i64 = 4}, 4, MAX_LEVEL, FLAGS },
49  { "qp", "force a constant quantizer parameter", OFFSET(qp), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 64, FLAGS },
50  { "strength", "set filter strength", OFFSET(strength), AV_OPT_TYPE_INT, {.i64 = 0}, -15, 32, FLAGS },
51  { "use_bframe_qp", "use B-frames' QP", OFFSET(use_bframe_qp), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, FLAGS },
52  { NULL }
53 };
54 
56 
57 DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
58  { 0, 48, 12, 60, 3, 51, 15, 63, },
59  { 32, 16, 44, 28, 35, 19, 47, 31, },
60  { 8, 56, 4, 52, 11, 59, 7, 55, },
61  { 40, 24, 36, 20, 43, 27, 39, 23, },
62  { 2, 50, 14, 62, 1, 49, 13, 61, },
63  { 34, 18, 46, 30, 33, 17, 45, 29, },
64  { 10, 58, 6, 54, 9, 57, 5, 53, },
65  { 42, 26, 38, 22, 41, 25, 37, 21, },
66 };
67 
68 static const short custom_threshold[64] = {
69 // values (296) can't be too high
70 // -it causes too big quant dependence
71 // or maybe overflow(check), which results in some flashing
72  71, 296, 295, 237, 71, 40, 38, 19,
73  245, 193, 185, 121, 102, 73, 53, 27,
74  158, 129, 141, 107, 97, 73, 50, 26,
75  102, 116, 109, 98, 82, 66, 45, 23,
76  71, 94, 95, 81, 70, 56, 38, 20,
77  56, 77, 74, 66, 56, 44, 30, 15,
78  38, 53, 50, 45, 38, 30, 21, 11,
79  20, 27, 26, 23, 20, 15, 11, 5
80 };
81 
82 //This func reads from 1 slice, 1 and clears 0 & 1
83 static void store_slice_c(uint8_t *dst, int16_t *src,
84  ptrdiff_t dst_stride, ptrdiff_t src_stride,
85  ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
86 {
87  int y, x;
88 #define STORE(pos) \
89  temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
90  src[x + pos] = src[x + pos - 8 * src_stride] = 0; \
91  if (temp & 0x100) temp = ~(temp >> 31); \
92  dst[x + pos] = temp;
93 
94  for (y = 0; y < height; y++) {
95  const uint8_t *d = dither[y];
96  for (x = 0; x < width; x += 8) {
97  int temp;
98  STORE(0);
99  STORE(1);
100  STORE(2);
101  STORE(3);
102  STORE(4);
103  STORE(5);
104  STORE(6);
105  STORE(7);
106  }
107  src += src_stride;
108  dst += dst_stride;
109  }
110 }
111 
112 //This func reads from 2 slices, 0 & 2 and clears 2-nd
113 static void store_slice2_c(uint8_t *dst, int16_t *src,
114  ptrdiff_t dst_stride, ptrdiff_t src_stride,
115  ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
116 {
117  int y, x;
118 #define STORE2(pos) \
119  temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
120  src[x + pos + 16 * src_stride] = 0; \
121  if (temp & 0x100) temp = ~(temp >> 31); \
122  dst[x + pos] = temp;
123 
124  for (y = 0; y < height; y++) {
125  const uint8_t *d = dither[y];
126  for (x = 0; x < width; x += 8) {
127  int temp;
128  STORE2(0);
129  STORE2(1);
130  STORE2(2);
131  STORE2(3);
132  STORE2(4);
133  STORE2(5);
134  STORE2(6);
135  STORE2(7);
136  }
137  src += src_stride;
138  dst += dst_stride;
139  }
140 }
141 
142 static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
143 {
144  int a;
145  for (a = 0; a < 64; a++)
146  thr_adr[a] = q * thr_adr_noq[a];
147 }
148 
149 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
150  int dst_stride, int src_stride,
151  int width, int height,
152  uint8_t *qp_store, int qp_stride, int is_luma)
153 {
154  int x, x0, y, es, qy, t;
155 
156  const int stride = is_luma ? p->temp_stride : (width + 16);
157  const int step = 6 - p->log2_count;
158  const int qpsh = 4 - p->hsub * !is_luma;
159  const int qpsv = 4 - p->vsub * !is_luma;
160 
161  DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ];
162  int16_t *block = (int16_t *)block_align;
163  int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
164 
165  memset(block3, 0, 4 * 8 * BLOCKSZ);
166 
167  if (!src || !dst) return;
168 
169  for (y = 0; y < height; y++) {
170  int index = 8 + 8 * stride + y * stride;
171  memcpy(p->src + index, src + y * src_stride, width);
172  for (x = 0; x < 8; x++) {
173  p->src[index - x - 1] = p->src[index + x ];
174  p->src[index + width + x ] = p->src[index + width - x - 1];
175  }
176  }
177 
178  for (y = 0; y < 8; y++) {
179  memcpy(p->src + ( 7 - y ) * stride, p->src + ( y + 8 ) * stride, stride);
180  memcpy(p->src + (height + 8 + y) * stride, p->src + (height - y + 7) * stride, stride);
181  }
182  //FIXME (try edge emu)
183 
184  for (y = 8; y < 24; y++)
185  memset(p->temp + 8 + y * stride, 0, width * sizeof(int16_t));
186 
187  for (y = step; y < height + 8; y += step) { //step= 1,2
188  const int y1 = y - 8 + step; //l5-7 l4-6;
189  qy = y - 4;
190 
191  if (qy > height - 1) qy = height - 1;
192  if (qy < 0) qy = 0;
193 
194  qy = (qy >> qpsv) * qp_stride;
195  p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
196 
197  for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 1)) {
198  p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
199 
200  if (p->qp)
201  p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
202  else
203  for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
204  t = x + x0 - 2; //correct t=x+x0-2-(y&1), but its the same
205 
206  if (t < 0) t = 0; //t always < width-2
207 
208  t = qp_store[qy + (t >> qpsh)];
209  t = ff_norm_qscale(t, p->qscale_type);
210 
211  if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
212  p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
213  }
214  p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
215  memmove(block, block + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling
216  memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * sizeof(int16_t));
217  }
218 
219  es = width + 8 - x0; // 8, ...
220  if (es > 8)
221  p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
222 
223  p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1));
224  p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
225 
226  if (!(y1 & 7) && y1) {
227  if (y1 & 8)
228  p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
229  dst_stride, stride, width, 8, 5 - p->log2_count);
230  else
231  p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
232  dst_stride, stride, width, 8, 5 - p->log2_count);
233  }
234  }
235 
236  if (y & 7) { // height % 8 != 0
237  if (y & 8)
238  p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
239  dst_stride, stride, width, y&7, 5 - p->log2_count);
240  else
241  p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
242  dst_stride, stride, width, y&7, 5 - p->log2_count);
243  }
244 }
245 
246 static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
247 {
248  int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
249  int_simd16_t tmp10, tmp11, tmp12, tmp13;
250  int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
251  int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
252 
253  int16_t *dataptr;
254  int16_t *wsptr;
255  int16_t *threshold;
256  int ctr;
257 
258  dataptr = data;
259  wsptr = output;
260 
261  for (; cnt > 0; cnt -= 2) { //start positions
262  threshold = (int16_t *)thr_adr;//threshold_mtx
263  for (ctr = DCTSIZE; ctr > 0; ctr--) {
264  // Process columns from input, add to output.
265  tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
266  tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
267 
268  tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
269  tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
270 
271  tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
272  tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
273 
274  tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
275  tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
276 
277  // Even part of FDCT
278 
279  tmp10 = tmp0 + tmp3;
280  tmp13 = tmp0 - tmp3;
281  tmp11 = tmp1 + tmp2;
282  tmp12 = tmp1 - tmp2;
283 
284  d0 = tmp10 + tmp11;
285  d4 = tmp10 - tmp11;
286 
287  z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
288  d2 = tmp13 + z1;
289  d6 = tmp13 - z1;
290 
291  // Even part of IDCT
292 
293  THRESHOLD(tmp0, d0, threshold[0 * 8]);
294  THRESHOLD(tmp1, d2, threshold[2 * 8]);
295  THRESHOLD(tmp2, d4, threshold[4 * 8]);
296  THRESHOLD(tmp3, d6, threshold[6 * 8]);
297  tmp0 += 2;
298  tmp10 = (tmp0 + tmp2) >> 2;
299  tmp11 = (tmp0 - tmp2) >> 2;
300 
301  tmp13 = (tmp1 + tmp3) >>2; //+2 ! (psnr decides)
302  tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
303 
304  tmp0 = tmp10 + tmp13; //->temps
305  tmp3 = tmp10 - tmp13; //->temps
306  tmp1 = tmp11 + tmp12; //->temps
307  tmp2 = tmp11 - tmp12; //->temps
308 
309  // Odd part of FDCT
310 
311  tmp10 = tmp4 + tmp5;
312  tmp11 = tmp5 + tmp6;
313  tmp12 = tmp6 + tmp7;
314 
315  z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
316  z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
317  z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
318  z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
319 
320  z11 = tmp7 + z3;
321  z13 = tmp7 - z3;
322 
323  d5 = z13 + z2;
324  d3 = z13 - z2;
325  d1 = z11 + z4;
326  d7 = z11 - z4;
327 
328  // Odd part of IDCT
329 
330  THRESHOLD(tmp4, d1, threshold[1 * 8]);
331  THRESHOLD(tmp5, d3, threshold[3 * 8]);
332  THRESHOLD(tmp6, d5, threshold[5 * 8]);
333  THRESHOLD(tmp7, d7, threshold[7 * 8]);
334 
335  //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
336  z13 = tmp6 + tmp5;
337  z10 = (tmp6 - tmp5) << 1;
338  z11 = tmp4 + tmp7;
339  z12 = (tmp4 - tmp7) << 1;
340 
341  tmp7 = (z11 + z13) >> 2; //+2 !
342  tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
343  z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
344  tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
345  tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
346 
347  tmp6 = tmp12 - tmp7;
348  tmp5 = tmp11 - tmp6;
349  tmp4 = tmp10 + tmp5;
350 
351  wsptr[DCTSIZE * 0] += (tmp0 + tmp7);
352  wsptr[DCTSIZE * 1] += (tmp1 + tmp6);
353  wsptr[DCTSIZE * 2] += (tmp2 + tmp5);
354  wsptr[DCTSIZE * 3] += (tmp3 - tmp4);
355  wsptr[DCTSIZE * 4] += (tmp3 + tmp4);
356  wsptr[DCTSIZE * 5] += (tmp2 - tmp5);
357  wsptr[DCTSIZE * 6] = (tmp1 - tmp6);
358  wsptr[DCTSIZE * 7] = (tmp0 - tmp7);
359  //
360  dataptr++; //next column
361  wsptr++;
362  threshold++;
363  }
364  dataptr += 8; //skip each second start pos
365  wsptr += 8;
366  }
367 }
368 
369 static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
370 {
371  int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
372  int_simd16_t tmp10, tmp11, tmp12, tmp13;
373  int_simd16_t z5, z10, z11, z12, z13;
374  int16_t *outptr;
375  int16_t *wsptr;
376 
377  cnt *= 4;
378  wsptr = workspace;
379  outptr = output_adr;
380  for (; cnt > 0; cnt--) {
381  // Even part
382  //Simd version reads 4x4 block and transposes it
383  tmp10 = wsptr[2] + wsptr[3];
384  tmp11 = wsptr[2] - wsptr[3];
385 
386  tmp13 = wsptr[0] + wsptr[1];
387  tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow
388 
389  tmp0 = tmp10 + tmp13; //->temps
390  tmp3 = tmp10 - tmp13; //->temps
391  tmp1 = tmp11 + tmp12;
392  tmp2 = tmp11 - tmp12;
393 
394  // Odd part
395  //Also transpose, with previous:
396  // ---- ---- ||||
397  // ---- ---- idct ||||
398  // ---- ---- ---> ||||
399  // ---- ---- ||||
400  z13 = wsptr[4] + wsptr[5];
401  z10 = wsptr[4] - wsptr[5];
402  z11 = wsptr[6] + wsptr[7];
403  z12 = wsptr[6] - wsptr[7];
404 
405  tmp7 = z11 + z13;
406  tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
407 
408  z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
409  tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
410  tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
411 
412  tmp6 = (tmp12 << 3) - tmp7;
413  tmp5 = (tmp11 << 3) - tmp6;
414  tmp4 = (tmp10 << 3) + tmp5;
415 
416  // Final output stage: descale and write column
417  outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
418  outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
419  outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
420  outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
421  outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
422  outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
423  outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
424  outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
425  outptr++;
426 
427  wsptr += DCTSIZE; // advance pointer to next row
428  }
429 }
430 
431 static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
432 {
433  int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
434  int_simd16_t tmp10, tmp11, tmp12, tmp13;
435  int_simd16_t z1, z2, z3, z4, z5, z11, z13;
436  int16_t *dataptr;
437 
438  cnt *= 4;
439  // Pass 1: process rows.
440 
441  dataptr = data;
442  for (; cnt > 0; cnt--) {
443  tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
444  tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
445  tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
446  tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
447  tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
448  tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
449  tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
450  tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
451 
452  // Even part
453 
454  tmp10 = tmp0 + tmp3;
455  tmp13 = tmp0 - tmp3;
456  tmp11 = tmp1 + tmp2;
457  tmp12 = tmp1 - tmp2;
458  //Even columns are written first, this leads to different order of columns
459  //in column_fidct(), but they are processed independently, so all ok.
460  //Later in the row_idct() columns readed at the same order.
461  dataptr[2] = tmp10 + tmp11;
462  dataptr[3] = tmp10 - tmp11;
463 
464  z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
465  dataptr[0] = tmp13 + z1;
466  dataptr[1] = tmp13 - z1;
467 
468  // Odd part
469 
470  tmp10 = (tmp4 + tmp5) << 2;
471  tmp11 = (tmp5 + tmp6) << 2;
472  tmp12 = (tmp6 + tmp7) << 2;
473 
474  z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
475  z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
476  z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
477  z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
478 
479  z11 = tmp7 + z3;
480  z13 = tmp7 - z3;
481 
482  dataptr[4] = z13 + z2;
483  dataptr[5] = z13 - z2;
484  dataptr[6] = z11 + z4;
485  dataptr[7] = z11 - z4;
486 
487  pixels++; // advance pointer to next column
488  dataptr += DCTSIZE;
489  }
490 }
491 
493 {
494  static const enum PixelFormat pix_fmts[] = {
502  };
504  return 0;
505 }
506 
507 static int config_input(AVFilterLink *inlink)
508 {
509  AVFilterContext *ctx = inlink->dst;
510  FSPPContext *fspp = ctx->priv;
511  const int h = FFALIGN(inlink->h + 16, 16);
512  const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
513 
514  fspp->hsub = desc->log2_chroma_w;
515  fspp->vsub = desc->log2_chroma_h;
516 
517  fspp->temp_stride = FFALIGN(inlink->w + 16, 16);
518  fspp->temp = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->temp));
519  fspp->src = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->src));
520 
521  if (!fspp->temp || !fspp->src)
522  return AVERROR(ENOMEM);
523 
524  if (!fspp->use_bframe_qp && !fspp->qp) {
525  fspp->non_b_qp_alloc_size = FF_CEIL_RSHIFT(inlink->w, 4) * FF_CEIL_RSHIFT(inlink->h, 4);
526  fspp->non_b_qp_table = av_calloc(fspp->non_b_qp_alloc_size, sizeof(*fspp->non_b_qp_table));
527  if (!fspp->non_b_qp_table)
528  return AVERROR(ENOMEM);
529  }
530 
531  fspp->store_slice = store_slice_c;
533  fspp->mul_thrmat = mul_thrmat_c;
535  fspp->row_idct = row_idct_c;
536  fspp->row_fdct = row_fdct_c;
537 
538  if (ARCH_X86)
539  ff_fspp_init_x86(fspp);
540 
541  return 0;
542 }
543 
544 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
545 {
546  AVFilterContext *ctx = inlink->dst;
547  FSPPContext *fspp = ctx->priv;
548  AVFilterLink *outlink = ctx->outputs[0];
549  AVFrame *out = in;
550 
551  int qp_stride = 0;
552  uint8_t *qp_table = NULL;
553  int i, bias;
554  int custom_threshold_m[64];
555 
556  bias = (1 << 4) + fspp->strength;
557 
558  for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
559  custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
560 
561  for (i = 0; i < 8; i++) {
562  fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 2]
563  |(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16)
564  |(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32)
565  |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48);
566 
567  fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 8 + 5]
568  |(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16)
569  |(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32)
570  |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48);
571  }
572 
573  if (fspp->qp)
574  fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
575 
576  /* if we are not in a constant user quantizer mode and we don't want to use
577  * the quantizers from the B-frames (B-frames often have a higher QP), we
578  * need to save the qp table from the last non B-frame; this is what the
579  * following code block does */
580  if (!fspp->qp) {
581  qp_table = av_frame_get_qp_table(in, &qp_stride, &fspp->qscale_type);
582 
583  if (qp_table && !fspp->use_bframe_qp && in->pict_type != AV_PICTURE_TYPE_B) {
584  int w, h;
585 
586  /* if the qp stride is not set, it means the QP are only defined on
587  * a line basis */
588  if (!qp_stride) {
589  w = FF_CEIL_RSHIFT(inlink->w, 4);
590  h = 1;
591  } else {
592  w = qp_stride;
593  h = FF_CEIL_RSHIFT(inlink->h, 4);
594  }
595  if (w * h > fspp->non_b_qp_alloc_size) {
596  int ret = av_reallocp_array(&fspp->non_b_qp_table, w, h);
597  if (ret < 0) {
598  fspp->non_b_qp_alloc_size = 0;
599  return ret;
600  }
601  fspp->non_b_qp_alloc_size = w * h;
602  }
603 
604  av_assert0(w * h <= fspp->non_b_qp_alloc_size);
605  memcpy(fspp->non_b_qp_table, qp_table, w * h);
606  }
607  }
608 
609  if (fspp->log2_count && !ctx->is_disabled) {
610  if (!fspp->use_bframe_qp && fspp->non_b_qp_table)
611  qp_table = fspp->non_b_qp_table;
612 
613  if (qp_table || fspp->qp) {
614  const int cw = FF_CEIL_RSHIFT(inlink->w, fspp->hsub);
615  const int ch = FF_CEIL_RSHIFT(inlink->h, fspp->vsub);
616 
617  /* get a new frame if in-place is not possible or if the dimensions
618  * are not multiple of 8 */
619  if (!av_frame_is_writable(in) || (inlink->w & 7) || (inlink->h & 7)) {
620  const int aligned_w = FFALIGN(inlink->w, 8);
621  const int aligned_h = FFALIGN(inlink->h, 8);
622 
623  out = ff_get_video_buffer(outlink, aligned_w, aligned_h);
624  if (!out) {
625  av_frame_free(&in);
626  return AVERROR(ENOMEM);
627  }
628  av_frame_copy_props(out, in);
629  }
630 
631  filter(fspp, out->data[0], in->data[0], out->linesize[0], in->linesize[0],
632  inlink->w, inlink->h, qp_table, qp_stride, 1);
633  filter(fspp, out->data[1], in->data[1], out->linesize[1], in->linesize[1],
634  cw, ch, qp_table, qp_stride, 0);
635  filter(fspp, out->data[2], in->data[2], out->linesize[2], in->linesize[2],
636  cw, ch, qp_table, qp_stride, 0);
637  emms_c();
638  }
639  }
640 
641  if (in != out) {
642  if (in->data[3])
643  av_image_copy_plane(out->data[3], out->linesize[3],
644  in ->data[3], in ->linesize[3],
645  inlink->w, inlink->h);
646  av_frame_free(&in);
647  }
648  return ff_filter_frame(outlink, out);
649 }
650 
651 static av_cold void uninit(AVFilterContext *ctx)
652 {
653  FSPPContext *fspp = ctx->priv;
654  av_freep(&fspp->temp);
655  av_freep(&fspp->src);
656  av_freep(&fspp->non_b_qp_table);
657 }
658 
659 static const AVFilterPad fspp_inputs[] = {
660  {
661  .name = "default",
662  .type = AVMEDIA_TYPE_VIDEO,
663  .config_props = config_input,
664  .filter_frame = filter_frame,
665  },
666  { NULL }
667 };
668 
669 static const AVFilterPad fspp_outputs[] = {
670  {
671  .name = "default",
672  .type = AVMEDIA_TYPE_VIDEO,
673  },
674  { NULL }
675 };
676 
678  .name = "fspp",
679  .description = NULL_IF_CONFIG_SMALL("Apply Fast Simple Post-processing filter."),
680  .priv_size = sizeof(FSPPContext),
681  .uninit = uninit,
683  .inputs = fspp_inputs,
684  .outputs = fspp_outputs,
685  .priv_class = &fspp_class,
687 };