FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
vf_fspp.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
3  * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
4  * Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License along
19  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21  */
22 
23 /**
24  * @file
25  * Fast Simple Post-processing filter
26  * This implementation is based on an algorithm described in
27  * "Aria Nosratinia Embedded Post-Processing for
28  * Enhancement of Compressed Images (1999)"
29  * (http://www.utdallas.edu/~aria/papers/vlsisp99.pdf)
30  * Further, with splitting (I)DCT into horizontal/vertical passes, one of
31  * them can be performed once per block, not per pixel. This allows for much
32  * higher speed.
33  *
34  * Originally written by Michael Niedermayer and Nikolaj for the MPlayer
35  * project, and ported by Arwa Arif for FFmpeg.
36  */
37 
38 #include "libavutil/avassert.h"
39 #include "libavutil/imgutils.h"
40 #include "libavutil/opt.h"
41 #include "libavutil/pixdesc.h"
42 #include "internal.h"
43 #include "vf_fspp.h"
44 
45 #define OFFSET(x) offsetof(FSPPContext, x)
46 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
47 static const AVOption fspp_options[] = {
48  { "quality", "set quality", OFFSET(log2_count), AV_OPT_TYPE_INT, {.i64 = 4}, 4, MAX_LEVEL, FLAGS },
49  { "qp", "force a constant quantizer parameter", OFFSET(qp), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 64, FLAGS },
50  { "strength", "set filter strength", OFFSET(strength), AV_OPT_TYPE_INT, {.i64 = 0}, -15, 32, FLAGS },
51  { "use_bframe_qp", "use B-frames' QP", OFFSET(use_bframe_qp), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, FLAGS },
52  { NULL }
53 };
54 
56 
57 DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
58  { 0, 48, 12, 60, 3, 51, 15, 63, },
59  { 32, 16, 44, 28, 35, 19, 47, 31, },
60  { 8, 56, 4, 52, 11, 59, 7, 55, },
61  { 40, 24, 36, 20, 43, 27, 39, 23, },
62  { 2, 50, 14, 62, 1, 49, 13, 61, },
63  { 34, 18, 46, 30, 33, 17, 45, 29, },
64  { 10, 58, 6, 54, 9, 57, 5, 53, },
65  { 42, 26, 38, 22, 41, 25, 37, 21, },
66 };
67 
68 static const short custom_threshold[64] = {
69 // values (296) can't be too high
70 // -it causes too big quant dependence
71 // or maybe overflow(check), which results in some flashing
72  71, 296, 295, 237, 71, 40, 38, 19,
73  245, 193, 185, 121, 102, 73, 53, 27,
74  158, 129, 141, 107, 97, 73, 50, 26,
75  102, 116, 109, 98, 82, 66, 45, 23,
76  71, 94, 95, 81, 70, 56, 38, 20,
77  56, 77, 74, 66, 56, 44, 30, 15,
78  38, 53, 50, 45, 38, 30, 21, 11,
79  20, 27, 26, 23, 20, 15, 11, 5
80 };
81 
82 //This func reads from 1 slice, 1 and clears 0 & 1
83 static void store_slice_c(uint8_t *dst, int16_t *src,
84  ptrdiff_t dst_stride, ptrdiff_t src_stride,
85  ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
86 {
87  int y, x;
88 #define STORE(pos) \
89  temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
90  src[x + pos] = src[x + pos - 8 * src_stride] = 0; \
91  if (temp & 0x100) temp = ~(temp >> 31); \
92  dst[x + pos] = temp;
93 
94  for (y = 0; y < height; y++) {
95  const uint8_t *d = dither[y];
96  for (x = 0; x < width; x += 8) {
97  int temp;
98  STORE(0);
99  STORE(1);
100  STORE(2);
101  STORE(3);
102  STORE(4);
103  STORE(5);
104  STORE(6);
105  STORE(7);
106  }
107  src += src_stride;
108  dst += dst_stride;
109  }
110 }
111 
112 //This func reads from 2 slices, 0 & 2 and clears 2-nd
113 static void store_slice2_c(uint8_t *dst, int16_t *src,
114  ptrdiff_t dst_stride, ptrdiff_t src_stride,
115  ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
116 {
117  int y, x;
118 #define STORE2(pos) \
119  temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
120  src[x + pos + 16 * src_stride] = 0; \
121  if (temp & 0x100) temp = ~(temp >> 31); \
122  dst[x + pos] = temp;
123 
124  for (y = 0; y < height; y++) {
125  const uint8_t *d = dither[y];
126  for (x = 0; x < width; x += 8) {
127  int temp;
128  STORE2(0);
129  STORE2(1);
130  STORE2(2);
131  STORE2(3);
132  STORE2(4);
133  STORE2(5);
134  STORE2(6);
135  STORE2(7);
136  }
137  src += src_stride;
138  dst += dst_stride;
139  }
140 }
141 
142 static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
143 {
144  int a;
145  for (a = 0; a < 64; a++)
146  thr_adr[a] = q * thr_adr_noq[a];
147 }
148 
149 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
150  int dst_stride, int src_stride,
151  int width, int height,
152  uint8_t *qp_store, int qp_stride, int is_luma)
153 {
154  int x, x0, y, es, qy, t;
155 
156  const int stride = is_luma ? p->temp_stride : (width + 16);
157  const int step = 6 - p->log2_count;
158  const int qpsh = 4 - p->hsub * !is_luma;
159  const int qpsv = 4 - p->vsub * !is_luma;
160 
161  DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ];
162  int16_t *block = (int16_t *)block_align;
163  int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
164 
165  memset(block3, 0, 4 * 8 * BLOCKSZ);
166 
167  if (!src || !dst) return;
168 
169  for (y = 0; y < height; y++) {
170  int index = 8 + 8 * stride + y * stride;
171  memcpy(p->src + index, src + y * src_stride, width);
172  for (x = 0; x < 8; x++) {
173  p->src[index - x - 1] = p->src[index + x ];
174  p->src[index + width + x ] = p->src[index + width - x - 1];
175  }
176  }
177 
178  for (y = 0; y < 8; y++) {
179  memcpy(p->src + ( 7 - y ) * stride, p->src + ( y + 8 ) * stride, stride);
180  memcpy(p->src + (height + 8 + y) * stride, p->src + (height - y + 7) * stride, stride);
181  }
182  //FIXME (try edge emu)
183 
184  for (y = 8; y < 24; y++)
185  memset(p->temp + 8 + y * stride, 0, width * sizeof(int16_t));
186 
187  for (y = step; y < height + 8; y += step) { //step= 1,2
188  const int y1 = y - 8 + step; //l5-7 l4-6;
189  qy = y - 4;
190 
191  if (qy > height - 1) qy = height - 1;
192  if (qy < 0) qy = 0;
193 
194  qy = (qy >> qpsv) * qp_stride;
195  p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
196 
197  for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 1)) {
198  p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
199 
200  if (p->qp)
201  p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
202  else
203  for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
204  t = x + x0 - 2; //correct t=x+x0-2-(y&1), but its the same
205 
206  if (t < 0) t = 0; //t always < width-2
207 
208  t = qp_store[qy + (t >> qpsh)];
209  t = ff_norm_qscale(t, p->qscale_type);
210 
211  if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
212  p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
213  }
214  p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
215  memmove(block, block + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling
216  memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * sizeof(int16_t));
217  }
218 
219  es = width + 8 - x0; // 8, ...
220  if (es > 8)
221  p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
222 
223  p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1));
224  if (es > 3)
225  p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
226 
227  if (!(y1 & 7) && y1) {
228  if (y1 & 8)
229  p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
230  dst_stride, stride, width, 8, 5 - p->log2_count);
231  else
232  p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
233  dst_stride, stride, width, 8, 5 - p->log2_count);
234  }
235  }
236 
237  if (y & 7) { // height % 8 != 0
238  if (y & 8)
239  p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
240  dst_stride, stride, width, y&7, 5 - p->log2_count);
241  else
242  p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
243  dst_stride, stride, width, y&7, 5 - p->log2_count);
244  }
245 }
246 
247 static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
248 {
249  int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
250  int_simd16_t tmp10, tmp11, tmp12, tmp13;
251  int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
252  int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
253 
254  int16_t *dataptr;
255  int16_t *wsptr;
256  int16_t *threshold;
257  int ctr;
258 
259  dataptr = data;
260  wsptr = output;
261 
262  for (; cnt > 0; cnt -= 2) { //start positions
263  threshold = (int16_t *)thr_adr;//threshold_mtx
264  for (ctr = DCTSIZE; ctr > 0; ctr--) {
265  // Process columns from input, add to output.
266  tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
267  tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
268 
269  tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
270  tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
271 
272  tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
273  tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
274 
275  tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
276  tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
277 
278  // Even part of FDCT
279 
280  tmp10 = tmp0 + tmp3;
281  tmp13 = tmp0 - tmp3;
282  tmp11 = tmp1 + tmp2;
283  tmp12 = tmp1 - tmp2;
284 
285  d0 = tmp10 + tmp11;
286  d4 = tmp10 - tmp11;
287 
288  z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
289  d2 = tmp13 + z1;
290  d6 = tmp13 - z1;
291 
292  // Even part of IDCT
293 
294  THRESHOLD(tmp0, d0, threshold[0 * 8]);
295  THRESHOLD(tmp1, d2, threshold[2 * 8]);
296  THRESHOLD(tmp2, d4, threshold[4 * 8]);
297  THRESHOLD(tmp3, d6, threshold[6 * 8]);
298  tmp0 += 2;
299  tmp10 = (tmp0 + tmp2) >> 2;
300  tmp11 = (tmp0 - tmp2) >> 2;
301 
302  tmp13 = (tmp1 + tmp3) >>2; //+2 ! (psnr decides)
303  tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
304 
305  tmp0 = tmp10 + tmp13; //->temps
306  tmp3 = tmp10 - tmp13; //->temps
307  tmp1 = tmp11 + tmp12; //->temps
308  tmp2 = tmp11 - tmp12; //->temps
309 
310  // Odd part of FDCT
311 
312  tmp10 = tmp4 + tmp5;
313  tmp11 = tmp5 + tmp6;
314  tmp12 = tmp6 + tmp7;
315 
316  z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
317  z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
318  z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
319  z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
320 
321  z11 = tmp7 + z3;
322  z13 = tmp7 - z3;
323 
324  d5 = z13 + z2;
325  d3 = z13 - z2;
326  d1 = z11 + z4;
327  d7 = z11 - z4;
328 
329  // Odd part of IDCT
330 
331  THRESHOLD(tmp4, d1, threshold[1 * 8]);
332  THRESHOLD(tmp5, d3, threshold[3 * 8]);
333  THRESHOLD(tmp6, d5, threshold[5 * 8]);
334  THRESHOLD(tmp7, d7, threshold[7 * 8]);
335 
336  //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
337  z13 = tmp6 + tmp5;
338  z10 = (tmp6 - tmp5) << 1;
339  z11 = tmp4 + tmp7;
340  z12 = (tmp4 - tmp7) << 1;
341 
342  tmp7 = (z11 + z13) >> 2; //+2 !
343  tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
344  z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
345  tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
346  tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
347 
348  tmp6 = tmp12 - tmp7;
349  tmp5 = tmp11 - tmp6;
350  tmp4 = tmp10 + tmp5;
351 
352  wsptr[DCTSIZE * 0] += (tmp0 + tmp7);
353  wsptr[DCTSIZE * 1] += (tmp1 + tmp6);
354  wsptr[DCTSIZE * 2] += (tmp2 + tmp5);
355  wsptr[DCTSIZE * 3] += (tmp3 - tmp4);
356  wsptr[DCTSIZE * 4] += (tmp3 + tmp4);
357  wsptr[DCTSIZE * 5] += (tmp2 - tmp5);
358  wsptr[DCTSIZE * 6] = (tmp1 - tmp6);
359  wsptr[DCTSIZE * 7] = (tmp0 - tmp7);
360  //
361  dataptr++; //next column
362  wsptr++;
363  threshold++;
364  }
365  dataptr += 8; //skip each second start pos
366  wsptr += 8;
367  }
368 }
369 
370 static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
371 {
372  int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
373  int_simd16_t tmp10, tmp11, tmp12, tmp13;
374  int_simd16_t z5, z10, z11, z12, z13;
375  int16_t *outptr;
376  int16_t *wsptr;
377 
378  cnt *= 4;
379  wsptr = workspace;
380  outptr = output_adr;
381  for (; cnt > 0; cnt--) {
382  // Even part
383  //Simd version reads 4x4 block and transposes it
384  tmp10 = wsptr[2] + wsptr[3];
385  tmp11 = wsptr[2] - wsptr[3];
386 
387  tmp13 = wsptr[0] + wsptr[1];
388  tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow
389 
390  tmp0 = tmp10 + tmp13; //->temps
391  tmp3 = tmp10 - tmp13; //->temps
392  tmp1 = tmp11 + tmp12;
393  tmp2 = tmp11 - tmp12;
394 
395  // Odd part
396  //Also transpose, with previous:
397  // ---- ---- ||||
398  // ---- ---- idct ||||
399  // ---- ---- ---> ||||
400  // ---- ---- ||||
401  z13 = wsptr[4] + wsptr[5];
402  z10 = wsptr[4] - wsptr[5];
403  z11 = wsptr[6] + wsptr[7];
404  z12 = wsptr[6] - wsptr[7];
405 
406  tmp7 = z11 + z13;
407  tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
408 
409  z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
410  tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
411  tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
412 
413  tmp6 = (tmp12 << 3) - tmp7;
414  tmp5 = (tmp11 << 3) - tmp6;
415  tmp4 = (tmp10 << 3) + tmp5;
416 
417  // Final output stage: descale and write column
418  outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
419  outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
420  outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
421  outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
422  outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
423  outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
424  outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
425  outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
426  outptr++;
427 
428  wsptr += DCTSIZE; // advance pointer to next row
429  }
430 }
431 
432 static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
433 {
434  int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
435  int_simd16_t tmp10, tmp11, tmp12, tmp13;
436  int_simd16_t z1, z2, z3, z4, z5, z11, z13;
437  int16_t *dataptr;
438 
439  cnt *= 4;
440  // Pass 1: process rows.
441 
442  dataptr = data;
443  for (; cnt > 0; cnt--) {
444  tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
445  tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
446  tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
447  tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
448  tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
449  tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
450  tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
451  tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
452 
453  // Even part
454 
455  tmp10 = tmp0 + tmp3;
456  tmp13 = tmp0 - tmp3;
457  tmp11 = tmp1 + tmp2;
458  tmp12 = tmp1 - tmp2;
459  //Even columns are written first, this leads to different order of columns
460  //in column_fidct(), but they are processed independently, so all ok.
461  //Later in the row_idct() columns readed at the same order.
462  dataptr[2] = tmp10 + tmp11;
463  dataptr[3] = tmp10 - tmp11;
464 
465  z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
466  dataptr[0] = tmp13 + z1;
467  dataptr[1] = tmp13 - z1;
468 
469  // Odd part
470 
471  tmp10 = (tmp4 + tmp5) << 2;
472  tmp11 = (tmp5 + tmp6) << 2;
473  tmp12 = (tmp6 + tmp7) << 2;
474 
475  z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
476  z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
477  z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
478  z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
479 
480  z11 = tmp7 + z3;
481  z13 = tmp7 - z3;
482 
483  dataptr[4] = z13 + z2;
484  dataptr[5] = z13 - z2;
485  dataptr[6] = z11 + z4;
486  dataptr[7] = z11 - z4;
487 
488  pixels++; // advance pointer to next column
489  dataptr += DCTSIZE;
490  }
491 }
492 
494 {
495  static const enum PixelFormat pix_fmts[] = {
503  };
505  return 0;
506 }
507 
508 static int config_input(AVFilterLink *inlink)
509 {
510  AVFilterContext *ctx = inlink->dst;
511  FSPPContext *fspp = ctx->priv;
512  const int h = FFALIGN(inlink->h + 16, 16);
513  const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
514 
515  fspp->hsub = desc->log2_chroma_w;
516  fspp->vsub = desc->log2_chroma_h;
517 
518  fspp->temp_stride = FFALIGN(inlink->w + 16, 16);
519  fspp->temp = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->temp));
520  fspp->src = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->src));
521 
522  if (!fspp->temp || !fspp->src)
523  return AVERROR(ENOMEM);
524 
525  if (!fspp->use_bframe_qp && !fspp->qp) {
526  fspp->non_b_qp_alloc_size = FF_CEIL_RSHIFT(inlink->w, 4) * FF_CEIL_RSHIFT(inlink->h, 4);
527  fspp->non_b_qp_table = av_calloc(fspp->non_b_qp_alloc_size, sizeof(*fspp->non_b_qp_table));
528  if (!fspp->non_b_qp_table)
529  return AVERROR(ENOMEM);
530  }
531 
532  fspp->store_slice = store_slice_c;
534  fspp->mul_thrmat = mul_thrmat_c;
536  fspp->row_idct = row_idct_c;
537  fspp->row_fdct = row_fdct_c;
538 
539  if (ARCH_X86)
540  ff_fspp_init_x86(fspp);
541 
542  return 0;
543 }
544 
545 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
546 {
547  AVFilterContext *ctx = inlink->dst;
548  FSPPContext *fspp = ctx->priv;
549  AVFilterLink *outlink = ctx->outputs[0];
550  AVFrame *out = in;
551 
552  int qp_stride = 0;
553  uint8_t *qp_table = NULL;
554  int i, bias;
555  int custom_threshold_m[64];
556 
557  bias = (1 << 4) + fspp->strength;
558 
559  for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
560  custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
561 
562  for (i = 0; i < 8; i++) {
563  fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 2]
564  |(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16)
565  |(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32)
566  |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48);
567 
568  fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 8 + 5]
569  |(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16)
570  |(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32)
571  |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48);
572  }
573 
574  if (fspp->qp)
575  fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
576 
577  /* if we are not in a constant user quantizer mode and we don't want to use
578  * the quantizers from the B-frames (B-frames often have a higher QP), we
579  * need to save the qp table from the last non B-frame; this is what the
580  * following code block does */
581  if (!fspp->qp) {
582  qp_table = av_frame_get_qp_table(in, &qp_stride, &fspp->qscale_type);
583 
584  if (qp_table && !fspp->use_bframe_qp && in->pict_type != AV_PICTURE_TYPE_B) {
585  int w, h;
586 
587  /* if the qp stride is not set, it means the QP are only defined on
588  * a line basis */
589  if (!qp_stride) {
590  w = FF_CEIL_RSHIFT(inlink->w, 4);
591  h = 1;
592  } else {
593  w = qp_stride;
594  h = FF_CEIL_RSHIFT(inlink->h, 4);
595  }
596  if (w * h > fspp->non_b_qp_alloc_size) {
597  int ret = av_reallocp_array(&fspp->non_b_qp_table, w, h);
598  if (ret < 0) {
599  fspp->non_b_qp_alloc_size = 0;
600  return ret;
601  }
602  fspp->non_b_qp_alloc_size = w * h;
603  }
604 
605  av_assert0(w * h <= fspp->non_b_qp_alloc_size);
606  memcpy(fspp->non_b_qp_table, qp_table, w * h);
607  }
608  }
609 
610  if (fspp->log2_count && !ctx->is_disabled) {
611  if (!fspp->use_bframe_qp && fspp->non_b_qp_table)
612  qp_table = fspp->non_b_qp_table;
613 
614  if (qp_table || fspp->qp) {
615  const int cw = FF_CEIL_RSHIFT(inlink->w, fspp->hsub);
616  const int ch = FF_CEIL_RSHIFT(inlink->h, fspp->vsub);
617 
618  /* get a new frame if in-place is not possible or if the dimensions
619  * are not multiple of 8 */
620  if (!av_frame_is_writable(in) || (inlink->w & 7) || (inlink->h & 7)) {
621  const int aligned_w = FFALIGN(inlink->w, 8);
622  const int aligned_h = FFALIGN(inlink->h, 8);
623 
624  out = ff_get_video_buffer(outlink, aligned_w, aligned_h);
625  if (!out) {
626  av_frame_free(&in);
627  return AVERROR(ENOMEM);
628  }
629  av_frame_copy_props(out, in);
630  out->width = in->width;
631  out->height = in->height;
632  }
633 
634  filter(fspp, out->data[0], in->data[0], out->linesize[0], in->linesize[0],
635  inlink->w, inlink->h, qp_table, qp_stride, 1);
636  filter(fspp, out->data[1], in->data[1], out->linesize[1], in->linesize[1],
637  cw, ch, qp_table, qp_stride, 0);
638  filter(fspp, out->data[2], in->data[2], out->linesize[2], in->linesize[2],
639  cw, ch, qp_table, qp_stride, 0);
640  emms_c();
641  }
642  }
643 
644  if (in != out) {
645  if (in->data[3])
646  av_image_copy_plane(out->data[3], out->linesize[3],
647  in ->data[3], in ->linesize[3],
648  inlink->w, inlink->h);
649  av_frame_free(&in);
650  }
651  return ff_filter_frame(outlink, out);
652 }
653 
654 static av_cold void uninit(AVFilterContext *ctx)
655 {
656  FSPPContext *fspp = ctx->priv;
657  av_freep(&fspp->temp);
658  av_freep(&fspp->src);
659  av_freep(&fspp->non_b_qp_table);
660 }
661 
662 static const AVFilterPad fspp_inputs[] = {
663  {
664  .name = "default",
665  .type = AVMEDIA_TYPE_VIDEO,
666  .config_props = config_input,
667  .filter_frame = filter_frame,
668  },
669  { NULL }
670 };
671 
672 static const AVFilterPad fspp_outputs[] = {
673  {
674  .name = "default",
675  .type = AVMEDIA_TYPE_VIDEO,
676  },
677  { NULL }
678 };
679 
681  .name = "fspp",
682  .description = NULL_IF_CONFIG_SMALL("Apply Fast Simple Post-processing filter."),
683  .priv_size = sizeof(FSPPContext),
684  .uninit = uninit,
686  .inputs = fspp_inputs,
687  .outputs = fspp_outputs,
688  .priv_class = &fspp_class,
690 };