FFmpeg
ops.c
Go to the documentation of this file.
1 /**
2  * Copyright (C) 2025-2026 Niklas Haas
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <float.h>
22 
23 #include "libavutil/avassert.h"
24 #include "libavutil/mem.h"
25 #include "libavutil/x86/cpu.h"
26 
27 #include "../ops_chain.h"
28 #include "../uops.h"
29 #include "../uops_macros.h"
30 
31 static int setup_rw_packed(const SwsImplParams *params, SwsImplResult *out)
32 {
33  const SwsUOp *uop = params->uop;
34 
35  /* 3-component packed reads/writes process one extra garbage word */
36  if (uop->mask == SWS_COMP_ELEMS(3)) {
37  switch (uop->uop) {
38  case SWS_UOP_READ_PACKED: out->over_read[0] = sizeof(uint32_t); break;
39  case SWS_UOP_WRITE_PACKED: out->over_write[0] = sizeof(uint32_t); break;
40  }
41  }
42 
43  return 0;
44 }
45 
46 static int setup_filter_v(const SwsImplParams *params, SwsImplResult *out)
47 {
48  const SwsFilterWeights *filter = params->uop->data.kernel;
49  static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]),
50  ">8 byte pointers not supported");
51 
52  /* Pre-convert weights to float */
53  float *weights = av_calloc(filter->num_weights, sizeof(float));
54  if (!weights)
55  return AVERROR(ENOMEM);
56 
57  for (int i = 0; i < filter->num_weights; i++)
58  weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE;
59 
60  out->priv.ptr = weights;
61  out->priv.uptr[1] = filter->filter_size;
62  out->free = ff_op_priv_free;
63  return 0;
64 }
65 
66 static int hscale_sizeof_weight(const SwsUOp *uop)
67 {
68  switch (uop->type) {
69  case SWS_PIXEL_U8: return sizeof(int16_t);
70  case SWS_PIXEL_U16: return sizeof(int16_t);
71  case SWS_PIXEL_F32: return sizeof(float);
72  default: return 0;
73  }
74 }
75 
76 static int setup_filter_h(const SwsImplParams *params, SwsImplResult *out)
77 {
78  const SwsUOp *uop = params->uop;
79  const SwsFilterWeights *filter = uop->data.kernel;
80 
81  /**
82  * `vpgatherdd` gathers 32 bits at a time; so if we're filtering a smaller
83  * size, we need to gather 2/4 taps simultaneously and unroll the inner
84  * loop over several packed samples.
85  */
86  const int pixel_size = ff_sws_pixel_type_size(uop->type);
87  const int taps_align = sizeof(int32_t) / pixel_size;
88  const int filter_size = filter->filter_size;
89  const int block_size = params->table->block_size;
90  const size_t aligned_size = FFALIGN(filter_size, taps_align);
91  const size_t line_size = FFALIGN(filter->dst_size, block_size);
92  av_assert1(FFALIGN(line_size, taps_align) == line_size);
93  if (aligned_size > INT_MAX)
94  return AVERROR(EINVAL);
95 
96  union {
97  void *ptr;
98  int16_t *i16;
99  float *f32;
100  } weights;
101 
102  const int sizeof_weight = hscale_sizeof_weight(uop);
103  weights.ptr = av_calloc(line_size, sizeof_weight * aligned_size);
104  if (!weights.ptr)
105  return AVERROR(ENOMEM);
106 
107  /**
108  * Transpose filter weights to group (aligned) taps by block
109  */
110  const int mmsize = block_size * 2;
111  const int gather_size = mmsize / sizeof(int32_t); /* pixels per vpgatherdd */
112  for (size_t x = 0; x < line_size; x += block_size) {
113  const int elems = FFMIN(block_size, filter->dst_size - x);
114  for (int j = 0; j < filter_size; j++) {
115  const int jb = j & ~(taps_align - 1);
116  const int ji = j - jb;
117  const size_t idx_base = x * aligned_size + jb * block_size + ji;
118  for (int i = 0; i < elems; i++) {
119  const int w = filter->weights[(x + i) * filter_size + j];
120  size_t idx = idx_base;
121  if (uop->type == SWS_PIXEL_U8) {
122  /* Interleave the pixels within each lane, i.e.:
123  * [a0 a1 a2 a3 | b0 b1 b2 b3 ] pixels 0-1, taps 0-3 (lane 0)
124  * [e0 e1 e2 e3 | f0 f1 f2 f3 ] pixels 4-5, taps 0-3 (lane 1)
125  * [c0 c1 c2 c3 | d0 d1 d2 d3 ] pixels 2-3, taps 0-3 (lane 0)
126  * [g0 g1 g2 g3 | h0 h1 h2 h3 ] pixels 6-7, taps 0-3 (lane 1)
127  * [i0 i1 i2 i3 | j0 j1 j2 j3 ] pixels 8-9, taps 0-3 (lane 0)
128  * ...
129  * [o0 o1 o2 o3 | p0 p1 p2 p3 ] pixels 14-15, taps 0-3 (lane 1)
130  * (repeat for taps 4-7, etc.)
131  */
132  const int gather_base = i & ~(gather_size - 1);
133  const int gather_pos = i - gather_base;
134  const int lane_idx = gather_pos >> 2;
135  const int pos_in_lane = gather_pos & 3;
136  idx += gather_base * 4 /* which gather (m0 or m1) */
137  + (pos_in_lane >> 1) * (mmsize / 2) /* lo/hi unpack */
138  + lane_idx * 8 /* 8 ints per lane */
139  + (pos_in_lane & 1) * 4; /* 4 taps per pair */
140  } else {
141  idx += i * taps_align;
142  }
143 
144  switch (uop->type) {
145  case SWS_PIXEL_U8: weights.i16[idx] = w; break;
146  case SWS_PIXEL_U16: weights.i16[idx] = w; break;
147  case SWS_PIXEL_F32: weights.f32[idx] = w; break;
148  }
149  }
150  }
151  }
152 
153  out->priv.ptr = weights.ptr;
154  out->priv.uptr[1] = aligned_size;
155  out->free = ff_op_priv_free;
156 
157  for (int i = 0; i < 4; i++) {
158  if (uop->mask & SWS_COMP(i))
159  out->over_read[i] = (aligned_size - filter_size) * pixel_size;
160  }
161  return 0;
162 }
163 
164 static bool check_filter_h_4x4(const SwsImplParams *params)
165 {
166  SwsContext *ctx = params->ctx;
167  const SwsUOp *uop = params->uop;
168  if ((ctx->flags & SWS_BITEXACT) && uop->type == SWS_PIXEL_F32)
169  return false; /* different accumulation order due to 4x4 transpose */
170 
171  const int cpu_flags = av_get_cpu_flags();
173  return true; /* always prefer over gathers if gathers are slow */
174 
175  /**
176  * Otherwise, prefer it above a certain filter size. Empirically, this
177  * kernel seems to be faster whenever the reference/gather kernel crosses
178  * a breakpoint for the number of gathers needed, but this filter doesn't.
179  *
180  * Tested on a Lunar Lake (Intel Core Ultra 7 258V) system.
181  */
182  const SwsFilterWeights *filter = uop->data.kernel;
183  return uop->type == SWS_PIXEL_U8 && filter->filter_size > 12 ||
184  uop->type == SWS_PIXEL_U16 && filter->filter_size > 4 ||
185  uop->type == SWS_PIXEL_F32 && filter->filter_size > 1;
186 }
187 
189 {
190  const SwsUOp *uop = params->uop;
191  const SwsFilterWeights *filter = uop->data.kernel;
192  const int pixel_size = ff_sws_pixel_type_size(uop->type);
193  const int sizeof_weights = hscale_sizeof_weight(uop);
194  const int block_size = params->table->block_size;
195  const int taps_align = 16 / sizeof_weights; /* taps per iteration (XMM) */
196  const int pixels_align = 4; /* pixels per iteration */
197  const int filter_size = filter->filter_size;
198  const size_t aligned_size = FFALIGN(filter_size, taps_align);
199  const int line_size = FFALIGN(filter->dst_size, block_size);
200  av_assert1(FFALIGN(line_size, pixels_align) == line_size);
201 
202  union {
203  void *ptr;
204  int16_t *i16;
205  float *f32;
206  } weights;
207 
208  weights.ptr = av_calloc(line_size, aligned_size * sizeof_weights);
209  if (!weights.ptr)
210  return AVERROR(ENOMEM);
211 
212  /**
213  * Desired memory layout: [w][taps][pixels_align][taps_align]
214  *
215  * Example with taps_align=8, pixels_align=4:
216  * [a0, a1, ... a7] weights for pixel 0, taps 0..7
217  * [b0, b1, ... b7] weights for pixel 1, taps 0..7
218  * [c0, c1, ... c7] weights for pixel 2, taps 0..7
219  * [d0, d1, ... d7] weights for pixel 3, taps 0..7
220  * [a8, a9, ... a15] weights for pixel 0, taps 8..15
221  * ...
222  * repeat for all taps, then move on to pixels 4..7, etc.
223  */
224  for (int x = 0; x < filter->dst_size; x++) {
225  for (int j = 0; j < filter_size; j++) {
226  const int xb = x & ~(pixels_align - 1);
227  const int jb = j & ~(taps_align - 1);
228  const int xi = x - xb, ji = j - jb;
229  const int w = filter->weights[x * filter_size + j];
230  const int idx = xb * aligned_size + jb * pixels_align + xi * taps_align + ji;
231 
232  switch (uop->type) {
233  case SWS_PIXEL_U8: weights.i16[idx] = w; break;
234  case SWS_PIXEL_U16: weights.i16[idx] = w; break;
235  case SWS_PIXEL_F32: weights.f32[idx] = w; break;
236  }
237  }
238  }
239 
240  out->priv.ptr = weights.ptr;
241  out->priv.uptr[1] = aligned_size * sizeof_weights;
242  out->free = ff_op_priv_free;
243 
244  for (int i = 0; i < 4; i++) {
245  if (uop->mask & SWS_COMP(i))
246  out->over_read[i] = (aligned_size - filter_size) * pixel_size;
247  }
248  return 0;
249 }
250 
251 static int setup_scale(const SwsImplParams *params, SwsImplResult *out)
252 {
253  const SwsUOp *uop = params->uop;
254  switch (uop->type) {
255  case SWS_PIXEL_U8: out->priv.u16[0] = uop->data.scalar.u8; break; /* for pmullw */
256  case SWS_PIXEL_U16: out->priv.u16[0] = uop->data.scalar.u16; break;
257  case SWS_PIXEL_U32: out->priv.u32[0] = uop->data.scalar.u32; break;
258  case SWS_PIXEL_F32: out->priv.f32[0] = uop->data.scalar.f32; break;
259  default: return AVERROR(EINVAL);
260  }
261 
262  return 0;
263 }
264 
265 static int setup_clear(const SwsImplParams *params, SwsImplResult *out)
266 {
267  const SwsUOp *uop = params->uop;
268  for (int i = 0; i < 4; i++)
269  out->priv.u32[i] = uop->data.vec4[i].u32;
270  return 0;
271 }
272 
273 static int setup_dither(const SwsImplParams *params, SwsImplResult *out)
274 {
275  out->priv.ptr = av_refstruct_ref(params->uop->data.ptr);
276  out->free = ff_op_priv_unref;
277  return 0;
278 }
279 
280 static int setup_linear(const SwsImplParams *params, SwsImplResult *out)
281 {
282  const SwsUOp *uop = params->uop;
283  out->priv.ptr = av_memdup(uop->data.mat4, sizeof(uop->data.mat4));
284  out->free = ff_op_priv_free;
285  return out->priv.ptr ? 0 : AVERROR(ENOMEM);
286 }
287 
288 static bool uop_is_type_invariant(const SwsUOpType uop)
289 {
290  switch (uop) {
291  case SWS_UOP_READ_PLANAR:
293  case SWS_UOP_CLEAR:
294  return true;
295  default:
296  return false;
297  }
298 }
299 
300 #define REF_ENTRY(EXT, NAME, ...) &op_##NAME##EXT,
301 #define DECL_ENTRY(EXT, CHECK, SETUP, NAME, ...) \
302  void ff_##NAME##EXT(void); \
303  static const SwsOpEntry op_##NAME##EXT = { \
304  .func = (SwsFuncPtr) ff_##NAME##EXT, \
305  .check = CHECK, \
306  .setup = SETUP, \
307  __VA_ARGS__, \
308  };
309 
310 /* Define all UOPs except conversion ops and type-invariant ops */
311 #define DECL_OPS_COMMON(EXT, TYPE) \
312 SWS_FOR_STRUCT(TYPE, READ_PACKED, DECL_ENTRY, EXT, NULL, setup_rw_packed) \
313 SWS_FOR_STRUCT(TYPE, READ_NIBBLE, DECL_ENTRY, EXT, NULL, NULL) \
314 SWS_FOR_STRUCT(TYPE, READ_BIT, DECL_ENTRY, EXT, NULL, NULL) \
315 SWS_FOR_STRUCT(TYPE, WRITE_PACKED, DECL_ENTRY, EXT, NULL, setup_rw_packed) \
316 SWS_FOR_STRUCT(TYPE, WRITE_NIBBLE, DECL_ENTRY, EXT, NULL, NULL) \
317 SWS_FOR_STRUCT(TYPE, WRITE_BIT, DECL_ENTRY, EXT, NULL, NULL) \
318 SWS_FOR_STRUCT(TYPE, SWAP_BYTES, DECL_ENTRY, EXT, NULL, NULL) \
319 SWS_FOR_STRUCT(TYPE, EXPAND_BIT, DECL_ENTRY, EXT, NULL, NULL) \
320 SWS_FOR_STRUCT(TYPE, MOVE, DECL_ENTRY, EXT, NULL, NULL) \
321 SWS_FOR_STRUCT(TYPE, SCALE, DECL_ENTRY, EXT, NULL, setup_scale) \
322 SWS_FOR_STRUCT(TYPE, ADD, DECL_ENTRY, EXT, NULL, ff_sws_setup_vec4) \
323 SWS_FOR_STRUCT(TYPE, MIN, DECL_ENTRY, EXT, NULL, ff_sws_setup_vec4) \
324 SWS_FOR_STRUCT(TYPE, MAX, DECL_ENTRY, EXT, NULL, ff_sws_setup_vec4) \
325 SWS_FOR_STRUCT(TYPE, UNPACK, DECL_ENTRY, EXT, NULL, NULL) \
326 SWS_FOR_STRUCT(TYPE, PACK, DECL_ENTRY, EXT, NULL, NULL) \
327 SWS_FOR_STRUCT(TYPE, LSHIFT, DECL_ENTRY, EXT, NULL, NULL) \
328 SWS_FOR_STRUCT(TYPE, RSHIFT, DECL_ENTRY, EXT, NULL, NULL) \
329 SWS_FOR_STRUCT(TYPE, LINEAR_FMA, DECL_ENTRY, EXT, NULL, setup_linear) \
330 SWS_FOR_STRUCT(TYPE, DITHER, DECL_ENTRY, EXT, NULL, setup_dither) \
331 /* end of macro */
332 
333 #define REF_OPS_COMMON(EXT, TYPE) \
334  SWS_FOR(TYPE, READ_PACKED, REF_ENTRY, EXT) \
335  SWS_FOR(TYPE, READ_NIBBLE, REF_ENTRY, EXT) \
336  SWS_FOR(TYPE, READ_BIT, REF_ENTRY, EXT) \
337  SWS_FOR(TYPE, WRITE_PACKED, REF_ENTRY, EXT) \
338  SWS_FOR(TYPE, WRITE_NIBBLE, REF_ENTRY, EXT) \
339  SWS_FOR(TYPE, WRITE_BIT, REF_ENTRY, EXT) \
340  SWS_FOR(TYPE, SWAP_BYTES, REF_ENTRY, EXT) \
341  SWS_FOR(TYPE, EXPAND_BIT, REF_ENTRY, EXT) \
342  SWS_FOR(TYPE, MOVE, REF_ENTRY, EXT) \
343  SWS_FOR(TYPE, SCALE, REF_ENTRY, EXT) \
344  SWS_FOR(TYPE, ADD, REF_ENTRY, EXT) \
345  SWS_FOR(TYPE, MIN, REF_ENTRY, EXT) \
346  SWS_FOR(TYPE, MAX, REF_ENTRY, EXT) \
347  SWS_FOR(TYPE, UNPACK, REF_ENTRY, EXT) \
348  SWS_FOR(TYPE, PACK, REF_ENTRY, EXT) \
349  SWS_FOR(TYPE, LSHIFT, REF_ENTRY, EXT) \
350  SWS_FOR(TYPE, RSHIFT, REF_ENTRY, EXT) \
351  SWS_FOR(TYPE, LINEAR_FMA, REF_ENTRY, EXT) \
352  SWS_FOR(TYPE, DITHER, REF_ENTRY, EXT) \
353  /* end of macro */
354 
355 #define DECL_TABLE_U8(EXT, SIZE, FLAG) \
356 DECL_OPS_COMMON(EXT, U8) \
357 SWS_FOR_STRUCT(U8, READ_PLANAR, DECL_ENTRY, EXT, NULL, NULL) \
358 SWS_FOR_STRUCT(U8, WRITE_PLANAR, DECL_ENTRY, EXT, NULL, NULL) \
359 SWS_FOR_STRUCT(U8, CLEAR, DECL_ENTRY, EXT, NULL, setup_clear) \
360  \
361 static const SwsOpTable ops_u8##EXT = { \
362  .cpu_flags = AV_CPU_FLAG_##FLAG, \
363  .block_size = SIZE, \
364  .entries = { \
365  REF_OPS_COMMON(EXT, U8) \
366  SWS_FOR(U8, READ_PLANAR, REF_ENTRY, EXT) \
367  SWS_FOR(U8, WRITE_PLANAR, REF_ENTRY, EXT) \
368  SWS_FOR(U8, CLEAR, REF_ENTRY, EXT) \
369  NULL \
370  }, \
371 };
372 
373 #define DECL_TABLE_U16(EXT, SIZE, FLAG) \
374 DECL_OPS_COMMON(EXT, U16) \
375 SWS_FOR_STRUCT(U8, TO_U16, DECL_ENTRY, EXT, NULL, NULL) \
376 SWS_FOR_STRUCT(U16, TO_U8, DECL_ENTRY, EXT, NULL, NULL) \
377 SWS_FOR_STRUCT(U8, EXPAND_PAIR, DECL_ENTRY, EXT, NULL, NULL) \
378  \
379 static const SwsOpTable ops_u16##EXT = { \
380  .cpu_flags = AV_CPU_FLAG_##FLAG, \
381  .block_size = SIZE, \
382  .entries = { \
383  REF_OPS_COMMON(EXT, U16) \
384  SWS_FOR(U8, TO_U16, REF_ENTRY, EXT) \
385  SWS_FOR(U16, TO_U8, REF_ENTRY, EXT) \
386  SWS_FOR(U8, EXPAND_PAIR, REF_ENTRY, EXT) \
387  NULL \
388  }, \
389 };
390 
391 #define DECL_TABLE_U32(EXT, SIZE, FLAG) \
392 DECL_OPS_COMMON(EXT, U32) \
393 SWS_FOR_STRUCT(U8, TO_U32, DECL_ENTRY, EXT, NULL, NULL) \
394 SWS_FOR_STRUCT(U32, TO_U8, DECL_ENTRY, EXT, NULL, NULL) \
395 SWS_FOR_STRUCT(U16, TO_U32, DECL_ENTRY, EXT, NULL, NULL) \
396 SWS_FOR_STRUCT(U32, TO_U16, DECL_ENTRY, EXT, NULL, NULL) \
397 SWS_FOR_STRUCT(U8, EXPAND_QUAD, DECL_ENTRY, EXT, NULL, NULL) \
398  \
399 static const SwsOpTable ops_u32##EXT = { \
400  .cpu_flags = AV_CPU_FLAG_##FLAG, \
401  .block_size = SIZE, \
402  .entries = { \
403  REF_OPS_COMMON(EXT, U32) \
404  SWS_FOR(U8, TO_U32, REF_ENTRY, EXT) \
405  SWS_FOR(U32, TO_U8, REF_ENTRY, EXT) \
406  SWS_FOR(U16, TO_U32, REF_ENTRY, EXT) \
407  SWS_FOR(U32, TO_U16, REF_ENTRY, EXT) \
408  SWS_FOR(U8, EXPAND_QUAD, REF_ENTRY, EXT) \
409  NULL \
410  }, \
411 };
412 
413 #define DECL_TABLE_F32(EXT, SIZE, FLAG) \
414 DECL_OPS_COMMON(EXT, F32) \
415 SWS_FOR_STRUCT(U8, TO_F32, DECL_ENTRY, EXT, NULL, NULL) \
416 SWS_FOR_STRUCT(F32, TO_U8, DECL_ENTRY, EXT, NULL, NULL) \
417 SWS_FOR_STRUCT(U16, TO_F32, DECL_ENTRY, EXT, NULL, NULL) \
418 SWS_FOR_STRUCT(F32, TO_U16, DECL_ENTRY, EXT, NULL, NULL) \
419 SWS_FOR_STRUCT(U8, READ_PLANAR_FH, DECL_ENTRY, EXT, NULL, setup_filter_h) \
420 SWS_FOR_STRUCT(U16, READ_PLANAR_FH, DECL_ENTRY, EXT, NULL, setup_filter_h) \
421 SWS_FOR_STRUCT(F32, READ_PLANAR_FH, DECL_ENTRY, EXT, NULL, setup_filter_h) \
422 SWS_FOR_STRUCT(U8, READ_PLANAR_FH, DECL_ENTRY, _4x4##EXT, \
423  check_filter_h_4x4, setup_filter_h_4x4) \
424 SWS_FOR_STRUCT(U16, READ_PLANAR_FH, DECL_ENTRY, _4x4##EXT, \
425  check_filter_h_4x4, setup_filter_h_4x4) \
426 SWS_FOR_STRUCT(F32, READ_PLANAR_FH, DECL_ENTRY, _4x4##EXT, \
427  check_filter_h_4x4, setup_filter_h_4x4) \
428 SWS_FOR_STRUCT(U8, READ_PLANAR_FV, DECL_ENTRY, EXT, NULL, setup_filter_v) \
429 SWS_FOR_STRUCT(U16, READ_PLANAR_FV, DECL_ENTRY, EXT, NULL, setup_filter_v) \
430 SWS_FOR_STRUCT(F32, READ_PLANAR_FV, DECL_ENTRY, EXT, NULL, setup_filter_v) \
431 SWS_FOR_STRUCT(U8, READ_PLANAR_FV_FMA, DECL_ENTRY, EXT, NULL, setup_filter_v) \
432 SWS_FOR_STRUCT(U16, READ_PLANAR_FV_FMA, DECL_ENTRY, EXT, NULL, setup_filter_v) \
433 SWS_FOR_STRUCT(F32, READ_PLANAR_FV_FMA, DECL_ENTRY, EXT, NULL, setup_filter_v) \
434  \
435 static const SwsOpTable ops_f32##EXT = { \
436  .cpu_flags = AV_CPU_FLAG_##FLAG, \
437  .block_size = SIZE, \
438  .entries = { \
439  REF_OPS_COMMON(EXT, F32) \
440  SWS_FOR(U8, TO_F32, REF_ENTRY, EXT) \
441  SWS_FOR(F32, TO_U8, REF_ENTRY, EXT) \
442  SWS_FOR(U16, TO_F32, REF_ENTRY, EXT) \
443  SWS_FOR(F32, TO_U16, REF_ENTRY, EXT) \
444  SWS_FOR(U8, READ_PLANAR_FH, REF_ENTRY, _4x4##EXT) \
445  SWS_FOR(U16, READ_PLANAR_FH, REF_ENTRY, _4x4##EXT) \
446  SWS_FOR(F32, READ_PLANAR_FH, REF_ENTRY, _4x4##EXT) \
447  SWS_FOR(U8, READ_PLANAR_FH, REF_ENTRY, EXT) \
448  SWS_FOR(U16, READ_PLANAR_FH, REF_ENTRY, EXT) \
449  SWS_FOR(F32, READ_PLANAR_FH, REF_ENTRY, EXT) \
450  SWS_FOR(U8, READ_PLANAR_FV, REF_ENTRY, EXT) \
451  SWS_FOR(U16, READ_PLANAR_FV, REF_ENTRY, EXT) \
452  SWS_FOR(F32, READ_PLANAR_FV, REF_ENTRY, EXT) \
453  SWS_FOR(U8, READ_PLANAR_FV_FMA, REF_ENTRY, EXT) \
454  SWS_FOR(U16, READ_PLANAR_FV_FMA, REF_ENTRY, EXT) \
455  SWS_FOR(F32, READ_PLANAR_FV_FMA, REF_ENTRY, EXT) \
456  NULL \
457  }, \
458 };
459 
460 DECL_TABLE_U8( _m1_sse4, 16, SSE4)
461 DECL_TABLE_U8( _m1_avx2, 32, AVX2)
462 DECL_TABLE_U8( _m2_sse4, 32, SSE4)
463 DECL_TABLE_U8( _m2_avx2, 64, AVX2)
464 DECL_TABLE_U16(_m1_avx2, 16, AVX2)
465 DECL_TABLE_U16(_m2_avx2, 32, AVX2)
466 DECL_TABLE_U32(_m2_avx2, 16, AVX2)
467 DECL_TABLE_F32(_m2_avx2, 16, AVX2)
468 
469 static const SwsOpTable *const tables[] = {
470  &ops_u8_m1_sse4,
471  &ops_u8_m1_avx2, /* order before _m2_sse4 */
472  &ops_u8_m2_sse4,
473  &ops_u8_m2_avx2,
474  &ops_u16_m1_avx2,
475  &ops_u16_m2_avx2,
476  &ops_u32_m2_avx2,
477  &ops_f32_m2_avx2,
478 };
479 
480 SWS_DECL_FUNC(ff_sws_process1_x86);
481 SWS_DECL_FUNC(ff_sws_process2_x86);
482 SWS_DECL_FUNC(ff_sws_process3_x86);
483 SWS_DECL_FUNC(ff_sws_process4_x86);
484 
485 static int movsize(const int bytes, const int mmsize)
486 {
487  return bytes <= 4 ? 4 : /* movd */
488  bytes <= 8 ? 8 : /* movq */
489  mmsize; /* movu */
490 }
491 
492 static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
493 {
494  uint8_t shuffle[16];
495  int read_bytes, write_bytes;
496  int pixels;
497 
498  /* Solve the shuffle mask for one 128-bit lane only */
499  pixels = ff_sws_solve_shuffle(ops, shuffle, 16, 0x80, &read_bytes, &write_bytes);
500  if (pixels < 0)
501  return pixels;
502 
503  /* We can't shuffle across lanes, so restrict the vector size to XMM
504  * whenever the read/write size would be a subset of the full vector */
505  if (read_bytes < 16 || write_bytes < 16)
506  mmsize = 16;
507 
508  const int num_lanes = mmsize / 16;
509  const int in_total = num_lanes * read_bytes;
510  const int out_total = num_lanes * write_bytes;
511 
512  *out = (SwsCompiledOp) {
513  .priv = av_memdup(shuffle, sizeof(shuffle)),
514  .free = av_free,
515  .slice_align = 1,
516  .block_size = pixels * num_lanes,
517  .over_read = { movsize(in_total, mmsize) - in_total },
518  .over_write = { movsize(out_total, mmsize) - out_total },
519  .cpu_flags = mmsize > 32 ? AV_CPU_FLAG_AVX512 :
520  mmsize > 16 ? AV_CPU_FLAG_AVX2 :
522  };
523 
524  if (!out->priv)
525  return AVERROR(ENOMEM);
526 
527 #define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT) \
528 do { \
529  SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT); \
530  if (in_total == IN && out_total == OUT) \
531  out->func = ff_packed_shuffle##IN##_##OUT##_##EXT; \
532 } while (0)
533 
534  ASSIGN_SHUFFLE_FUNC( 5, 15, sse4);
535  ASSIGN_SHUFFLE_FUNC( 4, 16, sse4);
536  ASSIGN_SHUFFLE_FUNC( 2, 12, sse4);
537  ASSIGN_SHUFFLE_FUNC(16, 8, sse4);
538  ASSIGN_SHUFFLE_FUNC(10, 15, sse4);
539  ASSIGN_SHUFFLE_FUNC( 8, 16, sse4);
540  ASSIGN_SHUFFLE_FUNC( 4, 12, sse4);
541  ASSIGN_SHUFFLE_FUNC(15, 5, sse4);
542  ASSIGN_SHUFFLE_FUNC(15, 15, sse4);
543  ASSIGN_SHUFFLE_FUNC(12, 16, sse4);
544  ASSIGN_SHUFFLE_FUNC( 6, 12, sse4);
545  ASSIGN_SHUFFLE_FUNC(16, 4, sse4);
546  ASSIGN_SHUFFLE_FUNC(16, 12, sse4);
547  ASSIGN_SHUFFLE_FUNC(16, 16, sse4);
548  ASSIGN_SHUFFLE_FUNC( 8, 12, sse4);
549  ASSIGN_SHUFFLE_FUNC(12, 12, sse4);
550  ASSIGN_SHUFFLE_FUNC(32, 32, avx2);
551  ASSIGN_SHUFFLE_FUNC(64, 64, avx512);
552  av_assert1(out->func);
553  return 0;
554 }
555 
556 /* Expand pixel value to 32-bits by repeating as necessary */
557 static uint32_t expand32(const SwsPixelType type, const SwsPixel value)
558 {
559  switch (type) {
560  case SWS_PIXEL_U8: return value.u8 * 0x01010101u;
561  case SWS_PIXEL_U16: return value.u16 * 0x00010001u;
562  case SWS_PIXEL_U32: return value.u32;
563  case SWS_PIXEL_F32: return value.u32; /* reinterpret */
564  default: return 0;
565  }
566 }
567 
568 static void normalize_clear(SwsUOp *uop)
569 {
570  for (int i = 0; i < 4; i++)
571  uop->data.vec4[i].u32 = expand32(uop->type, uop->data.vec4[i]);
572 }
573 
574 static int compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out)
575 {
576  const int cpu_flags = av_get_cpu_flags();
577  int ret, mmsize;
579  mmsize = 64;
580  else if (EXTERNAL_AVX2(cpu_flags))
581  mmsize = 32;
582  else if (EXTERNAL_SSE4(cpu_flags))
583  mmsize = 16;
584  else
585  return AVERROR(ENOTSUP);
586 
587  /* Special fast path for in-place packed shuffle */
588  ret = solve_shuffle(ops, mmsize, out);
589  if (ret != AVERROR(ENOTSUP))
590  return ret;
591 
593  if (!chain)
594  return AVERROR(ENOMEM);
595 
597  if (!uops) {
598  ret = AVERROR(ENOMEM);
599  goto fail;
600  }
601 
605 
606  ret = ff_sws_ops_translate(ctx, ops, flags, uops);
607  if (ret < 0)
608  goto fail;
609 
610  *out = (SwsCompiledOp) {
611  /* Use at most two full YMM regs during the widest precision section */
612  .block_size = 2 * FFMIN(mmsize, 32) / ff_sws_op_list_max_size(ops),
613  .slice_align = 1,
614  .free = ff_sws_op_chain_free_cb,
615  .priv = chain,
616  };
617 
618  for (int i = 0; i < uops->num_ops; i++) {
619  SwsUOp *uop = &uops->ops[i];
620  int op_block_size = out->block_size;
621 
622  if (uop_is_type_invariant(uop->uop)) {
623  if (uop->uop == SWS_UOP_CLEAR)
624  normalize_clear(uop);
625  op_block_size *= ff_sws_pixel_type_size(uop->type);
626  uop->type = SWS_PIXEL_U8;
627  }
628 
630  op_block_size, chain);
631  if (ret < 0)
632  goto fail;
633  }
634 
635  const SwsOp *read = ff_sws_op_list_input(ops);
636  const SwsOp *write = ff_sws_op_list_output(ops);
637  const int read_planes = read ? ff_sws_rw_op_planes(read) : 0;
638  const int write_planes = ff_sws_rw_op_planes(write);
639  switch (FFMAX(read_planes, write_planes)) {
640  case 1: out->func = ff_sws_process1_x86; break;
641  case 2: out->func = ff_sws_process2_x86; break;
642  case 3: out->func = ff_sws_process3_x86; break;
643  case 4: out->func = ff_sws_process4_x86; break;
644  }
645 
646  if (ret < 0) {
647  ff_sws_op_chain_free(chain);
648  return ret;
649  }
650 
651  out->cpu_flags = chain->cpu_flags;
652  memcpy(out->over_read, chain->over_read, sizeof(out->over_read));
653  memcpy(out->over_write, chain->over_write, sizeof(out->over_write));
654  ff_sws_uop_list_free(&uops);
655  return 0;
656 
657 fail:
658  ff_sws_uop_list_free(&uops);
659  ff_sws_op_chain_free(chain);
660  return ret;
661 }
662 
664  .name = "x86",
665  .flags = SWS_BACKEND_X86,
666  .compile = compile,
667  .hw_format = AV_PIX_FMT_NONE,
668 };
hscale_sizeof_weight
static int hscale_sizeof_weight(const SwsUOp *uop)
Definition: ops.c:66
flags
const SwsFlags flags[]
Definition: swscale.c:85
SwsOpTable
Copyright (C) 2025 Niklas Haas.
Definition: ops_chain.h:154
ff_sws_rw_op_planes
int ff_sws_rw_op_planes(const SwsOp *op)
Return the number of planes involved in a read/write operation.
Definition: ops.c:169
expand32
static uint32_t expand32(const SwsPixelType type, const SwsPixel value)
Definition: ops.c:557
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
SwsOpChain::over_read
int over_read[4]
Definition: ops_chain.h:90
out
static FILE * out
Definition: movenc.c:55
ff_sws_op_list_input
const SwsOp * ff_sws_op_list_input(const SwsOpList *ops)
Returns the input operation for a given op list, or NULL if there is none (e.g.
Definition: ops.c:688
ff_sws_op_list_max_size
int ff_sws_op_list_max_size(const SwsOpList *ops)
Returns the size of the largest pixel type used in ops.
Definition: ops.c:765
backend_x86
const SwsOpBackend backend_x86
Definition: ops.c:663
normalize_clear
static void normalize_clear(SwsUOp *uop)
Definition: ops.c:568
EXTERNAL_FMA3
#define EXTERNAL_FMA3(flags)
Definition: cpu.h:68
u
#define u(width, name, range_min, range_max)
Definition: cbs_apv.c:68
SwsFilterWeights
Represents a computed filter kernel.
Definition: filters.h:64
SWS_BITEXACT
@ SWS_BITEXACT
Definition: swscale.h:180
read_bytes
static void read_bytes(const uint8_t *src, float *dst, int src_stride, int dst_stride, int width, int height, float scale)
Definition: vf_nnedi.c:442
ff_sws_uop_lookup
int ff_sws_uop_lookup(SwsContext *ctx, const SwsOpTable *const tables[], int num_tables, const SwsUOp *uop, const int block_size, SwsOpChain *chain)
"Compile" a single uop by looking it up in a list of fixed size uop tables, in decreasing order of pr...
Definition: ops_chain.c:116
float.h
filter
void(* filter)(uint8_t *src, int stride, int qscale)
Definition: h263dsp.c:29
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
ff_op_priv_unref
static void ff_op_priv_unref(SwsOpPriv *priv)
Definition: ops_chain.h:149
setup_linear
static int setup_linear(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:280
SwsOpBackend::name
const char * name
Definition: ops_dispatch.h:134
ff_sws_pixel_type_size
int ff_sws_pixel_type_size(SwsPixelType type)
Definition: ops.c:77
SwsOpChain::cpu_flags
int cpu_flags
Definition: ops_chain.h:89
DECL_TABLE_F32
#define DECL_TABLE_F32(EXT, SIZE, FLAG)
Definition: ops.c:413
SWS_BACKEND_X86
@ SWS_BACKEND_X86
Chained x86 SIMD kernels.
Definition: swscale.h:118
av_memdup
void * av_memdup(const void *p, size_t size)
Duplicate a buffer with av_malloc().
Definition: mem.c:304
AV_CPU_FLAG_AVX512
#define AV_CPU_FLAG_AVX512
AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used.
Definition: cpu.h:60
SwsOpTable::block_size
int block_size
Definition: ops_chain.h:156
setup_dither
static int setup_dither(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:273
type
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf type
Definition: writing_filters.txt:86
SWS_COMP_ELEMS
#define SWS_COMP_ELEMS(N)
Definition: uops.h:73
DECL_TABLE_U8
#define DECL_TABLE_U8(EXT, SIZE, FLAG)
Definition: ops.c:355
avassert.h
FF_ARRAY_ELEMS
#define FF_ARRAY_ELEMS(a)
Definition: sinewin_tablegen.c:29
SwsUOp::kernel
SwsFilterWeights * kernel
Definition: uops.h:210
float
float
Definition: af_crystalizer.c:122
ff_sws_op_chain_alloc
SwsOpChain * ff_sws_op_chain_alloc(void)
Definition: ops_chain.c:29
SwsUOp::uop
SwsUOpType uop
Definition: uops.h:204
AVFormatContext::flags
int flags
Flags modifying the (de)muxer behaviour.
Definition: avformat.h:1465
SWS_UOP_WRITE_PLANAR
@ SWS_UOP_WRITE_PLANAR
Definition: uops.h:100
setup_clear
static int setup_clear(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:265
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
ctx
static AVFormatContext * ctx
Definition: movenc.c:49
SWS_UOP_READ_PACKED
@ SWS_UOP_READ_PACKED
Definition: uops.h:96
EXTERNAL_AVX2
#define EXTERNAL_AVX2(flags)
Definition: cpu.h:72
ff_sws_op_list_output
const SwsOp * ff_sws_op_list_output(const SwsOpList *ops)
Returns the output operation for a given op list, or NULL if there is none.
Definition: ops.c:697
SwsPixel::f32
float f32
Definition: uops.h:57
SwsOpBackend
Definition: ops_dispatch.h:133
DECL_TABLE_U16
#define DECL_TABLE_U16(EXT, SIZE, FLAG)
Definition: ops.c:373
fail
#define fail
Definition: test.h:478
SwsPixel::u8
uint8_t u8
Definition: uops.h:54
compile
static int compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out)
Definition: ops.c:574
SwsOpChain
Compiled "chain" of operations, which can be dispatched efficiently.
Definition: ops_chain.h:84
SwsUOp::mat4
SwsPixel mat4[4][5]
Definition: uops.h:214
AV_CPU_FLAG_AVX2
#define AV_CPU_FLAG_AVX2
AVX2 functions: requires OS support even if YMM registers aren't used.
Definition: cpu.h:56
tables
static const SwsOpTable *const tables[]
Definition: ops.c:469
solve_shuffle
static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
Definition: ops.c:492
SWS_UOP_FLAG_MOVE
@ SWS_UOP_FLAG_MOVE
Definition: uops.h:85
uop_is_type_invariant
static bool uop_is_type_invariant(const SwsUOpType uop)
Definition: ops.c:288
check_filter_h_4x4
static bool check_filter_h_4x4(const SwsImplParams *params)
Definition: ops.c:164
DECL_TABLE_U32
#define DECL_TABLE_U32(EXT, SIZE, FLAG)
Definition: ops.c:391
SwsPixelType
SwsPixelType
Definition: uops.h:38
SwsImplParams
Definition: ops_chain.h:105
SwsUOp::data
union SwsUOp::@586 data
EXTERNAL_AVX512
#define EXTERNAL_AVX512(flags)
Definition: cpu.h:77
SwsPixel::u16
uint16_t u16
Definition: uops.h:55
SwsUOp
Definition: uops.h:201
SWS_DECL_FUNC
SWS_DECL_FUNC(ff_sws_process1_x86)
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
movsize
static int movsize(const int bytes, const int mmsize)
Definition: ops.c:485
SwsUOp::mask
SwsCompMask mask
Definition: uops.h:205
SWS_COMP
#define SWS_COMP(X)
Definition: uops.h:70
SWS_PIXEL_U32
@ SWS_PIXEL_U32
Definition: uops.h:42
av_refstruct_ref
void * av_refstruct_ref(void *obj)
Create a new reference to an object managed via this API, i.e.
Definition: refstruct.c:140
cpu_flags
CheckasmCpu cpu_flags
Definition: checkasm.c:84
SwsPixel
Definition: uops.h:51
EXTERNAL_SSE4
#define EXTERNAL_SSE4(flags)
Definition: cpu.h:62
ff_sws_uop_list_alloc
SwsUOpList * ff_sws_uop_list_alloc(void)
Definition: uops.c:382
ff_sws_op_chain_free_cb
void ff_sws_op_chain_free_cb(void *ptr)
Definition: ops_chain.c:34
setup_filter_h_4x4
static int setup_filter_h_4x4(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:188
SwsImplParams::ctx
SwsContext * ctx
Definition: ops_chain.h:111
ff_sws_op_chain_free
static void ff_sws_op_chain_free(SwsOpChain *chain)
Definition: ops_chain.h:96
SWS_UOP_READ_PLANAR
@ SWS_UOP_READ_PLANAR
Definition: uops.h:92
weights
static const int weights[]
Definition: hevc_pel.c:32
av_assert1
#define av_assert1(cond)
assert() equivalent, that does not lie in speed critical code.
Definition: avassert.h:58
SWS_PIXEL_U8
@ SWS_PIXEL_U8
Definition: uops.h:40
value
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default value
Definition: writing_filters.txt:86
SwsUOpType
SwsUOpType
Definition: uops.h:88
SwsOpChain::over_write
int over_write[4]
Definition: ops_chain.h:91
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
SwsUOp::scalar
SwsPixel scalar
Definition: uops.h:212
SwsOp
Definition: ops.h:226
write_bytes
static void write_bytes(const float *src, uint8_t *dst, int src_stride, int dst_stride, int width, int height, int depth, float scale)
Definition: vf_nnedi.c:484
av_calloc
void * av_calloc(size_t nmemb, size_t size)
Definition: mem.c:264
ff_op_priv_free
static void ff_op_priv_free(SwsOpPriv *priv)
Definition: ops_chain.h:144
SwsUOp::type
SwsPixelType type
Definition: uops.h:203
ff_sws_ops_translate
int ff_sws_ops_translate(SwsContext *ctx, const SwsOpList *ops, SwsUOpFlags flags, SwsUOpList *uops)
Translate a list of operations down to micro-ops, which can be further optimized and then directly ex...
Definition: uops.c:852
ret
ret
Definition: filter_design.txt:187
SwsUOpList::num_ops
int num_ops
Definition: uops.h:237
SwsCompiledOp
Definition: ops_dispatch.h:100
ff_sws_uop_list_free
void ff_sws_uop_list_free(SwsUOpList **p_ops)
Definition: uops.c:368
SwsUOp::ptr
SwsPixel * ptr
Definition: uops.h:211
ASSIGN_SHUFFLE_FUNC
#define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT)
AV_PIX_FMT_NONE
@ AV_PIX_FMT_NONE
Definition: pixfmt.h:72
setup_rw_packed
static int setup_rw_packed(const SwsImplParams *params, SwsImplResult *out)
Copyright (C) 2025-2026 Niklas Haas.
Definition: ops.c:31
ff_sws_solve_shuffle
int ff_sws_solve_shuffle(const SwsOpList *ops, uint8_t shuffle[], int size, uint8_t clear_val, int *read_bytes, int *write_bytes)
"Solve" an op list into a fixed shuffle mask, with an optional ability to also directly clear the out...
Definition: ops_optimizer.c:811
SwsUOpList
Definition: uops.h:235
SwsUOp::vec4
SwsPixel vec4[4]
Definition: uops.h:213
setup_scale
static int setup_scale(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:251
SWS_FILTER_SCALE
@ SWS_FILTER_SCALE
14-bit coefficients are picked to fit comfortably within int16_t for efficient SIMD processing (e....
Definition: filters.h:40
SWS_UOP_WRITE_PACKED
@ SWS_UOP_WRITE_PACKED
Definition: uops.h:101
mem.h
AV_CPU_FLAG_SLOW_GATHER
#define AV_CPU_FLAG_SLOW_GATHER
CPU has slow gathers.
Definition: cpu.h:62
cpu.h
SWS_PIXEL_F32
@ SWS_PIXEL_F32
Definition: uops.h:43
w
uint8_t w
Definition: llvidencdsp.c:39
av_free
#define av_free(p)
Definition: tableprint_vlc.h:34
FFALIGN
#define FFALIGN(x, a)
Definition: macros.h:78
SwsImplParams::uop
const SwsUOp * uop
Definition: ops_chain.h:108
SwsUOpFlags
uint32_t SwsUOpFlags
Definition: uops.h:81
setup_filter_v
static int setup_filter_v(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:46
int32_t
int32_t
Definition: audioconvert.c:56
SWS_UOP_FLAG_FMA
@ SWS_UOP_FLAG_FMA
Definition: uops.h:84
SWS_UOP_CLEAR
@ SWS_UOP_CLEAR
Definition: uops.h:131
xi
#define xi(width, name, var, range_min, range_max, subs,...)
Definition: cbs_h264.c:190
SwsOpList
Helper struct for representing a list of operations.
Definition: ops.h:281
SwsContext
Main external API structure.
Definition: swscale.h:229
SWS_PIXEL_U16
@ SWS_PIXEL_U16
Definition: uops.h:41
SwsPixel::u32
uint32_t u32
Definition: uops.h:56
shuffle
static uint64_t shuffle(uint64_t in, const uint8_t *shuffle, int shuffle_len)
Definition: des.c:179
SwsImplResult
Definition: ops_chain.h:114
SwsImplParams::table
const SwsOpTable * table
Definition: ops_chain.h:106
SwsUOpList::ops
SwsUOp * ops
Definition: uops.h:236
read
static uint32_t BS_FUNC() read(BSCTX *bc, unsigned int n)
Return n bits from the buffer, n has to be in the 0-32 range.
Definition: bitstream_template.h:239
AV_CPU_FLAG_SSE4
#define AV_CPU_FLAG_SSE4
Penryn SSE4.1 functions.
Definition: cpu.h:47
setup_filter_h
static int setup_filter_h(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:76