Go to the documentation of this file.
27 #include "../ops_chain.h"
29 #include "../uops_macros.h"
49 static_assert(
sizeof(
out->priv.ptr) <=
sizeof(
int32_t[2]),
50 ">8 byte pointers not supported");
57 for (
int i = 0;
i <
filter->num_weights;
i++)
87 const int taps_align =
sizeof(
int32_t) / pixel_size;
88 const int filter_size =
filter->filter_size;
90 const size_t aligned_size =
FFALIGN(filter_size, taps_align);
91 const size_t line_size =
FFALIGN(
filter->dst_size, block_size);
93 if (aligned_size > INT_MAX)
110 const int mmsize = block_size * 2;
111 const int gather_size = mmsize /
sizeof(
int32_t);
112 for (
size_t x = 0; x < line_size; x += block_size) {
113 const int elems =
FFMIN(block_size,
filter->dst_size - x);
114 for (
int j = 0; j < filter_size; j++) {
115 const int jb = j & ~(taps_align - 1);
116 const int ji = j - jb;
117 const size_t idx_base = x * aligned_size + jb * block_size + ji;
118 for (
int i = 0;
i < elems;
i++) {
119 const int w =
filter->weights[(x +
i) * filter_size + j];
120 size_t idx = idx_base;
132 const int gather_base =
i & ~(gather_size - 1);
133 const int gather_pos =
i - gather_base;
134 const int lane_idx = gather_pos >> 2;
135 const int pos_in_lane = gather_pos & 3;
136 idx += gather_base * 4
137 + (pos_in_lane >> 1) * (mmsize / 2)
139 + (pos_in_lane & 1) * 4;
141 idx +=
i * taps_align;
154 out->priv.uptr[1] = aligned_size;
157 for (
int i = 0;
i < 4;
i++) {
159 out->over_read[
i] = (aligned_size - filter_size) * pixel_size;
195 const int taps_align = 16 / sizeof_weights;
196 const int pixels_align = 4;
197 const int filter_size =
filter->filter_size;
198 const size_t aligned_size =
FFALIGN(filter_size, taps_align);
224 for (
int x = 0; x <
filter->dst_size; x++) {
225 for (
int j = 0; j < filter_size; j++) {
226 const int xb = x & ~(pixels_align - 1);
227 const int jb = j & ~(taps_align - 1);
228 const int xi = x - xb, ji = j - jb;
229 const int w =
filter->weights[x * filter_size + j];
230 const int idx = xb * aligned_size + jb * pixels_align +
xi * taps_align + ji;
241 out->priv.uptr[1] = aligned_size * sizeof_weights;
244 for (
int i = 0;
i < 4;
i++) {
246 out->over_read[
i] = (aligned_size - filter_size) * pixel_size;
259 default:
return AVERROR(EINVAL);
268 for (
int i = 0;
i < 4;
i++)
300 #define REF_ENTRY(EXT, NAME, ...) &op_##NAME##EXT,
301 #define DECL_ENTRY(EXT, CHECK, SETUP, NAME, ...) \
302 void ff_##NAME##EXT(void); \
303 static const SwsOpEntry op_##NAME##EXT = { \
304 .func = (SwsFuncPtr) ff_##NAME##EXT, \
311 #define DECL_OPS_COMMON(EXT, TYPE) \
312 SWS_FOR_STRUCT(TYPE, READ_PACKED, DECL_ENTRY, EXT, NULL, setup_rw_packed) \
313 SWS_FOR_STRUCT(TYPE, READ_NIBBLE, DECL_ENTRY, EXT, NULL, NULL) \
314 SWS_FOR_STRUCT(TYPE, READ_BIT, DECL_ENTRY, EXT, NULL, NULL) \
315 SWS_FOR_STRUCT(TYPE, WRITE_PACKED, DECL_ENTRY, EXT, NULL, setup_rw_packed) \
316 SWS_FOR_STRUCT(TYPE, WRITE_NIBBLE, DECL_ENTRY, EXT, NULL, NULL) \
317 SWS_FOR_STRUCT(TYPE, WRITE_BIT, DECL_ENTRY, EXT, NULL, NULL) \
318 SWS_FOR_STRUCT(TYPE, SWAP_BYTES, DECL_ENTRY, EXT, NULL, NULL) \
319 SWS_FOR_STRUCT(TYPE, EXPAND_BIT, DECL_ENTRY, EXT, NULL, NULL) \
320 SWS_FOR_STRUCT(TYPE, MOVE, DECL_ENTRY, EXT, NULL, NULL) \
321 SWS_FOR_STRUCT(TYPE, SCALE, DECL_ENTRY, EXT, NULL, setup_scale) \
322 SWS_FOR_STRUCT(TYPE, ADD, DECL_ENTRY, EXT, NULL, ff_sws_setup_vec4) \
323 SWS_FOR_STRUCT(TYPE, MIN, DECL_ENTRY, EXT, NULL, ff_sws_setup_vec4) \
324 SWS_FOR_STRUCT(TYPE, MAX, DECL_ENTRY, EXT, NULL, ff_sws_setup_vec4) \
325 SWS_FOR_STRUCT(TYPE, UNPACK, DECL_ENTRY, EXT, NULL, NULL) \
326 SWS_FOR_STRUCT(TYPE, PACK, DECL_ENTRY, EXT, NULL, NULL) \
327 SWS_FOR_STRUCT(TYPE, LSHIFT, DECL_ENTRY, EXT, NULL, NULL) \
328 SWS_FOR_STRUCT(TYPE, RSHIFT, DECL_ENTRY, EXT, NULL, NULL) \
329 SWS_FOR_STRUCT(TYPE, LINEAR_FMA, DECL_ENTRY, EXT, NULL, setup_linear) \
330 SWS_FOR_STRUCT(TYPE, DITHER, DECL_ENTRY, EXT, NULL, setup_dither) \
333 #define REF_OPS_COMMON(EXT, TYPE) \
334 SWS_FOR(TYPE, READ_PACKED, REF_ENTRY, EXT) \
335 SWS_FOR(TYPE, READ_NIBBLE, REF_ENTRY, EXT) \
336 SWS_FOR(TYPE, READ_BIT, REF_ENTRY, EXT) \
337 SWS_FOR(TYPE, WRITE_PACKED, REF_ENTRY, EXT) \
338 SWS_FOR(TYPE, WRITE_NIBBLE, REF_ENTRY, EXT) \
339 SWS_FOR(TYPE, WRITE_BIT, REF_ENTRY, EXT) \
340 SWS_FOR(TYPE, SWAP_BYTES, REF_ENTRY, EXT) \
341 SWS_FOR(TYPE, EXPAND_BIT, REF_ENTRY, EXT) \
342 SWS_FOR(TYPE, MOVE, REF_ENTRY, EXT) \
343 SWS_FOR(TYPE, SCALE, REF_ENTRY, EXT) \
344 SWS_FOR(TYPE, ADD, REF_ENTRY, EXT) \
345 SWS_FOR(TYPE, MIN, REF_ENTRY, EXT) \
346 SWS_FOR(TYPE, MAX, REF_ENTRY, EXT) \
347 SWS_FOR(TYPE, UNPACK, REF_ENTRY, EXT) \
348 SWS_FOR(TYPE, PACK, REF_ENTRY, EXT) \
349 SWS_FOR(TYPE, LSHIFT, REF_ENTRY, EXT) \
350 SWS_FOR(TYPE, RSHIFT, REF_ENTRY, EXT) \
351 SWS_FOR(TYPE, LINEAR_FMA, REF_ENTRY, EXT) \
352 SWS_FOR(TYPE, DITHER, REF_ENTRY, EXT) \
355 #define DECL_TABLE_U8(EXT, SIZE, FLAG) \
356 DECL_OPS_COMMON(EXT, U8) \
357 SWS_FOR_STRUCT(U8, READ_PLANAR, DECL_ENTRY, EXT, NULL, NULL) \
358 SWS_FOR_STRUCT(U8, WRITE_PLANAR, DECL_ENTRY, EXT, NULL, NULL) \
359 SWS_FOR_STRUCT(U8, CLEAR, DECL_ENTRY, EXT, NULL, setup_clear) \
361 static const SwsOpTable ops_u8##EXT = { \
362 .cpu_flags = AV_CPU_FLAG_##FLAG, \
363 .block_size = SIZE, \
365 REF_OPS_COMMON(EXT, U8) \
366 SWS_FOR(U8, READ_PLANAR, REF_ENTRY, EXT) \
367 SWS_FOR(U8, WRITE_PLANAR, REF_ENTRY, EXT) \
368 SWS_FOR(U8, CLEAR, REF_ENTRY, EXT) \
373 #define DECL_TABLE_U16(EXT, SIZE, FLAG) \
374 DECL_OPS_COMMON(EXT, U16) \
375 SWS_FOR_STRUCT(U8, TO_U16, DECL_ENTRY, EXT, NULL, NULL) \
376 SWS_FOR_STRUCT(U16, TO_U8, DECL_ENTRY, EXT, NULL, NULL) \
377 SWS_FOR_STRUCT(U8, EXPAND_PAIR, DECL_ENTRY, EXT, NULL, NULL) \
379 static const SwsOpTable ops_u16##EXT = { \
380 .cpu_flags = AV_CPU_FLAG_##FLAG, \
381 .block_size = SIZE, \
383 REF_OPS_COMMON(EXT, U16) \
384 SWS_FOR(U8, TO_U16, REF_ENTRY, EXT) \
385 SWS_FOR(U16, TO_U8, REF_ENTRY, EXT) \
386 SWS_FOR(U8, EXPAND_PAIR, REF_ENTRY, EXT) \
391 #define DECL_TABLE_U32(EXT, SIZE, FLAG) \
392 DECL_OPS_COMMON(EXT, U32) \
393 SWS_FOR_STRUCT(U8, TO_U32, DECL_ENTRY, EXT, NULL, NULL) \
394 SWS_FOR_STRUCT(U32, TO_U8, DECL_ENTRY, EXT, NULL, NULL) \
395 SWS_FOR_STRUCT(U16, TO_U32, DECL_ENTRY, EXT, NULL, NULL) \
396 SWS_FOR_STRUCT(U32, TO_U16, DECL_ENTRY, EXT, NULL, NULL) \
397 SWS_FOR_STRUCT(U8, EXPAND_QUAD, DECL_ENTRY, EXT, NULL, NULL) \
399 static const SwsOpTable ops_u32##EXT = { \
400 .cpu_flags = AV_CPU_FLAG_##FLAG, \
401 .block_size = SIZE, \
403 REF_OPS_COMMON(EXT, U32) \
404 SWS_FOR(U8, TO_U32, REF_ENTRY, EXT) \
405 SWS_FOR(U32, TO_U8, REF_ENTRY, EXT) \
406 SWS_FOR(U16, TO_U32, REF_ENTRY, EXT) \
407 SWS_FOR(U32, TO_U16, REF_ENTRY, EXT) \
408 SWS_FOR(U8, EXPAND_QUAD, REF_ENTRY, EXT) \
413 #define DECL_TABLE_F32(EXT, SIZE, FLAG) \
414 DECL_OPS_COMMON(EXT, F32) \
415 SWS_FOR_STRUCT(U8, TO_F32, DECL_ENTRY, EXT, NULL, NULL) \
416 SWS_FOR_STRUCT(F32, TO_U8, DECL_ENTRY, EXT, NULL, NULL) \
417 SWS_FOR_STRUCT(U16, TO_F32, DECL_ENTRY, EXT, NULL, NULL) \
418 SWS_FOR_STRUCT(F32, TO_U16, DECL_ENTRY, EXT, NULL, NULL) \
419 SWS_FOR_STRUCT(U8, READ_PLANAR_FH, DECL_ENTRY, EXT, NULL, setup_filter_h) \
420 SWS_FOR_STRUCT(U16, READ_PLANAR_FH, DECL_ENTRY, EXT, NULL, setup_filter_h) \
421 SWS_FOR_STRUCT(F32, READ_PLANAR_FH, DECL_ENTRY, EXT, NULL, setup_filter_h) \
422 SWS_FOR_STRUCT(U8, READ_PLANAR_FH, DECL_ENTRY, _4x4##EXT, \
423 check_filter_h_4x4, setup_filter_h_4x4) \
424 SWS_FOR_STRUCT(U16, READ_PLANAR_FH, DECL_ENTRY, _4x4##EXT, \
425 check_filter_h_4x4, setup_filter_h_4x4) \
426 SWS_FOR_STRUCT(F32, READ_PLANAR_FH, DECL_ENTRY, _4x4##EXT, \
427 check_filter_h_4x4, setup_filter_h_4x4) \
428 SWS_FOR_STRUCT(U8, READ_PLANAR_FV, DECL_ENTRY, EXT, NULL, setup_filter_v) \
429 SWS_FOR_STRUCT(U16, READ_PLANAR_FV, DECL_ENTRY, EXT, NULL, setup_filter_v) \
430 SWS_FOR_STRUCT(F32, READ_PLANAR_FV, DECL_ENTRY, EXT, NULL, setup_filter_v) \
431 SWS_FOR_STRUCT(U8, READ_PLANAR_FV_FMA, DECL_ENTRY, EXT, NULL, setup_filter_v) \
432 SWS_FOR_STRUCT(U16, READ_PLANAR_FV_FMA, DECL_ENTRY, EXT, NULL, setup_filter_v) \
433 SWS_FOR_STRUCT(F32, READ_PLANAR_FV_FMA, DECL_ENTRY, EXT, NULL, setup_filter_v) \
435 static const SwsOpTable ops_f32##EXT = { \
436 .cpu_flags = AV_CPU_FLAG_##FLAG, \
437 .block_size = SIZE, \
439 REF_OPS_COMMON(EXT, F32) \
440 SWS_FOR(U8, TO_F32, REF_ENTRY, EXT) \
441 SWS_FOR(F32, TO_U8, REF_ENTRY, EXT) \
442 SWS_FOR(U16, TO_F32, REF_ENTRY, EXT) \
443 SWS_FOR(F32, TO_U16, REF_ENTRY, EXT) \
444 SWS_FOR(U8, READ_PLANAR_FH, REF_ENTRY, _4x4##EXT) \
445 SWS_FOR(U16, READ_PLANAR_FH, REF_ENTRY, _4x4##EXT) \
446 SWS_FOR(F32, READ_PLANAR_FH, REF_ENTRY, _4x4##EXT) \
447 SWS_FOR(U8, READ_PLANAR_FH, REF_ENTRY, EXT) \
448 SWS_FOR(U16, READ_PLANAR_FH, REF_ENTRY, EXT) \
449 SWS_FOR(F32, READ_PLANAR_FH, REF_ENTRY, EXT) \
450 SWS_FOR(U8, READ_PLANAR_FV, REF_ENTRY, EXT) \
451 SWS_FOR(U16, READ_PLANAR_FV, REF_ENTRY, EXT) \
452 SWS_FOR(F32, READ_PLANAR_FV, REF_ENTRY, EXT) \
453 SWS_FOR(U8, READ_PLANAR_FV_FMA, REF_ENTRY, EXT) \
454 SWS_FOR(U16, READ_PLANAR_FV_FMA, REF_ENTRY, EXT) \
455 SWS_FOR(F32, READ_PLANAR_FV_FMA, REF_ENTRY, EXT) \
485 static int movsize(
const int bytes,
const int mmsize)
487 return bytes <= 4 ? 4 :
508 const int num_lanes = mmsize / 16;
516 .block_size = pixels * num_lanes,
517 .over_read = {
movsize(in_total, mmsize) - in_total },
518 .over_write = {
movsize(out_total, mmsize) - out_total },
527 #define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT) \
529 SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT); \
530 if (in_total == IN && out_total == OUT) \
531 out->func = ff_packed_shuffle##IN##_##OUT##_##EXT; \
570 for (
int i = 0;
i < 4;
i++)
620 int op_block_size =
out->block_size;
630 op_block_size, chain);
639 switch (
FFMAX(read_planes, write_planes)) {
640 case 1:
out->func = ff_sws_process1_x86;
break;
641 case 2:
out->func = ff_sws_process2_x86;
break;
642 case 3:
out->func = ff_sws_process3_x86;
break;
643 case 4:
out->func = ff_sws_process4_x86;
break;
static int hscale_sizeof_weight(const SwsUOp *uop)
Copyright (C) 2025 Niklas Haas.
int ff_sws_rw_op_planes(const SwsOp *op)
Return the number of planes involved in a read/write operation.
static uint32_t expand32(const SwsPixelType type, const SwsPixel value)
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
const SwsOp * ff_sws_op_list_input(const SwsOpList *ops)
Returns the input operation for a given op list, or NULL if there is none (e.g.
int ff_sws_op_list_max_size(const SwsOpList *ops)
Returns the size of the largest pixel type used in ops.
const SwsOpBackend backend_x86
static void normalize_clear(SwsUOp *uop)
#define EXTERNAL_FMA3(flags)
#define u(width, name, range_min, range_max)
Represents a computed filter kernel.
static void read_bytes(const uint8_t *src, float *dst, int src_stride, int dst_stride, int width, int height, float scale)
int ff_sws_uop_lookup(SwsContext *ctx, const SwsOpTable *const tables[], int num_tables, const SwsUOp *uop, const int block_size, SwsOpChain *chain)
"Compile" a single uop by looking it up in a list of fixed size uop tables, in decreasing order of pr...
void(* filter)(uint8_t *src, int stride, int qscale)
static void ff_op_priv_unref(SwsOpPriv *priv)
static int setup_linear(const SwsImplParams *params, SwsImplResult *out)
int ff_sws_pixel_type_size(SwsPixelType type)
#define DECL_TABLE_F32(EXT, SIZE, FLAG)
@ SWS_BACKEND_X86
Chained x86 SIMD kernels.
void * av_memdup(const void *p, size_t size)
Duplicate a buffer with av_malloc().
#define AV_CPU_FLAG_AVX512
AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used.
static int setup_dither(const SwsImplParams *params, SwsImplResult *out)
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf type
#define SWS_COMP_ELEMS(N)
#define DECL_TABLE_U8(EXT, SIZE, FLAG)
#define FF_ARRAY_ELEMS(a)
SwsFilterWeights * kernel
SwsOpChain * ff_sws_op_chain_alloc(void)
int flags
Flags modifying the (de)muxer behaviour.
static int setup_clear(const SwsImplParams *params, SwsImplResult *out)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
static AVFormatContext * ctx
#define EXTERNAL_AVX2(flags)
const SwsOp * ff_sws_op_list_output(const SwsOpList *ops)
Returns the output operation for a given op list, or NULL if there is none.
#define DECL_TABLE_U16(EXT, SIZE, FLAG)
static int compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out)
Compiled "chain" of operations, which can be dispatched efficiently.
#define AV_CPU_FLAG_AVX2
AVX2 functions: requires OS support even if YMM registers aren't used.
static const SwsOpTable *const tables[]
static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
static bool uop_is_type_invariant(const SwsUOpType uop)
static bool check_filter_h_4x4(const SwsImplParams *params)
#define DECL_TABLE_U32(EXT, SIZE, FLAG)
#define EXTERNAL_AVX512(flags)
SWS_DECL_FUNC(ff_sws_process1_x86)
#define i(width, name, range_min, range_max)
static int movsize(const int bytes, const int mmsize)
void * av_refstruct_ref(void *obj)
Create a new reference to an object managed via this API, i.e.
#define EXTERNAL_SSE4(flags)
SwsUOpList * ff_sws_uop_list_alloc(void)
void ff_sws_op_chain_free_cb(void *ptr)
static int setup_filter_h_4x4(const SwsImplParams *params, SwsImplResult *out)
static void ff_sws_op_chain_free(SwsOpChain *chain)
static const int weights[]
#define av_assert1(cond)
assert() equivalent, that does not lie in speed critical code.
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default value
static void write_bytes(const float *src, uint8_t *dst, int src_stride, int dst_stride, int width, int height, int depth, float scale)
void * av_calloc(size_t nmemb, size_t size)
static void ff_op_priv_free(SwsOpPriv *priv)
int ff_sws_ops_translate(SwsContext *ctx, const SwsOpList *ops, SwsUOpFlags flags, SwsUOpList *uops)
Translate a list of operations down to micro-ops, which can be further optimized and then directly ex...
void ff_sws_uop_list_free(SwsUOpList **p_ops)
#define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT)
static int setup_rw_packed(const SwsImplParams *params, SwsImplResult *out)
Copyright (C) 2025-2026 Niklas Haas.
int ff_sws_solve_shuffle(const SwsOpList *ops, uint8_t shuffle[], int size, uint8_t clear_val, int *read_bytes, int *write_bytes)
"Solve" an op list into a fixed shuffle mask, with an optional ability to also directly clear the out...
static int setup_scale(const SwsImplParams *params, SwsImplResult *out)
@ SWS_FILTER_SCALE
14-bit coefficients are picked to fit comfortably within int16_t for efficient SIMD processing (e....
#define AV_CPU_FLAG_SLOW_GATHER
CPU has slow gathers.
static int setup_filter_v(const SwsImplParams *params, SwsImplResult *out)
#define xi(width, name, var, range_min, range_max, subs,...)
Helper struct for representing a list of operations.
Main external API structure.
static uint64_t shuffle(uint64_t in, const uint8_t *shuffle, int shuffle_len)
static uint32_t BS_FUNC() read(BSCTX *bc, unsigned int n)
Return n bits from the buffer, n has to be in the 0-32 range.
#define AV_CPU_FLAG_SSE4
Penryn SSE4.1 functions.
static int setup_filter_h(const SwsImplParams *params, SwsImplResult *out)