Go to the documentation of this file.
27 #include "../ops_chain.h"
29 #include "../uops_macros.h"
49 static_assert(
sizeof(
out->priv.ptr) <=
sizeof(
int32_t[2]),
50 ">8 byte pointers not supported");
57 for (
int i = 0;
i <
filter->num_weights;
i++)
87 const int taps_align =
sizeof(
int32_t) / pixel_size;
88 const int filter_size =
filter->filter_size;
90 const size_t aligned_size =
FFALIGN(filter_size, taps_align);
91 const size_t line_size =
FFALIGN(
filter->dst_size, block_size);
93 if (aligned_size > INT_MAX)
110 const int mmsize = block_size * 2;
111 const int gather_size = mmsize /
sizeof(
int32_t);
112 for (
size_t x = 0; x < line_size; x += block_size) {
113 const int elems =
FFMIN(block_size,
filter->dst_size - x);
114 for (
int j = 0; j < filter_size; j++) {
115 const int jb = j & ~(taps_align - 1);
116 const int ji = j - jb;
117 const size_t idx_base = x * aligned_size + jb * block_size + ji;
118 for (
int i = 0;
i < elems;
i++) {
119 const int w =
filter->weights[(x +
i) * filter_size + j];
120 size_t idx = idx_base;
132 const int gather_base =
i & ~(gather_size - 1);
133 const int gather_pos =
i - gather_base;
134 const int lane_idx = gather_pos >> 2;
135 const int pos_in_lane = gather_pos & 3;
136 idx += gather_base * 4
137 + (pos_in_lane >> 1) * (mmsize / 2)
139 + (pos_in_lane & 1) * 4;
141 idx +=
i * taps_align;
154 out->priv.uptr[1] = aligned_size;
157 for (
int i = 0;
i < 4;
i++) {
159 out->over_read[
i] = (aligned_size - filter_size) * pixel_size;
195 const int taps_align = 16 / sizeof_weights;
196 const int pixels_align = 4;
197 const int filter_size =
filter->filter_size;
198 const size_t aligned_size =
FFALIGN(filter_size, taps_align);
224 for (
int x = 0; x <
filter->dst_size; x++) {
225 for (
int j = 0; j < filter_size; j++) {
226 const int xb = x & ~(pixels_align - 1);
227 const int jb = j & ~(taps_align - 1);
228 const int xi = x - xb, ji = j - jb;
229 const int w =
filter->weights[x * filter_size + j];
230 const int idx = xb * aligned_size + jb * pixels_align +
xi * taps_align + ji;
241 out->priv.uptr[1] = aligned_size * sizeof_weights;
244 for (
int i = 0;
i < 4;
i++) {
246 out->over_read[
i] = (aligned_size - filter_size) * pixel_size;
259 default:
return AVERROR(EINVAL);
268 for (
int i = 0;
i < 4;
i++)
300 #define REF_ENTRY(EXT, NAME, ...) &uop_##NAME##EXT,
301 #define DECL_ENTRY(EXT, CHECK, SETUP, NAME, ...) \
302 void ff_##NAME##EXT(void); \
303 static const SwsUOpEntry uop_##NAME##EXT = { \
304 .func = (SwsFuncPtr) ff_##NAME##EXT, \
311 #define DECL_OPS_COMMON(EXT, TYPE) \
312 SWS_FOR_STRUCT(TYPE, READ_PACKED, DECL_ENTRY, EXT, NULL, setup_rw_packed) \
313 SWS_FOR_STRUCT(TYPE, READ_NIBBLE, DECL_ENTRY, EXT, NULL, NULL) \
314 SWS_FOR_STRUCT(TYPE, READ_BIT, DECL_ENTRY, EXT, NULL, NULL) \
315 SWS_FOR_STRUCT(TYPE, READ_PALETTE, DECL_ENTRY, EXT, NULL, NULL) \
316 SWS_FOR_STRUCT(TYPE, WRITE_PACKED, DECL_ENTRY, EXT, NULL, setup_rw_packed) \
317 SWS_FOR_STRUCT(TYPE, WRITE_NIBBLE, DECL_ENTRY, EXT, NULL, NULL) \
318 SWS_FOR_STRUCT(TYPE, WRITE_BIT, DECL_ENTRY, EXT, NULL, NULL) \
319 SWS_FOR_STRUCT(TYPE, SWAP_BYTES, DECL_ENTRY, EXT, NULL, NULL) \
320 SWS_FOR_STRUCT(TYPE, EXPAND_BIT, DECL_ENTRY, EXT, NULL, NULL) \
321 SWS_FOR_STRUCT(TYPE, MOVE, DECL_ENTRY, EXT, NULL, NULL) \
322 SWS_FOR_STRUCT(TYPE, SCALE, DECL_ENTRY, EXT, NULL, setup_scale) \
323 SWS_FOR_STRUCT(TYPE, ADD, DECL_ENTRY, EXT, NULL, ff_sws_setup_vec4) \
324 SWS_FOR_STRUCT(TYPE, MIN, DECL_ENTRY, EXT, NULL, ff_sws_setup_vec4) \
325 SWS_FOR_STRUCT(TYPE, MAX, DECL_ENTRY, EXT, NULL, ff_sws_setup_vec4) \
326 SWS_FOR_STRUCT(TYPE, UNPACK, DECL_ENTRY, EXT, NULL, NULL) \
327 SWS_FOR_STRUCT(TYPE, PACK, DECL_ENTRY, EXT, NULL, NULL) \
328 SWS_FOR_STRUCT(TYPE, LSHIFT, DECL_ENTRY, EXT, NULL, NULL) \
329 SWS_FOR_STRUCT(TYPE, RSHIFT, DECL_ENTRY, EXT, NULL, NULL) \
330 SWS_FOR_STRUCT(TYPE, LINEAR_FMA, DECL_ENTRY, EXT, NULL, setup_linear) \
331 SWS_FOR_STRUCT(TYPE, DITHER, DECL_ENTRY, EXT, NULL, setup_dither) \
334 #define REF_OPS_COMMON(EXT, TYPE) \
335 SWS_FOR(TYPE, READ_PACKED, REF_ENTRY, EXT) \
336 SWS_FOR(TYPE, READ_NIBBLE, REF_ENTRY, EXT) \
337 SWS_FOR(TYPE, READ_BIT, REF_ENTRY, EXT) \
338 SWS_FOR(TYPE, READ_PALETTE, REF_ENTRY, EXT) \
339 SWS_FOR(TYPE, WRITE_PACKED, REF_ENTRY, EXT) \
340 SWS_FOR(TYPE, WRITE_NIBBLE, REF_ENTRY, EXT) \
341 SWS_FOR(TYPE, WRITE_BIT, REF_ENTRY, EXT) \
342 SWS_FOR(TYPE, SWAP_BYTES, REF_ENTRY, EXT) \
343 SWS_FOR(TYPE, EXPAND_BIT, REF_ENTRY, EXT) \
344 SWS_FOR(TYPE, MOVE, REF_ENTRY, EXT) \
345 SWS_FOR(TYPE, SCALE, REF_ENTRY, EXT) \
346 SWS_FOR(TYPE, ADD, REF_ENTRY, EXT) \
347 SWS_FOR(TYPE, MIN, REF_ENTRY, EXT) \
348 SWS_FOR(TYPE, MAX, REF_ENTRY, EXT) \
349 SWS_FOR(TYPE, UNPACK, REF_ENTRY, EXT) \
350 SWS_FOR(TYPE, PACK, REF_ENTRY, EXT) \
351 SWS_FOR(TYPE, LSHIFT, REF_ENTRY, EXT) \
352 SWS_FOR(TYPE, RSHIFT, REF_ENTRY, EXT) \
353 SWS_FOR(TYPE, LINEAR_FMA, REF_ENTRY, EXT) \
354 SWS_FOR(TYPE, DITHER, REF_ENTRY, EXT) \
357 #define DECL_TABLE_U8(EXT, SIZE, FLAG) \
358 DECL_OPS_COMMON(EXT, U8) \
359 SWS_FOR_STRUCT(U8, READ_PLANAR, DECL_ENTRY, EXT, NULL, NULL) \
360 SWS_FOR_STRUCT(U8, WRITE_PLANAR, DECL_ENTRY, EXT, NULL, NULL) \
361 SWS_FOR_STRUCT(U8, CLEAR, DECL_ENTRY, EXT, NULL, setup_clear) \
363 static const SwsUOpTable uops_u8##EXT = { \
364 .cpu_flags = AV_CPU_FLAG_##FLAG, \
365 .block_size = SIZE, \
367 REF_OPS_COMMON(EXT, U8) \
368 SWS_FOR(U8, READ_PLANAR, REF_ENTRY, EXT) \
369 SWS_FOR(U8, WRITE_PLANAR, REF_ENTRY, EXT) \
370 SWS_FOR(U8, CLEAR, REF_ENTRY, EXT) \
375 #define DECL_TABLE_U16(EXT, SIZE, FLAG) \
376 DECL_OPS_COMMON(EXT, U16) \
377 SWS_FOR_STRUCT(U8, TO_U16, DECL_ENTRY, EXT, NULL, NULL) \
378 SWS_FOR_STRUCT(U16, TO_U8, DECL_ENTRY, EXT, NULL, NULL) \
379 SWS_FOR_STRUCT(U8, EXPAND_PAIR, DECL_ENTRY, EXT, NULL, NULL) \
381 static const SwsUOpTable uops_u16##EXT = { \
382 .cpu_flags = AV_CPU_FLAG_##FLAG, \
383 .block_size = SIZE, \
385 REF_OPS_COMMON(EXT, U16) \
386 SWS_FOR(U8, TO_U16, REF_ENTRY, EXT) \
387 SWS_FOR(U16, TO_U8, REF_ENTRY, EXT) \
388 SWS_FOR(U8, EXPAND_PAIR, REF_ENTRY, EXT) \
393 #define DECL_TABLE_U32(EXT, SIZE, FLAG) \
394 DECL_OPS_COMMON(EXT, U32) \
395 SWS_FOR_STRUCT(U8, TO_U32, DECL_ENTRY, EXT, NULL, NULL) \
396 SWS_FOR_STRUCT(U32, TO_U8, DECL_ENTRY, EXT, NULL, NULL) \
397 SWS_FOR_STRUCT(U16, TO_U32, DECL_ENTRY, EXT, NULL, NULL) \
398 SWS_FOR_STRUCT(U32, TO_U16, DECL_ENTRY, EXT, NULL, NULL) \
399 SWS_FOR_STRUCT(U8, EXPAND_QUAD, DECL_ENTRY, EXT, NULL, NULL) \
401 static const SwsUOpTable uops_u32##EXT = { \
402 .cpu_flags = AV_CPU_FLAG_##FLAG, \
403 .block_size = SIZE, \
405 REF_OPS_COMMON(EXT, U32) \
406 SWS_FOR(U8, TO_U32, REF_ENTRY, EXT) \
407 SWS_FOR(U32, TO_U8, REF_ENTRY, EXT) \
408 SWS_FOR(U16, TO_U32, REF_ENTRY, EXT) \
409 SWS_FOR(U32, TO_U16, REF_ENTRY, EXT) \
410 SWS_FOR(U8, EXPAND_QUAD, REF_ENTRY, EXT) \
415 #define DECL_TABLE_F32(EXT, SIZE, FLAG) \
416 DECL_OPS_COMMON(EXT, F32) \
417 SWS_FOR_STRUCT(U8, TO_F32, DECL_ENTRY, EXT, NULL, NULL) \
418 SWS_FOR_STRUCT(F32, TO_U8, DECL_ENTRY, EXT, NULL, NULL) \
419 SWS_FOR_STRUCT(U16, TO_F32, DECL_ENTRY, EXT, NULL, NULL) \
420 SWS_FOR_STRUCT(F32, TO_U16, DECL_ENTRY, EXT, NULL, NULL) \
421 SWS_FOR_STRUCT(U8, READ_PLANAR_FH, DECL_ENTRY, EXT, NULL, setup_filter_h) \
422 SWS_FOR_STRUCT(U16, READ_PLANAR_FH, DECL_ENTRY, EXT, NULL, setup_filter_h) \
423 SWS_FOR_STRUCT(F32, READ_PLANAR_FH, DECL_ENTRY, EXT, NULL, setup_filter_h) \
424 SWS_FOR_STRUCT(U8, READ_PLANAR_FH, DECL_ENTRY, _4x4##EXT, \
425 check_filter_h_4x4, setup_filter_h_4x4) \
426 SWS_FOR_STRUCT(U16, READ_PLANAR_FH, DECL_ENTRY, _4x4##EXT, \
427 check_filter_h_4x4, setup_filter_h_4x4) \
428 SWS_FOR_STRUCT(F32, READ_PLANAR_FH, DECL_ENTRY, _4x4##EXT, \
429 check_filter_h_4x4, setup_filter_h_4x4) \
430 SWS_FOR_STRUCT(U8, READ_PLANAR_FV, DECL_ENTRY, EXT, NULL, setup_filter_v) \
431 SWS_FOR_STRUCT(U16, READ_PLANAR_FV, DECL_ENTRY, EXT, NULL, setup_filter_v) \
432 SWS_FOR_STRUCT(F32, READ_PLANAR_FV, DECL_ENTRY, EXT, NULL, setup_filter_v) \
433 SWS_FOR_STRUCT(U8, READ_PLANAR_FV_FMA, DECL_ENTRY, EXT, NULL, setup_filter_v) \
434 SWS_FOR_STRUCT(U16, READ_PLANAR_FV_FMA, DECL_ENTRY, EXT, NULL, setup_filter_v) \
435 SWS_FOR_STRUCT(F32, READ_PLANAR_FV_FMA, DECL_ENTRY, EXT, NULL, setup_filter_v) \
437 static const SwsUOpTable uops_f32##EXT = { \
438 .cpu_flags = AV_CPU_FLAG_##FLAG, \
439 .block_size = SIZE, \
441 REF_OPS_COMMON(EXT, F32) \
442 SWS_FOR(U8, TO_F32, REF_ENTRY, EXT) \
443 SWS_FOR(F32, TO_U8, REF_ENTRY, EXT) \
444 SWS_FOR(U16, TO_F32, REF_ENTRY, EXT) \
445 SWS_FOR(F32, TO_U16, REF_ENTRY, EXT) \
446 SWS_FOR(U8, READ_PLANAR_FH, REF_ENTRY, _4x4##EXT) \
447 SWS_FOR(U16, READ_PLANAR_FH, REF_ENTRY, _4x4##EXT) \
448 SWS_FOR(F32, READ_PLANAR_FH, REF_ENTRY, _4x4##EXT) \
449 SWS_FOR(U8, READ_PLANAR_FH, REF_ENTRY, EXT) \
450 SWS_FOR(U16, READ_PLANAR_FH, REF_ENTRY, EXT) \
451 SWS_FOR(F32, READ_PLANAR_FH, REF_ENTRY, EXT) \
452 SWS_FOR(U8, READ_PLANAR_FV, REF_ENTRY, EXT) \
453 SWS_FOR(U16, READ_PLANAR_FV, REF_ENTRY, EXT) \
454 SWS_FOR(F32, READ_PLANAR_FV, REF_ENTRY, EXT) \
455 SWS_FOR(U8, READ_PLANAR_FV_FMA, REF_ENTRY, EXT) \
456 SWS_FOR(U16, READ_PLANAR_FV_FMA, REF_ENTRY, EXT) \
457 SWS_FOR(F32, READ_PLANAR_FV_FMA, REF_ENTRY, EXT) \
487 static int movsize(
const int bytes,
const int mmsize)
489 return bytes <= 4 ? 4 :
510 const int num_lanes = mmsize / 16;
518 .block_size = pixels * num_lanes,
519 .over_read = {
movsize(in_total, mmsize) - in_total },
520 .over_write = {
movsize(out_total, mmsize) - out_total },
529 #define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT) \
531 SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT); \
532 if (in_total == IN && out_total == OUT) \
533 out->func = ff_packed_shuffle##IN##_##OUT##_##EXT; \
572 for (
int i = 0;
i < 4;
i++)
622 int op_block_size =
out->block_size;
632 op_block_size, chain);
641 switch (
FFMAX(read_planes, write_planes)) {
642 case 1:
out->func = ff_sws_process1_x86;
break;
643 case 2:
out->func = ff_sws_process2_x86;
break;
644 case 3:
out->func = ff_sws_process3_x86;
break;
645 case 4:
out->func = ff_sws_process4_x86;
break;
static int hscale_sizeof_weight(const SwsUOp *uop)
int ff_sws_rw_op_planes(const SwsOp *op)
Return the number of planes involved in a read/write operation.
static uint32_t expand32(const SwsPixelType type, const SwsPixel value)
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
const SwsOp * ff_sws_op_list_input(const SwsOpList *ops)
Returns the input operation for a given op list, or NULL if there is none (e.g.
int ff_sws_op_list_max_size(const SwsOpList *ops)
Returns the size of the largest pixel type used in ops.
const SwsOpBackend backend_x86
static void normalize_clear(SwsUOp *uop)
#define EXTERNAL_FMA3(flags)
#define u(width, name, range_min, range_max)
Represents a computed filter kernel.
static void read_bytes(const uint8_t *src, float *dst, int src_stride, int dst_stride, int width, int height, float scale)
void(* filter)(uint8_t *src, int stride, int qscale)
Copyright (C) 2025 Niklas Haas.
static void ff_op_priv_unref(SwsOpPriv *priv)
static int setup_linear(const SwsImplParams *params, SwsImplResult *out)
int ff_sws_pixel_type_size(SwsPixelType type)
#define DECL_TABLE_F32(EXT, SIZE, FLAG)
@ SWS_BACKEND_X86
Chained x86 SIMD kernels.
void * av_memdup(const void *p, size_t size)
Duplicate a buffer with av_malloc().
#define AV_CPU_FLAG_AVX512
AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used.
static int setup_dither(const SwsImplParams *params, SwsImplResult *out)
const SwsUOpTable * table
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf type
#define SWS_COMP_ELEMS(N)
#define DECL_TABLE_U8(EXT, SIZE, FLAG)
#define FF_ARRAY_ELEMS(a)
SwsFilterWeights * kernel
SwsOpChain * ff_sws_op_chain_alloc(void)
int flags
Flags modifying the (de)muxer behaviour.
static int setup_clear(const SwsImplParams *params, SwsImplResult *out)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
static AVFormatContext * ctx
#define EXTERNAL_AVX2(flags)
const SwsOp * ff_sws_op_list_output(const SwsOpList *ops)
Returns the output operation for a given op list, or NULL if there is none.
#define DECL_TABLE_U16(EXT, SIZE, FLAG)
static int compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out)
Compiled "chain" of operations, which can be dispatched efficiently.
#define AV_CPU_FLAG_AVX2
AVX2 functions: requires OS support even if YMM registers aren't used.
static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
static bool uop_is_type_invariant(const SwsUOpType uop)
static bool check_filter_h_4x4(const SwsImplParams *params)
#define DECL_TABLE_U32(EXT, SIZE, FLAG)
#define EXTERNAL_AVX512(flags)
SWS_DECL_FUNC(ff_sws_process1_x86)
static const SwsUOpTable *const tables[]
#define i(width, name, range_min, range_max)
static int movsize(const int bytes, const int mmsize)
void * av_refstruct_ref(void *obj)
Create a new reference to an object managed via this API, i.e.
#define EXTERNAL_SSE4(flags)
SwsUOpList * ff_sws_uop_list_alloc(void)
void ff_sws_op_chain_free_cb(void *ptr)
static int setup_filter_h_4x4(const SwsImplParams *params, SwsImplResult *out)
static void ff_sws_op_chain_free(SwsOpChain *chain)
static const int weights[]
#define av_assert1(cond)
assert() equivalent, that does not lie in speed critical code.
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default value
static void write_bytes(const float *src, uint8_t *dst, int src_stride, int dst_stride, int width, int height, int depth, float scale)
void * av_calloc(size_t nmemb, size_t size)
static void ff_op_priv_free(SwsOpPriv *priv)
int ff_sws_ops_translate(SwsContext *ctx, const SwsOpList *ops, SwsUOpFlags flags, SwsUOpList *uops)
Translate a list of operations down to micro-ops, which can be further optimized and then directly ex...
void ff_sws_uop_list_free(SwsUOpList **p_ops)
#define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT)
static int setup_rw_packed(const SwsImplParams *params, SwsImplResult *out)
Copyright (C) 2025-2026 Niklas Haas.
int ff_sws_solve_shuffle(const SwsOpList *ops, uint8_t shuffle[], int size, uint8_t clear_val, int *read_bytes, int *write_bytes)
"Solve" an op list into a fixed shuffle mask, with an optional ability to also directly clear the out...
static int setup_scale(const SwsImplParams *params, SwsImplResult *out)
@ SWS_FILTER_SCALE
14-bit coefficients are picked to fit comfortably within int16_t for efficient SIMD processing (e....
#define AV_CPU_FLAG_SLOW_GATHER
CPU has slow gathers.
static int setup_filter_v(const SwsImplParams *params, SwsImplResult *out)
int ff_sws_uop_lookup(SwsContext *ctx, const SwsUOpTable *const tables[], int num_tables, const SwsUOp *uop, const int block_size, SwsOpChain *chain)
"Compile" a single uop by looking it up in a list of fixed size uop tables, in decreasing order of pr...
#define xi(width, name, var, range_min, range_max, subs,...)
Helper struct for representing a list of operations.
Main external API structure.
static uint64_t shuffle(uint64_t in, const uint8_t *shuffle, int shuffle_len)
static uint32_t BS_FUNC() read(BSCTX *bc, unsigned int n)
Return n bits from the buffer, n has to be in the 0-32 range.
#define AV_CPU_FLAG_SSE4
Penryn SSE4.1 functions.
static int setup_filter_h(const SwsImplParams *params, SwsImplResult *out)