FFmpeg
ops_asmgen.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2026 Ramiro Polla
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <assert.h>
22 #include <limits.h>
23 #include <stdint.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 
28 #ifdef _WIN32
29 #include <io.h>
30 #include <fcntl.h>
31 #endif
32 
33 /**
34  * This file is compiled as a standalone build-time tool and must not depend
35  * on internal FFmpeg libraries. The necessary utils are redefined below using
36  * standard C equivalents.
37  */
38 
39 #define AVUTIL_AVASSERT_H
40 #define AVUTIL_LOG_H
41 #define AVUTIL_MACROS_H
42 #define AVUTIL_MEM_H
43 #define av_assert0(cond) assert(cond)
44 #define av_malloc(s) malloc(s)
45 #define av_mallocz(s) calloc(1, s)
46 #define av_realloc(p, s) realloc(p, s)
47 #define av_strdup(s) strdup(s)
48 #define av_free(p) free(p)
49 #define FFMAX(a,b) ((a) > (b) ? (a) : (b))
50 #define FFMIN(a,b) ((a) > (b) ? (b) : (a))
51 
52 static void av_freep(void *ptr)
53 {
54  void **pptr = (void **) ptr;
55  if (pptr) {
56  ptr = *pptr;
57  if (ptr)
58  free(ptr);
59  *pptr = NULL;
60  }
61 }
62 
63 #include "libavutil/dynarray.h"
64 
65 static void *av_dynarray2_add(void **tab_ptr, int *nb_ptr, size_t elem_size,
66  const uint8_t *elem_data)
67 {
68  uint8_t *tab_elem_data = NULL;
69 
70  FF_DYNARRAY_ADD(INT_MAX, elem_size, *tab_ptr, *nb_ptr, {
71  tab_elem_data = (uint8_t *)*tab_ptr + (*nb_ptr) * elem_size;
72  if (elem_data)
73  memcpy(tab_elem_data, elem_data, elem_size);
74  }, {
75  av_freep(tab_ptr);
76  *nb_ptr = 0;
77  });
78  return tab_elem_data;
79 }
80 
81 /*********************************************************************/
82 #include "rasm.c"
83 #include "rasm_print.c"
84 #include "ops_impl.c"
85 
86 /**
87  * Implementation parameters for all exported functions. This list is
88  * compiled by performing a dummy run of all conversions in sws_ops and
89  * collecting all functions that need to be generated. This is achieved
90  * by running:
91  * make sws_ops_entries_aarch64
92  */
94 #include "ops_entries.c"
95  { .op = AARCH64_SWS_OP_NONE }
96 };
97 
98 /*********************************************************************/
100 {
101  switch (fmt) {
102  case AARCH64_PIXEL_U8: return 1;
103  case AARCH64_PIXEL_U16: return 2;
104  case AARCH64_PIXEL_U32: return 4;
105  case AARCH64_PIXEL_F32: return 4;
106  default:
107  av_assert0(!"Invalid pixel type!");
108  break;
109  }
110  return 0;
111 }
112 
113 static void impl_func_name(char **buf, size_t *size, const SwsAArch64OpImplParams *params)
114 {
115  buf_appendf(buf, size, "ff_sws");
116  const ParamField **fields = op_fields[params->op];
117  for (int i = 0; fields[i]; i++) {
118  const ParamField *field = fields[i];
119  void *p = (void *) (((uintptr_t) params) + field->offset);
120  field->print_str(buf, size, p);
121  }
122  buf_appendf(buf, size, "_neon");
123 }
124 
125 void aarch64_op_impl_func_name(char *buf, size_t size, const SwsAArch64OpImplParams *params)
126 {
127  impl_func_name(&buf, &size, params);
128  av_assert0(size && "string buffer exhausted");
129 }
130 
131 /*********************************************************************/
132 typedef struct SwsAArch64Context {
134 
135  /* SwsOpFunc arguments. */
142 
143  /* Loop iterator variables. */
146 
147  /* Scratch registers. */
150 
151  /* CPS-related variables. */
156 
157  /* Vector registers. Two banks (low and high) are used. */
158  RasmOp vl[ 4];
159  RasmOp vh[ 4];
160  RasmOp vt[12];
161 
162  /* Read/Write data pointers and padding. */
163  RasmOp in[4];
167 
168  /* Vector register dimensions. */
169  size_t el_size;
170  size_t el_count;
171  size_t vec_size;
172  bool use_vh;
174 
175 /*********************************************************************/
176 /* Helpers functions. */
177 
178 /* Looping when s->use_vh is set. */
179 #define LOOP_VH(s, mask, idx) if (s->use_vh) LOOP(mask, idx)
180 #define LOOP_MASK_VH(s, p, idx) if (s->use_vh) LOOP_MASK(p, idx)
181 #define LOOP_MASK_BWD_VH(s, p, idx) if (s->use_vh) LOOP_MASK_BWD(p, idx)
182 
183 /* Inline rasm comments. */
184 #define CMT(comment) rasm_annotate(r, comment)
185 #define CMTF(fmt, ...) rasm_annotatef(r, (char[128]){0}, 128, fmt, __VA_ARGS__)
186 
187 /* Reshape all vector registers for current SwsOp. */
188 static void reshape_all_vectors(SwsAArch64Context *s, int el_count, int el_size)
189 {
190  s->vl[ 0] = a64op_make_vec( 0, el_count, el_size);
191  s->vl[ 1] = a64op_make_vec( 1, el_count, el_size);
192  s->vl[ 2] = a64op_make_vec( 2, el_count, el_size);
193  s->vl[ 3] = a64op_make_vec( 3, el_count, el_size);
194  s->vh[ 0] = a64op_make_vec( 4, el_count, el_size);
195  s->vh[ 1] = a64op_make_vec( 5, el_count, el_size);
196  s->vh[ 2] = a64op_make_vec( 6, el_count, el_size);
197  s->vh[ 3] = a64op_make_vec( 7, el_count, el_size);
198  s->vt[ 0] = a64op_make_vec(16, el_count, el_size);
199  s->vt[ 1] = a64op_make_vec(17, el_count, el_size);
200  s->vt[ 2] = a64op_make_vec(18, el_count, el_size);
201  s->vt[ 3] = a64op_make_vec(19, el_count, el_size);
202  s->vt[ 4] = a64op_make_vec(20, el_count, el_size);
203  s->vt[ 5] = a64op_make_vec(21, el_count, el_size);
204  s->vt[ 6] = a64op_make_vec(22, el_count, el_size);
205  s->vt[ 7] = a64op_make_vec(23, el_count, el_size);
206  s->vt[ 8] = a64op_make_vec(24, el_count, el_size);
207  s->vt[ 9] = a64op_make_vec(25, el_count, el_size);
208  s->vt[10] = a64op_make_vec(26, el_count, el_size);
209  s->vt[11] = a64op_make_vec(27, el_count, el_size);
210 }
211 
212 /*********************************************************************/
213 /* Function frame */
214 
215 static unsigned clobbered_frame_size(unsigned n)
216 {
217  return ((n + 1) >> 1) * 16;
218 }
219 
220 static void asmgen_prologue(SwsAArch64Context *s, const RasmOp *regs, unsigned n)
221 {
222  RasmContext *r = s->rctx;
223  RasmOp sp = a64op_sp();
224  unsigned frame_size = clobbered_frame_size(n);
225  RasmOp sp_pre = a64op_pre(sp, -frame_size);
226 
227  rasm_add_comment(r, "prologue");
228  if (n == 0) {
229  /* no-op */
230  } else if (n == 1) {
231  i_str(r, regs[0], sp_pre);
232  } else {
233  i_stp(r, regs[0], regs[1], sp_pre);
234  for (unsigned i = 2; i + 1 < n; i += 2)
235  i_stp(r, regs[i], regs[i + 1], a64op_off(sp, i * sizeof(uint64_t)));
236  if (n & 1)
237  i_str(r, regs[n - 1], a64op_off(sp, (n - 1) * sizeof(uint64_t)));
238  }
239 }
240 
241 static void asmgen_epilogue(SwsAArch64Context *s, const RasmOp *regs, unsigned n)
242 {
243  RasmContext *r = s->rctx;
244  RasmOp sp = a64op_sp();
245  unsigned frame_size = clobbered_frame_size(n);
246  RasmOp sp_post = a64op_post(sp, frame_size);
247 
248  rasm_add_comment(r, "epilogue");
249  if (n == 0) {
250  /* no-op */
251  } else if (n == 1) {
252  i_ldr(r, regs[0], sp_post);
253  } else {
254  if (n & 1)
255  i_ldr(r, regs[n - 1], a64op_off(sp, (n - 1) * sizeof(uint64_t)));
256  for (unsigned i = (n & ~1u) - 2; i >= 2; i -= 2)
257  i_ldp(r, regs[i], regs[i + 1], a64op_off(sp, i * sizeof(uint64_t)));
258  i_ldp(r, regs[0], regs[1], sp_post);
259  }
260 }
261 
262 /*********************************************************************/
263 /* Callee-saved registers (r19-r28, fp, and lr). */
264 #define MAX_SAVED_REGS 12
265 
266 static void clobber_gpr(RasmOp regs[MAX_SAVED_REGS], unsigned *count,
267  RasmOp gpr)
268 {
269  const int n = a64op_gpr_n(gpr);
270  if (n >= 19 && n <= 30)
271  regs[(*count)++] = gpr;
272 }
273 
274 static unsigned clobbered_gprs(const SwsAArch64Context *s,
276  RasmOp regs[MAX_SAVED_REGS])
277 {
278  unsigned count = 0;
279  clobber_gpr(regs, &count, a64op_lr());
280  LOOP(mask, i) {
281  clobber_gpr(regs, &count, s->in[i]);
282  clobber_gpr(regs, &count, s->out[i]);
283  clobber_gpr(regs, &count, s->in_bump[i]);
284  clobber_gpr(regs, &count, s->out_bump[i]);
285  }
286  return count;
287 }
288 
290 {
291  RasmContext *r = s->rctx;
292  char func_name[128];
293  char buf[64];
294 
295  /**
296  * The process function for aarch64 works similarly to the x86 backend.
297  * The description in x86/ops_include.asm mostly holds as well here.
298  */
299 
300  snprintf(func_name, sizeof(func_name), "ff_sws_process_%04x_neon", mask);
301 
302  rasm_func_begin(r, func_name, true, false);
303 
304  /* Function prologue */
305  RasmOp saved_regs[MAX_SAVED_REGS];
306  unsigned nsaved = clobbered_gprs(s, mask, saved_regs);
307  if (nsaved)
308  asmgen_prologue(s, saved_regs, nsaved);
309 
310  /* Load values from impl. */
311  i_ldr(r, s->op0_func, a64op_off(s->impl, offsetof_impl_cont)); CMT("SwsFuncPtr op0_func = impl->cont;");
312  i_add(r, s->op1_impl, s->impl, IMM(sizeof_impl)); CMT("SwsOpImpl *op1_impl = impl + 1;");
313 
314  /* Load values from exec. */
315  LOOP(mask, i) {
316  rasm_annotate_nextf(r, buf, sizeof(buf), "in[%u] = exec->in[%u];", i, i);
317  i_ldr(r, s->in[i], a64op_off(s->exec, offsetof_exec_in + (i * sizeof(uint8_t *))));
318  }
319  LOOP(mask, i) {
320  rasm_annotate_nextf(r, buf, sizeof(buf), "out[%u] = exec->out[%u];", i, i);
321  i_ldr(r, s->out[i], a64op_off(s->exec, offsetof_exec_out + (i * sizeof(uint8_t *))));
322  }
323  LOOP(mask, i) {
324  rasm_annotate_nextf(r, buf, sizeof(buf), "in_bump[%u] = exec->in_bump[%u];", i, i);
325  i_ldr(r, s->in_bump[i], a64op_off(s->exec, offsetof_exec_in_bump + (i * sizeof(ptrdiff_t))));
326  }
327  LOOP(mask, i) {
328  rasm_annotate_nextf(r, buf, sizeof(buf), "out_bump[%u] = exec->out_bump[%u];", i, i);
329  i_ldr(r, s->out_bump[i], a64op_off(s->exec, offsetof_exec_out_bump + (i * sizeof(ptrdiff_t))));
330  }
331 
332  int first_row = rasm_new_label(r, NULL);
333  int next_row = rasm_new_label(r, NULL);
334  int next_block = rasm_new_label(r, NULL);
335 
336  /* Jump to first row (skips padding). */
337  i_b (r, rasm_op_label(first_row)); CMT("goto first_row;");
338 
339  /* Perform padding, preparing for next row. */
340  rasm_add_label(r, next_row); CMT("next_row:");
341  LOOP(mask, i) { i_add(r, s->in[i], s->in[i], s->in_bump[i]); CMTF("in[%u] += in_bump[%u];", i, i); }
342  LOOP(mask, i) { i_add(r, s->out[i], s->out[i], s->out_bump[i]); CMTF("out[%u] += out_bump[%u];", i, i); }
343 
344  /* First row (reset x). */
345  rasm_add_label(r, first_row); CMT("first_row:");
346  i_mov(r, s->bx, s->bx_start); CMT("bx = bx_start;");
347 
348  /* Reset impl and call first kernel. */
349  rasm_add_label(r, next_block); CMT("next_block:");
350  i_mov(r, s->impl, s->op1_impl); CMT("impl = op1_impl;");
351  i_blr(r, s->op0_func); CMT("op0_func();");
352 
353  /* Perform horizontal loop. */
354  i_add(r, s->bx, s->bx, IMM(1)); CMT("bx += 1;");
355  i_cmp(r, s->bx, s->bx_end); CMT("if (bx != bx_end)");
356  i_bne(r, next_block); CMT(" goto next_block;");
357 
358  /* Perform vertical loop. */
359  i_add(r, s->y, s->y, IMM(1)); CMT("y += 1;");
360  i_cmp(r, s->y, s->y_end); CMT("if (y != y_end)");
361  i_bne(r, next_row); CMT(" goto next_row;");
362 
363  /* Function epilogue */
364  if (nsaved)
365  asmgen_epilogue(s, saved_regs, nsaved);
366 
367  i_ret(r);
368 }
369 
370 /*********************************************************************/
371 /**
372  * Set node where the continuation address will be loaded and impl will
373  * be incremented. This should be done right after impl->priv has been
374  * used.
375  */
377 {
378  RasmContext *r = s->rctx;
379  s->load_cont_node = rasm_get_current_node(r);
380 }
381 
382 /*********************************************************************/
383 /* gather raw pixels from planes */
384 /* AARCH64_SWS_OP_READ_BIT */
385 /* AARCH64_SWS_OP_READ_NIBBLE */
386 /* AARCH64_SWS_OP_READ_PACKED */
387 /* AARCH64_SWS_OP_READ_PLANAR */
388 
390 {
391  RasmContext *r = s->rctx;
392  RasmOp bitmask_vec = s->vt[1];
393  RasmOp wtmp = a64op_w(s->tmp0);
394  AArch64VecViews vl[1];
395  AArch64VecViews vtmp;
396  AArch64VecViews shift_vec;
397 
398  a64op_vec_views(s->vt[0], &shift_vec);
399  a64op_vec_views(s->vl[0], &vl[0]);
400  a64op_vec_views(s->vt[2], &vtmp);
401 
402  /* Note that shift_vec has negative values, so that using it with
403  * ushl actually performs a right shift. */
404  rasm_annotate_next(r, "v128 shift_vec = impl->priv.v128;");
405  i_ldr(r, shift_vec.q, a64op_off(s->impl, offsetof_impl_priv));
407 
408  if (p->block_size == 16) {
409  i_ldrh(r, wtmp, a64op_post(s->in[0], 2)); CMT("uint16_t tmp = *in[0]++;");
410  i_movi(r, bitmask_vec, IMM(1)); CMT("v128 bitmask_vec = {1 <repeats 16 times>};");
411  i_dup (r, vl[0].b8, wtmp); CMT("vl[0].lo = broadcast(tmp);");
412  i_lsr (r, wtmp, wtmp, IMM(8)); CMT("tmp >>= 8;");
413  i_dup (r, vtmp.b8, wtmp); CMT("vtmp.lo = broadcast(tmp);");
414  i_ins (r, vl[0].de[1], vtmp.de[0]); CMT("vl[0].hi = vtmp.lo;");
415  i_ushl(r, vl[0].b16, vl[0].b16, shift_vec.b16); CMT("vl[0] <<= shift_vec;");
416  i_and (r, vl[0].b16, vl[0].b16, bitmask_vec); CMT("vl[0] &= bitmask_vec;");
417  } else {
418  i_ldrb(r, wtmp, a64op_post(s->in[0], 1)); CMT("uint8_t tmp = *in[0]++;");
419  i_movi(r, bitmask_vec, IMM(1)); CMT("v128 bitmask_vec = {1 <repeats 8 times>, 0 <repeats 8 times>};");
420  i_dup (r, vl[0].b8, wtmp); CMT("vl[0].lo = broadcast(tmp);");
421  i_ushl(r, vl[0].b8, vl[0].b8, shift_vec.b8); CMT("vl[0] <<= shift_vec;");
422  i_and (r, vl[0].b8, vl[0].b8, bitmask_vec); CMT("vl[0] &= bitmask_vec;");
423  }
424 }
425 
427 {
428  RasmContext *r = s->rctx;
429  RasmOp nibble_mask = v_8b(s->vt[0]);
430  AArch64VecViews vl[1];
431  AArch64VecViews vtmp;
432 
433  a64op_vec_views(s->vl[0], &vl[0]);
434  a64op_vec_views(s->vt[1], &vtmp);
435 
436  rasm_annotate_next(r, "v128 nibble_mask = {0xf <repeats 8 times>, 0x0 <repeats 8 times>};");
437  i_movi(r, nibble_mask, IMM(0x0f));
438 
439  if (p->block_size == 8) {
440  i_ldr (r, vl[0].s, a64op_post(s->in[0], 4)); CMT("vl[0] = *in[0]++;");
441  i_ushr(r, vtmp.b8, vl[0].b8, IMM(4)); CMT("vtmp.lo = vl[0] >> 4;");
442  i_and (r, vl[0].b8, vl[0].b8, nibble_mask); CMT("vl[0].lo &= nibble_mask;");
443  i_zip1(r, vl[0].b8, vtmp.b8, vl[0].b8); CMT("interleave");
444  } else {
445  i_ldr (r, vl[0].d, a64op_post(s->in[0], 8)); CMT("vl[0] = *in[0]++;");
446  i_ushr(r, vtmp.b8, vl[0].b8, IMM(4)); CMT("vtmp.lo = vl[0] >> 4;");
447  i_and (r, vl[0].b8, vl[0].b8, nibble_mask); CMT("vl[0].lo &= nibble_mask;");
448  i_zip1(r, vl[0].b16, vtmp.b16, vl[0].b16); CMT("interleave");
449  }
450 }
451 
453 {
454  RasmContext *r = s->rctx;
455  AArch64VecViews vl[1];
456  AArch64VecViews vh[1];
457 
458  a64op_vec_views(s->vl[0], &vl[0]);
459  a64op_vec_views(s->vh[0], &vh[0]);
460 
461  switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
462  case 0x008: i_ldr(r, vl[0].d, a64op_post(s->in[0], s->vec_size * 1)); break;
463  case 0x010: i_ldr(r, vl[0].q, a64op_post(s->in[0], s->vec_size * 1)); break;
464  case 0x108: i_ldp(r, vl[0].d, vh[0].d, a64op_post(s->in[0], s->vec_size * 2)); break;
465  case 0x110: i_ldp(r, vl[0].q, vh[0].q, a64op_post(s->in[0], s->vec_size * 2)); break;
466  }
467 }
468 
470 {
471  RasmContext *r = s->rctx;
472 
473  switch (p->mask) {
474  case 0x0011: i_ld2(r, vv_2(vx[0], vx[1]), a64op_post(s->in[0], s->vec_size * 2)); break;
475  case 0x0111: i_ld3(r, vv_3(vx[0], vx[1], vx[2]), a64op_post(s->in[0], s->vec_size * 3)); break;
476  case 0x1111: i_ld4(r, vv_4(vx[0], vx[1], vx[2], vx[3]), a64op_post(s->in[0], s->vec_size * 4)); break;
477  }
478 }
479 
481 {
482  if (p->mask == 0x0001) {
484  } else {
485  asmgen_op_read_packed_n(s, p, s->vl);
486  if (s->use_vh)
487  asmgen_op_read_packed_n(s, p, s->vh);
488  }
489 }
490 
492 {
493  RasmContext *r = s->rctx;
494  AArch64VecViews vl[4];
495  AArch64VecViews vh[4];
496 
497  for (int i = 0; i < 4; i++) {
498  a64op_vec_views(s->vl[i], &vl[i]);
499  a64op_vec_views(s->vh[i], &vh[i]);
500  }
501 
502  LOOP_MASK(p, i) {
503  switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
504  case 0x008: i_ldr(r, vl[i].d, a64op_post(s->in[i], s->vec_size * 1)); break;
505  case 0x010: i_ldr(r, vl[i].q, a64op_post(s->in[i], s->vec_size * 1)); break;
506  case 0x108: i_ldp(r, vl[i].d, vh[i].d, a64op_post(s->in[i], s->vec_size * 2)); break;
507  case 0x110: i_ldp(r, vl[i].q, vh[i].q, a64op_post(s->in[i], s->vec_size * 2)); break;
508  }
509  }
510 }
511 
512 /*********************************************************************/
513 /* write raw pixels to planes */
514 /* AARCH64_SWS_OP_WRITE_BIT */
515 /* AARCH64_SWS_OP_WRITE_NIBBLE */
516 /* AARCH64_SWS_OP_WRITE_PACKED */
517 /* AARCH64_SWS_OP_WRITE_PLANAR */
518 
520 {
521  RasmContext *r = s->rctx;
522  AArch64VecViews vl[1];
523  AArch64VecViews shift_vec;
524  AArch64VecViews vtmp0;
525  AArch64VecViews vtmp1;
526 
527  a64op_vec_views(s->vl[0], &vl[0]);
528  a64op_vec_views(s->vt[0], &shift_vec);
529  a64op_vec_views(s->vt[1], &vtmp0);
530  a64op_vec_views(s->vt[2], &vtmp1);
531 
532  rasm_annotate_next(r, "v128 shift_vec = impl->priv.v128;");
533  i_ldr(r, shift_vec.q, a64op_off(s->impl, offsetof_impl_priv));
535 
536  if (p->block_size == 8) {
537  i_ushl(r, vl[0].b8, vl[0].b8, shift_vec.b8); CMT("vl[0] <<= shift_vec;");
538  i_addv(r, vtmp0.b, vl[0].b8); CMT("vtmp0[0] = add_across(vl[0].lo);");
539  i_str (r, vtmp0.b, a64op_post(s->out[0], 1)); CMT("*out[0]++ = vtmp0;");
540  } else {
541  i_ushl(r, vl[0].b16, vl[0].b16, shift_vec.b16); CMT("vl[0] <<= shift_vec;");
542  i_addv(r, vtmp0.b, vl[0].b8); CMT("vtmp0[0] = add_across(vl[0].lo);");
543  i_ins (r, vtmp1.de[0], vl[0].de[1]); CMT("vtmp1.lo = vl[0].hi;");
544  i_addv(r, vtmp1.b, vtmp1.b8); CMT("vtmp1[0] = add_across(vtmp1);");
545  i_ins (r, vtmp0.be[1], vtmp1.be[0]); CMT("vtmp0[1] = vtmp1[0];");
546  i_str (r, vtmp0.h, a64op_post(s->out[0], 2)); CMT("*out[0]++ = vtmp0;");
547  }
548 }
549 
551 {
552  RasmContext *r = s->rctx;
553  AArch64VecViews vl[4];
554  AArch64VecViews vtmp0;
555  AArch64VecViews vtmp1;
556 
557  for (int i = 0; i < 4; i++)
558  a64op_vec_views(s->vl[i], &vl[i]);
559  a64op_vec_views(s->vt[0], &vtmp0);
560  a64op_vec_views(s->vt[1], &vtmp1);
561 
562  if (p->block_size == 8) {
563  i_shl (r, vtmp0.h4, vl[0].h4, IMM(4));
564  i_ushr(r, vtmp1.h4, vl[0].h4, IMM(8));
565  i_orr (r, vl[0].b8, vtmp0.b8, vtmp1.b8);
566  i_xtn (r, vtmp0.b8, vl[0].h8);
567  i_str (r, vtmp0.s, a64op_post(s->out[0], 4));
568  } else {
569  i_shl (r, vtmp0.h8, vl[0].h8, IMM(4));
570  i_ushr(r, vtmp1.h8, vl[0].h8, IMM(8));
571  i_orr (r, vl[0].b16, vtmp0.b16, vtmp1.b16);
572  i_xtn (r, vtmp0.b8, vl[0].h8);
573  i_str (r, vtmp0.d, a64op_post(s->out[0], 8));
574  }
575 }
576 
578 {
579  RasmContext *r = s->rctx;
580  AArch64VecViews vl[1];
581  AArch64VecViews vh[1];
582 
583  a64op_vec_views(s->vl[0], &vl[0]);
584  a64op_vec_views(s->vh[0], &vh[0]);
585 
586  switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
587  case 0x008: i_str(r, vl[0].d, a64op_post(s->out[0], s->vec_size * 1)); break;
588  case 0x010: i_str(r, vl[0].q, a64op_post(s->out[0], s->vec_size * 1)); break;
589  case 0x108: i_stp(r, vl[0].d, vh[0].d, a64op_post(s->out[0], s->vec_size * 2)); break;
590  case 0x110: i_stp(r, vl[0].q, vh[0].q, a64op_post(s->out[0], s->vec_size * 2)); break;
591  }
592 }
593 
595 {
596  RasmContext *r = s->rctx;
597 
598  switch (p->mask) {
599  case 0x0011: i_st2(r, vv_2(vx[0], vx[1]), a64op_post(s->out[0], s->vec_size * 2)); break;
600  case 0x0111: i_st3(r, vv_3(vx[0], vx[1], vx[2]), a64op_post(s->out[0], s->vec_size * 3)); break;
601  case 0x1111: i_st4(r, vv_4(vx[0], vx[1], vx[2], vx[3]), a64op_post(s->out[0], s->vec_size * 4)); break;
602  }
603 }
604 
606 {
607  if (p->mask == 0x0001) {
609  } else {
610  asmgen_op_write_packed_n(s, p, s->vl);
611  if (s->use_vh)
612  asmgen_op_write_packed_n(s, p, s->vh);
613  }
614 }
615 
617 {
618  RasmContext *r = s->rctx;
619  AArch64VecViews vl[4];
620  AArch64VecViews vh[4];
621 
622  for (int i = 0; i < 4; i++) {
623  a64op_vec_views(s->vl[i], &vl[i]);
624  a64op_vec_views(s->vh[i], &vh[i]);
625  }
626 
627  LOOP_MASK(p, i) {
628  switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
629  case 0x008: i_str(r, vl[i].d, a64op_post(s->out[i], s->vec_size * 1)); break;
630  case 0x010: i_str(r, vl[i].q, a64op_post(s->out[i], s->vec_size * 1)); break;
631  case 0x108: i_stp(r, vl[i].d, vh[i].d, a64op_post(s->out[i], s->vec_size * 2)); break;
632  case 0x110: i_stp(r, vl[i].q, vh[i].q, a64op_post(s->out[i], s->vec_size * 2)); break;
633  }
634  }
635 }
636 
637 /*********************************************************************/
638 /* swap byte order (for differing endianness) */
639 /* AARCH64_SWS_OP_SWAP_BYTES */
640 
642 {
643  RasmContext *r = s->rctx;
644  AArch64VecViews vl[4];
645  AArch64VecViews vh[4];
646 
647  for (int i = 0; i < 4; i++) {
648  a64op_vec_views(s->vl[i], &vl[i]);
649  a64op_vec_views(s->vh[i], &vh[i]);
650  }
651 
652  switch (aarch64_pixel_size(p->type)) {
653  case sizeof(uint16_t):
654  LOOP_MASK (p, i) i_rev16(r, vl[i].b16, vl[i].b16);
655  LOOP_MASK_VH(s, p, i) i_rev16(r, vh[i].b16, vh[i].b16);
656  break;
657  case sizeof(uint32_t):
658  LOOP_MASK (p, i) i_rev32(r, vl[i].b16, vl[i].b16);
659  LOOP_MASK_VH(s, p, i) i_rev32(r, vh[i].b16, vh[i].b16);
660  break;
661  }
662 }
663 
664 /*********************************************************************/
665 /* rearrange channel order, or duplicate channels */
666 /* AARCH64_SWS_OP_SWIZZLE */
667 
668 #define SWIZZLE_TMP 0xf
669 
670 static const char *print_swizzle_v(char buf[8], uint8_t n, uint8_t vh)
671 {
672  if (n == SWIZZLE_TMP)
673  snprintf(buf, sizeof(char[8]), "vtmp%c", vh ? 'h' : 'l');
674  else
675  snprintf(buf, sizeof(char[8]), "v%c[%u]", vh ? 'h' : 'l', n);
676  return buf;
677 }
678 #define PRINT_SWIZZLE_V(n, vh) print_swizzle_v((char[8]){ 0 }, n, vh)
679 
680 static RasmOp swizzle_a64op(SwsAArch64Context *s, uint8_t n, uint8_t vh)
681 {
682  if (n == SWIZZLE_TMP)
683  return s->vt[vh];
684  return vh ? s->vh[n] : s->vl[n];
685 }
686 
687 static void swizzle_emit(SwsAArch64Context *s, uint8_t dst, uint8_t src)
688 {
689  RasmContext *r = s->rctx;
690  RasmOp src_op[2] = { swizzle_a64op(s, src, 0), swizzle_a64op(s, src, 1) };
691  RasmOp dst_op[2] = { swizzle_a64op(s, dst, 0), swizzle_a64op(s, dst, 1) };
692 
693  i_mov (r, dst_op[0], src_op[0]); CMTF("%s = %s;", PRINT_SWIZZLE_V(dst, 0), PRINT_SWIZZLE_V(src, 0));
694  if (s->use_vh) {
695  i_mov(r, dst_op[1], src_op[1]); CMTF("%s = %s;", PRINT_SWIZZLE_V(dst, 1), PRINT_SWIZZLE_V(src, 1));
696  }
697 }
698 
700 {
701  /* Compute used vectors (src and dst) */
702  uint8_t src_used[4] = { 0 };
703  bool done[4] = { true, true, true, true };
704  LOOP_MASK(p, dst) {
705  uint8_t src = MASK_GET(p->swizzle, dst);
706  src_used[src]++;
707  done[dst] = false;
708  }
709 
710  /* First perform unobstructed copies. */
711  for (bool progress = true; progress; ) {
712  progress = false;
713  for (int dst = 0; dst < 4; dst++) {
714  if (done[dst] || src_used[dst])
715  continue;
716  uint8_t src = MASK_GET(p->swizzle, dst);
717  swizzle_emit(s, dst, src);
718  src_used[src]--;
719  done[dst] = true;
720  progress = true;
721  }
722  }
723 
724  /* Then swap and rotate remaining operations. */
725  for (int dst = 0; dst < 4; dst++) {
726  if (done[dst])
727  continue;
728 
730 
731  uint8_t cur_dst = dst;
732  uint8_t src = MASK_GET(p->swizzle, cur_dst);
733  while (src != dst) {
734  swizzle_emit(s, cur_dst, src);
735  done[cur_dst] = true;
736  cur_dst = src;
737  src = MASK_GET(p->swizzle, cur_dst);
738  }
739 
740  swizzle_emit(s, cur_dst, SWIZZLE_TMP);
741  done[cur_dst] = true;
742  }
743 }
744 
745 #undef SWIZZLE_TMP
746 
747 /*********************************************************************/
748 /* split tightly packed data into components */
749 /* AARCH64_SWS_OP_UNPACK */
750 
752 {
753  RasmContext *r = s->rctx;
754  RasmOp *vl = s->vl;
755  RasmOp *vh = s->vh;
756  RasmOp *vt = s->vt;
757  RasmOp mask_gpr = a64op_w(s->tmp0);
758  uint32_t mask_val[4] = { 0 };
759  uint8_t mask_idx[4] = { 0 };
760  uint8_t cur_vt = 0;
761 
762  const int offsets[4] = {
763  MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2) + MASK_GET(p->pack, 1),
764  MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2),
765  MASK_GET(p->pack, 3),
766  0
767  };
768 
769  /* Generate masks. */
770  rasm_add_comment(r, "generate masks");
771  LOOP_MASK(p, i) {
772  uint32_t val = (1u << MASK_GET(p->pack, i)) - 1;
773  for (int j = 0; j < 4; j++) {
774  if (mask_val[j] == val) {
775  mask_val[i] = mask_val[j];
776  mask_idx[i] = mask_idx[j];
777  break;
778  }
779  }
780  if (!mask_val[i]) {
781  /**
782  * All-one values in movi only work up to 8-bit, and then
783  * at full 16- or 32-bit, but not for intermediate values
784  * like 10-bit. In those cases, we use mov + dup instead.
785  */
786  if (val <= 0xff || val == 0xffff) {
787  i_movi(r, vt[cur_vt], IMM(val));
788  } else {
789  i_mov (r, mask_gpr, IMM(val));
790  i_dup (r, vt[cur_vt], mask_gpr);
791  }
792  mask_val[i] = val;
793  mask_idx[i] = cur_vt++;
794  }
795  }
796 
797  /* Loop backwards to avoid clobbering component 0. */
798  LOOP_MASK_BWD (p, i) {
799  if (offsets[i]) {
800  i_ushr (r, vl[i], vl[0], IMM(offsets[i])); CMTF("vl[%u] >>= %u;", i, offsets[i]);
801  } else if (i) {
802  i_mov16b(r, vl[i], vl[0]); CMTF("vl[%u] = vl[0];", i);
803  }
804  }
805  LOOP_MASK_BWD_VH(s, p, i) {
806  if (offsets[i]) {
807  i_ushr (r, vh[i], vh[0], IMM(offsets[i])); CMTF("vh[%u] >>= %u;", i, offsets[i]);
808  } else if (i) {
809  i_mov16b(r, vh[i], vh[0]); CMTF("vh[%u] = vh[0];", i);
810  }
811  }
812 
813  /* Apply masks. */
814  reshape_all_vectors(s, 16, 1);
815  LOOP_MASK_BWD (p, i) { i_and(r, vl[i], vl[i], vt[mask_idx[i]]); CMTF("vl[%u] &= 0x%x;", i, mask_val[i]); }
816  LOOP_MASK_BWD_VH(s, p, i) { i_and(r, vh[i], vh[i], vt[mask_idx[i]]); CMTF("vh[%u] &= 0x%x;", i, mask_val[i]); }
817 }
818 
819 /*********************************************************************/
820 /* compress components into tightly packed data */
821 /* AARCH64_SWS_OP_PACK */
822 
824 {
825  RasmContext *r = s->rctx;
826  RasmOp *vl = s->vl;
827  RasmOp *vh = s->vh;
828 
829  const int offsets[4] = {
830  MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2) + MASK_GET(p->pack, 1),
831  MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2),
832  MASK_GET(p->pack, 3),
833  0
834  };
835  uint16_t offset_mask = 0;
836  LOOP_MASK(p, i) {
837  if (offsets[i])
838  MASK_SET(offset_mask, i, 1);
839  }
840 
841  /* Perform left shift. */
842  LOOP (offset_mask, i) { i_shl(r, vl[i], vl[i], IMM(offsets[i])); CMTF("vl[%u] <<= %u;", i, offsets[i]); }
843  LOOP_VH(s, offset_mask, i) { i_shl(r, vh[i], vh[i], IMM(offsets[i])); CMTF("vh[%u] <<= %u;", i, offsets[i]); }
844 
845  /* Combine components. */
846  reshape_all_vectors(s, 16, 1);
847  LOOP_MASK (p, i) {
848  if (i != 0) {
849  i_orr (r, vl[0], vl[0], vl[i]); CMTF("vl[0] |= vl[%u];", i);
850  if (s->use_vh) {
851  i_orr(r, vh[0], vh[0], vh[i]); CMTF("vh[0] |= vh[%u];", i);
852  }
853  }
854  }
855 }
856 
857 /*********************************************************************/
858 /* logical left shift of raw pixel values */
859 /* AARCH64_SWS_OP_LSHIFT */
860 
862 {
863  RasmContext *r = s->rctx;
864  RasmOp *vl = s->vl;
865  RasmOp *vh = s->vh;
866 
867  LOOP_MASK (p, i) { i_shl(r, vl[i], vl[i], IMM(p->shift)); CMTF("vl[%u] <<= %u;", i, p->shift); }
868  LOOP_MASK_VH(s, p, i) { i_shl(r, vh[i], vh[i], IMM(p->shift)); CMTF("vh[%u] <<= %u;", i, p->shift); }
869 }
870 
871 /*********************************************************************/
872 /* right shift of raw pixel values */
873 /* AARCH64_SWS_OP_RSHIFT */
874 
876 {
877  RasmContext *r = s->rctx;
878  RasmOp *vl = s->vl;
879  RasmOp *vh = s->vh;
880 
881  LOOP_MASK (p, i) { i_ushr(r, vl[i], vl[i], IMM(p->shift)); CMTF("vl[%u] >>= %u;", i, p->shift); }
882  LOOP_MASK_VH(s, p, i) { i_ushr(r, vh[i], vh[i], IMM(p->shift)); CMTF("vh[%u] >>= %u;", i, p->shift); }
883 }
884 
885 /*********************************************************************/
886 /* clear pixel values */
887 /* AARCH64_SWS_OP_CLEAR */
888 
890 {
891  RasmContext *r = s->rctx;
892  RasmOp *vl = s->vl;
893  RasmOp *vh = s->vh;
894  RasmOp clear_vec = s->vt[0];
895 
896  /**
897  * TODO
898  * - pack elements in impl->priv and perform smaller loads
899  * - if only 1 element and not vh, load directly with ld1r
900  */
901 
902  i_ldr(r, v_q(clear_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 clear_vec = impl->priv.v128;");
904 
905  LOOP_MASK (p, i) { i_dup(r, vl[i], a64op_elem(clear_vec, i)); CMTF("vl[%u] = broadcast(clear_vec[%u])", i, i); }
906  LOOP_MASK_VH(s, p, i) { i_dup(r, vh[i], a64op_elem(clear_vec, i)); CMTF("vh[%u] = broadcast(clear_vec[%u])", i, i); }
907 }
908 
909 /*********************************************************************/
910 /* convert (cast) between formats */
911 /* AARCH64_SWS_OP_CONVERT */
912 
914 {
915  RasmContext *r = s->rctx;
916  AArch64VecViews vl[4];
917  AArch64VecViews vh[4];
918 
919  /**
920  * Since each instruction in the convert operation needs specific
921  * element types, it is simpler to use arrangement specifiers for
922  * each operand instead of reshaping all vectors.
923  */
924 
925  for (int i = 0; i < 4; i++) {
926  a64op_vec_views(s->vl[i], &vl[i]);
927  a64op_vec_views(s->vh[i], &vh[i]);
928  }
929 
930  size_t src_el_size = s->el_size;
931  size_t dst_el_size = aarch64_pixel_size(p->to_type);
932 
933  /**
934  * This function assumes block_size is either 8 or 16, and that
935  * we're always using the most amount of vector registers possible.
936  * Therefore, u32 always uses the high vector bank.
937  */
938  if (p->type == AARCH64_PIXEL_F32) {
939  rasm_add_comment(r, "f32 -> u32");
940  LOOP_MASK(p, i) i_fcvtzu(r, vl[i].s4, vl[i].s4);
941  LOOP_MASK(p, i) i_fcvtzu(r, vh[i].s4, vh[i].s4);
942  }
943 
944  if (p->block_size == 8) {
945  if (src_el_size == 1 && dst_el_size > src_el_size) {
946  rasm_add_comment(r, "u8 -> u16");
947  LOOP_MASK(p, i) i_uxtl (r, vl[i].h8, vl[i].b8);
948  src_el_size = 2;
949  } else if (src_el_size == 4 && dst_el_size < src_el_size) {
950  rasm_add_comment(r, "u32 -> u16");
951  LOOP_MASK(p, i) i_xtn (r, vl[i].h4, vl[i].s4);
952  LOOP_MASK(p, i) i_xtn (r, vh[i].h4, vh[i].s4);
953  LOOP_MASK(p, i) i_ins (r, vl[i].de[1], vh[i].de[0]);
954  src_el_size = 2;
955  }
956  if (src_el_size == 2 && dst_el_size == 4) {
957  rasm_add_comment(r, "u16 -> u32");
958  LOOP_MASK(p, i) i_uxtl2(r, vh[i].s4, vl[i].h8);
959  LOOP_MASK(p, i) i_uxtl (r, vl[i].s4, vl[i].h4);
960  src_el_size = 4;
961  } else if (src_el_size == 2 && dst_el_size == 1) {
962  rasm_add_comment(r, "u16 -> u8");
963  LOOP_MASK(p, i) i_xtn (r, vl[i].b8, vl[i].h8);
964  src_el_size = 1;
965  }
966  } else /* if (p->block_size == 16) */ {
967  if (src_el_size == 1 && dst_el_size == 2) {
968  rasm_add_comment(r, "u8 -> u16");
969  LOOP_MASK(p, i) i_uxtl2(r, vh[i].h8, vl[i].b16);
970  LOOP_MASK(p, i) i_uxtl (r, vl[i].h8, vl[i].b8);
971  } else if (src_el_size == 2 && dst_el_size == 1) {
972  rasm_add_comment(r, "u16 -> u8");
973  LOOP_MASK(p, i) i_xtn (r, vl[i].b8, vl[i].h8);
974  LOOP_MASK(p, i) i_xtn (r, vh[i].b8, vh[i].h8);
975  LOOP_MASK(p, i) i_ins (r, vl[i].de[1], vh[i].de[0]);
976  }
977  }
978 
979  /* See comment above for high vector bank usage for u32. */
980  if (p->to_type == AARCH64_PIXEL_F32) {
981  rasm_add_comment(r, "u32 -> f32");
982  LOOP_MASK(p, i) i_ucvtf(r, vl[i].s4, vl[i].s4);
983  LOOP_MASK(p, i) i_ucvtf(r, vh[i].s4, vh[i].s4);
984  }
985 }
986 
987 /*********************************************************************/
988 /* expand integers to the full range */
989 /* AARCH64_SWS_OP_EXPAND */
990 
992 {
993  RasmContext *r = s->rctx;
994  RasmOp *vl = s->vl;
995  RasmOp *vh = s->vh;
996 
997  size_t src_el_size = s->el_size;
998  size_t dst_el_size = aarch64_pixel_size(p->to_type);
999  size_t dst_total_size = p->block_size * dst_el_size;
1000  size_t dst_vec_size = FFMIN(dst_total_size, 16);
1001 
1002  if (!s->use_vh)
1003  s->use_vh = (dst_vec_size != dst_total_size);
1004 
1005  if (src_el_size == 1) {
1006  rasm_add_comment(r, "u8 -> u16");
1007  reshape_all_vectors(s, 16, 1);
1008  LOOP_MASK_VH(s, p, i) i_zip2(r, vh[i], vl[i], vl[i]);
1009  LOOP_MASK (p, i) i_zip1(r, vl[i], vl[i], vl[i]);
1010  }
1011  if (dst_el_size == 4) {
1012  rasm_add_comment(r, "u16 -> u32");
1013  reshape_all_vectors(s, 8, 2);
1014  LOOP_MASK_VH(s, p, i) i_zip2(r, vh[i], vl[i], vl[i]);
1015  LOOP_MASK (p, i) i_zip1(r, vl[i], vl[i], vl[i]);
1016  }
1017 }
1018 
1019 /*********************************************************************/
1020 /* numeric minimum */
1021 /* AARCH64_SWS_OP_MIN */
1022 
1024 {
1025  RasmContext *r = s->rctx;
1026  RasmOp *vl = s->vl;
1027  RasmOp *vh = s->vh;
1028  RasmOp *vt = s->vt;
1029  RasmOp min_vec = s->vt[4];
1030 
1031  i_ldr(r, v_q(min_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 min_vec = impl->priv.v128;");
1033  LOOP_MASK(p, i) { i_dup(r, vt[i], a64op_elem(min_vec, i)); CMTF("v128 vmin%u = min_vec[%u];", i, i); }
1034 
1035  if (p->type == AARCH64_PIXEL_F32) {
1036  LOOP_MASK (p, i) { i_fmin(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = min(vl[%u], vmin%u);", i, i, i); }
1037  LOOP_MASK_VH(s, p, i) { i_fmin(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = min(vh[%u], vmin%u);", i, i, i); }
1038  } else {
1039  LOOP_MASK (p, i) { i_umin(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = min(vl[%u], vmin%u);", i, i, i); }
1040  LOOP_MASK_VH(s, p, i) { i_umin(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = min(vh[%u], vmin%u);", i, i, i); }
1041  }
1042 }
1043 
1044 /*********************************************************************/
1045 /* numeric maximum */
1046 /* AARCH64_SWS_OP_MAX */
1047 
1049 {
1050  RasmContext *r = s->rctx;
1051  RasmOp *vl = s->vl;
1052  RasmOp *vh = s->vh;
1053  RasmOp *vt = s->vt;
1054  RasmOp max_vec = s->vt[4];
1055 
1056  i_ldr(r, v_q(max_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 max_vec = impl->priv.v128;");
1058  LOOP_MASK(p, i) { i_dup(r, vt[i], a64op_elem(max_vec, i)); CMTF("v128 vmax%u = max_vec[%u];", i, i); }
1059 
1060  if (p->type == AARCH64_PIXEL_F32) {
1061  LOOP_MASK (p, i) { i_fmax(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = max(vl[%u], vmax%u);", i, i, i); }
1062  LOOP_MASK_VH(s, p, i) { i_fmax(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = max(vh[%u], vmax%u);", i, i, i); }
1063  } else {
1064  LOOP_MASK (p, i) { i_umax(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = max(vl[%u], vmax%u);", i, i, i); }
1065  LOOP_MASK_VH(s, p, i) { i_umax(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = max(vh[%u], vmax%u);", i, i, i); }
1066  }
1067 }
1068 
1069 /*********************************************************************/
1070 /* multiplication by scalar */
1071 /* AARCH64_SWS_OP_SCALE */
1072 
1074 {
1075  RasmContext *r = s->rctx;
1076  RasmOp *vl = s->vl;
1077  RasmOp *vh = s->vh;
1078  RasmOp priv_ptr = s->tmp0;
1079  RasmOp scale_vec = s->vt[0];
1080 
1081  i_add (r, priv_ptr, s->impl, IMM(offsetof_impl_priv)); CMT("v128 *scale_vec_ptr = &impl->priv;");
1083  i_ld1r(r, vv_1(scale_vec), a64op_base(priv_ptr)); CMT("v128 scale_vec = broadcast(*scale_vec_ptr);");
1084 
1085  if (p->type == AARCH64_PIXEL_F32) {
1086  LOOP_MASK (p, i) { i_fmul(r, vl[i], vl[i], scale_vec); CMTF("vl[%u] *= scale_vec;", i); }
1087  LOOP_MASK_VH(s, p, i) { i_fmul(r, vh[i], vh[i], scale_vec); CMTF("vh[%u] *= scale_vec;", i); }
1088  } else {
1089  LOOP_MASK (p, i) { i_mul (r, vl[i], vl[i], scale_vec); CMTF("vl[%u] *= scale_vec;", i); }
1090  LOOP_MASK_VH(s, p, i) { i_mul (r, vh[i], vh[i], scale_vec); CMTF("vh[%u] *= scale_vec;", i); }
1091  }
1092 }
1093 
1094 /*********************************************************************/
1095 /* generalized linear affine transform */
1096 /* AARCH64_SWS_OP_LINEAR */
1097 
1098 /**
1099  * Performs one pass of the linear transform over a single vector bank
1100  * (low or high).
1101  */
1103  RasmOp *vt, RasmOp *vc,
1104  int save_mask, bool vh_pass)
1105 {
1106  RasmContext *r = s->rctx;
1107  /**
1108  * The intermediate registers for fmul+fadd (for when SWS_BITEXACT
1109  * is set) start from temp vector 4.
1110  */
1111  RasmOp *vtmp = &vt[4];
1112  RasmOp *vx = vh_pass ? s->vh : s->vl;
1113  char cvh = vh_pass ? 'h' : 'l';
1114 
1115  if (vh_pass && !s->use_vh)
1116  return;
1117 
1118  /**
1119  * Save rows that need to be used as input after they have been already
1120  * written to.
1121  */
1122  RasmOp src_vx[4] = { vx[0], vx[1], vx[2], vx[3] };
1123  if (save_mask) {
1124  for (int i = 0; i < 4; i++) {
1125  if (MASK_GET(save_mask, i)) {
1126  src_vx[i] = vt[i];
1127  i_mov16b(r, vt[i], vx[i]); CMTF("vsrc[%u] = v%c[%u];", i, cvh, i);
1128  }
1129  }
1130  }
1131 
1132  /**
1133  * The non-zero coefficients have been packed in aarch64_setup_linear()
1134  * in sequential order into the individual lanes of the coefficient
1135  * vector registers. We must follow the same order of execution here.
1136  */
1137  int i_coeff = 0;
1138  LOOP_MASK(p, i) {
1139  bool first = true;
1140  RasmNode *pre_mul = rasm_get_current_node(r);
1141  for (int j = 0; j < 5; j++) {
1142  if (!LINEAR_MASK_GET(p->linear.mask, i, j))
1143  continue;
1144  bool is_offset = linear_index_is_offset(j);
1145  int src_j = linear_index_to_vx(j);
1146  RasmOp vsrc = src_vx[src_j];
1147  uint8_t vc_i = i_coeff / 4;
1148  uint8_t vc_j = i_coeff & 3;
1149  RasmOp vcoeff = a64op_elem(vc[vc_i], vc_j);
1150  i_coeff++;
1151  if (first && is_offset) {
1152  i_dup (r, vx[i], vcoeff); CMTF("v%c[%u] = broadcast(vc[%u][%u]);", cvh, i, vc_i, vc_j);
1153  } else if (first && !is_offset) {
1154  if (LINEAR_MASK_GET(p->linear.mask, i, j) == LINEAR_MASK_1) {
1155  i_mov16b(r, vx[i], vsrc); CMTF("v%c[%u] = vsrc[%u];", cvh, i, src_j);
1156  } else {
1157  i_fmul (r, vx[i], vsrc, vcoeff); CMTF("v%c[%u] = vsrc[%u] * vc[%u][%u];", cvh, i, src_j, vc_i, vc_j);
1158  }
1159  } else if (!p->linear.fmla) {
1160  /**
1161  * Split the multiply-accumulate into fmul+fadd. All
1162  * multiplications are performed first into temporary
1163  * registers, and only then added to the destination,
1164  * to reduce the dependency chain.
1165  * There is no need to perform multiplications by 1.
1166  */
1167  if (LINEAR_MASK_GET(p->linear.mask, i, j) != LINEAR_MASK_1) {
1168  pre_mul = rasm_set_current_node(r, pre_mul);
1169  i_fmul(r, vtmp[vc_j], vsrc, vcoeff); CMTF("vtmp[%u] = vsrc[%u] * vc[%u][%u];", vc_j, src_j, vc_i, vc_j);
1170  pre_mul = rasm_set_current_node(r, pre_mul);
1171  i_fadd(r, vx[i], vx[i], vtmp[vc_j]); CMTF("v%c[%u] += vtmp[%u];", cvh, i, vc_j);
1172  } else {
1173  i_fadd(r, vx[i], vx[i], vsrc); CMTF("v%c[%u] += vsrc[%u];", cvh, i, vc_j);
1174  }
1175  } else {
1176  /**
1177  * Most modern aarch64 cores have a fastpath for sequences
1178  * of fmla instructions. This means that even if the coefficient
1179  * is 1, it is still faster to use fmla by 1 instead of fadd.
1180  */
1181  i_fmla(r, vx[i], vsrc, vcoeff); CMTF("v%c[%u] += vsrc[%u] * vc[%u][%u];", cvh, i, src_j, vc_i, vc_j);
1182  }
1183  first = false;
1184  }
1185  }
1186 }
1187 
1189 {
1190  RasmContext *r = s->rctx;
1191  RasmOp *vt = s->vt;
1192  RasmOp *vc = &vt[8]; /* The coefficients are loaded starting from temp vector 8 */
1193  RasmOp ptr = s->tmp0;
1194  RasmOp coeff_veclist;
1195 
1196  /* Preload coefficients from impl->priv. */
1197  const int num_vregs = linear_num_vregs(p);
1198  av_assert0(num_vregs <= 4);
1199  switch (num_vregs) {
1200  case 1: coeff_veclist = vv_1(vc[0]); break;
1201  case 2: coeff_veclist = vv_2(vc[0], vc[1]); break;
1202  case 3: coeff_veclist = vv_3(vc[0], vc[1], vc[2]); break;
1203  case 4: coeff_veclist = vv_4(vc[0], vc[1], vc[2], vc[3]); break;
1204  }
1205  i_ldr(r, ptr, a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 *vcoeff_ptr = impl->priv.ptr;");
1207  i_ld1(r, coeff_veclist, a64op_base(ptr)); CMT("coeff_veclist = *vcoeff_ptr;");
1208 
1209  /* Compute mask for rows that must be saved before being overwritten. */
1210  uint16_t save_mask = 0;
1211  bool overwritten[4] = { false, false, false, false };
1212  LOOP_MASK(p, i) {
1213  for (int j = 0; j < 5; j++) {
1214  if (!LINEAR_MASK_GET(p->linear.mask, i, j))
1215  continue;
1216  bool is_offset = linear_index_is_offset(j);
1217  int src_j = linear_index_to_vx(j);
1218  if (!is_offset && overwritten[src_j])
1219  MASK_SET(save_mask, j - 1, 1);
1220  overwritten[i] = true;
1221  }
1222  }
1223 
1224  /* Perform linear passes for low and high vector banks. */
1225  linear_pass(s, p, vt, vc, save_mask, false);
1226  linear_pass(s, p, vt, vc, save_mask, true);
1227 }
1228 
1229 /*********************************************************************/
1230 /* add dithering noise */
1231 /* AARCH64_SWS_OP_DITHER */
1232 
1234 {
1235  RasmContext *r = s->rctx;
1236  RasmOp *vl = s->vl;
1237  RasmOp *vh = s->vh;
1238  RasmOp ptr = s->tmp0;
1239  RasmOp tmp1 = s->tmp1;
1240  RasmOp wtmp1 = a64op_w(tmp1);
1241  RasmOp dither_vl = s->vt[0];
1242  RasmOp dither_vh = s->vt[1];
1243  RasmOp bx64 = a64op_x(s->bx);
1244  RasmOp y64 = a64op_x(s->y);
1245 
1246  /**
1247  * For a description of the matrix buffer layout, read the comments
1248  * in aarch64_setup_dither() in aarch64/ops.c.
1249  */
1250 
1251  /**
1252  * Sort components by y_offset value so that we can start dithering
1253  * with the smallest value, and increment the pointer upwards for
1254  * each new offset. The dither matrix is over-allocated and may be
1255  * over-read at the top, but it cannot be over-read before the start
1256  * of the buffer. Since we only mask the y offset once, this would
1257  * be an issue if we tried to subtract a value larger than the
1258  * initial y_offset.
1259  */
1260  int sorted[4];
1261  int n_comps = 0;
1262  /* Very cheap bucket sort. */
1263  int max_offset = 0;
1264  LOOP_MASK(p, i)
1265  max_offset = FFMAX(max_offset, MASK_GET(p->dither.y_offset, i));
1266  for (int y_off = 0; y_off <= max_offset; y_off++) {
1267  LOOP_MASK(p, i) {
1268  if (MASK_GET(p->dither.y_offset, i) == y_off)
1269  sorted[n_comps++] = i;
1270  }
1271  }
1272 
1273  i_ldr(r, ptr, a64op_off(s->impl, offsetof_impl_priv)); CMT("void *ptr = impl->priv.ptr;");
1275 
1276  /**
1277  * We use ubfiz to mask and shift left in one single instruction:
1278  * ubfiz <Wd>, <Wn>, #<lsb>, #<width>
1279  * Wd = (Wn & ((1 << width) - 1)) << lsb;
1280  *
1281  * Given:
1282  * block_size = 8, log2(block_size) = 3
1283  * dither_size = 16, log2(dither_size) = 4, dither_mask = 0b1111
1284  * sizeof(float) = 4, log2(sizeof(float)) = 2
1285  *
1286  * Suppose we have bx = 0bvvvv. To get x, we left shift by
1287  * log2(block_size) and end up with 0bvvvv000. Then we mask against
1288  * dither_mask, and end up with 0bv000. Finally we multiply by
1289  * sizeof(float), which is the same as shifting left by
1290  * log2(sizeof(float)). The result is 0bv00000.
1291  *
1292  * Therefore:
1293  * width = log2(dither_size) - log2(block_size)
1294  * lsb = log2(block_size) + log2(sizeof(float))
1295  */
1296  const int block_size_log2 = (p->block_size == 16) ? 4 : 3;
1297  const int dither_size_log2 = p->dither.size_log2;
1298  const int sizeof_float_log2 = 2;
1299  if (dither_size_log2 != block_size_log2) {
1300  RasmOp lsb = IMM(block_size_log2 + sizeof_float_log2);
1301  RasmOp width = IMM(dither_size_log2 - block_size_log2);
1302  i_ubfiz(r, tmp1, bx64, lsb, width); CMT("tmp1 = (bx & ((dither_size / block_size) - 1)) * block_size * sizeof(float);");
1303  i_add (r, ptr, ptr, tmp1); CMT("ptr += tmp1;");
1304  }
1305 
1306  int last_y_off = -1;
1307  int prev_i = 0;
1308  for (int sorted_i = 0; sorted_i < n_comps; sorted_i++) {
1309  int i = sorted[sorted_i];
1310  uint8_t y_off = MASK_GET(p->dither.y_offset, i);
1311  bool do_load = (y_off != last_y_off);
1312 
1313  if (last_y_off < 0) {
1314  /* On the first run, calculate pointer inside dither_matrix. */
1315  RasmOp lsb = IMM(dither_size_log2 + sizeof_float_log2);
1316  RasmOp width = IMM(dither_size_log2);
1317  /**
1318  * The ubfiz instruction for the y offset performs masking
1319  * by the dither matrix size and shifts by the stride.
1320  */
1321  if (y_off == 0) {
1322  i_ubfiz(r, tmp1, y64, lsb, width); CMT("tmp1 = (y & (dither_size - 1)) * dither_size * sizeof(float);");
1323  } else {
1324  i_add (r, wtmp1, s->y, IMM(y_off)); CMTF("tmp1 = y + y_off[%u];", i);
1325  i_ubfiz(r, tmp1, tmp1, lsb, width); CMT("tmp1 = (tmp1 & (dither_size - 1)) * dither_size * sizeof(float);");
1326  }
1327  i_add(r, ptr, ptr, tmp1); CMT("ptr += tmp1;");
1328  } else if (do_load) {
1329  /**
1330  * On subsequent runs, just increment the pointer.
1331  * The matrix is over-allocated, so we don't risk
1332  * overreading.
1333  */
1334  int delta = (y_off - last_y_off) * (1 << dither_size_log2) * sizeof(float);
1335  i_add(r, ptr, ptr, IMM(delta)); CMTF("ptr += (y_off[%u] - y_off[%u]) * dither_size * sizeof(float);", i, prev_i);
1336  }
1337 
1338  if (do_load) {
1339  RasmOp dither_vlq = v_q(dither_vl);
1340  RasmOp dither_vhq = v_q(dither_vh);
1341  i_ldp (r, dither_vlq, dither_vhq, a64op_base(ptr)); CMT("{ ditherl, ditherh } = *ptr;");
1342  }
1343 
1344  i_fadd (r, vl[i], vl[i], dither_vl); CMTF("vl[%u] += vditherl;", i);
1345  if (s->use_vh) {
1346  i_fadd(r, vh[i], vh[i], dither_vh); CMTF("vh[%u] += vditherh;", i);
1347  }
1348 
1349  last_y_off = y_off;
1350  prev_i = i;
1351  }
1352 }
1353 
1354 /*********************************************************************/
1356 {
1357  RasmContext *r = s->rctx;
1358 
1359  bool is_read = false;
1360  bool is_write = false;
1361  switch (p->op) {
1366  is_read = true;
1367  break;
1372  is_write = true;
1373  break;
1374  default:
1375  break;
1376  }
1377 
1378  char func_name[128];
1379  aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
1380  rasm_func_begin(r, func_name, true, !is_read);
1381 
1382  /**
1383  * Set up vector register dimensions and reshape all vectors
1384  * accordingly.
1385  */
1386  size_t el_size = aarch64_pixel_size(p->type);
1387  size_t total_size = p->block_size * el_size;
1388 
1389  s->vec_size = FFMIN(total_size, 16);
1390  s->use_vh = (s->vec_size != total_size);
1391 
1392  s->el_size = el_size;
1393  s->el_count = s->vec_size / el_size;
1394  reshape_all_vectors(s, s->el_count, el_size);
1395 
1396  /* Common start for continuation-passing style (CPS) functions. */
1398 
1399  switch (p->op) {
1410  case AARCH64_SWS_OP_UNPACK: asmgen_op_unpack(s, p); break;
1411  case AARCH64_SWS_OP_PACK: asmgen_op_pack(s, p); break;
1412  case AARCH64_SWS_OP_LSHIFT: asmgen_op_lshift(s, p); break;
1413  case AARCH64_SWS_OP_RSHIFT: asmgen_op_rshift(s, p); break;
1414  case AARCH64_SWS_OP_CLEAR: asmgen_op_clear(s, p); break;
1416  case AARCH64_SWS_OP_EXPAND: asmgen_op_expand(s, p); break;
1417  case AARCH64_SWS_OP_MIN: asmgen_op_min(s, p); break;
1418  case AARCH64_SWS_OP_MAX: asmgen_op_max(s, p); break;
1419  case AARCH64_SWS_OP_SCALE: asmgen_op_scale(s, p); break;
1420  case AARCH64_SWS_OP_LINEAR: asmgen_op_linear(s, p); break;
1421  case AARCH64_SWS_OP_DITHER: asmgen_op_dither(s, p); break;
1422  /* TODO implement AARCH64_SWS_OP_SHUFFLE */
1423  default:
1424  break;
1425  }
1426 
1427  if (is_write) {
1428  /* Write functions return directly. */
1429  i_ret(r);
1430  } else {
1431  /* Load continuation address and increment impl pointer. */
1432  RasmNode *node = rasm_set_current_node(r, s->load_cont_node);
1433  RasmOp impl_post = a64op_post(s->impl, sizeof_impl);
1434  i_ldr(r, s->cont, impl_post); CMT("SwsFuncPtr cont = (impl++)->cont;");
1435  rasm_set_current_node(r, node);
1436  /* Common end for remaining CPS functions. */
1437  i_br (r, s->cont); CMT("jump to cont");
1438  }
1439 }
1440 
1441 /*********************************************************************/
1442 static void aarch64_op_impl_lookup_str(char *buf, size_t size, const SwsAArch64OpImplParams *params,
1443  const SwsAArch64OpImplParams *prev, const char *p_str)
1444 {
1445  int first_diff = 0;
1446  int prev_levels = 0;
1447  int levels = 0;
1448 
1449  /* Compute number of current levels. */
1450  if (params) {
1451  const ParamField **fields = op_fields[params->op];
1452  while (fields[levels])
1453  levels++;
1454  }
1455 
1456  /* Compute number of previous levels. */
1457  if (prev) {
1458  const ParamField **prev_fields = op_fields[prev->op];
1459  while (prev_fields[prev_levels])
1460  prev_levels++;
1461  }
1462 
1463  /* Walk up and check the conditions that match. */
1464  if (params && prev) {
1465  const ParamField **fields = op_fields[params->op];
1466  first_diff = -1;
1467  for (int i = 0; fields[i]; i++) {
1468  const ParamField *field = fields[i];
1469  if (first_diff < 0) {
1470  int diff = field->cmp_val((void *) (((uintptr_t) params) + field->offset),
1471  (void *) (((uintptr_t) prev) + field->offset));
1472  if (diff)
1473  first_diff = i;
1474  }
1475  }
1476  }
1477 
1478  /* Walk back closing conditions. */
1479  if (prev) {
1480  for (int i = prev_levels - 1; i > first_diff; i--) {
1481  buf_appendf(&buf, &size, "%*sreturn NULL;\n", 4 * (i + 1), "");
1482  buf_appendf(&buf, &size, "%*s}\n", 4 * i, "");
1483  }
1484  }
1485 
1486  /* Walk up adding conditions to return current function. */
1487  if (params) {
1488  const ParamField **fields = op_fields[params->op];
1489  for (int i = first_diff; i < levels; i++) {
1490  const ParamField *field = fields[i];
1491  void *p = (void *) (((uintptr_t) params) + field->offset);
1492  buf_appendf(&buf, &size, "%*sif (%s%s == ", 4 * (i + 1), "", p_str, field->name);
1493  field->print_val(&buf, &size, p);
1494  buf_appendf(&buf, &size, ")");
1495  if (i == (levels - 1)) {
1496  buf_appendf(&buf, &size, " return ");
1497  impl_func_name(&buf, &size, params);
1498  buf_appendf(&buf, &size, ";\n");
1499  } else {
1500  buf_appendf(&buf, &size, " {\n");
1501  }
1502  }
1503  }
1504 
1505  av_assert0(size && "string buffer exhausted");
1506 }
1507 
1508 static int lookup_gen(void)
1509 {
1510  char buf[1024];
1511 
1512  /**
1513  * The lookup function matches the SwsAArch64OpImplParams from
1514  * ops_entries.c to the exported functions generated by asmgen_op().
1515  * Each call to aarch64_op_impl_lookup_str() generates a code
1516  * fragment to uniquely detect the current function, opening and/or
1517  * closing conditions depending on the parameters of the previous
1518  * function.
1519  */
1520 
1521  /* External function declarations. */
1522  printf("#include \"libswscale/aarch64/ops_lookup.h\"\n");
1523  printf("\n");
1524  for (const SwsAArch64OpImplParams *p = impl_params; p->op; p++) {
1525  aarch64_op_impl_func_name(buf, sizeof(buf), p);
1526  printf("extern void %s(void);\n", buf);
1527  }
1528  printf("\n");
1529 
1530  /* Lookup function. */
1531  printf("SwsFuncPtr ff_sws_aarch64_lookup(const SwsAArch64OpImplParams *p)\n");
1532  printf("{\n");
1533  const SwsAArch64OpImplParams *prev = NULL;
1534  for (const SwsAArch64OpImplParams *p = impl_params; p->op; p++) {
1535  aarch64_op_impl_lookup_str(buf, sizeof(buf), p, prev, "p->");
1536  printf("%s", buf);
1537  prev = p;
1538  }
1539  aarch64_op_impl_lookup_str(buf, sizeof(buf), NULL, prev, "p->");
1540  printf("%s", buf);
1541  printf(" return NULL;\n");
1542  printf("}\n");
1543 
1544  return 0;
1545 }
1546 
1547 /*********************************************************************/
1548 
1549 /* Generate all functions described by ops_entries.c */
1550 static int asmgen(void)
1551 {
1552  RasmContext *rctx = rasm_alloc();
1553  if (!rctx)
1554  return AVERROR(ENOMEM);
1555 
1556  SwsAArch64Context s = { .rctx = rctx };
1557  int ret;
1558 
1559  /**
1560  * The entry point of the SwsOpFunc is the `process` function. The
1561  * first kernel function is called from `process`, and subsequent
1562  * kernel functions are chained by directly branching to the next
1563  * operation, using a continuation-passing style design. The last
1564  * operation must be a write operation, which returns from the call
1565  * to the `process` function.
1566  *
1567  * The GPRs used by the entire call-chain are listed below.
1568  *
1569  * Function arguments are passed in r0-r5. After the parameters
1570  * from `exec` have been read, r0 is reused to branch to the
1571  * continuation functions. After the original parameters from
1572  * `impl` have been computed, r1 is reused as the `impl` pointer
1573  * for each operation.
1574  *
1575  * Loop iterators are r6 for `bx` and r3 for `y`, reused from
1576  * `y_start`, which doesn't need to be preserved.
1577  *
1578  * The intra-procedure-call temporary registers (r16 and r17) are
1579  * used as scratch registers. They may be used by call veneers and
1580  * PLT code inserted by the linker, so we cannot expect them to
1581  * persist across branches between functions.
1582  *
1583  * The Platform Register (r18) is not used.
1584  *
1585  * The read/write data pointers and padding values first use up the
1586  * remaining free caller-saved registers, and only then are the
1587  * caller-saved registers (r19-r28) used.
1588  *
1589  * The Link Register (r30) is used when calling the first kernel,
1590  * so it must be saved.
1591  */
1592 
1593  /* SwsOpFunc arguments. */
1594  s.exec = a64op_gpx(0); // const SwsOpExec *exec
1595  s.impl = a64op_gpx(1); // const void *priv
1596  s.bx_start = a64op_gpw(2); // int bx_start
1597  s.y_start = a64op_gpw(3); // int y_start
1598  s.bx_end = a64op_gpw(4); // int bx_end
1599  s.y_end = a64op_gpw(5); // int y_end
1600 
1601  /* Loop iterator variables. */
1602  s.bx = a64op_gpw(6);
1603  s.y = s.y_start; /* Reused from SwsOpFunc argument. */
1604 
1605  /* Scratch registers. */
1606  s.tmp0 = a64op_gpx(16); /* IP0 */
1607  s.tmp1 = a64op_gpx(17); /* IP1 */
1608 
1609  /* CPS-related variables. */
1610  s.op0_func = a64op_gpx(7);
1611  s.op1_impl = a64op_gpx(8);
1612  s.cont = s.exec; /* Reused from SwsOpFunc argument. */
1613 
1614  /* Read/Write data pointers and padding. */
1615  s.in [0] = a64op_gpx(9);
1616  s.out [0] = a64op_gpx(10);
1617  s.in_bump [0] = a64op_gpx(11);
1618  s.out_bump[0] = a64op_gpx(12);
1619  s.in [1] = a64op_gpx(13);
1620  s.out [1] = a64op_gpx(14);
1621  s.in_bump [1] = a64op_gpx(15);
1622  s.out_bump[1] = a64op_gpx(19);
1623  s.in [2] = a64op_gpx(20);
1624  s.out [2] = a64op_gpx(21);
1625  s.in_bump [2] = a64op_gpx(22);
1626  s.out_bump[2] = a64op_gpx(23);
1627  s.in [3] = a64op_gpx(24);
1628  s.out [3] = a64op_gpx(25);
1629  s.in_bump [3] = a64op_gpx(26);
1630  s.out_bump[3] = a64op_gpx(27);
1631 
1632  /* Generate all process functions using rasm. */
1633  asmgen_process(&s, 0x0001);
1634  asmgen_process(&s, 0x0011);
1635  asmgen_process(&s, 0x0111);
1636  asmgen_process(&s, 0x1111);
1637 
1638  /* Generate all functions from ops_entries.c using rasm. */
1639  const SwsAArch64OpImplParams *params = impl_params;
1640  while (params->op) {
1641  asmgen_op_cps(&s, params++);
1642  if (rctx->error) {
1643  ret = rctx->error;
1644  goto error;
1645  }
1646  }
1647 
1648  /* Print all rasm functions to stdout. */
1649  printf("#include \"libavutil/aarch64/asm.S\"\n");
1650  printf("\n");
1651  ret = rasm_print(s.rctx, stdout);
1652 
1653 error:
1654  rasm_free(&s.rctx);
1655  return ret;
1656 }
1657 
1658 /*********************************************************************/
1659 int main(int argc, char *argv[])
1660 {
1661  bool lookup = false;
1662  bool ops = false;
1663 
1664 #ifdef _WIN32
1665  _setmode(_fileno(stdout), _O_BINARY);
1666 #endif
1667 
1668  for (int i = 1; i < argc; i++) {
1669  if (!strcmp(argv[i], "-ops"))
1670  ops = true;
1671  else if (!strcmp(argv[i], "-lookup"))
1672  lookup = true;
1673  }
1674  if ((lookup && ops) || (!lookup && !ops)) {
1675  fprintf(stderr, "Exactly one of -ops or -lookup must be specified.\n");
1676  return -1;
1677  }
1678 
1679  return lookup ? lookup_gen() : asmgen();
1680 }
error
static void error(const char *err)
Definition: target_bsf_fuzzer.c:32
asmgen_op_write_planar
static void asmgen_op_write_planar(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:616
AARCH64_SWS_OP_MIN
@ AARCH64_SWS_OP_MIN
Definition: ops_impl.h:57
AArch64VecViews::h8
RasmOp h8
Definition: rasm.h:466
linear_index_to_vx
static int linear_index_to_vx(int idx)
Definition: ops_impl.h:155
rasm_print.c
FF_DYNARRAY_ADD
#define FF_DYNARRAY_ADD(av_size_max, av_elt_size, av_array, av_size, av_success, av_failure)
Add an element to a dynamic array.
Definition: dynarray.h:45
LINEAR_MASK_GET
#define LINEAR_MASK_GET(mask, idx, jdx)
Definition: ops_impl.h:122
SwsAArch64Context::vh
RasmOp vh[4]
Definition: ops_asmgen.c:159
rasm_alloc
RasmContext * rasm_alloc(void)
Definition: rasm.c:32
r
const char * r
Definition: vf_curves.c:127
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
printf
__device__ int printf(const char *,...)
main
int main(int argc, char *argv[])
Definition: ops_asmgen.c:1659
LOOP_MASK_BWD_VH
#define LOOP_MASK_BWD_VH(s, p, idx)
Definition: ops_asmgen.c:181
i_ld1
#define i_ld1(rctx, op0, op1)
Definition: rasm.h:557
LOOP_MASK_BWD
#define LOOP_MASK_BWD(p, idx)
Definition: ops_impl.h:120
ParamField
The following structure is used to describe one field from SwsAArch64OpImplParams.
Definition: ops_impl.c:186
reshape_all_vectors
static void reshape_all_vectors(SwsAArch64Context *s, int el_count, int el_size)
Definition: ops_asmgen.c:188
a64op_base
static RasmOp a64op_base(RasmOp op)
Definition: rasm.h:498
AArch64VecViews::b16
RasmOp b16
Definition: rasm.h:464
i_zip1
#define i_zip1(rctx, op0, op1, op2)
Definition: rasm.h:596
AArch64VecViews::d
RasmOp d
Definition: rasm.h:460
i_ld4
#define i_ld4(rctx, op0, op1)
Definition: rasm.h:561
i_mul
#define i_mul(rctx, op0, op1, op2)
Definition: rasm.h:569
a64op_gpx
static RasmOp a64op_gpx(uint8_t n)
Definition: rasm.h:354
av_dynarray2_add
static void * av_dynarray2_add(void **tab_ptr, int *nb_ptr, size_t elem_size, const uint8_t *elem_data)
Definition: ops_asmgen.c:65
asmgen_op_read_packed
static void asmgen_op_read_packed(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:480
a64op_w
static RasmOp a64op_w(RasmOp op)
Definition: rasm.h:359
RasmContext::error
int error
Definition: rasm.h:191
clobber_gpr
static void clobber_gpr(RasmOp regs[MAX_SAVED_REGS], unsigned *count, RasmOp gpr)
Definition: ops_asmgen.c:266
mask
int mask
Definition: mediacodecdec_common.c:154
SwsAArch64Context::out_bump
RasmOp out_bump[4]
Definition: ops_asmgen.c:166
rasm_free
void rasm_free(RasmContext **prctx)
Definition: rasm.c:37
i_blr
#define i_blr(rctx, op0)
Definition: rasm.h:545
rasm_set_current_node
RasmNode * rasm_set_current_node(RasmContext *rctx, RasmNode *node)
Definition: rasm.c:199
i_st4
#define i_st4(rctx, op0, op1)
Definition: rasm.h:578
u
#define u(width, name, range_min, range_max)
Definition: cbs_apv.c:68
asmgen_op_max
static void asmgen_op_max(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1048
AArch64VecViews
This helper structure is used to mimic the assembler syntax for vector register modifiers.
Definition: rasm.h:455
SwsAArch64Context::rctx
RasmContext * rctx
Definition: ops_asmgen.c:133
rasm_get_current_node
RasmNode * rasm_get_current_node(RasmContext *rctx)
Definition: rasm.c:194
asmgen_op_write_bit
static void asmgen_op_write_bit(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:519
a64op_gpw
static RasmOp a64op_gpw(uint8_t n)
Definition: rasm.h:353
i_ld3
#define i_ld3(rctx, op0, op1)
Definition: rasm.h:560
vv_2
static RasmOp vv_2(RasmOp op0, RasmOp op1)
Definition: rasm.h:446
vv_3
static RasmOp vv_3(RasmOp op0, RasmOp op1, RasmOp op2)
Definition: rasm.h:447
CMTF
#define CMTF(fmt,...)
Definition: ops_asmgen.c:185
PRINT_SWIZZLE_V
#define PRINT_SWIZZLE_V(n, vh)
Definition: ops_asmgen.c:678
AARCH64_SWS_OP_SWIZZLE
@ AARCH64_SWS_OP_SWIZZLE
Definition: ops_impl.h:49
i_dup
#define i_dup(rctx, op0, op1)
Definition: rasm.h:549
asmgen_op_dither
static void asmgen_op_dither(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1233
SwsAArch64Context::y
RasmOp y
Definition: ops_asmgen.c:145
i_bne
#define i_bne(rctx, id)
Definition: rasm.h:601
rasm_print
void int rasm_print(RasmContext *rctx, FILE *fp)
Definition: rasm_print.c:432
SwsAArch64Context::out
RasmOp out[4]
Definition: ops_asmgen.c:164
i_ld2
#define i_ld2(rctx, op0, op1)
Definition: rasm.h:559
i_fmla
#define i_fmla(rctx, op0, op1, op2)
Definition: rasm.h:554
asmgen_op_read_bit
static void asmgen_op_read_bit(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:389
RasmNode
Definition: rasm.h:144
i_rev16
#define i_rev16(rctx, op0, op1)
Definition: rasm.h:572
ops_impl.c
asmgen_op_write_nibble
static void asmgen_op_write_nibble(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:550
IMM
#define IMM(val)
Definition: rasm.h:91
AARCH64_SWS_OP_CLEAR
@ AARCH64_SWS_OP_CLEAR
Definition: ops_impl.h:54
SwsAArch64Context::el_size
size_t el_size
Definition: ops_asmgen.c:169
clobbered_gprs
static unsigned clobbered_gprs(const SwsAArch64Context *s, SwsAArch64OpMask mask, RasmOp regs[MAX_SAVED_REGS])
Definition: ops_asmgen.c:274
i_fmin
#define i_fmin(rctx, op0, op1, op2)
Definition: rasm.h:553
SwsAArch64Context::bx_start
RasmOp bx_start
Definition: ops_asmgen.c:138
AARCH64_SWS_OP_NONE
@ AARCH64_SWS_OP_NONE
Definition: ops_impl.h:39
LOOP_MASK_VH
#define LOOP_MASK_VH(s, p, idx)
Definition: ops_asmgen.c:180
a64op_vec_views
void a64op_vec_views(RasmOp op, AArch64VecViews *out)
Definition: rasm.c:330
SwsAArch64Context::bx_end
RasmOp bx_end
Definition: ops_asmgen.c:140
AARCH64_SWS_OP_READ_NIBBLE
@ AARCH64_SWS_OP_READ_NIBBLE
Definition: ops_impl.h:41
SwsAArch64Context
Definition: ops_asmgen.c:132
AArch64VecViews::s
RasmOp s
Definition: rasm.h:459
AARCH64_SWS_OP_PACK
@ AARCH64_SWS_OP_PACK
Definition: ops_impl.h:51
AARCH64_SWS_OP_SWAP_BYTES
@ AARCH64_SWS_OP_SWAP_BYTES
Definition: ops_impl.h:48
AARCH64_SWS_OP_READ_BIT
@ AARCH64_SWS_OP_READ_BIT
Definition: ops_impl.h:40
i_st2
#define i_st2(rctx, op0, op1)
Definition: rasm.h:576
SwsAArch64Context::impl
RasmOp impl
Definition: ops_asmgen.c:137
i_st3
#define i_st3(rctx, op0, op1)
Definition: rasm.h:577
i_ushr
#define i_ushr(rctx, op0, op1, op2)
Definition: rasm.h:592
RasmOp
Runtime assembler for AArch64.
Definition: rasm.h:43
AARCH64_SWS_OP_MAX
@ AARCH64_SWS_OP_MAX
Definition: ops_impl.h:58
asmgen_op_read_nibble
static void asmgen_op_read_nibble(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:426
i_addv
#define i_addv(rctx, op0, op1)
Definition: rasm.h:540
asmgen_op_clear
static void asmgen_op_clear(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:889
swizzle_a64op
static RasmOp swizzle_a64op(SwsAArch64Context *s, uint8_t n, uint8_t vh)
Definition: ops_asmgen.c:680
val
static double val(void *priv, double ch)
Definition: aeval.c:77
rasm_add_label
RasmNode * rasm_add_label(RasmContext *rctx, int id)
Definition: rasm.c:146
i_fadd
#define i_fadd(rctx, op0, op1, op2)
Definition: rasm.h:550
asmgen_op_pack
static void asmgen_op_pack(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:823
LINEAR_MASK_1
#define LINEAR_MASK_1
Definition: ops_impl.h:127
i_ld1r
#define i_ld1r(rctx, op0, op1)
Definition: rasm.h:558
SwsAArch64Context::y_start
RasmOp y_start
Definition: ops_asmgen.c:139
first
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But first
Definition: rate_distortion.txt:12
SwsAArch64Context::exec
RasmOp exec
Definition: ops_asmgen.c:136
vv_1
static RasmOp vv_1(RasmOp op0)
Definition: rasm.h:445
a64op_elem
static RasmOp a64op_elem(RasmOp op, uint8_t idx)
Definition: rasm.h:421
FFMIN
#define FFMIN(a, b)
Definition: ops_asmgen.c:50
float
float
Definition: af_crystalizer.c:122
AArch64VecViews::be
RasmOp be[2]
Definition: rasm.h:471
AARCH64_SWS_OP_WRITE_NIBBLE
@ AARCH64_SWS_OP_WRITE_NIBBLE
Definition: ops_impl.h:45
AArch64VecViews::b
RasmOp b
Definition: rasm.h:457
SwsAArch64OpMask
uint16_t SwsAArch64OpMask
Definition: ops_impl.h:66
AARCH64_SWS_OP_DITHER
@ AARCH64_SWS_OP_DITHER
Definition: ops_impl.h:61
AARCH64_SWS_OP_RSHIFT
@ AARCH64_SWS_OP_RSHIFT
Definition: ops_impl.h:53
s
#define s(width, name)
Definition: cbs_vp9.c:198
offsets
static const int offsets[]
Definition: hevc_pel.c:34
SwsAArch64Context::op1_impl
RasmOp op1_impl
Definition: ops_asmgen.c:153
impl_func_name
static void impl_func_name(char **buf, size_t *size, const SwsAArch64OpImplParams *params)
Definition: ops_asmgen.c:113
AARCH64_SWS_OP_LINEAR
@ AARCH64_SWS_OP_LINEAR
Definition: ops_impl.h:60
LOOP
#define LOOP(mask, idx)
Definition: ops_impl.h:112
frame_size
int frame_size
Definition: mxfenc.c:2489
asmgen_op_cps
static void asmgen_op_cps(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1355
FFMAX
#define FFMAX(a, b)
Definition: ops_asmgen.c:49
SwsAArch64Context::vl
RasmOp vl[4]
Definition: ops_asmgen.c:158
offsetof_impl_cont
#define offsetof_impl_cont
Definition: ops_impl.h:173
CMT
#define CMT(comment)
Definition: ops_asmgen.c:184
linear_pass
static void linear_pass(SwsAArch64Context *s, const SwsAArch64OpImplParams *p, RasmOp *vt, RasmOp *vc, int save_mask, bool vh_pass)
Performs one pass of the linear transform over a single vector bank (low or high).
Definition: ops_asmgen.c:1102
ops_entries.c
AARCH64_SWS_OP_CONVERT
@ AARCH64_SWS_OP_CONVERT
Definition: ops_impl.h:55
limits.h
i_ins
#define i_ins(rctx, op0, op1)
Definition: rasm.h:556
asmgen_op_convert
static void asmgen_op_convert(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:913
field
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this field
Definition: writing_filters.txt:78
v_8b
static RasmOp v_8b(RasmOp op)
Definition: rasm.h:436
i_ldr
#define i_ldr(rctx, op0, op1)
Definition: rasm.h:563
a64op_make_vec
static RasmOp a64op_make_vec(uint8_t n, uint8_t el_count, uint8_t el_size)
Definition: rasm.h:365
op_fields
static const ParamField * op_fields[AARCH64_SWS_OP_TYPE_NB][MAX_LEVELS]
Definition: ops_impl.c:323
AARCH64_PIXEL_F32
@ AARCH64_PIXEL_F32
Definition: ops_impl.h:33
asmgen_op_scale
static void asmgen_op_scale(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1073
asmgen
static int asmgen(void)
Definition: ops_asmgen.c:1550
fields
the definition of that something depends on the semantic of the filter The callback must examine the status of the filter s links and proceed accordingly The status of output links is stored in the status_in and status_out fields and tested by the then the processing requires a frame on this link and the filter is expected to make efforts in that direction The status of input links is stored by the fifo and status_out fields
Definition: filter_design.txt:155
aarch64_op_impl_func_name
void aarch64_op_impl_func_name(char *buf, size_t size, const SwsAArch64OpImplParams *params)
Definition: ops_asmgen.c:125
aarch64_op_impl_lookup_str
static void aarch64_op_impl_lookup_str(char *buf, size_t size, const SwsAArch64OpImplParams *params, const SwsAArch64OpImplParams *prev, const char *p_str)
Definition: ops_asmgen.c:1442
i_br
#define i_br(rctx, op0)
Definition: rasm.h:546
i_cmp
#define i_cmp(rctx, op0, op1)
Definition: rasm.h:547
AARCH64_SWS_OP_SCALE
@ AARCH64_SWS_OP_SCALE
Definition: ops_impl.h:59
asmgen_op_read_packed_1
static void asmgen_op_read_packed_1(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:452
offsetof_exec_out_bump
#define offsetof_exec_out_bump
Definition: ops_impl.h:172
NULL
#define NULL
Definition: coverity.c:32
aarch64_pixel_size
static size_t aarch64_pixel_size(SwsAArch64PixelType fmt)
Definition: ops_asmgen.c:99
impl_params
static const SwsAArch64OpImplParams impl_params[]
Implementation parameters for all exported functions.
Definition: ops_asmgen.c:93
asmgen_op_unpack
static void asmgen_op_unpack(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:751
i_fmul
#define i_fmul(rctx, op0, op1, op2)
Definition: rasm.h:555
asmgen_op_write_packed_n
static void asmgen_op_write_packed_n(SwsAArch64Context *s, const SwsAArch64OpImplParams *p, RasmOp *vx)
Definition: ops_asmgen.c:594
a64op_post
static RasmOp a64op_post(RasmOp op, int16_t imm)
Definition: rasm.h:501
AARCH64_SWS_OP_READ_PACKED
@ AARCH64_SWS_OP_READ_PACKED
Definition: ops_impl.h:42
i_umin
#define i_umin(rctx, op0, op1, op2)
Definition: rasm.h:587
MAX_SAVED_REGS
#define MAX_SAVED_REGS
Definition: ops_asmgen.c:264
LOOP_VH
#define LOOP_VH(s, mask, idx)
Definition: ops_asmgen.c:179
offsetof_exec_out
#define offsetof_exec_out
Definition: ops_impl.h:170
sizeof_impl
#define sizeof_impl
Definition: ops_impl.h:175
i_add
#define i_add(rctx, op0, op1, op2)
Definition: rasm.h:539
asmgen_op_read_planar
static void asmgen_op_read_planar(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:491
rasm.c
asmgen_process
static void asmgen_process(SwsAArch64Context *s, SwsAArch64OpMask mask)
Definition: ops_asmgen.c:289
rasm_new_label
int rasm_new_label(RasmContext *rctx, const char *name)
Allocate a new label ID with the given name.
Definition: rasm.c:282
SwsAArch64Context::tmp0
RasmOp tmp0
Definition: ops_asmgen.c:148
a64op_sp
static RasmOp a64op_sp(void)
Definition: rasm.h:356
SwsAArch64Context::vt
RasmOp vt[12]
Definition: ops_asmgen.c:160
AARCH64_SWS_OP_WRITE_PLANAR
@ AARCH64_SWS_OP_WRITE_PLANAR
Definition: ops_impl.h:47
i_uxtl
#define i_uxtl(rctx, op0, op1)
Definition: rasm.h:593
SwsAArch64Context::use_vh
bool use_vh
Definition: ops_asmgen.c:172
LOOP_MASK
#define LOOP_MASK(p, idx)
Definition: ops_impl.h:119
AARCH64_SWS_OP_LSHIFT
@ AARCH64_SWS_OP_LSHIFT
Definition: ops_impl.h:52
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:87
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
i_ldrb
#define i_ldrb(rctx, op0, op1)
Definition: rasm.h:564
i_shl
#define i_shl(rctx, op0, op1, op2)
Definition: rasm.h:574
i_fmax
#define i_fmax(rctx, op0, op1, op2)
Definition: rasm.h:552
size
int size
Definition: twinvq_data.h:10344
AArch64VecViews::h
RasmOp h
Definition: rasm.h:458
SwsAArch64Context::load_cont_node
RasmNode * load_cont_node
Definition: ops_asmgen.c:155
i_zip2
#define i_zip2(rctx, op0, op1, op2)
Definition: rasm.h:597
SwsAArch64OpImplParams::op
SwsAArch64OpType op
Definition: ops_impl.h:93
SwsAArch64Context::vec_size
size_t vec_size
Definition: ops_asmgen.c:171
i_fcvtzu
#define i_fcvtzu(rctx, op0, op1)
Definition: rasm.h:551
diff
static av_always_inline int diff(const struct color_info *a, const struct color_info *b, const int trans_thresh)
Definition: vf_paletteuse.c:166
asmgen_op_read_packed_n
static void asmgen_op_read_packed_n(SwsAArch64Context *s, const SwsAArch64OpImplParams *p, RasmOp *vx)
Definition: ops_asmgen.c:469
i_ucvtf
#define i_ucvtf(rctx, op0, op1)
Definition: rasm.h:585
AARCH64_SWS_OP_WRITE_BIT
@ AARCH64_SWS_OP_WRITE_BIT
Definition: ops_impl.h:44
a64op_off
static RasmOp a64op_off(RasmOp op, int16_t imm)
Definition: rasm.h:499
AARCH64_SWS_OP_READ_PLANAR
@ AARCH64_SWS_OP_READ_PLANAR
Definition: ops_impl.h:43
av_freep
static void av_freep(void *ptr)
Definition: ops_asmgen.c:52
AARCH64_SWS_OP_EXPAND
@ AARCH64_SWS_OP_EXPAND
Definition: ops_impl.h:56
i_uxtl2
#define i_uxtl2(rctx, op0, op1)
Definition: rasm.h:594
AArch64VecViews::de
RasmOp de[2]
Definition: rasm.h:472
SwsAArch64Context::tmp1
RasmOp tmp1
Definition: ops_asmgen.c:149
AARCH64_SWS_OP_UNPACK
@ AARCH64_SWS_OP_UNPACK
Definition: ops_impl.h:50
vv_4
static RasmOp vv_4(RasmOp op0, RasmOp op1, RasmOp op2, RasmOp op3)
Definition: rasm.h:448
i_lsr
#define i_lsr(rctx, op0, op1, op2)
Definition: rasm.h:566
rasm_annotate_next
void rasm_annotate_next(RasmContext *rctx, const char *comment)
Definition: rasm.c:263
clobbered_frame_size
static unsigned clobbered_frame_size(unsigned n)
Definition: ops_asmgen.c:215
RasmContext
Definition: rasm.h:184
lookup
int lookup
Definition: vorbis_enc_data.h:428
i_ldp
#define i_ldp(rctx, op0, op1, op2)
Definition: rasm.h:562
delta
float delta
Definition: vorbis_enc_data.h:430
asmgen_op_swap_bytes
static void asmgen_op_swap_bytes(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:641
AArch64VecViews::q
RasmOp q
Definition: rasm.h:461
offsetof_impl_priv
#define offsetof_impl_priv
Definition: ops_impl.h:174
asmgen_set_load_cont_node
static void asmgen_set_load_cont_node(SwsAArch64Context *s)
Set node where the continuation address will be loaded and impl will be incremented.
Definition: ops_asmgen.c:376
asmgen_op_write_packed
static void asmgen_op_write_packed(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:605
AArch64VecViews::h4
RasmOp h4
Definition: rasm.h:465
i_xtn
#define i_xtn(rctx, op0, op1)
Definition: rasm.h:595
SwsAArch64Context::in
RasmOp in[4]
Definition: ops_asmgen.c:163
av_assert0
#define av_assert0(cond)
Definition: ops_asmgen.c:43
a64op_pre
static RasmOp a64op_pre(RasmOp op, int16_t imm)
Definition: rasm.h:500
SWIZZLE_TMP
#define SWIZZLE_TMP
Definition: ops_asmgen.c:668
asmgen_op_lshift
static void asmgen_op_lshift(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:861
SwsAArch64Context::el_count
size_t el_count
Definition: ops_asmgen.c:170
rasm_annotate_nextf
void rasm_annotate_nextf(RasmContext *rctx, char *s, size_t n, const char *fmt,...)
Definition: rasm.c:273
rasm_op_label
static RasmOp rasm_op_label(int id)
Definition: rasm.h:95
i_umax
#define i_umax(rctx, op0, op1, op2)
Definition: rasm.h:586
ret
ret
Definition: filter_design.txt:187
offsetof_exec_in
#define offsetof_exec_in
These values will be used by ops_asmgen to access fields inside of SwsOpExec and SwsOpImpl.
Definition: ops_impl.h:169
MASK_SET
#define MASK_SET(mask, idx, val)
Definition: ops_impl.h:110
rasm_func_begin
int rasm_func_begin(RasmContext *rctx, const char *name, bool export, bool jumpable)
Definition: rasm.c:209
i_mov16b
#define i_mov16b(rctx, op0, op1)
Definition: rasm.h:618
AARCH64_PIXEL_U8
@ AARCH64_PIXEL_U8
Definition: ops_impl.h:30
SwsAArch64Context::cont
RasmOp cont
Definition: ops_asmgen.c:154
i_b
#define i_b(rctx, op0)
Definition: rasm.h:543
lookup_gen
static int lookup_gen(void)
Definition: ops_asmgen.c:1508
SwsAArch64Context::in_bump
RasmOp in_bump[4]
Definition: ops_asmgen.c:165
AARCH64_PIXEL_U32
@ AARCH64_PIXEL_U32
Definition: ops_impl.h:32
MASK_GET
#define MASK_GET(mask, idx)
Definition: ops_impl.h:109
i_str
#define i_str(rctx, op0, op1)
Definition: rasm.h:580
dynarray.h
swizzle_emit
static void swizzle_emit(SwsAArch64Context *s, uint8_t dst, uint8_t src)
Definition: ops_asmgen.c:687
i_and
#define i_and(rctx, op0, op1, op2)
Definition: rasm.h:542
i_ldrh
#define i_ldrh(rctx, op0, op1)
Definition: rasm.h:565
asmgen_op_write_packed_1
static void asmgen_op_write_packed_1(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:577
SwsAArch64Context::op0_func
RasmOp op0_func
Definition: ops_asmgen.c:152
asmgen_prologue
static void asmgen_prologue(SwsAArch64Context *s, const RasmOp *regs, unsigned n)
Definition: ops_asmgen.c:220
asmgen_op_expand
static void asmgen_op_expand(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:991
i_ubfiz
#define i_ubfiz(rctx, op0, op1, op2, op3)
Definition: rasm.h:584
asmgen_epilogue
static void asmgen_epilogue(SwsAArch64Context *s, const RasmOp *regs, unsigned n)
Definition: ops_asmgen.c:241
Windows::Graphics::DirectX::Direct3D11::p
IDirect3DDxgiInterfaceAccess _COM_Outptr_ void ** p
Definition: vsrc_gfxcapture_winrt.hpp:53
i_orr
#define i_orr(rctx, op0, op1, op2)
Definition: rasm.h:570
AARCH64_SWS_OP_WRITE_PACKED
@ AARCH64_SWS_OP_WRITE_PACKED
Definition: ops_impl.h:46
SwsAArch64OpImplParams
SwsAArch64OpImplParams describes the parameters for an SwsAArch64OpType operation.
Definition: ops_impl.h:92
i_rev32
#define i_rev32(rctx, op0, op1)
Definition: rasm.h:573
i_stp
#define i_stp(rctx, op0, op1, op2)
Definition: rasm.h:579
asmgen_op_min
static void asmgen_op_min(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1023
i_movi
#define i_movi(rctx, op0, op1)
Definition: rasm.h:568
i_ret
#define i_ret(rctx)
Definition: rasm.h:571
AARCH64_PIXEL_U16
@ AARCH64_PIXEL_U16
Definition: ops_impl.h:31
a64op_x
static RasmOp a64op_x(RasmOp op)
Definition: rasm.h:360
asmgen_op_linear
static void asmgen_op_linear(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:1188
offsetof_exec_in_bump
#define offsetof_exec_in_bump
Definition: ops_impl.h:171
SwsAArch64Context::bx
RasmOp bx
Definition: ops_asmgen.c:144
asmgen_op_rshift
static void asmgen_op_rshift(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:875
linear_index_is_offset
static int linear_index_is_offset(int idx)
Definition: ops_impl.h:150
linear_num_vregs
static int linear_num_vregs(const SwsAArch64OpImplParams *params)
Definition: ops_impl.h:136
SwsAArch64PixelType
SwsAArch64PixelType
Definition: ops_impl.h:29
print_swizzle_v
static const char * print_swizzle_v(char buf[8], uint8_t n, uint8_t vh)
Definition: ops_asmgen.c:670
width
#define width
Definition: dsp.h:89
asmgen_op_swizzle
static void asmgen_op_swizzle(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
Definition: ops_asmgen.c:699
SwsAArch64Context::y_end
RasmOp y_end
Definition: ops_asmgen.c:141
snprintf
#define snprintf
Definition: snprintf.h:34
i_ushl
#define i_ushl(rctx, op0, op1, op2)
Definition: rasm.h:589
rasm_add_comment
RasmNode * rasm_add_comment(RasmContext *rctx, const char *comment)
Definition: rasm.c:117
src
#define src
Definition: vp8dsp.c:248
a64op_lr
static RasmOp a64op_lr(void)
Definition: rasm.h:355
a64op_gpr_n
static uint8_t a64op_gpr_n(RasmOp op)
Definition: rasm.h:350
i_mov
#define i_mov(rctx, op0, op1)
Definition: rasm.h:567
v_q
static RasmOp v_q(RasmOp op)
Definition: rasm.h:433
AArch64VecViews::b8
RasmOp b8
Definition: rasm.h:463