FFmpeg
swscale_lasx.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2022 Loongson Technology Corporation Limited
3  * Contributed by Hao Chen(chenhao@loongson.cn)
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "swscale_loongarch.h"
24 #include "libavutil/intreadwrite.h"
25 
26 #define SCALE_8_16(_sh) \
27 { \
28  src0 = __lasx_xvldrepl_d(src + filterPos[0], 0); \
29  src1 = __lasx_xvldrepl_d(src + filterPos[1], 0); \
30  src2 = __lasx_xvldrepl_d(src + filterPos[2], 0); \
31  src3 = __lasx_xvldrepl_d(src + filterPos[3], 0); \
32  src4 = __lasx_xvldrepl_d(src + filterPos[4], 0); \
33  src5 = __lasx_xvldrepl_d(src + filterPos[5], 0); \
34  src6 = __lasx_xvldrepl_d(src + filterPos[6], 0); \
35  src7 = __lasx_xvldrepl_d(src + filterPos[7], 0); \
36  src8 = __lasx_xvldrepl_d(src + filterPos[8], 0); \
37  src9 = __lasx_xvldrepl_d(src + filterPos[9], 0); \
38  src10 = __lasx_xvldrepl_d(src + filterPos[10], 0); \
39  src11 = __lasx_xvldrepl_d(src + filterPos[11], 0); \
40  src12 = __lasx_xvldrepl_d(src + filterPos[12], 0); \
41  src13 = __lasx_xvldrepl_d(src + filterPos[13], 0); \
42  src14 = __lasx_xvldrepl_d(src + filterPos[14], 0); \
43  src15 = __lasx_xvldrepl_d(src + filterPos[15], 0); \
44  DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64, \
45  filter, 96, filter0, filter1, filter2, filter3); \
46  DUP4_ARG2(__lasx_xvld, filter, 128, filter, 160, \
47  filter, 192, filter, 224, filter4, \
48  filter5, filter6, filter7); \
49  DUP4_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2, \
50  src5, src4, src7, src6, src0, src2, src4, src6); \
51  DUP4_ARG2(__lasx_xvilvl_d, src9, src8, src11, src10, \
52  src13, src12, src15, src14, src8, src10, src12, src14); \
53  DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src4, src6, \
54  src0, src2, src4, src6); \
55  DUP4_ARG1(__lasx_vext2xv_hu_bu, src8, src10, src12, \
56  src14, src8, src10, src12, src14); \
57  DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2, \
58  filter2, src4, filter3, src6, src0, src1, src2, src3); \
59  DUP4_ARG2(__lasx_xvdp2_w_h, filter4, src8, filter5, src10, \
60  filter6, src12, filter7, src14, src4, src5, src6, src7);\
61  src0 = __lasx_xvhaddw_d_w(src0, src0); \
62  src1 = __lasx_xvhaddw_d_w(src1, src1); \
63  src2 = __lasx_xvhaddw_d_w(src2, src2); \
64  src3 = __lasx_xvhaddw_d_w(src3, src3); \
65  src4 = __lasx_xvhaddw_d_w(src4, src4); \
66  src5 = __lasx_xvhaddw_d_w(src5, src5); \
67  src6 = __lasx_xvhaddw_d_w(src6, src6); \
68  src7 = __lasx_xvhaddw_d_w(src7, src7); \
69  DUP4_ARG2(__lasx_xvpickev_w, src1, src0, src3, src2, \
70  src5, src4, src7, src6, src0, src1, src2, src3); \
71  src0 = __lasx_xvhaddw_d_w(src0, src0); \
72  src1 = __lasx_xvhaddw_d_w(src1, src1); \
73  src2 = __lasx_xvhaddw_d_w(src2, src2); \
74  src3 = __lasx_xvhaddw_d_w(src3, src3); \
75  src0 = __lasx_xvpickev_w(src1, src0); \
76  src1 = __lasx_xvpickev_w(src3, src2); \
77  src0 = __lasx_xvsrai_w(src0, _sh); \
78  src1 = __lasx_xvsrai_w(src1, _sh); \
79  src0 = __lasx_xvmin_w(src0, vmax); \
80  src1 = __lasx_xvmin_w(src1, vmax); \
81  src0 = __lasx_xvperm_w(src0, shuf); \
82  src1 = __lasx_xvperm_w(src1, shuf); \
83  src0 = __lasx_xvpickev_h(src1, src0); \
84  src0 = __lasx_xvpermi_d(src0, 0xd8); \
85  __lasx_xvst(src0, dst, 0); \
86  filterPos += 16; \
87  filter += 128; \
88  dst += 16; \
89 }
90 
91 #define SCALE_8_8(_sh) \
92 { \
93  src0 = __lasx_xvldrepl_d(src + filterPos[0], 0); \
94  src1 = __lasx_xvldrepl_d(src + filterPos[1], 0); \
95  src2 = __lasx_xvldrepl_d(src + filterPos[2], 0); \
96  src3 = __lasx_xvldrepl_d(src + filterPos[3], 0); \
97  src4 = __lasx_xvldrepl_d(src + filterPos[4], 0); \
98  src5 = __lasx_xvldrepl_d(src + filterPos[5], 0); \
99  src6 = __lasx_xvldrepl_d(src + filterPos[6], 0); \
100  src7 = __lasx_xvldrepl_d(src + filterPos[7], 0); \
101  DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64, \
102  filter, 96, filter0, filter1, filter2, filter3); \
103  filterPos += 8; \
104  filter += 64; \
105  DUP4_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2, \
106  src5, src4, src7, src6, src0, src2, src4, src6); \
107  DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src4, src6, \
108  src0, src2, src4, src6); \
109  DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2, \
110  filter2, src4, filter3, src6, src0, src1, src2,src3); \
111  src0 = __lasx_xvhaddw_d_w(src0, src0); \
112  src1 = __lasx_xvhaddw_d_w(src1, src1); \
113  src2 = __lasx_xvhaddw_d_w(src2, src2); \
114  src3 = __lasx_xvhaddw_d_w(src3, src3); \
115  src0 = __lasx_xvpickev_w(src1, src0); \
116  src1 = __lasx_xvpickev_w(src3, src2); \
117  src0 = __lasx_xvhaddw_d_w(src0, src0); \
118  src1 = __lasx_xvhaddw_d_w(src1, src1); \
119  src0 = __lasx_xvpickev_w(src1, src0); \
120  src0 = __lasx_xvsrai_w(src0, _sh); \
121  src0 = __lasx_xvmin_w(src0, vmax); \
122  src0 = __lasx_xvperm_w(src0, shuf); \
123 }
124 
125 #define SCALE_8_4(_sh) \
126 { \
127  src0 = __lasx_xvldrepl_d(src + filterPos[0], 0); \
128  src1 = __lasx_xvldrepl_d(src + filterPos[1], 0); \
129  src2 = __lasx_xvldrepl_d(src + filterPos[2], 0); \
130  src3 = __lasx_xvldrepl_d(src + filterPos[3], 0); \
131  filter0 = __lasx_xvld(filter, 0); \
132  filter1 = __lasx_xvld(filter, 32); \
133  filterPos += 4; \
134  filter += 32; \
135  src0 = __lasx_xvilvl_d(src1, src0); \
136  src2 = __lasx_xvilvl_d(src3, src2); \
137  src0 = __lasx_vext2xv_hu_bu(src0); \
138  src2 = __lasx_vext2xv_hu_bu(src2); \
139  src0 = __lasx_xvdp2_w_h(src0, filter0); \
140  src1 = __lasx_xvdp2_w_h(src2, filter1); \
141  src0 = __lasx_xvhaddw_d_w(src0, src0); \
142  src1 = __lasx_xvhaddw_d_w(src1, src1); \
143  src0 = __lasx_xvpickev_w(src1, src0); \
144  src0 = __lasx_xvhaddw_d_w(src0, src0); \
145  src0 = __lasx_xvpickev_w(src0, src0); \
146  src0 = __lasx_xvsrai_w(src0, _sh); \
147  src0 = __lasx_xvmin_w(src0, vmax); \
148  src0 = __lasx_xvperm_w(src0, shuf); \
149 }
150 
151 #define SCALE_8_2(_sh) \
152 { \
153  src0 = __lasx_xvldrepl_d(src + filterPos[0], 0); \
154  src1 = __lasx_xvldrepl_d(src + filterPos[1], 0); \
155  filter0 = __lasx_xvld(filter, 0); \
156  src0 = __lasx_xvilvl_d(src1, src0); \
157  src0 = __lasx_vext2xv_hu_bu(src0); \
158  src0 = __lasx_xvdp2_w_h(filter0, src0); \
159  src0 = __lasx_xvhaddw_d_w(src0, src0); \
160  src0 = __lasx_xvhaddw_q_d(src0, src0); \
161  src0 = __lasx_xvsrai_w(src0, _sh); \
162  src0 = __lasx_xvmin_w(src0, vmax); \
163  dst[0] = __lasx_xvpickve2gr_w(src0, 0); \
164  dst[1] = __lasx_xvpickve2gr_w(src0, 4); \
165  filterPos += 2; \
166  filter += 16; \
167  dst += 2; \
168 }
169 
170 #define SCALE_4_16(_sh) \
171 { \
172  src0 = __lasx_xvldrepl_w(src + filterPos[0], 0); \
173  src1 = __lasx_xvldrepl_w(src + filterPos[1], 0); \
174  src2 = __lasx_xvldrepl_w(src + filterPos[2], 0); \
175  src3 = __lasx_xvldrepl_w(src + filterPos[3], 0); \
176  src4 = __lasx_xvldrepl_w(src + filterPos[4], 0); \
177  src5 = __lasx_xvldrepl_w(src + filterPos[5], 0); \
178  src6 = __lasx_xvldrepl_w(src + filterPos[6], 0); \
179  src7 = __lasx_xvldrepl_w(src + filterPos[7], 0); \
180  src8 = __lasx_xvldrepl_w(src + filterPos[8], 0); \
181  src9 = __lasx_xvldrepl_w(src + filterPos[9], 0); \
182  src10 = __lasx_xvldrepl_w(src + filterPos[10], 0); \
183  src11 = __lasx_xvldrepl_w(src + filterPos[11], 0); \
184  src12 = __lasx_xvldrepl_w(src + filterPos[12], 0); \
185  src13 = __lasx_xvldrepl_w(src + filterPos[13], 0); \
186  src14 = __lasx_xvldrepl_w(src + filterPos[14], 0); \
187  src15 = __lasx_xvldrepl_w(src + filterPos[15], 0); \
188  DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64, \
189  filter, 96, filter0, filter1, filter2, filter3); \
190  DUP4_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src5, \
191  src4, src7, src6, src0, src2, src4, src6); \
192  DUP4_ARG2(__lasx_xvilvl_w, src9, src8, src11, src10, src13, \
193  src12, src15, src14, src8, src10, src12, src14); \
194  DUP4_ARG2(__lasx_xvilvl_d, src2, src0, src6, src4, src10, \
195  src8, src14, src12, src0, src1, src2, src3); \
196  DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src1, src2, src3, \
197  src0, src1, src2, src3); \
198  DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src1, \
199  filter2, src2, filter3, src3, src0, src1, src2, src3); \
200  src0 = __lasx_xvhaddw_d_w(src0, src0); \
201  src1 = __lasx_xvhaddw_d_w(src1, src1); \
202  src2 = __lasx_xvhaddw_d_w(src2, src2); \
203  src3 = __lasx_xvhaddw_d_w(src3, src3); \
204  src0 = __lasx_xvpickev_w(src1, src0); \
205  src1 = __lasx_xvpickev_w(src3, src2); \
206  src0 = __lasx_xvsrai_w(src0, _sh); \
207  src1 = __lasx_xvsrai_w(src1, _sh); \
208  src0 = __lasx_xvmin_w(src0, vmax); \
209  src1 = __lasx_xvmin_w(src1, vmax); \
210  src0 = __lasx_xvpickev_h(src1, src0); \
211  src0 = __lasx_xvperm_w(src0, shuf); \
212  __lasx_xvst(src0, dst, 0); \
213  filterPos += 16; \
214  filter += 64; \
215  dst += 16; \
216 }
217 
218 #define SCALE_4_8(_sh) \
219 { \
220  src0 = __lasx_xvldrepl_w(src + filterPos[0], 0); \
221  src1 = __lasx_xvldrepl_w(src + filterPos[1], 0); \
222  src2 = __lasx_xvldrepl_w(src + filterPos[2], 0); \
223  src3 = __lasx_xvldrepl_w(src + filterPos[3], 0); \
224  src4 = __lasx_xvldrepl_w(src + filterPos[4], 0); \
225  src5 = __lasx_xvldrepl_w(src + filterPos[5], 0); \
226  src6 = __lasx_xvldrepl_w(src + filterPos[6], 0); \
227  src7 = __lasx_xvldrepl_w(src + filterPos[7], 0); \
228  filter0 = __lasx_xvld(filter, 0); \
229  filter1 = __lasx_xvld(filter, 32); \
230  filterPos += 8; \
231  filter += 32; \
232  DUP4_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src5, \
233  src4, src7, src6, src0, src2, src4, src6); \
234  src0 = __lasx_xvilvl_d(src2, src0); \
235  src1 = __lasx_xvilvl_d(src6, src4); \
236  \
237  src0 = __lasx_vext2xv_hu_bu(src0); \
238  src1 = __lasx_vext2xv_hu_bu(src1); \
239  src0 = __lasx_xvdp2_w_h(filter0, src0); \
240  src1 = __lasx_xvdp2_w_h(filter1, src1); \
241  src0 = __lasx_xvhaddw_d_w(src0, src0); \
242  src1 = __lasx_xvhaddw_d_w(src1, src1); \
243  src0 = __lasx_xvpickev_w(src1, src0); \
244  src0 = __lasx_xvsrai_w(src0, _sh); \
245  src0 = __lasx_xvmin_w(src0, vmax); \
246 }
247 
248 #define SCALE_4_4(_sh) \
249 { \
250  src0 = __lasx_xvldrepl_w(src + filterPos[0], 0); \
251  src1 = __lasx_xvldrepl_w(src + filterPos[1], 0); \
252  src2 = __lasx_xvldrepl_w(src + filterPos[2], 0); \
253  src3 = __lasx_xvldrepl_w(src + filterPos[3], 0); \
254  filter0 = __lasx_xvld(filter, 0); \
255  filterPos += 4; \
256  filter += 16; \
257  src0 = __lasx_xvilvl_w(src1, src0); \
258  src1 = __lasx_xvilvl_w(src3, src2); \
259  \
260  src0 = __lasx_xvilvl_d(src1, src0); \
261  src0 = __lasx_vext2xv_hu_bu(src0); \
262  src0 = __lasx_xvdp2_w_h(filter0, src0); \
263  src0 = __lasx_xvhaddw_d_w(src0, src0); \
264  src0 = __lasx_xvsrai_w(src0, _sh); \
265  src0 = __lasx_xvmin_w(src0, vmax); \
266  src0 = __lasx_xvpickev_w(src0, src0); \
267  src0 = __lasx_xvpermi_d(src0, 0xd8); \
268 }
269 
270 #define SCALE_4_2(_sh) \
271 { \
272  src0 = __lasx_xvldrepl_w(src + filterPos[0], 0); \
273  src1 = __lasx_xvldrepl_w(src + filterPos[1], 0); \
274  filter0 = __lasx_xvld(filter, 0); \
275  src0 = __lasx_xvilvl_w(src1, src0); \
276  src0 = __lasx_vext2xv_hu_bu(src0); \
277  src0 = __lasx_xvdp2_w_h(filter0, src0); \
278  src0 = __lasx_xvhaddw_d_w(src0, src0); \
279  src0 = __lasx_xvsrai_w(src0, _sh); \
280  src0 = __lasx_xvmin_w(src0, vmax); \
281  dst[0] = __lasx_xvpickve2gr_w(src0, 0); \
282  dst[1] = __lasx_xvpickve2gr_w(src0, 2); \
283  filterPos += 2; \
284  filter += 8; \
285  dst += 2; \
286 }
287 
288 #define SCALE_16 \
289 { \
290  int dex = j << 1; \
291  src0 = __lasx_xvldrepl_d((srcPos1 + j), 0); \
292  src1 = __lasx_xvldrepl_d((srcPos2 + j), 0); \
293  src2 = __lasx_xvldrepl_d((srcPos3 + j), 0); \
294  src3 = __lasx_xvldrepl_d((srcPos4 + j), 0); \
295  DUP4_ARG2(__lasx_xvldx, filterStart1, dex, filterStart2, dex, \
296  filterStart3, dex, filterStart4, dex, filter0, \
297  filter1, filter2, filter3); \
298  src0 = __lasx_xvpermi_q(src0, src1, 0x02); \
299  src1 = __lasx_xvpermi_q(src2, src3, 0x02); \
300  filter0 = __lasx_xvpermi_q(filter0, filter1, 0x02); \
301  filter1 = __lasx_xvpermi_q(filter2, filter3, 0x02); \
302  src0 = __lasx_xvilvl_b(zero, src0); \
303  src1 = __lasx_xvilvl_b(zero, src1); \
304  out0 = __lasx_xvdp2_w_h(filter0, src0); \
305  out1 = __lasx_xvdp2_w_h(filter1, src1); \
306  src0 = __lasx_xvhaddw_d_w(out0, out0); \
307  src1 = __lasx_xvhaddw_d_w(out1, out1); \
308  out0 = __lasx_xvpackev_d(src1, src0); \
309  out1 = __lasx_xvpackod_d(src1, src0); \
310  out0 = __lasx_xvadd_w(out0, out1); \
311  out = __lasx_xvadd_w(out, out0); \
312 }
313 
314 void ff_hscale_8_to_15_lasx(SwsInternal *c, int16_t *dst, int dstW,
315  const uint8_t *src, const int16_t *filter,
316  const int32_t *filterPos, int filterSize)
317 {
318  int i;
319  int max = (1 << 15) - 1;
320 
321  if (filterSize == 8) {
322  __m256i src0, src1, src2, src3, src4, src5, src6, src7;
323  __m256i src8, src9, src10, src11, src12, src13, src14, src15;
324  __m256i filter0, filter1, filter2, filter3;
325  __m256i filter4, filter5, filter6, filter7;
326  __m256i vmax = __lasx_xvreplgr2vr_w(max);
327  __m256i shuf = {0x0000000400000000, 0x0000000500000001,
328  0x0000000600000002, 0x0000000700000003};
329  int len = dstW >> 4;
330  int res = dstW & 15;
331  while (len--) {
332  SCALE_8_16(7);
333  }
334  if (res & 8) {
335  SCALE_8_8(7);
336  src0 = __lasx_xvpickev_h(src0, src0);
337  __lasx_xvstelm_d(src0, dst, 0, 0);
338  __lasx_xvstelm_d(src0, dst, 8, 2);
339  dst += 8;
340  }
341  if (res & 4) {
342  SCALE_8_4(7);
343  src0 = __lasx_xvpickev_h(src0, src0);
344  __lasx_xvstelm_d(src0, dst, 0, 0);
345  dst += 4;
346  }
347  if (res & 2) {
348  SCALE_8_2(7);
349  }
350  if (res & 1) {
351  int val = 0;
352  src0 = __lasx_xvldrepl_d(src + filterPos[0], 0);
353  filter0 = __lasx_xvld(filter, 0);
354  src0 = __lasx_vext2xv_hu_bu(src0);
355  src0 = __lasx_xvdp2_w_h(filter0, src0);
356  src0 = __lasx_xvhaddw_d_w(src0, src0);
357  src0 = __lasx_xvhaddw_q_d(src0, src0);
358  val = __lasx_xvpickve2gr_w(src0, 0);
359  dst[0] = FFMIN(val >> 7, max);
360  }
361  } else if (filterSize == 4) {
362  __m256i src0, src1, src2, src3, src4, src5, src6, src7;
363  __m256i src8, src9, src10, src11, src12, src13, src14, src15;
364  __m256i filter0, filter1, filter2, filter3;
365  __m256i vmax = __lasx_xvreplgr2vr_w(max);
366  __m256i shuf = {0x0000000400000000, 0x0000000500000001,
367  0x0000000600000002, 0x0000000700000003};
368  int len = dstW >> 4;
369  int res = dstW & 15;
370  while (len--) {
371  SCALE_4_16(7);
372  }
373  if (res & 8) {
374  SCALE_4_8(7);
375  src0 = __lasx_xvpickev_h(src1, src0);
376  src0 = __lasx_xvperm_w(src0, shuf);
377  __lasx_xvstelm_d(src0, dst, 0, 0);
378  __lasx_xvstelm_d(src0, dst, 8, 1);
379  dst += 8;
380  }
381  if (res & 4) {
382  SCALE_4_4(7);
383  src0 = __lasx_xvpickev_h(src0, src0);
384  __lasx_xvstelm_d(src0, dst, 0, 0);
385  dst += 4;
386  }
387  if (res & 2) {
388  SCALE_4_2(7);
389  }
390  if (res & 1) {
391  int val = 0;
392  const uint8_t *srcPos = src + filterPos[0];
393 
394  for (int j = 0; j < filterSize; j++) {
395  val += ((int)srcPos[j]) * filter[j];
396  }
397  dst[0] = FFMIN(val >> 7, max);
398  }
399  } else if (filterSize > 8) {
400  int filterlen = filterSize - 7;
401  int len = dstW >> 2;
402  int res = dstW & 3;
403  __m256i zero = __lasx_xvldi(0);
404 
405  while (len--) {
406  __m256i src0, src1, src2, src3;
407  __m256i filter0, filter1, filter2, filter3, out0, out1;
408  __m256i out = zero;
409  const uint8_t *srcPos1 = src + filterPos[0];
410  const uint8_t *srcPos2 = src + filterPos[1];
411  const uint8_t *srcPos3 = src + filterPos[2];
412  const uint8_t *srcPos4 = src + filterPos[3];
413  const int16_t *filterStart1 = filter;
414  const int16_t *filterStart2 = filterStart1 + filterSize;
415  const int16_t *filterStart3 = filterStart2 + filterSize;
416  const int16_t *filterStart4 = filterStart3 + filterSize;
417  int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
418 
419  for (j = 0; j < filterlen; j += 8) {
420  SCALE_16
421  }
422  val1 = __lasx_xvpickve2gr_w(out, 0);
423  val2 = __lasx_xvpickve2gr_w(out, 4);
424  val3 = __lasx_xvpickve2gr_w(out, 2);
425  val4 = __lasx_xvpickve2gr_w(out, 6);
426  for (; j < filterSize; j++) {
427  val1 += ((int)srcPos1[j]) * filterStart1[j];
428  val2 += ((int)srcPos2[j]) * filterStart2[j];
429  val3 += ((int)srcPos3[j]) * filterStart3[j];
430  val4 += ((int)srcPos4[j]) * filterStart4[j];
431  }
432  dst[0] = FFMIN(val1 >> 7, max);
433  dst[1] = FFMIN(val2 >> 7, max);
434  dst[2] = FFMIN(val3 >> 7, max);
435  dst[3] = FFMIN(val4 >> 7, max);
436  dst += 4;
437  filterPos += 4;
438  filter = filterStart4 + filterSize;
439  }
440  for(i = 0; i < res; i++) {
441  int j, val = 0;
442  const uint8_t *srcPos = src + filterPos[i];
443  __m256i src1, filter0, out0;
444 
445  for (j = 0; j < filterlen; j += 8) {
446  src1 = __lasx_xvldrepl_d((srcPos + j), 0);
447  filter0 = __lasx_xvld(filter + j, 0);
448  src1 = __lasx_xvilvl_b(zero, src1);
449  out0 = __lasx_xvdp2_w_h(filter0, src1);
450  out0 = __lasx_xvhaddw_d_w(out0, out0);
451  out0 = __lasx_xvhaddw_q_d(out0, out0);
452  val += __lasx_xvpickve2gr_w(out0, 0);
453  }
454  for (; j < filterSize; j++) {
455  val += ((int)srcPos[j]) * filter[j];
456  }
457  dst[i] = FFMIN(val >> 7, max);
458  filter += filterSize;
459  }
460  } else {
461  for (i = 0; i < dstW; i++) {
462  int val = 0;
463  const uint8_t *srcPos = src + filterPos[i];
464 
465  for (int j = 0; j < filterSize; j++) {
466  val += ((int)srcPos[j]) * filter[j];
467  }
468  dst[i] = FFMIN(val >> 7, max);
469  filter += filterSize;
470  }
471  }
472 }
473 
474 void ff_hscale_8_to_19_lasx(SwsInternal *c, int16_t *_dst, int dstW,
475  const uint8_t *src, const int16_t *filter,
476  const int32_t *filterPos, int filterSize)
477 {
478  int i;
479  int max = (1 << 19) - 1;
480  int32_t *dst = (int32_t *) _dst;
481 
482  if (filterSize == 8) {
483  __m256i src0, src1, src2, src3, src4, src5, src6, src7;
484  __m256i filter0, filter1, filter2, filter3;
485  __m256i vmax = __lasx_xvreplgr2vr_w(max);
486  __m256i shuf = {0x0000000400000000, 0x0000000500000001,
487  0x0000000600000002, 0x0000000700000003};
488  int len = dstW >> 3;
489  int res = dstW & 7;
490  while (len--) {
491  SCALE_8_8(3);
492  __lasx_xvst(src0, dst, 0);
493  dst += 8;
494  }
495  if (res & 4) {
496  SCALE_8_4(3);
497  __lasx_xvstelm_d(src0, dst, 0, 0);
498  __lasx_xvstelm_d(src0, dst, 8, 1);
499  dst += 4;
500  }
501  if (res & 2) {
502  SCALE_8_2(3);
503  }
504  if (res & 1) {
505  int val = 0;
506  __m256i src0, filter0, out0;
507 
508  src0 = __lasx_xvldrepl_d(src + filterPos[0], 0);
509  filter0 = __lasx_xvld(filter, 0);
510  src0 = __lasx_vext2xv_hu_bu(src0);
511  out0 = __lasx_xvdp2_w_h(filter0, src0);
512  out0 = __lasx_xvhaddw_d_w(out0, out0);
513  out0 = __lasx_xvhaddw_q_d(out0, out0);
514  val = __lasx_xvpickve2gr_w(out0, 0);
515  dst[0] = FFMIN(val >> 3, max);
516  }
517  } else if (filterSize == 4) {
518  __m256i src0, src1, src2, src3, src4, src5, src6, src7;
519  __m256i filter0, filter1;
520  __m256i vmax = __lasx_xvreplgr2vr_w(max);
521  __m256i shuf = {0x0000000100000000, 0x0000000500000004,
522  0x0000000300000002, 0x0000000700000006};
523  int len = dstW >> 3;
524  int res = dstW & 7;
525  while (len--) {
526  SCALE_4_8(3);
527  src0 = __lasx_xvperm_w(src0, shuf);
528  __lasx_xvst(src0, dst, 0);
529  dst += 8;
530  }
531  if (res & 4) {
532  SCALE_4_4(3);
533  __lasx_xvstelm_d(src0, dst, 0, 0);
534  __lasx_xvstelm_d(src0, dst, 8, 1);
535  dst += 4;
536  }
537  if (res & 2) {
538  SCALE_4_2(3);
539  }
540  if (res & 1) {
541  int val = 0;
542  const uint8_t *srcPos = src + filterPos[0];
543 
544  for (int j = 0; j < filterSize; j++) {
545  val += ((int)srcPos[j]) * filter[j];
546  }
547  dst[0] = FFMIN(val >> 3, max);
548  }
549  } else if (filterSize > 8) {
550  int len = dstW >> 2;
551  int res = dstW & 3;
552  int filterlen = filterSize - 7;
553  __m256i zero = __lasx_xvldi(0);
554 
555  while (len--) {
556  __m256i src0, src1, src2, src3;
557  __m256i filter0, filter1, filter2, filter3, out0, out1;
558  __m256i out = zero;
559  const uint8_t *srcPos1 = src + filterPos[0];
560  const uint8_t *srcPos2 = src + filterPos[1];
561  const uint8_t *srcPos3 = src + filterPos[2];
562  const uint8_t *srcPos4 = src + filterPos[3];
563  const int16_t *filterStart1 = filter;
564  const int16_t *filterStart2 = filterStart1 + filterSize;
565  const int16_t *filterStart3 = filterStart2 + filterSize;
566  const int16_t *filterStart4 = filterStart3 + filterSize;
567  int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
568 
569  for (j = 0; j < filterlen; j += 8) {
570  SCALE_16
571  }
572  val1 = __lasx_xvpickve2gr_w(out, 0);
573  val2 = __lasx_xvpickve2gr_w(out, 4);
574  val3 = __lasx_xvpickve2gr_w(out, 2);
575  val4 = __lasx_xvpickve2gr_w(out, 6);
576  for (; j < filterSize; j++) {
577  val1 += ((int)srcPos1[j]) * filterStart1[j];
578  val2 += ((int)srcPos2[j]) * filterStart2[j];
579  val3 += ((int)srcPos3[j]) * filterStart3[j];
580  val4 += ((int)srcPos4[j]) * filterStart4[j];
581  }
582  dst[0] = FFMIN(val1 >> 3, max);
583  dst[1] = FFMIN(val2 >> 3, max);
584  dst[2] = FFMIN(val3 >> 3, max);
585  dst[3] = FFMIN(val4 >> 3, max);
586  dst += 4;
587  filterPos += 4;
588  filter = filterStart4 + filterSize;
589  }
590  for (i = 0; i < res; i++) {
591  int j, val = 0;
592  const uint8_t *srcPos = src + filterPos[i];
593  __m256i src1, filter0, out0;
594 
595  for (j = 0; j < filterlen; j += 8) {
596  src1 = __lasx_xvldrepl_d((srcPos + j), 0);
597  filter0 = __lasx_xvld(filter + j, 0);
598  src1 = __lasx_xvilvl_b(zero, src1);
599  out0 = __lasx_xvdp2_w_h(filter0, src1);
600  out0 = __lasx_xvhaddw_d_w(out0, out0);
601  out0 = __lasx_xvhaddw_q_d(out0, out0);
602  val += __lasx_xvpickve2gr_w(out0, 0);
603  }
604  for (; j < filterSize; j++) {
605  val += ((int)srcPos[j]) * filter[j];
606  }
607  dst[i] = FFMIN(val >> 3, max);
608  filter += filterSize;
609  }
610  } else {
611  for (i = 0; i < dstW; i++) {
612  int val = 0;
613  const uint8_t *srcPos = src + filterPos[i];
614 
615  for (int j = 0; j < filterSize; j++) {
616  val += ((int)srcPos[j]) * filter[j];
617  }
618  dst[i] = FFMIN(val >> 3, max);
619  filter += filterSize;
620  }
621  }
622 }
623 
624 #undef SCALE_16
625 
626 #define SCALE_8 \
627 { \
628  __m256i src0, src1, src2, src3, filter0, filter1, out0, out1; \
629  DUP4_ARG2(__lasx_xvld, src + filterPos[0], 0, src + filterPos[1], 0, \
630  src + filterPos[2], 0, src + filterPos[3], 0, src0, src1, src2,\
631  src3); \
632  filter0 = __lasx_xvld(filter, 0); \
633  filter1 = __lasx_xvld(filter, 32); \
634  src0 = __lasx_xvpermi_q(src0, src1, 0x02); \
635  src2 = __lasx_xvpermi_q(src2, src3, 0x02); \
636  out0 = __lasx_xvdp2_w_hu_h(src0, filter0); \
637  out1 = __lasx_xvdp2_w_hu_h(src2, filter1); \
638  src0 = __lasx_xvhaddw_d_w(out0, out0); \
639  src1 = __lasx_xvhaddw_d_w(out1, out1); \
640  out0 = __lasx_xvpackev_d(src1, src0); \
641  out1 = __lasx_xvpackod_d(src1, src0); \
642  out0 = __lasx_xvadd_w(out0, out1); \
643  out0 = __lasx_xvsra_w(out0, shift); \
644  out0 = __lasx_xvmin_w(out0, v_max); \
645  dst[0] = __lasx_xvpickve2gr_w(out0, 0); \
646  dst[1] = __lasx_xvpickve2gr_w(out0, 4); \
647  dst[2] = __lasx_xvpickve2gr_w(out0, 2); \
648  dst[3] = __lasx_xvpickve2gr_w(out0, 6); \
649  filterPos += 4; \
650  filter += 32; \
651  dst += 4; \
652 }
653 
654 #define SCALE_16 \
655 { \
656  int dex = j << 1; \
657  DUP4_ARG2(__lasx_xvldx, srcPos1, dex, srcPos2, dex, srcPos3, dex, \
658  srcPos4, dex, src0, src1, src2, src3); \
659  DUP4_ARG2(__lasx_xvldx, filterStart1, dex, filterStart2, dex, \
660  filterStart3, dex, filterStart4, dex, filter0, \
661  filter1, filter2, filter3); \
662  src0 = __lasx_xvpermi_q(src0, src1, 0x02); \
663  src1 = __lasx_xvpermi_q(src2, src3, 0x02); \
664  filter0 = __lasx_xvpermi_q(filter0, filter1, 0x02); \
665  filter1 = __lasx_xvpermi_q(filter2, filter3, 0x02); \
666  out0 = __lasx_xvdp2_w_hu_h(src0, filter0); \
667  out1 = __lasx_xvdp2_w_hu_h(src1, filter1); \
668  src0 = __lasx_xvhaddw_d_w(out0, out0); \
669  src1 = __lasx_xvhaddw_d_w(out1, out1); \
670  out0 = __lasx_xvpackev_d(src1, src0); \
671  out1 = __lasx_xvpackod_d(src1, src0); \
672  out0 = __lasx_xvadd_w(out0, out1); \
673  out = __lasx_xvadd_w(out, out0); \
674 }
675 
676 void ff_hscale_16_to_15_lasx(SwsInternal *c, int16_t *dst, int dstW,
677  const uint8_t *_src, const int16_t *filter,
678  const int32_t *filterPos, int filterSize)
679 {
680  const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->opts.src_format);
681  int i;
682  const uint16_t *src = (const uint16_t *) _src;
683  int sh = desc->comp[0].depth - 1;
684  int max = (1 << 15) - 1;
685  int len = dstW >> 2;
686  int res = dstW & 3;
687  __m256i shift;
688  __m256i zero = __lasx_xvldi(0);
689 
690  if (sh < 15) {
691  sh = isAnyRGB(c->opts.src_format) || c->opts.src_format==AV_PIX_FMT_PAL8 ? 13 :
692  (desc->comp[0].depth - 1);
693  } else if (desc->flags && AV_PIX_FMT_FLAG_FLOAT) {
694  sh = 15;
695  }
696  shift = __lasx_xvreplgr2vr_w(sh);
697 
698  if (filterSize == 8) {
699  __m256i v_max = __lasx_xvreplgr2vr_w(max);
700  for (i = 0; i < len; i++) {
701  SCALE_8
702  }
703  for (i = 0; i < res; i++) {
704  int val = 0;
705  __m256i src0, filter0, out0;
706 
707  src0 = __lasx_xvld(src + filterPos[i], 0);
708  filter0 = __lasx_xvld(filter, 0);
709  out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
710  out0 = __lasx_xvhaddw_d_w(out0, out0);
711  out0 = __lasx_xvhaddw_q_d(out0, out0);
712  val = __lasx_xvpickve2gr_w(out0, 0);
713  dst[i] = FFMIN(val >> sh, max);
714  filter += 8;
715  }
716  } else if (filterSize == 4) {
717  __m256i v_max = __lasx_xvreplgr2vr_w(max);
718  for (i = 0; i < len; i++) {
719  __m256i src1, src2, src3, src4, src0, filter0, out0;
720 
721  src1 = __lasx_xvldrepl_d(src + filterPos[0], 0);
722  src2 = __lasx_xvldrepl_d(src + filterPos[1], 0);
723  src3 = __lasx_xvldrepl_d(src + filterPos[2], 0);
724  src4 = __lasx_xvldrepl_d(src + filterPos[3], 0);
725  filter0 = __lasx_xvld(filter, 0);
726  src1 = __lasx_xvextrins_d(src1, src2, 0x10);
727  src3 = __lasx_xvextrins_d(src3, src4, 0x10);
728  src0 = __lasx_xvpermi_q(src1, src3, 0x02);
729  out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
730  out0 = __lasx_xvhaddw_d_w(out0, out0);
731  out0 = __lasx_xvsra_w(out0, shift);
732  out0 = __lasx_xvmin_w(out0, v_max);
733  dst[0] = __lasx_xvpickve2gr_w(out0, 0);
734  dst[1] = __lasx_xvpickve2gr_w(out0, 2);
735  dst[2] = __lasx_xvpickve2gr_w(out0, 4);
736  dst[3] = __lasx_xvpickve2gr_w(out0, 6);
737  dst += 4;
738  filterPos += 4;
739  filter += 16;
740  }
741  for (i = 0; i < res; i++) {
742  int val = 0;
743  const uint16_t *srcPos = src + filterPos[i];
744 
745  for (int j = 0; j < filterSize; j++) {
746  val += ((int)srcPos[j]) * filter[j];
747  }
748  dst[i] = FFMIN(val >> sh, max);
749  filter += 4;
750  }
751  } else if (filterSize > 8) {
752  int filterlen = filterSize - 7;
753 
754  for (i = 0; i < len; i++) {
755  __m256i src0, src1, src2, src3;
756  __m256i filter0, filter1, filter2, filter3, out0, out1;
757  __m256i out = zero;
758  const uint16_t *srcPos1 = src + filterPos[0];
759  const uint16_t *srcPos2 = src + filterPos[1];
760  const uint16_t *srcPos3 = src + filterPos[2];
761  const uint16_t *srcPos4 = src + filterPos[3];
762  const int16_t *filterStart1 = filter;
763  const int16_t *filterStart2 = filterStart1 + filterSize;
764  const int16_t *filterStart3 = filterStart2 + filterSize;
765  const int16_t *filterStart4 = filterStart3 + filterSize;
766  int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
767 
768  for (j = 0; j < filterlen; j += 8) {
769  SCALE_16
770  }
771  val1 = __lasx_xvpickve2gr_w(out, 0);
772  val2 = __lasx_xvpickve2gr_w(out, 4);
773  val3 = __lasx_xvpickve2gr_w(out, 2);
774  val4 = __lasx_xvpickve2gr_w(out, 6);
775  for (; j < filterSize; j++) {
776  val1 += ((int)srcPos1[j]) * filterStart1[j];
777  val2 += ((int)srcPos2[j]) * filterStart2[j];
778  val3 += ((int)srcPos3[j]) * filterStart3[j];
779  val4 += ((int)srcPos4[j]) * filterStart4[j];
780  }
781  dst[0] = FFMIN(val1 >> sh, max);
782  dst[1] = FFMIN(val2 >> sh, max);
783  dst[2] = FFMIN(val3 >> sh, max);
784  dst[3] = FFMIN(val4 >> sh, max);
785  dst += 4;
786  filterPos += 4;
787  filter = filterStart4 + filterSize;
788  }
789  for (i = 0; i < res; i++) {
790  int j, val = 0;
791  const uint16_t *srcPos = src + filterPos[i];
792  __m256i src0, filter0, out0;
793 
794  for (j = 0; j < filterlen; j += 8) {
795  int dex = j << 1;
796  src0 = __lasx_xvldx(srcPos, dex);
797  filter0 = __lasx_xvldx(filter, dex);
798  out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
799  out0 = __lasx_xvhaddw_d_w(out0, out0);
800  out0 = __lasx_xvhaddw_q_d(out0, out0);
801  val += __lasx_xvpickve2gr_w(out0, 0);
802  }
803  for (; j < filterSize; j++) {
804  val += ((int)srcPos[j]) * filter[j];
805  }
806  dst[i] = FFMIN(val >> sh, max);
807  filter += filterSize;
808  }
809  } else {
810  for (i = 0; i < dstW; i++) {
811  int val = 0;
812  const uint16_t *srcPos = src + filterPos[i];
813 
814  for (int j = 0; j < filterSize; j++) {
815  val += ((int)srcPos[j]) * filter[j];
816  }
817  dst[i] = FFMIN(val >> sh, max);
818  filter += filterSize;
819  }
820  }
821 }
822 
823 void ff_hscale_16_to_19_lasx(SwsInternal *c, int16_t *_dst, int dstW,
824  const uint8_t *_src, const int16_t *filter,
825  const int32_t *filterPos, int filterSize)
826 {
827  const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->opts.src_format);
828  int i;
829  int32_t *dst = (int32_t *) _dst;
830  const uint16_t *src = (const uint16_t *) _src;
831  int sh = desc->comp[0].depth - 5;
832  int max = (1 << 19) - 1;
833  int len = dstW >> 2;
834  int res = dstW & 3;
835  __m256i shift;
836  __m256i zero = __lasx_xvldi(0);
837 
838  if ((isAnyRGB(c->opts.src_format) || c->opts.src_format == AV_PIX_FMT_PAL8)
839  && desc->comp[0].depth<16) {
840  sh = 9;
841  } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) {
842  sh = 11;
843  }
844  shift = __lasx_xvreplgr2vr_w(sh);
845 
846  if (filterSize == 8) {
847  __m256i v_max = __lasx_xvreplgr2vr_w(max);
848  for (i = 0; i < len; i++) {
849  SCALE_8
850  }
851  for (i = 0; i < res; i++) {
852  int val = 0;
853  __m256i src0, filter0, out0;
854 
855  src0 = __lasx_xvld(src + filterPos[i], 0);
856  filter0 = __lasx_xvld(filter, 0);
857  out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
858  out0 = __lasx_xvhaddw_d_w(out0, out0);
859  out0 = __lasx_xvhaddw_q_d(out0, out0);
860  val = __lasx_xvpickve2gr_w(out0, 0);
861  dst[i] = FFMIN(val >> sh, max);
862  filter += 8;
863  }
864  } else if (filterSize == 4) {
865  __m256i v_max = __lasx_xvreplgr2vr_w(max);
866  for (i = 0; i < len; i++) {
867  __m256i src1, src2, src3, src4, src0, filter0, out0;
868 
869  src1 = __lasx_xvldrepl_d(src + filterPos[0], 0);
870  src2 = __lasx_xvldrepl_d(src + filterPos[1], 0);
871  src3 = __lasx_xvldrepl_d(src + filterPos[2], 0);
872  src4 = __lasx_xvldrepl_d(src + filterPos[3], 0);
873  filter0 = __lasx_xvld(filter, 0);
874  src1 = __lasx_xvextrins_d(src1, src2, 0x10);
875  src3 = __lasx_xvextrins_d(src3, src4, 0x10);
876  src0 = __lasx_xvpermi_q(src1, src3, 0x02);
877  out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
878  out0 = __lasx_xvhaddw_d_w(out0, out0);
879  out0 = __lasx_xvsra_w(out0, shift);
880  out0 = __lasx_xvmin_w(out0, v_max);
881  dst[0] = __lasx_xvpickve2gr_w(out0, 0);
882  dst[1] = __lasx_xvpickve2gr_w(out0, 2);
883  dst[2] = __lasx_xvpickve2gr_w(out0, 4);
884  dst[3] = __lasx_xvpickve2gr_w(out0, 6);
885  dst += 4;
886  filterPos += 4;
887  filter += 16;
888  }
889  for (i = 0; i < res; i++) {
890  int val = 0;
891  const uint16_t *srcPos = src + filterPos[i];
892 
893  for (int j = 0; j < filterSize; j++) {
894  val += ((int)srcPos[j]) * filter[j];
895  }
896  dst[i] = FFMIN(val >> sh, max);
897  filter += 4;
898  }
899  } else if (filterSize > 8) {
900  int filterlen = filterSize - 7;
901 
902  for (i = 0; i < len; i ++) {
903  __m256i src0, src1, src2, src3;
904  __m256i filter0, filter1, filter2, filter3, out0, out1;
905  __m256i out = zero;
906  const uint16_t *srcPos1 = src + filterPos[0];
907  const uint16_t *srcPos2 = src + filterPos[1];
908  const uint16_t *srcPos3 = src + filterPos[2];
909  const uint16_t *srcPos4 = src + filterPos[3];
910  const int16_t *filterStart1 = filter;
911  const int16_t *filterStart2 = filterStart1 + filterSize;
912  const int16_t *filterStart3 = filterStart2 + filterSize;
913  const int16_t *filterStart4 = filterStart3 + filterSize;
914  int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
915 
916  for (j = 0; j < filterlen; j += 8) {
917  SCALE_16
918  }
919  val1 = __lasx_xvpickve2gr_w(out, 0);
920  val2 = __lasx_xvpickve2gr_w(out, 4);
921  val3 = __lasx_xvpickve2gr_w(out, 2);
922  val4 = __lasx_xvpickve2gr_w(out, 6);
923  for (; j < filterSize; j++) {
924  val1 += ((int)srcPos1[j]) * filterStart1[j];
925  val2 += ((int)srcPos2[j]) * filterStart2[j];
926  val3 += ((int)srcPos3[j]) * filterStart3[j];
927  val4 += ((int)srcPos4[j]) * filterStart4[j];
928  }
929  dst[0] = FFMIN(val1 >> sh, max);
930  dst[1] = FFMIN(val2 >> sh, max);
931  dst[2] = FFMIN(val3 >> sh, max);
932  dst[3] = FFMIN(val4 >> sh, max);
933  dst += 4;
934  filterPos += 4;
935  filter = filterStart4 + filterSize;
936  }
937  for (i = 0; i < res; i++) {
938  int j, val = 0;
939  const uint16_t *srcPos = src + filterPos[i];
940  __m256i src0, filter0, out0;
941 
942  for (j = 0; j < filterlen; j += 8) {
943  int dex = j << 1;
944  src0 = __lasx_xvldx(srcPos, dex);
945  filter0 = __lasx_xvldx(filter, dex);
946  out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
947  out0 = __lasx_xvhaddw_d_w(out0, out0);
948  out0 = __lasx_xvhaddw_q_d(out0, out0);
949  val += __lasx_xvpickve2gr_w(out0, 0);
950  }
951  for (; j < filterSize; j++) {
952  val += ((int)srcPos[j]) * filter[j];
953  }
954  dst[i] = FFMIN(val >> sh, max);
955  filter += filterSize;
956  }
957  } else {
958  for (i = 0; i < dstW; i++) {
959  int val = 0;
960  const uint16_t *srcPos = src + filterPos[i];
961 
962  for (int j = 0; j < filterSize; j++) {
963  val += ((int)srcPos[j]) * filter[j];
964  }
965  dst[i] = FFMIN(val >> sh, max);
966  filter += filterSize;
967  }
968  }
969 }
970 
971 #undef SCALE_8
972 #undef SCALE_16
_dst
uint8_t * _dst
Definition: dsp.h:52
out
FILE * out
Definition: movenc.c:55
filter1
static void filter1(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:360
av_pix_fmt_desc_get
const AVPixFmtDescriptor * av_pix_fmt_desc_get(enum AVPixelFormat pix_fmt)
Definition: pixdesc.c:3170
ff_hscale_8_to_19_lasx
void ff_hscale_8_to_19_lasx(SwsInternal *c, int16_t *_dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
Definition: swscale_lasx.c:474
src1
const pixel * src1
Definition: h264pred_template.c:421
AV_PIX_FMT_FLAG_FLOAT
#define AV_PIX_FMT_FLAG_FLOAT
The pixel format contains IEEE-754 floating point values.
Definition: pixdesc.h:158
SCALE_8_4
#define SCALE_8_4(_sh)
Definition: swscale_lasx.c:125
filter
void(* filter)(uint8_t *src, int stride, int qscale)
Definition: h263dsp.c:29
SCALE_16
#define SCALE_16
Definition: swscale_lasx.c:654
max
#define max(a, b)
Definition: cuda_runtime.h:33
_src
uint8_t ptrdiff_t const uint8_t * _src
Definition: dsp.h:52
swscale_loongarch.h
val
static double val(void *priv, double ch)
Definition: aeval.c:77
SCALE_4_2
#define SCALE_4_2(_sh)
Definition: swscale_lasx.c:270
ff_hscale_8_to_15_lasx
void ff_hscale_8_to_15_lasx(SwsInternal *c, int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
Definition: swscale_lasx.c:314
intreadwrite.h
SCALE_4_4
#define SCALE_4_4(_sh)
Definition: swscale_lasx.c:248
SCALE_8_2
#define SCALE_8_2(_sh)
Definition: swscale_lasx.c:151
ff_hscale_16_to_15_lasx
void ff_hscale_16_to_15_lasx(SwsInternal *c, int16_t *dst, int dstW, const uint8_t *_src, const int16_t *filter, const int32_t *filterPos, int filterSize)
Definition: swscale_lasx.c:676
SCALE_8_8
#define SCALE_8_8(_sh)
Definition: swscale_lasx.c:91
SCALE_8_16
#define SCALE_8_16(_sh)
Definition: swscale_lasx.c:26
SCALE_4_16
#define SCALE_4_16(_sh)
Definition: swscale_lasx.c:170
SCALE_8
#define SCALE_8
Definition: swscale_lasx.c:626
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
shift
static int shift(int a, int b)
Definition: bonk.c:261
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
isAnyRGB
static av_always_inline int isAnyRGB(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:856
SCALE_4_8
#define SCALE_4_8(_sh)
Definition: swscale_lasx.c:218
zero
static int zero(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:121
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
src2
const pixel * src2
Definition: h264pred_template.c:422
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
len
int len
Definition: vorbis_enc_data.h:426
AV_PIX_FMT_PAL8
@ AV_PIX_FMT_PAL8
8 bits with AV_PIX_FMT_RGB32 palette
Definition: pixfmt.h:84
SwsInternal
Definition: swscale_internal.h:317
ff_hscale_16_to_19_lasx
void ff_hscale_16_to_19_lasx(SwsInternal *c, int16_t *_dst, int dstW, const uint8_t *_src, const int16_t *filter, const int32_t *filterPos, int filterSize)
Definition: swscale_lasx.c:823
src0
const pixel *const src0
Definition: h264pred_template.c:420
filter0
static void filter0(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:352
desc
const char * desc
Definition: libsvtav1.c:79
loongson_intrinsics.h
AVPixFmtDescriptor
Descriptor that unambiguously describes how the bits of a pixel are stored in the up to 4 data planes...
Definition: pixdesc.h:69
int32_t
int32_t
Definition: audioconvert.c:56
src
#define src
Definition: vp8dsp.c:248