FFmpeg
vp9_mc_lsx.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021 Loongson Technology Corporation Limited
3  * Contributed by Hao Chen <chenhao@loongson.cn>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavcodec/vp9dsp.h"
24 #include "vp9dsp_loongarch.h"
25 
26 static const uint8_t mc_filt_mask_arr[16 * 3] = {
27  /* 8 width cases */
28  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29  /* 4 width cases */
30  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31  /* 4 width cases */
32  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
33 };
34 
35 
36 #define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, \
37  _mask0, _mask1, _mask2, _mask3, \
38  _filter0, _filter1, _filter2, _filter3, \
39  _out0, _out1) \
40 { \
41  __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
42  __m128i _reg0, _reg1, _reg2, _reg3; \
43  \
44  DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src3, _src2, _mask0, \
45  _tmp0, _tmp1); \
46  DUP2_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _reg0, _reg1); \
47  DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask1, _src3, _src2, _mask1, \
48  _tmp2, _tmp3); \
49  DUP2_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp2, _filter1, _reg1, _tmp3, \
50  _filter1, _reg0, _reg1); \
51  DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask2, _src3, _src2, _mask2, \
52  _tmp4, _tmp5); \
53  DUP2_ARG2(__lsx_vdp2_h_b, _tmp4, _filter2, _tmp5, _filter2, _reg2, _reg3); \
54  DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask3, _src3, _src2, _mask3, \
55  _tmp6, _tmp7); \
56  DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7, \
57  _filter3, _reg2, _reg3); \
58  DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1); \
59 }
60 
61 #define HORIZ_8TAP_8WID_4VECS_FILT(_src0, _src1, _src2, _src3, \
62  _mask0, _mask1, _mask2, _mask3, \
63  _filter0, _filter1, _filter2, _filter3, \
64  _out0, _out1, _out2, _out3) \
65 { \
66  __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
67  __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7; \
68  \
69  DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask0, _src1, _src1, _mask0, _src2,\
70  _src2, _mask0, _src3, _src3, _mask0, _tmp0, _tmp1, _tmp2, _tmp3);\
71  DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _tmp2, \
72  _filter0, _tmp3, _filter0, _reg0, _reg1, _reg2, _reg3); \
73  DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask2, _src1, _src1, _mask2, _src2,\
74  _src2, _mask2, _src3, _src3, _mask2, _tmp0, _tmp1, _tmp2, _tmp3);\
75  DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter2, _tmp1, _filter2, _tmp2, \
76  _filter2, _tmp3, _filter2, _reg4, _reg5, _reg6, _reg7); \
77  DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask1, _src1, _src1, _mask1, _src2,\
78  _src2, _mask1, _src3, _src3, _mask1, _tmp4, _tmp5, _tmp6, _tmp7);\
79  DUP4_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp4, _filter1, _reg1, _tmp5, \
80  _filter1, _reg2, _tmp6, _filter1, _reg3, _tmp7, _filter1, _reg0, \
81  _reg1, _reg2, _reg3); \
82  DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask3, _src1, _src1, _mask3, _src2,\
83  _src2, _mask3, _src3, _src3, _mask3, _tmp4, _tmp5, _tmp6, _tmp7);\
84  DUP4_ARG3(__lsx_vdp2add_h_b, _reg4, _tmp4, _filter3, _reg5, _tmp5, \
85  _filter3, _reg6, _tmp6, _filter3, _reg7, _tmp7, _filter3, _reg4, \
86  _reg5, _reg6, _reg7); \
87  DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3, \
88  _reg7, _out0, _out1, _out2, _out3); \
89 }
90 
91 #define FILT_8TAP_DPADD_S_H(_reg0, _reg1, _reg2, _reg3, \
92  _filter0, _filter1, _filter2, _filter3) \
93 ( { \
94  __m128i _vec0, _vec1; \
95  \
96  _vec0 = __lsx_vdp2_h_b(_reg0, _filter0); \
97  _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1); \
98  _vec1 = __lsx_vdp2_h_b(_reg2, _filter2); \
99  _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3); \
100  _vec0 = __lsx_vsadd_h(_vec0, _vec1); \
101  \
102  _vec0; \
103 } )
104 
105 #define HORIZ_8TAP_FILT(_src0, _src1, _mask0, _mask1, _mask2, _mask3, \
106  _filt_h0, _filt_h1, _filt_h2, _filt_h3) \
107 ( { \
108  __m128i _tmp0, _tmp1, _tmp2, _tmp3; \
109  __m128i _out; \
110  \
111  DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1,\
112  _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3);\
113  _out = FILT_8TAP_DPADD_S_H(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1, \
114  _filt_h2, _filt_h3); \
115  _out = __lsx_vsrari_h(_out, 7); \
116  _out = __lsx_vsat_h(_out, 7); \
117  \
118  _out; \
119 } )
120 
121 #define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \
122 { \
123  _src0 = __lsx_vld(_src, 0); \
124  _src += _stride; \
125  _src1 = __lsx_vld(_src, 0); \
126  _src += _stride; \
127  _src2 = __lsx_vld(_src, 0); \
128  _src += _stride; \
129  _src3 = __lsx_vld(_src, 0); \
130 }
131 
132 static void common_hz_8t_4x4_lsx(const uint8_t *src, int32_t src_stride,
133  uint8_t *dst, int32_t dst_stride,
134  const int8_t *filter)
135 {
136  __m128i src0, src1, src2, src3;
137  __m128i filter0, filter1, filter2, filter3;
138  __m128i mask0, mask1, mask2, mask3;
139  __m128i out, out0, out1;
140 
141  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
142  src -= 3;
143  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
144  filter0, filter1, filter2, filter3);
145  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
146  mask3 = __lsx_vaddi_bu(mask0, 6);
147 
148  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
149  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
150  src0, src1, src2, src3);
151  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
152  mask3, filter0, filter1, filter2, filter3, out0, out1);
153  out = __lsx_vssrarni_b_h(out1, out0, 7);
154  out = __lsx_vxori_b(out, 128);
155  __lsx_vstelm_w(out, dst, 0, 0);
156  dst += dst_stride;
157  __lsx_vstelm_w(out, dst, 0, 1);
158  dst += dst_stride;
159  __lsx_vstelm_w(out, dst, 0, 2);
160  dst += dst_stride;
161  __lsx_vstelm_w(out, dst, 0, 3);
162 }
163 
164 static void common_hz_8t_4x8_lsx(const uint8_t *src, int32_t src_stride,
165  uint8_t *dst, int32_t dst_stride,
166  const int8_t *filter)
167 {
168  int32_t src_stride2 = src_stride << 1;
169  int32_t src_stride3 = src_stride + src_stride2;
170  int32_t src_stride4 = src_stride2 << 1;
171  __m128i src0, src1, src2, src3;
172  __m128i filter0, filter1, filter2, filter3;
173  __m128i mask0, mask1, mask2, mask3;
174  __m128i out0, out1, out2, out3;
175  uint8_t *_src = (uint8_t*)src - 3;
176 
177  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
178  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
179  mask3 = __lsx_vaddi_bu(mask0, 6);
180  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
181  filter0, filter1, filter2, filter3);
182 
183  src0 = __lsx_vld(_src, 0);
184  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
185  src3 = __lsx_vldx(_src, src_stride3);
186  _src += src_stride4;
187  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
188  src0, src1, src2, src3);
189  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
190  mask3, filter0, filter1, filter2, filter3, out0, out1);
191  src0 = __lsx_vld(_src, 0);
192  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
193  src3 = __lsx_vldx(_src, src_stride3);
194  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
195  src0, src1, src2, src3);
196  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
197  mask3, filter0, filter1, filter2, filter3, out2, out3);
198  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
199  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
200  __lsx_vstelm_w(out0, dst, 0, 0);
201  dst += dst_stride;
202  __lsx_vstelm_w(out0, dst, 0, 1);
203  dst += dst_stride;
204  __lsx_vstelm_w(out0, dst, 0, 2);
205  dst += dst_stride;
206  __lsx_vstelm_w(out0, dst, 0, 3);
207  dst += dst_stride;
208  __lsx_vstelm_w(out1, dst, 0, 0);
209  dst += dst_stride;
210  __lsx_vstelm_w(out1, dst, 0, 1);
211  dst += dst_stride;
212  __lsx_vstelm_w(out1, dst, 0, 2);
213  dst += dst_stride;
214  __lsx_vstelm_w(out1, dst, 0, 3);
215 }
216 
217 static void common_hz_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
218  uint8_t *dst, int32_t dst_stride,
219  const int8_t *filter, int32_t height)
220 {
221  if (height == 4) {
222  common_hz_8t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
223  } else if (height == 8) {
224  common_hz_8t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
225  }
226 }
227 
228 static void common_hz_8t_8x4_lsx(const uint8_t *src, int32_t src_stride,
229  uint8_t *dst, int32_t dst_stride,
230  const int8_t *filter)
231 {
232  __m128i src0, src1, src2, src3;
233  __m128i filter0, filter1, filter2, filter3;
234  __m128i mask0, mask1, mask2, mask3;
235  __m128i out0, out1, out2, out3;
236 
237  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
238  src -= 3;
239  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
240  mask3 = __lsx_vaddi_bu(mask0, 6);
241  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
242  filter0, filter1, filter2, filter3);
243 
244  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
245  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
246  src0, src1, src2, src3);
247  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
248  mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
249  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
250  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
251  __lsx_vstelm_d(out0, dst, 0, 0);
252  dst += dst_stride;
253  __lsx_vstelm_d(out0, dst, 0, 1);
254  dst += dst_stride;
255  __lsx_vstelm_d(out1, dst, 0, 0);
256  dst += dst_stride;
257  __lsx_vstelm_d(out1, dst, 0, 1);
258 }
259 
260 static void common_hz_8t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
261  uint8_t *dst, int32_t dst_stride,
262  const int8_t *filter, int32_t height)
263 {
264  uint32_t loop_cnt = height >> 2;
265  int32_t src_stride2 = src_stride << 1;
266  int32_t src_stride3 = src_stride + src_stride2;
267  int32_t src_stride4 = src_stride2 << 1;
268  __m128i src0, src1, src2, src3;
269  __m128i filter0, filter1, filter2, filter3;
270  __m128i mask0, mask1, mask2, mask3;
271  __m128i out0, out1, out2, out3;
272  uint8_t* _src = (uint8_t*)src - 3;
273 
274  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
275  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
276  mask3 = __lsx_vaddi_bu(mask0, 6);
277  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
278  filter0, filter1, filter2, filter3);
279 
280  for (; loop_cnt--;) {
281  src0 = __lsx_vld(_src, 0);
282  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
283  src3 = __lsx_vldx(_src, src_stride3);
284  _src += src_stride4;
285  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
286  src0, src1, src2, src3);
287  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
288  mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
289  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
290  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
291  __lsx_vstelm_d(out0, dst, 0, 0);
292  dst += dst_stride;
293  __lsx_vstelm_d(out0, dst, 0, 1);
294  dst += dst_stride;
295  __lsx_vstelm_d(out1, dst, 0, 0);
296  dst += dst_stride;
297  __lsx_vstelm_d(out1, dst, 0, 1);
298  dst += dst_stride;
299  }
300 }
301 
302 static void common_hz_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
303  uint8_t *dst, int32_t dst_stride,
304  const int8_t *filter, int32_t height)
305 {
306  if (height == 4) {
307  common_hz_8t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
308  } else {
309  common_hz_8t_8x8mult_lsx(src, src_stride, dst, dst_stride,
310  filter, height);
311  }
312 }
313 
314 static void common_hz_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
315  uint8_t *dst, int32_t dst_stride,
316  const int8_t *filter, int32_t height)
317 {
318  uint32_t loop_cnt = height >> 1;
319  int32_t stride = src_stride << 1;
320  __m128i src0, src1, src2, src3;
321  __m128i filter0, filter1, filter2, filter3;
322  __m128i mask0, mask1, mask2, mask3;
323  __m128i out0, out1, out2, out3;
324 
325  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
326  src -= 3;
327  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
328  mask3 = __lsx_vaddi_bu(mask0, 6);
329  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
330  filter0, filter1, filter2, filter3);
331 
332  for (; loop_cnt--;) {
333  const uint8_t* _src = src + src_stride;
334  DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src0, src2);
335  DUP2_ARG2(__lsx_vld, src, 8, _src, 8, src1, src3);
336  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
337  src0, src1, src2, src3);
338  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
339  mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
340  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
341  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
342  __lsx_vst(out0, dst, 0);
343  dst += dst_stride;
344  __lsx_vst(out1, dst, 0);
345  dst += dst_stride;
346  src += stride;
347  }
348 }
349 
350 static void common_hz_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
351  uint8_t *dst, int32_t dst_stride,
352  const int8_t *filter, int32_t height)
353 {
354  uint32_t loop_cnt = height >> 1;
355  __m128i src0, src1, src2, src3;
356  __m128i filter0, filter1, filter2, filter3;
357  __m128i mask0, mask1, mask2, mask3;
358  __m128i out0, out1, out2, out3;
359  __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
360 
361  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
362  src -= 3;
363  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
364  mask3 = __lsx_vaddi_bu(mask0, 6);
365  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
366  filter0, filter1, filter2, filter3);
367 
368  for (; loop_cnt--;) {
369  DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
370  src3 = __lsx_vld(src, 24);
371  src1 = __lsx_vshuf_b(src2, src0, shuff);
372  src += src_stride;
373  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
374  src0, src1, src2, src3);
375  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
376  mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
377  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
378  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
379  __lsx_vst(out0, dst, 0);
380  __lsx_vst(out1, dst, 16);
381 
382  DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
383  src3 = __lsx_vld(src, 24);
384  src1 = __lsx_vshuf_b(src2, src0, shuff);
385  src += src_stride;
386 
387  dst += dst_stride;
388  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
389  src0, src1, src2, src3);
390  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
391  mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
392  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
393  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
394  __lsx_vst(out0, dst, 0);
395  __lsx_vst(out1, dst, 16);
396  dst += dst_stride;
397  }
398 }
399 
400 static void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
401  uint8_t *dst, int32_t dst_stride,
402  const int8_t *filter, int32_t height)
403 {
404  int32_t loop_cnt = height;
405  __m128i src0, src1, src2, src3;
406  __m128i filter0, filter1, filter2, filter3;
407  __m128i mask0, mask1, mask2, mask3;
408  __m128i out0, out1, out2, out3;
409  __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
410 
411  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
412  src -= 3;
413  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
414  mask3 = __lsx_vaddi_bu(mask0, 6);
415  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
416  filter0, filter1, filter2, filter3);
417 
418  for (; loop_cnt--;) {
419  DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
420  src3 = __lsx_vld(src, 24);
421  src1 = __lsx_vshuf_b(src2, src0, shuff);
422  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
423  src0, src1, src2, src3);
424  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
425  mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
426  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
427  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
428  __lsx_vst(out0, dst, 0);
429  __lsx_vst(out1, dst, 16);
430 
431  DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
432  src3 = __lsx_vld(src, 56);
433  src1 = __lsx_vshuf_b(src2, src0, shuff);
434  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
435  src0, src1, src2, src3);
436  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
437  mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
438  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
439  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
440  __lsx_vst(out0, dst, 32);
441  __lsx_vst(out1, dst, 48);
442  src += src_stride;
443  dst += dst_stride;
444  }
445 }
446 
447 static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
448  uint8_t *dst, int32_t dst_stride,
449  const int8_t *filter, int32_t height)
450 {
451  uint32_t loop_cnt = height >> 2;
452  int32_t src_stride2 = src_stride << 1;
453  int32_t src_stride3 = src_stride + src_stride2;
454  int32_t src_stride4 = src_stride2 << 1;
455  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
456  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
457  __m128i reg0, reg1, reg2, reg3, reg4;
458  __m128i filter0, filter1, filter2, filter3;
459  __m128i out0, out1;
460  uint8_t* _src = (uint8_t*)src - src_stride3;
461 
462  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
463  filter0, filter1, filter2, filter3);
464  src0 = __lsx_vld(_src, 0);
465  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
466  src3 = __lsx_vldx(_src, src_stride3);
467  _src += src_stride4;
468  src4 = __lsx_vld(_src, 0);
469  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
470  _src += src_stride3;
471  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0,
472  tmp1, tmp2, tmp3);
473  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
474  DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
475  reg2 = __lsx_vilvl_d(tmp5, tmp2);
476  DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
477  reg2 = __lsx_vxori_b(reg2, 128);
478 
479  for (;loop_cnt--;) {
480  src7 = __lsx_vld(_src, 0);
481  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
482  src10 = __lsx_vldx(_src, src_stride3);
483  _src += src_stride4;
484  DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
485  src9, tmp0, tmp1, tmp2, tmp3);
486  DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
487  DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
488  out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0, filter1,
489  filter2, filter3);
490  out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0, filter1,
491  filter2, filter3);
492  out0 = __lsx_vssrarni_b_h(out1, out0, 7);
493  out0 = __lsx_vxori_b(out0, 128);
494  __lsx_vstelm_w(out0, dst, 0, 0);
495  dst += dst_stride;
496  __lsx_vstelm_w(out0, dst, 0, 1);
497  dst += dst_stride;
498  __lsx_vstelm_w(out0, dst, 0, 2);
499  dst += dst_stride;
500  __lsx_vstelm_w(out0, dst, 0, 3);
501  dst += dst_stride;
502 
503  reg0 = reg2;
504  reg1 = reg3;
505  reg2 = reg4;
506  src6 = src10;
507  }
508 }
509 
510 static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
511  uint8_t *dst, int32_t dst_stride,
512  const int8_t *filter, int32_t height)
513 {
514  uint32_t loop_cnt = height >> 2;
515  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
516  __m128i tmp0, tmp1, tmp2, tmp3;
517  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
518  __m128i filter0, filter1, filter2, filter3;
519  __m128i out0, out1, out2, out3;
520  int32_t src_stride2 = src_stride << 1;
521  int32_t src_stride3 = src_stride + src_stride2;
522  int32_t src_stride4 = src_stride2 << 1;
523  uint8_t* _src = (uint8_t*)src - src_stride3;
524 
525  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
526  filter0, filter1, filter2, filter3);
527 
528  src0 = __lsx_vld(_src, 0);
529  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
530  src3 = __lsx_vldx(_src, src_stride3);
531  _src += src_stride4;
532  src4 = __lsx_vld(_src, 0);
533  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
534  _src += src_stride3;
535 
536  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
537  src0, src1, src2, src3);
538  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
539  src6 = __lsx_vxori_b(src6, 128);
540  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
541  reg0, reg1, reg2, reg3);
542  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
543 
544  for (;loop_cnt--;) {
545  src7 = __lsx_vld(_src, 0);
546  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
547  src10 = __lsx_vldx(_src, src_stride3);
548  _src += src_stride4;
549  DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
550  src7, src8, src9, src10);
551  DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
552  src9, tmp0, tmp1, tmp2, tmp3);
553  out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0, filter1,
554  filter2, filter3);
555  out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0, filter1,
556  filter2, filter3);
557  out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0, filter1,
558  filter2, filter3);
559  out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0, filter1,
560  filter2, filter3);
561  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
562  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
563  __lsx_vstelm_d(out0, dst, 0, 0);
564  dst += dst_stride;
565  __lsx_vstelm_d(out0, dst, 0, 1);
566  dst += dst_stride;
567  __lsx_vstelm_d(out1, dst, 0, 0);
568  dst += dst_stride;
569  __lsx_vstelm_d(out1, dst, 0, 1);
570  dst += dst_stride;
571 
572  reg0 = reg2;
573  reg1 = tmp0;
574  reg2 = tmp2;
575  reg3 = reg5;
576  reg4 = tmp1;
577  reg5 = tmp3;
578  src6 = src10;
579  }
580 }
581 
582 static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
583  uint8_t *dst, int32_t dst_stride,
584  const int8_t *filter, int32_t height)
585 {
586  uint32_t loop_cnt = height >> 2;
587  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
588  __m128i filter0, filter1, filter2, filter3;
589  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
590  __m128i reg6, reg7, reg8, reg9, reg10, reg11;
591  __m128i tmp0, tmp1, tmp2, tmp3;
592  int32_t src_stride2 = src_stride << 1;
593  int32_t src_stride3 = src_stride + src_stride2;
594  int32_t src_stride4 = src_stride2 << 1;
595  uint8_t* _src = (uint8_t*)src - src_stride3;
596 
597  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
598  filter0, filter1, filter2, filter3);
599  src0 = __lsx_vld(_src, 0);
600  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
601  src3 = __lsx_vldx(_src, src_stride3);
602  _src += src_stride4;
603  src4 = __lsx_vld(_src, 0);
604  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
605  _src += src_stride3;
606  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
607  src0, src1, src2, src3);
608  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
609  src6 = __lsx_vxori_b(src6, 128);
610  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
611  reg0, reg1, reg2, reg3);
612  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
613  DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
614  reg6, reg7, reg8, reg9);
615  DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
616 
617  for (;loop_cnt--;) {
618  src7 = __lsx_vld(_src, 0);
619  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
620  src10 = __lsx_vldx(_src, src_stride3);
621  _src += src_stride4;
622  DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
623  src7, src8, src9, src10);
624  DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
625  src0, src1, src2, src3);
626  DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
627  src4, src5, src7, src8);
628  tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1,
629  filter2, filter3);
630  tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1,
631  filter2, filter3);
632  tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1,
633  filter2, filter3);
634  tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1,
635  filter2, filter3);
636  DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
637  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
638  __lsx_vst(tmp0, dst, 0);
639  dst += dst_stride;
640  __lsx_vst(tmp1, dst, 0);
641  dst += dst_stride;
642  tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1,
643  filter2, filter3);
644  tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1,
645  filter2, filter3);
646  tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1,
647  filter2, filter3);
648  tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1,
649  filter2, filter3);
650  DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
651  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
652  __lsx_vst(tmp0, dst, 0);
653  dst += dst_stride;
654  __lsx_vst(tmp1, dst, 0);
655  dst += dst_stride;
656 
657  reg0 = reg2;
658  reg1 = src0;
659  reg2 = src2;
660  reg3 = reg5;
661  reg4 = src1;
662  reg5 = src3;
663  reg6 = reg8;
664  reg7 = src4;
665  reg8 = src7;
666  reg9 = reg11;
667  reg10 = src5;
668  reg11 = src8;
669  src6 = src10;
670  }
671 }
672 
673 static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride,
674  uint8_t *dst, int32_t dst_stride,
675  const int8_t *filter, int32_t height,
676  int32_t width)
677 {
678  uint8_t *src_tmp;
679  uint8_t *dst_tmp;
680  uint32_t cnt = width >> 4;
681  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
682  __m128i filter0, filter1, filter2, filter3;
683  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
684  __m128i reg6, reg7, reg8, reg9, reg10, reg11;
685  __m128i tmp0, tmp1, tmp2, tmp3;
686  int32_t src_stride2 = src_stride << 1;
687  int32_t src_stride3 = src_stride + src_stride2;
688  int32_t src_stride4 = src_stride2 << 1;
689  int32_t dst_stride2 = dst_stride << 1;
690  int32_t dst_stride3 = dst_stride2 + dst_stride;
691  int32_t dst_stride4 = dst_stride2 << 1;
692  uint8_t* _src = (uint8_t*)src - src_stride3;
693 
694  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
695  filter0, filter1, filter2, filter3);
696  for (;cnt--;) {
697  uint32_t loop_cnt = height >> 2;
698 
699  src_tmp = _src;
700  dst_tmp = dst;
701 
702  src0 = __lsx_vld(src_tmp, 0);
703  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
704  src1, src2);
705  src3 = __lsx_vldx(src_tmp, src_stride3);
706  src_tmp += src_stride4;
707  src4 = __lsx_vld(src_tmp, 0);
708  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
709  src5, src6);
710  src_tmp += src_stride3;
711 
712  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
713  src0, src1, src2, src3);
714  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
715  src6 = __lsx_vxori_b(src6, 128);
716  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
717  reg0, reg1, reg2, reg3);
718  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
719  DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
720  reg6, reg7, reg8, reg9);
721  DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
722 
723  for (;loop_cnt--;) {
724  src7 = __lsx_vld(src_tmp, 0);
725  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
726  src8, src9);
727  src10 = __lsx_vldx(src_tmp, src_stride3);
728  src_tmp += src_stride4;
729  DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10,
730  128, src7, src8, src9, src10);
731  DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8,
732  src10, src9, src0, src1, src2, src3);
733  DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8,
734  src10, src9, src4, src5, src7, src8);
735  tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0,
736  filter1, filter2, filter3);
737  tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0,
738  filter1, filter2, filter3);
739  tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0,
740  filter1, filter2, filter3);
741  tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0,
742  filter1, filter2, filter3);
743  DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7,
744  tmp0, tmp1);
745  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
746  __lsx_vst(tmp0, dst_tmp, 0);
747  __lsx_vstx(tmp1, dst_tmp, dst_stride);
748  tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0,
749  filter1, filter2, filter3);
750  tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0,
751  filter1, filter2, filter3);
752  tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0,
753  filter1, filter2, filter3);
754  tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0,
755  filter1, filter2, filter3);
756  DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7,
757  tmp0, tmp1);
758  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
759  __lsx_vstx(tmp0, dst_tmp, dst_stride2);
760  __lsx_vstx(tmp1, dst_tmp, dst_stride3);
761  dst_tmp += dst_stride4;
762 
763  reg0 = reg2;
764  reg1 = src0;
765  reg2 = src2;
766  reg3 = reg5;
767  reg4 = src1;
768  reg5 = src3;
769  reg6 = reg8;
770  reg7 = src4;
771  reg8 = src7;
772  reg9 = reg11;
773  reg10 = src5;
774  reg11 = src8;
775  src6 = src10;
776  }
777  _src += 16;
778  dst += 16;
779  }
780 }
781 
782 static void common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
783  uint8_t *dst, int32_t dst_stride,
784  const int8_t *filter, int32_t height)
785 {
786  common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height, 32);
787 }
788 
789 static void common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
790  uint8_t *dst, int32_t dst_stride,
791  const int8_t *filter, int32_t height)
792 {
793  common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride,
794  filter, height, 64);
795 }
796 
797 static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride,
798  uint8_t *dst, int32_t dst_stride,
799  const int8_t *filter_horiz,
800  const int8_t *filter_vert,
801  int32_t height)
802 {
803  uint32_t loop_cnt = height >> 2;
804  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
805  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
806  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
807  __m128i mask0, mask1, mask2, mask3;
808  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
809  __m128i out0, out1;
810  __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
811  int32_t src_stride2 = src_stride << 1;
812  int32_t src_stride3 = src_stride + src_stride2;
813  int32_t src_stride4 = src_stride2 << 1;
814  uint8_t* _src = (uint8_t*)src - src_stride3 - 3;
815 
816  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
817  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
818  filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
819  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
820  mask3 = __lsx_vaddi_bu(mask0, 6);
821 
822  src0 = __lsx_vld(_src, 0);
823  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
824  src3 = __lsx_vldx(_src, src_stride3);
825  _src += src_stride4;
826  src4 = __lsx_vld(_src, 0);
827  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
828  _src += src_stride3;
829  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
830  src0, src1, src2, src3);
831  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
832  src6 = __lsx_vxori_b(src6, 128);
833 
834  tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
835  filt_hz1, filt_hz2, filt_hz3);
836  tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
837  filt_hz1, filt_hz2, filt_hz3);
838  tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
839  filt_hz1, filt_hz2, filt_hz3);
840  tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
841  filt_hz1, filt_hz2, filt_hz3);
842  DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
843  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
844  filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
845  DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
846  tmp2 = __lsx_vpackev_b(tmp5, tmp4);
847 
848  for (;loop_cnt--;) {
849  src7 = __lsx_vld(_src, 0);
850  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
851  src10 = __lsx_vldx(_src, src_stride3);
852  _src += src_stride4;
853  DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
854  src7, src8, src9, src10);
855  tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
856  filt_hz1, filt_hz2, filt_hz3);
857  tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
858  tmp4 = __lsx_vpackev_b(tmp3, tmp4);
859  out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
860  filt_vt2, filt_vt3);
861  src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
862  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
863  src0 = __lsx_vshuf_b(src1, tmp3, shuff);
864  src0 = __lsx_vpackev_b(src1, src0);
865  out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
866  filt_vt2, filt_vt3);
867  out0 = __lsx_vssrarni_b_h(out1, out0, 7);
868  out0 = __lsx_vxori_b(out0, 128);
869  __lsx_vstelm_w(out0, dst, 0, 0);
870  dst += dst_stride;
871  __lsx_vstelm_w(out0, dst, 0, 1);
872  dst += dst_stride;
873  __lsx_vstelm_w(out0, dst, 0, 2);
874  dst += dst_stride;
875  __lsx_vstelm_w(out0, dst, 0, 3);
876  dst += dst_stride;
877 
878  tmp5 = src1;
879  tmp0 = tmp2;
880  tmp1 = tmp4;
881  tmp2 = src0;
882  }
883 }
884 
885 static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride,
886  uint8_t *dst, int32_t dst_stride,
887  const int8_t *filter_horiz,
888  const int8_t *filter_vert,
889  int32_t height)
890 {
891  uint32_t loop_cnt = height >> 2;
892  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
893  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
894  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
895  __m128i mask0, mask1, mask2, mask3;
896  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
897  __m128i out0, out1;
898  int32_t src_stride2 = src_stride << 1;
899  int32_t src_stride3 = src_stride + src_stride2;
900  int32_t src_stride4 = src_stride2 << 1;
901  uint8_t* _src = (uint8_t*)src - src_stride3 - 3;
902 
903  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
904  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz,
905  4, filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
906  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
907  mask3 = __lsx_vaddi_bu(mask0, 6);
908 
909  src0 = __lsx_vld(_src, 0);
910  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
911  src3 = __lsx_vldx(_src, src_stride3);
912  _src += src_stride4;
913  src4 = __lsx_vld(_src, 0);
914  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
915  _src += src_stride3;
916  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
917  src0, src1, src2, src3);
918  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
919  src6 = __lsx_vxori_b(src6, 128);
920 
921  src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
922  filt_hz1, filt_hz2, filt_hz3);
923  src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
924  filt_hz1, filt_hz2, filt_hz3);
925  src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
926  filt_hz1, filt_hz2, filt_hz3);
927  src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
928  filt_hz1, filt_hz2, filt_hz3);
929  src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
930  filt_hz1, filt_hz2, filt_hz3);
931  src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
932  filt_hz1, filt_hz2, filt_hz3);
933  src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
934  filt_hz1, filt_hz2, filt_hz3);
935 
936  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
937  filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
938  DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4,
939  src2, src1, tmp0, tmp1, tmp2, tmp4);
940  DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
941 
942  for (;loop_cnt--;) {
943  src7 = __lsx_vld(_src, 0);
944  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
945  src10 = __lsx_vldx(_src, src_stride3);
946  _src += src_stride4;
947 
948  DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
949  src7, src8, src9, src10);
950  src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
951  filt_hz1, filt_hz2, filt_hz3);
952  tmp3 = __lsx_vpackev_b(src7, src6);
953  out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
954  filt_vt2, filt_vt3);
955  src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
956  filt_hz1, filt_hz2, filt_hz3);
957  src0 = __lsx_vpackev_b(src8, src7);
958  out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
959  filt_vt2, filt_vt3);
960  src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
961  filt_hz1, filt_hz2, filt_hz3);
962  src1 = __lsx_vpackev_b(src9, src8);
963  src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
964  filt_vt2, filt_vt3);
965  src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
966  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
967  src2 = __lsx_vpackev_b(src10, src9);
968  src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
969  filt_vt2, filt_vt3);
970  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
971  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
972  __lsx_vstelm_d(out0, dst, 0, 0);
973  dst += dst_stride;
974  __lsx_vstelm_d(out0, dst, 0, 1);
975  dst += dst_stride;
976  __lsx_vstelm_d(out1, dst, 0, 0);
977  dst += dst_stride;
978  __lsx_vstelm_d(out1, dst, 0, 1);
979  dst += dst_stride;
980 
981  src6 = src10;
982  tmp0 = tmp2;
983  tmp1 = tmp3;
984  tmp2 = src1;
985  tmp4 = tmp6;
986  tmp5 = src0;
987  tmp6 = src2;
988  }
989 }
990 
991 static void common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride,
992  uint8_t *dst, int32_t dst_stride,
993  const int8_t *filter_horiz,
994  const int8_t *filter_vert,
995  int32_t height)
996 {
997  int32_t multiple8_cnt;
998 
999  for (multiple8_cnt = 2; multiple8_cnt--;) {
1000  common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
1001  filter_vert, height);
1002  src += 8;
1003  dst += 8;
1004  }
1005 }
1006 
1007 static void common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride,
1008  uint8_t *dst, int32_t dst_stride,
1009  const int8_t *filter_horiz,
1010  const int8_t *filter_vert,
1011  int32_t height)
1012 {
1013  int32_t multiple8_cnt;
1014 
1015  for (multiple8_cnt = 4; multiple8_cnt--;) {
1016  common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
1017  filter_vert, height);
1018  src += 8;
1019  dst += 8;
1020  }
1021 }
1022 
1023 static void common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride,
1024  uint8_t *dst, int32_t dst_stride,
1025  const int8_t *filter_horiz,
1026  const int8_t *filter_vert,
1027  int32_t height)
1028 {
1029  int32_t multiple8_cnt;
1030 
1031  for (multiple8_cnt = 8; multiple8_cnt--;) {
1032  common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
1033  filter_vert, height);
1034  src += 8;
1035  dst += 8;
1036  }
1037 }
1038 
1039 static void copy_width8_lsx(const uint8_t *src, int32_t src_stride,
1040  uint8_t *dst, int32_t dst_stride,
1041  int32_t height)
1042 {
1043  int32_t cnt = height >> 2;
1044  __m128i src0, src1, src2, src3;
1045 
1046  for (;cnt--;) {
1047  src0 = __lsx_vldrepl_d(src, 0);
1048  src += src_stride;
1049  src1 = __lsx_vldrepl_d(src, 0);
1050  src += src_stride;
1051  src2 = __lsx_vldrepl_d(src, 0);
1052  src += src_stride;
1053  src3 = __lsx_vldrepl_d(src, 0);
1054  src += src_stride;
1055  __lsx_vstelm_d(src0, dst, 0, 0);
1056  dst += dst_stride;
1057  __lsx_vstelm_d(src1, dst, 0, 0);
1058  dst += dst_stride;
1059  __lsx_vstelm_d(src2, dst, 0, 0);
1060  dst += dst_stride;
1061  __lsx_vstelm_d(src3, dst, 0, 0);
1062  dst += dst_stride;
1063  }
1064 }
1065 
1066 static void copy_width16_lsx(const uint8_t *src, int32_t src_stride,
1067  uint8_t *dst, int32_t dst_stride,
1068  int32_t height)
1069 {
1070  int32_t cnt = height >> 2;
1071  __m128i src0, src1, src2, src3;
1072  int32_t src_stride2 = src_stride << 1;
1073  int32_t src_stride3 = src_stride + src_stride2;
1074  int32_t src_stride4 = src_stride2 << 1;
1075  int32_t dst_stride2 = dst_stride << 1;
1076  int32_t dst_stride3 = dst_stride2 + dst_stride;
1077  int32_t dst_stride4 = dst_stride2 << 1;
1078  uint8_t *_src = (uint8_t*)src;
1079 
1080  for (;cnt--;) {
1081  src0 = __lsx_vld(_src, 0);
1082  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1083  src3 = __lsx_vldx(_src, src_stride3);
1084  _src += src_stride4;
1085  __lsx_vst(src0, dst, 0);
1086  __lsx_vstx(src1, dst, dst_stride);
1087  __lsx_vstx(src2, dst, dst_stride2);
1088  __lsx_vstx(src3, dst, dst_stride3);
1089  dst += dst_stride4;
1090  }
1091 }
1092 
1093 static void copy_width32_lsx(const uint8_t *src, int32_t src_stride,
1094  uint8_t *dst, int32_t dst_stride,
1095  int32_t height)
1096 {
1097  int32_t cnt = height >> 2;
1098  uint8_t *src_tmp1 = (uint8_t*)src;
1099  uint8_t *dst_tmp1 = dst;
1100  uint8_t *src_tmp2 = src_tmp1 + 16;
1101  uint8_t *dst_tmp2 = dst_tmp1 + 16;
1102  int32_t src_stride2 = src_stride << 1;
1103  int32_t src_stride3 = src_stride + src_stride2;
1104  int32_t src_stride4 = src_stride2 << 1;
1105  int32_t dst_stride2 = dst_stride << 1;
1106  int32_t dst_stride3 = dst_stride2 + dst_stride;
1107  int32_t dst_stride4 = dst_stride2 << 1;
1108  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
1109 
1110  for (;cnt--;) {
1111  src0 = __lsx_vld(src_tmp1, 0);
1112  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2,
1113  src1, src2);
1114  src3 = __lsx_vldx(src_tmp1, src_stride3);
1115  src_tmp1 += src_stride4;
1116 
1117  src4 = __lsx_vld(src_tmp2, 0);
1118  DUP2_ARG2(__lsx_vldx, src_tmp2, src_stride, src_tmp2, src_stride2,
1119  src5, src6);
1120  src7 = __lsx_vldx(src_tmp2, src_stride3);
1121  src_tmp2 += src_stride4;
1122 
1123  __lsx_vst(src0, dst_tmp1, 0);
1124  __lsx_vstx(src1, dst_tmp1, dst_stride);
1125  __lsx_vstx(src2, dst_tmp1, dst_stride2);
1126  __lsx_vstx(src3, dst_tmp1, dst_stride3);
1127  dst_tmp1 += dst_stride4;
1128  __lsx_vst(src4, dst_tmp2, 0);
1129  __lsx_vstx(src5, dst_tmp2, dst_stride);
1130  __lsx_vstx(src6, dst_tmp2, dst_stride2);
1131  __lsx_vstx(src7, dst_tmp2, dst_stride3);
1132  dst_tmp2 += dst_stride4;
1133  }
1134 }
1135 
1136 static void copy_width64_lsx(const uint8_t *src, int32_t src_stride,
1137  uint8_t *dst, int32_t dst_stride,
1138  int32_t height)
1139 {
1140  int32_t cnt = height >> 2;
1141  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
1142  __m128i src8, src9, src10, src11, src12, src13, src14, src15;
1143 
1144  for (;cnt--;) {
1145  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
1146  src0, src1, src2, src3);
1147  src += src_stride;
1148  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
1149  src4, src5, src6, src7);
1150  src += src_stride;
1151  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
1152  src8, src9, src10, src11);
1153  src += src_stride;
1154  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
1155  src12, src13, src14, src15);
1156  src += src_stride;
1157  __lsx_vst(src0, dst, 0);
1158  __lsx_vst(src1, dst, 16);
1159  __lsx_vst(src2, dst, 32);
1160  __lsx_vst(src3, dst, 48);
1161  dst += dst_stride;
1162  __lsx_vst(src4, dst, 0);
1163  __lsx_vst(src5, dst, 16);
1164  __lsx_vst(src6, dst, 32);
1165  __lsx_vst(src7, dst, 48);
1166  dst += dst_stride;
1167  __lsx_vst(src8, dst, 0);
1168  __lsx_vst(src9, dst, 16);
1169  __lsx_vst(src10, dst, 32);
1170  __lsx_vst(src11, dst, 48);
1171  dst += dst_stride;
1172  __lsx_vst(src12, dst, 0);
1173  __lsx_vst(src13, dst, 16);
1174  __lsx_vst(src14, dst, 32);
1175  __lsx_vst(src15, dst, 48);
1176  dst += dst_stride;
1177  }
1178 }
1179 
1180 static void common_hz_8t_and_aver_dst_4x4_lsx(const uint8_t *src,
1181  int32_t src_stride,
1182  uint8_t *dst, int32_t dst_stride,
1183  const int8_t *filter)
1184 {
1185  uint8_t *dst_tmp = dst;
1186  __m128i src0, src1, src2, src3;
1187  __m128i filter0, filter1, filter2, filter3;
1188  __m128i mask0, mask1, mask2, mask3;
1189  __m128i tmp0, tmp1;
1190  __m128i dst0, dst1, dst2, dst3;
1191 
1192  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
1193  src -= 3;
1194  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1195  mask3 = __lsx_vaddi_bu(mask0, 6);
1196  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1197  filter0, filter1, filter2, filter3);
1198  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
1199  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1200  src0, src1, src2, src3);
1201  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
1202  filter0, filter1, filter2, filter3, tmp0, tmp1);
1203  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
1204  dst_tmp += dst_stride;
1205  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
1206  dst_tmp += dst_stride;
1207  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
1208  dst_tmp += dst_stride;
1209  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
1210  dst0 = __lsx_vilvl_w(dst1, dst0);
1211  dst1 = __lsx_vilvl_w(dst3, dst2);
1212  dst0 = __lsx_vilvl_d(dst1, dst0);
1213  tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
1214  tmp0 = __lsx_vxori_b(tmp0, 128);
1215  dst0 = __lsx_vavgr_bu(tmp0, dst0);
1216  __lsx_vstelm_w(dst0, dst, 0, 0);
1217  dst += dst_stride;
1218  __lsx_vstelm_w(dst0, dst, 0, 1);
1219  dst += dst_stride;
1220  __lsx_vstelm_w(dst0, dst, 0, 2);
1221  dst += dst_stride;
1222  __lsx_vstelm_w(dst0, dst, 0, 3);
1223 }
1224 
1225 static void common_hz_8t_and_aver_dst_4x8_lsx(const uint8_t *src,
1226  int32_t src_stride,
1227  uint8_t *dst, int32_t dst_stride,
1228  const int8_t *filter)
1229 {
1230  uint8_t *dst_tmp = dst;
1231  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
1232  __m128i mask0, mask1, mask2, mask3, tmp0, tmp1, tmp2, tmp3;
1233  __m128i dst0, dst1;
1234 
1235  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
1236  src -= 3;
1237  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1238  mask3 = __lsx_vaddi_bu(mask0, 6);
1239  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1240  filter0, filter1, filter2, filter3);
1241 
1242  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
1243  src += src_stride;
1244  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1245  src0, src1, src2, src3);
1246  tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
1247  dst_tmp += dst_stride;
1248  tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
1249  dst_tmp += dst_stride;
1250  tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
1251  dst_tmp += dst_stride;
1252  tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
1253  dst_tmp += dst_stride;
1254  tmp0 = __lsx_vilvl_w(tmp1, tmp0);
1255  tmp1 = __lsx_vilvl_w(tmp3, tmp2);
1256  dst0 = __lsx_vilvl_d(tmp1, tmp0);
1257 
1258  tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
1259  dst_tmp += dst_stride;
1260  tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
1261  dst_tmp += dst_stride;
1262  tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
1263  dst_tmp += dst_stride;
1264  tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
1265  tmp0 = __lsx_vilvl_w(tmp1, tmp0);
1266  tmp1 = __lsx_vilvl_w(tmp3, tmp2);
1267  dst1 = __lsx_vilvl_d(tmp1, tmp0);
1268  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
1269  filter0, filter1, filter2, filter3, tmp0, tmp1);
1270  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
1271  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1272  src0, src1, src2, src3);
1273  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
1274  filter0, filter1, filter2, filter3, tmp2, tmp3);
1275  DUP4_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp2, tmp2, 7,
1276  tmp3, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
1277  DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1278  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1279  DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
1280  __lsx_vstelm_w(dst0, dst, 0, 0);
1281  dst += dst_stride;
1282  __lsx_vstelm_w(dst0, dst, 0, 1);
1283  dst += dst_stride;
1284  __lsx_vstelm_w(dst0, dst, 0, 2);
1285  dst += dst_stride;
1286  __lsx_vstelm_w(dst0, dst, 0, 3);
1287  dst += dst_stride;
1288  __lsx_vstelm_w(dst1, dst, 0, 0);
1289  dst += dst_stride;
1290  __lsx_vstelm_w(dst1, dst, 0, 1);
1291  dst += dst_stride;
1292  __lsx_vstelm_w(dst1, dst, 0, 2);
1293  dst += dst_stride;
1294  __lsx_vstelm_w(dst1, dst, 0, 3);
1295 }
1296 
1297 static void common_hz_8t_and_aver_dst_4w_lsx(const uint8_t *src,
1298  int32_t src_stride,
1299  uint8_t *dst, int32_t dst_stride,
1300  const int8_t *filter,
1301  int32_t height)
1302 {
1303  if (height == 4) {
1304  common_hz_8t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
1305  } else if (height == 8) {
1306  common_hz_8t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
1307  }
1308 }
1309 
1310 static void common_hz_8t_and_aver_dst_8w_lsx(const uint8_t *src,
1311  int32_t src_stride,
1312  uint8_t *dst, int32_t dst_stride,
1313  const int8_t *filter,
1314  int32_t height)
1315 {
1316  int32_t loop_cnt = height >> 2;
1317  uint8_t *dst_tmp = dst;
1318  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
1319  __m128i mask0, mask1, mask2, mask3;
1320  __m128i tmp0, tmp1, tmp2, tmp3;
1321  __m128i dst0, dst1, dst2, dst3;
1322  int32_t src_stride2 = src_stride << 1;
1323  int32_t src_stride3 = src_stride2 + src_stride;
1324  int32_t src_stride4 = src_stride2 << 1;
1325  uint8_t *_src = (uint8_t*)src - 3;
1326 
1327  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
1328  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1329  mask3 = __lsx_vaddi_bu(mask0, 6);
1330  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1331  filter0, filter1, filter2, filter3);
1332 
1333  for (;loop_cnt--;) {
1334  src0 = __lsx_vld(_src, 0);
1335  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1336  src3 = __lsx_vldx(_src, src_stride3);
1337  _src += src_stride4;
1338  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1339  src0, src1, src2, src3);
1340  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1341  mask3,filter0, filter1, filter2, filter3, tmp0, tmp1, tmp2, tmp3);
1342  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
1343  dst_tmp += dst_stride;
1344  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
1345  dst_tmp += dst_stride;
1346  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
1347  dst_tmp += dst_stride;
1348  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
1349  dst_tmp += dst_stride;
1350  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
1351  DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
1352  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1353  DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
1354  __lsx_vstelm_d(dst0, dst, 0, 0);
1355  dst += dst_stride;
1356  __lsx_vstelm_d(dst0, dst, 0, 1);
1357  dst += dst_stride;
1358  __lsx_vstelm_d(dst1, dst, 0, 0);
1359  dst += dst_stride;
1360  __lsx_vstelm_d(dst1, dst, 0, 1);
1361  dst += dst_stride;
1362  }
1363 }
1364 
1365 static void common_hz_8t_and_aver_dst_16w_lsx(const uint8_t *src,
1366  int32_t src_stride,
1367  uint8_t *dst, int32_t dst_stride,
1368  const int8_t *filter,
1369  int32_t height)
1370 {
1371  int32_t loop_cnt = height >> 1;
1372  int32_t dst_stride2 = dst_stride << 1;
1373  uint8_t *dst_tmp = dst;
1374  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
1375  __m128i mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
1376  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1377  __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1378 
1379  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
1380  src -= 3;
1381  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1382  mask3 = __lsx_vaddi_bu(mask0, 6);
1383  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1384  filter0, filter1, filter2, filter3);
1385 
1386  for (;loop_cnt--;) {
1387  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
1388  src += src_stride;
1389  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
1390  src += src_stride;
1391  dst0 = __lsx_vld(dst_tmp, 0);
1392  dst1 = __lsx_vldx(dst_tmp, dst_stride);
1393  dst_tmp += dst_stride2;
1394  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1395  src0, src1, src2, src3);
1396  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2,
1397  mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
1398  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2,
1399  mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
1400  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2,
1401  mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
1402  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2,
1403  mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
1404  DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3,
1405  filter0, tmp0, tmp1, tmp2, tmp3);
1406  DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2, tmp11,
1407  filter2, tmp8, tmp9, tmp10, tmp11);
1408  DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2,
1409  tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
1410  DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3, tmp10,
1411  tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, tmp7);
1412  DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
1413  tmp0, tmp1, tmp2, tmp3);
1414  DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst2, dst3);
1415  DUP2_ARG2(__lsx_vxori_b, dst2, 128, dst3, 128, dst2, dst3);
1416  DUP2_ARG2(__lsx_vavgr_bu, dst0, dst2, dst1, dst3, dst0, dst1);
1417  __lsx_vst(dst0, dst, 0);
1418  __lsx_vstx(dst1, dst, dst_stride);
1419  dst += dst_stride2;
1420  }
1421 }
1422 
1423 static void common_hz_8t_and_aver_dst_32w_lsx(const uint8_t *src,
1424  int32_t src_stride,
1425  uint8_t *dst, int32_t dst_stride,
1426  const int8_t *filter,
1427  int32_t height)
1428 {
1429  uint32_t loop_cnt = height;
1430  uint8_t *dst_tmp = dst;
1431  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
1432  __m128i mask0, mask1, mask2, mask3, dst0, dst1;
1433  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1434  __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1435  __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
1436 
1437  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
1438  src -= 3;
1439  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1440  mask3 = __lsx_vaddi_bu(mask0, 6);
1441  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1442  filter0, filter1, filter2, filter3);
1443 
1444  for (;loop_cnt--;) {
1445  DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
1446  src3 = __lsx_vld(src, 24);
1447  src1 = __lsx_vshuf_b(src2, src0, shuff);
1448  src += src_stride;
1449  DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst, 16, dst0, dst1);
1450  dst_tmp += dst_stride;
1451  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1452  src0, src1, src2, src3);
1453  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2,
1454  src2, mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
1455  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2,
1456  src2, mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
1457  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2,
1458  src2, mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
1459  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2,
1460  src2, mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
1461  DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0,
1462  tmp3, filter0, tmp0, tmp1, tmp2, tmp3);
1463  DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2,
1464  tmp11, filter2, tmp8, tmp9, tmp10, tmp11);
1465  DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1,
1466  tmp2, tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
1467  DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3,
1468  tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, tmp7);
1469  DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
1470  tmp0, tmp1, tmp2, tmp3);
1471  DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
1472  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1473  DUP2_ARG2(__lsx_vavgr_bu, dst0, tmp0, dst1, tmp1, dst0, dst1);
1474  __lsx_vst(dst0, dst, 0);
1475  __lsx_vst(dst1, dst, 16);
1476  dst += dst_stride;
1477  }
1478 }
1479 
1480 static void common_hz_8t_and_aver_dst_64w_lsx(const uint8_t *src,
1481  int32_t src_stride,
1482  uint8_t *dst, int32_t dst_stride,
1483  const int8_t *filter,
1484  int32_t height)
1485 {
1486  int32_t loop_cnt = height;
1487  __m128i src0, src1, src2, src3;
1488  __m128i filter0, filter1, filter2, filter3;
1489  __m128i mask0, mask1, mask2, mask3;
1490  __m128i out0, out1, out2, out3, dst0, dst1;
1491  __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
1492 
1493  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
1494  src -= 3;
1495  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1496  mask3 = __lsx_vaddi_bu(mask0, 6);
1497  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1498  filter0, filter1, filter2, filter3);
1499 
1500  for (;loop_cnt--;) {
1501  DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
1502  src3 = __lsx_vld(src, 24);
1503  src1 = __lsx_vshuf_b(src2, src0, shuff);
1504  DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
1505  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1506  src0, src1, src2, src3);
1507  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1508  mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
1509  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
1510  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
1511  DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
1512  __lsx_vst(out0, dst, 0);
1513  __lsx_vst(out1, dst, 16);
1514 
1515  DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
1516  src3 = __lsx_vld(src, 56);
1517  src1 = __lsx_vshuf_b(src2, src0, shuff);
1518  DUP2_ARG2(__lsx_vld, dst, 32, dst, 48, dst0, dst1);
1519  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1520  src0, src1, src2, src3);
1521  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1522  mask3, filter0, filter1, filter2, filter3, out0, out1, out2, out3);
1523  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
1524  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
1525  DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
1526  __lsx_vst(out0, dst, 32);
1527  __lsx_vst(out1, dst, 48);
1528  src += src_stride;
1529  dst += dst_stride;
1530  }
1531 }
1532 
1533 static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src,
1534  int32_t src_stride,
1535  uint8_t *dst, int32_t dst_stride,
1536  const int8_t *filter,
1537  int32_t height)
1538 {
1539  uint32_t loop_cnt = height >> 2;
1540  uint8_t *dst_tmp = dst;
1541  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1542  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1543  __m128i reg0, reg1, reg2, reg3, reg4;
1544  __m128i filter0, filter1, filter2, filter3;
1545  __m128i out0, out1;
1546  int32_t src_stride2 = src_stride << 1;
1547  int32_t src_stride3 = src_stride + src_stride2;
1548  int32_t src_stride4 = src_stride2 << 1;
1549  uint8_t* _src = (uint8_t*)src - src_stride3;
1550 
1551  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1552  filter0, filter1, filter2, filter3);
1553  src0 = __lsx_vld(_src, 0);
1554  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1555  src3 = __lsx_vldx(_src, src_stride3);
1556  _src += src_stride4;
1557  src4 = __lsx_vld(_src, 0);
1558  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
1559  _src += src_stride3;
1560  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
1561  tmp0, tmp1, tmp2, tmp3);
1562  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
1563  DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
1564  reg2 = __lsx_vilvl_d(tmp5, tmp2);
1565  DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
1566  reg2 = __lsx_vxori_b(reg2, 128);
1567 
1568  for (;loop_cnt--;) {
1569  src7 = __lsx_vld(_src, 0);
1570  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
1571  src10 = __lsx_vldx(_src, src_stride3);
1572  _src += src_stride4;
1573  src0 = __lsx_vldrepl_w(dst_tmp, 0);
1574  dst_tmp += dst_stride;
1575  src1 = __lsx_vldrepl_w(dst_tmp, 0);
1576  dst_tmp += dst_stride;
1577  src2 = __lsx_vldrepl_w(dst_tmp, 0);
1578  dst_tmp += dst_stride;
1579  src3 = __lsx_vldrepl_w(dst_tmp, 0);
1580  dst_tmp += dst_stride;
1581  DUP2_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src0, src1);
1582  src0 = __lsx_vilvl_d(src1, src0);
1583  DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
1584  src9, tmp0, tmp1, tmp2, tmp3);
1585  DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
1586  DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
1587  out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0,
1588  filter1, filter2, filter3);
1589  out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0,
1590  filter1, filter2, filter3);
1591  out0 = __lsx_vssrarni_b_h(out1, out0, 7);
1592  out0 = __lsx_vxori_b(out0, 128);
1593  out0 = __lsx_vavgr_bu(out0, src0);
1594  __lsx_vstelm_w(out0, dst, 0, 0);
1595  dst += dst_stride;
1596  __lsx_vstelm_w(out0, dst, 0, 1);
1597  dst += dst_stride;
1598  __lsx_vstelm_w(out0, dst, 0, 2);
1599  dst += dst_stride;
1600  __lsx_vstelm_w(out0, dst, 0, 3);
1601  dst += dst_stride;
1602  reg0 = reg2;
1603  reg1 = reg3;
1604  reg2 = reg4;
1605  src6 = src10;
1606  }
1607 }
1608 
1609 static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src,
1610  int32_t src_stride,
1611  uint8_t *dst, int32_t dst_stride,
1612  const int8_t *filter,
1613  int32_t height)
1614 {
1615  uint32_t loop_cnt = height >> 2;
1616  uint8_t *dst_tmp = dst;
1617  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1618  __m128i tmp0, tmp1, tmp2, tmp3;
1619  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
1620  __m128i filter0, filter1, filter2, filter3;
1621  __m128i out0, out1, out2, out3;
1622  int32_t src_stride2 = src_stride << 1;
1623  int32_t src_stride3 = src_stride + src_stride2;
1624  int32_t src_stride4 = src_stride2 << 1;
1625  uint8_t* _src = (uint8_t*)src - src_stride3;
1626 
1627  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1628  filter0, filter1, filter2, filter3);
1629 
1630  src0 = __lsx_vld(_src, 0);
1631  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1632  src3 = __lsx_vldx(_src, src_stride3);
1633  _src += src_stride4;
1634  src4 = __lsx_vld(_src, 0);
1635  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
1636  _src += src_stride3;
1637  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1638  src0, src1, src2, src3);
1639  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
1640  src6 = __lsx_vxori_b(src6, 128);
1641  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2,
1642  src1, reg0, reg1, reg2, reg3);
1643  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
1644 
1645  for (;loop_cnt--;) {
1646  src7 = __lsx_vld(_src, 0);
1647  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
1648  src10 = __lsx_vldx(_src, src_stride3);
1649  _src += src_stride4;
1650  src0 = __lsx_vldrepl_d(dst_tmp, 0);
1651  dst_tmp += dst_stride;
1652  src1 = __lsx_vldrepl_d(dst_tmp, 0);
1653  dst_tmp += dst_stride;
1654  src2 = __lsx_vldrepl_d(dst_tmp, 0);
1655  dst_tmp += dst_stride;
1656  src3 = __lsx_vldrepl_d(dst_tmp, 0);
1657  dst_tmp += dst_stride;
1658  DUP2_ARG2(__lsx_vilvl_d, src1, src0, src3, src2, src0, src1);
1659  DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
1660  src7, src8, src9, src10);
1661  DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
1662  src9, tmp0, tmp1, tmp2, tmp3);
1663  out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0,
1664  filter1, filter2, filter3);
1665  out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0,
1666  filter1, filter2, filter3);
1667  out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0,
1668  filter1, filter2, filter3);
1669  out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0,
1670  filter1, filter2, filter3);
1671  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
1672  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
1673  DUP2_ARG2(__lsx_vavgr_bu, out0, src0, out1, src1, out0, out1);
1674  __lsx_vstelm_d(out0, dst, 0, 0);
1675  dst += dst_stride;
1676  __lsx_vstelm_d(out0, dst, 0, 1);
1677  dst += dst_stride;
1678  __lsx_vstelm_d(out1, dst, 0, 0);
1679  dst += dst_stride;
1680  __lsx_vstelm_d(out1, dst, 0, 1);
1681  dst += dst_stride;
1682 
1683  reg0 = reg2;
1684  reg1 = tmp0;
1685  reg2 = tmp2;
1686  reg3 = reg5;
1687  reg4 = tmp1;
1688  reg5 = tmp3;
1689  src6 = src10;
1690  }
1691 }
1692 
1693 static void common_vt_8t_and_aver_dst_16w_mult_lsx(const uint8_t *src,
1694  int32_t src_stride,
1695  uint8_t *dst,
1696  int32_t dst_stride,
1697  const int8_t *filter,
1698  int32_t height,
1699  int32_t width)
1700 {
1701  uint8_t *src_tmp;
1702  uint32_t cnt = width >> 4;
1703  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1704  __m128i filter0, filter1, filter2, filter3;
1705  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
1706  __m128i reg6, reg7, reg8, reg9, reg10, reg11;
1707  __m128i tmp0, tmp1, tmp2, tmp3;
1708  int32_t src_stride2 = src_stride << 1;
1709  int32_t src_stride3 = src_stride + src_stride2;
1710  int32_t src_stride4 = src_stride2 << 1;
1711  int32_t dst_stride2 = dst_stride << 1;
1712  int32_t dst_stride3 = dst_stride2 + dst_stride;
1713  int32_t dst_stride4 = dst_stride2 << 1;
1714  uint8_t *_src = (uint8_t*)src - src_stride3;
1715 
1716  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
1717  filter0, filter1, filter2, filter3);
1718  for (;cnt--;) {
1719  uint32_t loop_cnt = height >> 2;
1720  uint8_t *dst_reg = dst;
1721 
1722  src_tmp = _src;
1723  src0 = __lsx_vld(src_tmp, 0);
1724  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
1725  src1, src2);
1726  src3 = __lsx_vldx(src_tmp, src_stride3);
1727  src_tmp += src_stride4;
1728  src4 = __lsx_vld(src_tmp, 0);
1729  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
1730  src5, src6);
1731  src_tmp += src_stride3;
1732  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1733  src0, src1, src2, src3);
1734  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
1735  src6 = __lsx_vxori_b(src6, 128);
1736  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
1737  reg0, reg1, reg2, reg3);
1738  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
1739  DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
1740  reg6, reg7, reg8, reg9);
1741  DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
1742 
1743  for (;loop_cnt--;) {
1744  src7 = __lsx_vld(src_tmp, 0);
1745  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
1746  src8, src9);
1747  src10 = __lsx_vldx(src_tmp, src_stride3);
1748  src_tmp += src_stride4;
1749  DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10,
1750  128, src7, src8, src9, src10);
1751  DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8,
1752  src10, src9, src0, src1, src2, src3);
1753  DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8,
1754  src10, src9, src4, src5, src7, src8);
1755  tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0,
1756  filter1, filter2, filter3);
1757  tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0,
1758  filter1, filter2, filter3);
1759  tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0,
1760  filter1, filter2, filter3);
1761  tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0,
1762  filter1, filter2, filter3);
1763  DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7,
1764  tmp0, tmp1);
1765  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1766  tmp2 = __lsx_vld(dst_reg, 0);
1767  tmp3 = __lsx_vldx(dst_reg, dst_stride);
1768  DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
1769  __lsx_vst(tmp0, dst_reg, 0);
1770  __lsx_vstx(tmp1, dst_reg, dst_stride);
1771  tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0,
1772  filter1, filter2, filter3);
1773  tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0,
1774  filter1, filter2, filter3);
1775  tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0,
1776  filter1, filter2, filter3);
1777  tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0,
1778  filter1, filter2, filter3);
1779  DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7,
1780  tmp0, tmp1);
1781  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1782  tmp2 = __lsx_vldx(dst_reg, dst_stride2);
1783  tmp3 = __lsx_vldx(dst_reg, dst_stride3);
1784  DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
1785  __lsx_vstx(tmp0, dst_reg, dst_stride2);
1786  __lsx_vstx(tmp1, dst_reg, dst_stride3);
1787  dst_reg += dst_stride4;
1788 
1789  reg0 = reg2;
1790  reg1 = src0;
1791  reg2 = src2;
1792  reg3 = reg5;
1793  reg4 = src1;
1794  reg5 = src3;
1795  reg6 = reg8;
1796  reg7 = src4;
1797  reg8 = src7;
1798  reg9 = reg11;
1799  reg10 = src5;
1800  reg11 = src8;
1801  src6 = src10;
1802  }
1803  _src += 16;
1804  dst += 16;
1805  }
1806 }
1807 
1808 static void common_vt_8t_and_aver_dst_16w_lsx(const uint8_t *src,
1809  int32_t src_stride,
1810  uint8_t *dst, int32_t dst_stride,
1811  const int8_t *filter,
1812  int32_t height)
1813 {
1814  common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
1815  filter, height, 16);
1816 }
1817 
1818 static void common_vt_8t_and_aver_dst_32w_lsx(const uint8_t *src,
1819  int32_t src_stride,
1820  uint8_t *dst, int32_t dst_stride,
1821  const int8_t *filter,
1822  int32_t height)
1823 {
1824  common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
1825  filter, height, 32);
1826 }
1827 
1828 static void common_vt_8t_and_aver_dst_64w_lsx(const uint8_t *src,
1829  int32_t src_stride,
1830  uint8_t *dst, int32_t dst_stride,
1831  const int8_t *filter,
1832  int32_t height)
1833 {
1834  common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
1835  filter, height, 64);
1836 }
1837 
1838 static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(const uint8_t *src,
1839  int32_t src_stride,
1840  uint8_t *dst,
1841  int32_t dst_stride,
1842  const int8_t *filter_horiz,
1843  const int8_t *filter_vert,
1844  int32_t height)
1845 {
1846  uint32_t loop_cnt = height >> 2;
1847  uint8_t *dst_tmp = dst;
1848  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1849  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1850  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1851  __m128i mask0, mask1, mask2, mask3;
1852  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1853  __m128i out0, out1;
1854  __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
1855  int32_t src_stride2 = src_stride << 1;
1856  int32_t src_stride3 = src_stride + src_stride2;
1857  int32_t src_stride4 = src_stride2 << 1;
1858  uint8_t* _src = (uint8_t*)src - 3 - src_stride3;
1859 
1860  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
1861  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz,
1862  4, filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1863  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1864  mask3 = __lsx_vaddi_bu(mask0, 6);
1865 
1866  src0 = __lsx_vld(_src, 0);
1867  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1868  src3 = __lsx_vldx(_src, src_stride3);
1869  _src += src_stride4;
1870  src4 = __lsx_vld(_src, 0);
1871  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
1872  _src += src_stride3;
1873 
1874  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1875  src0, src1, src2, src3);
1876  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
1877  src6 = __lsx_vxori_b(src6, 128);
1878 
1879  tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
1880  filt_hz1, filt_hz2, filt_hz3);
1881  tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
1882  filt_hz1, filt_hz2, filt_hz3);
1883  tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
1884  filt_hz1, filt_hz2, filt_hz3);
1885  tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
1886  filt_hz1, filt_hz2, filt_hz3);
1887  DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
1888  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
1889  filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1890  DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1891  tmp2 = __lsx_vpackev_b(tmp5, tmp4);
1892 
1893  for (;loop_cnt--;) {
1894  src7 = __lsx_vld(_src, 0);
1895  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
1896  src10 = __lsx_vldx(_src, src_stride3);
1897  _src += src_stride4;
1898  src2 = __lsx_vldrepl_w(dst_tmp, 0);
1899  dst_tmp += dst_stride;
1900  src3 = __lsx_vldrepl_w(dst_tmp, 0);
1901  dst_tmp += dst_stride;
1902  src4 = __lsx_vldrepl_w(dst_tmp, 0);
1903  dst_tmp += dst_stride;
1904  src5 = __lsx_vldrepl_w(dst_tmp, 0);
1905  dst_tmp += dst_stride;
1906  DUP2_ARG2(__lsx_vilvl_w, src3, src2, src5, src4, src2, src3);
1907  src2 = __lsx_vilvl_d(src3, src2);
1908  DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
1909  src7, src8, src9, src10);
1910  tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
1911  filt_hz1, filt_hz2, filt_hz3);
1912  tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
1913  tmp4 = __lsx_vpackev_b(tmp3, tmp4);
1914  out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
1915  filt_vt2, filt_vt3);
1916  src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
1917  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1918  src0 = __lsx_vshuf_b(src1, tmp3, shuff);
1919  src0 = __lsx_vpackev_b(src1, src0);
1920  out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
1921  filt_vt2, filt_vt3);
1922  out0 = __lsx_vssrarni_b_h(out1, out0, 7);
1923  out0 = __lsx_vxori_b(out0, 128);
1924  out0 = __lsx_vavgr_bu(out0, src2);
1925  __lsx_vstelm_w(out0, dst, 0, 0);
1926  dst += dst_stride;
1927  __lsx_vstelm_w(out0, dst, 0, 1);
1928  dst += dst_stride;
1929  __lsx_vstelm_w(out0, dst, 0, 2);
1930  dst += dst_stride;
1931  __lsx_vstelm_w(out0, dst, 0, 3);
1932  dst += dst_stride;
1933 
1934  tmp5 = src1;
1935  tmp0 = tmp2;
1936  tmp1 = tmp4;
1937  tmp2 = src0;
1938  }
1939 }
1940 
1941 static void common_hv_8ht_8vt_and_aver_dst_8w_lsx(const uint8_t *src,
1942  int32_t src_stride,
1943  uint8_t *dst,
1944  int32_t dst_stride,
1945  const int8_t *filter_horiz,
1946  const int8_t *filter_vert,
1947  int32_t height)
1948 {
1949  uint32_t loop_cnt = height >> 2;
1950  uint8_t *dst_tmp = dst;
1951  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1952  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1953  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1954  __m128i mask0, mask1, mask2, mask3;
1955  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1956  __m128i out0, out1;
1957  int32_t src_stride2 = src_stride << 1;
1958  int32_t src_stride3 = src_stride + src_stride2;
1959  int32_t src_stride4 = src_stride2 << 1;
1960  uint8_t* _src = (uint8_t*)src - 3 - src_stride3;
1961 
1962  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
1963  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz,
1964  4, filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1965  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1966  mask3 = __lsx_vaddi_bu(mask0, 6);
1967 
1968  src0 = __lsx_vld(_src, 0);
1969  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
1970  src3 = __lsx_vldx(_src, src_stride3);
1971  _src += src_stride4;
1972  src4 = __lsx_vld(_src, 0);
1973  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
1974  _src += src_stride3;
1975  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1976  src0, src1, src2, src3);
1977  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
1978  src6 = __lsx_vxori_b(src6, 128);
1979 
1980  src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
1981  filt_hz1, filt_hz2, filt_hz3);
1982  src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
1983  filt_hz1, filt_hz2, filt_hz3);
1984  src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
1985  filt_hz1, filt_hz2, filt_hz3);
1986  src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
1987  filt_hz1, filt_hz2, filt_hz3);
1988  src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
1989  filt_hz1, filt_hz2, filt_hz3);
1990  src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
1991  filt_hz1, filt_hz2, filt_hz3);
1992  src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
1993  filt_hz1, filt_hz2, filt_hz3);
1994 
1995  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
1996  filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1997  DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4,
1998  src2, src1, tmp0, tmp1, tmp2, tmp4);
1999  DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
2000 
2001  for (;loop_cnt--;) {
2002  src7 = __lsx_vld(_src, 0);
2003  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
2004  src10 = __lsx_vldx(_src, src_stride3);
2005  _src += src_stride4;
2006 
2007  DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
2008  src7, src8, src9, src10);
2009  src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
2010  filt_hz1, filt_hz2, filt_hz3);
2011  tmp3 = __lsx_vpackev_b(src7, src6);
2012  out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
2013  filt_vt2, filt_vt3);
2014  src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
2015  filt_hz1, filt_hz2, filt_hz3);
2016  src0 = __lsx_vpackev_b(src8, src7);
2017  out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
2018  filt_vt2, filt_vt3);
2019  src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
2020  filt_hz1, filt_hz2, filt_hz3);
2021  src1 = __lsx_vpackev_b(src9, src8);
2022  src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
2023  filt_vt2, filt_vt3);
2024  src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
2025  filt_hz1, filt_hz2, filt_hz3);
2026  src2 = __lsx_vpackev_b(src10, src9);
2027  src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
2028  filt_vt2, filt_vt3);
2029  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
2030  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
2031  src5 = __lsx_vldrepl_d(dst_tmp, 0);
2032  dst_tmp += dst_stride;
2033  src7 = __lsx_vldrepl_d(dst_tmp, 0);
2034  dst_tmp += dst_stride;
2035  src8 = __lsx_vldrepl_d(dst_tmp, 0);
2036  dst_tmp += dst_stride;
2037  src9 = __lsx_vldrepl_d(dst_tmp, 0);
2038  dst_tmp += dst_stride;
2039  DUP2_ARG2(__lsx_vilvl_d, src7, src5, src9, src8, src5, src7);
2040  DUP2_ARG2(__lsx_vavgr_bu, out0, src5, out1, src7, out0, out1);
2041  __lsx_vstelm_d(out0, dst, 0, 0);
2042  dst += dst_stride;
2043  __lsx_vstelm_d(out0, dst, 0, 1);
2044  dst += dst_stride;
2045  __lsx_vstelm_d(out1, dst, 0, 0);
2046  dst += dst_stride;
2047  __lsx_vstelm_d(out1, dst, 0, 1);
2048  dst += dst_stride;
2049 
2050  src6 = src10;
2051  tmp0 = tmp2;
2052  tmp1 = tmp3;
2053  tmp2 = src1;
2054  tmp4 = tmp6;
2055  tmp5 = src0;
2056  tmp6 = src2;
2057  }
2058 }
2059 
2060 static void common_hv_8ht_8vt_and_aver_dst_16w_lsx(const uint8_t *src,
2061  int32_t src_stride,
2062  uint8_t *dst,
2063  int32_t dst_stride,
2064  const int8_t *filter_horiz,
2065  const int8_t *filter_vert,
2066  int32_t height)
2067 {
2068  int32_t multiple8_cnt;
2069 
2070  for (multiple8_cnt = 2; multiple8_cnt--;) {
2071  common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
2072  filter_horiz, filter_vert,
2073  height);
2074 
2075  src += 8;
2076  dst += 8;
2077  }
2078 }
2079 
2080 static void common_hv_8ht_8vt_and_aver_dst_32w_lsx(const uint8_t *src,
2081  int32_t src_stride,
2082  uint8_t *dst,
2083  int32_t dst_stride,
2084  const int8_t *filter_horiz,
2085  const int8_t *filter_vert,
2086  int32_t height)
2087 {
2088  int32_t multiple8_cnt;
2089 
2090  for (multiple8_cnt = 4; multiple8_cnt--;) {
2091  common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
2092  filter_horiz, filter_vert,
2093  height);
2094 
2095  src += 8;
2096  dst += 8;
2097  }
2098 }
2099 
2100 static void common_hv_8ht_8vt_and_aver_dst_64w_lsx(const uint8_t *src,
2101  int32_t src_stride,
2102  uint8_t *dst,
2103  int32_t dst_stride,
2104  const int8_t *filter_horiz,
2105  const int8_t *filter_vert,
2106  int32_t height)
2107 {
2108  int32_t multiple8_cnt;
2109 
2110  for (multiple8_cnt = 8; multiple8_cnt--;) {
2111  common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
2112  filter_horiz, filter_vert,
2113  height);
2114 
2115  src += 8;
2116  dst += 8;
2117  }
2118 }
2119 
2120 static void avg_width8_lsx(const uint8_t *src, int32_t src_stride,
2121  uint8_t *dst, int32_t dst_stride,
2122  int32_t height)
2123 {
2124  int32_t cnt = height >> 2;
2125  uint8_t *dst_tmp = dst;
2126  __m128i src0, src1, dst0, dst1;
2127  __m128i tmp0, tmp1, tmp2, tmp3;
2128 
2129  for (;cnt--;) {
2130  tmp0 = __lsx_vldrepl_d(src, 0);
2131  src += src_stride;
2132  tmp1 = __lsx_vldrepl_d(src, 0);
2133  src += src_stride;
2134  tmp2 = __lsx_vldrepl_d(src, 0);
2135  src += src_stride;
2136  tmp3 = __lsx_vldrepl_d(src, 0);
2137  src += src_stride;
2138  DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, src0, src1);
2139  tmp0 = __lsx_vldrepl_d(dst_tmp, 0);
2140  dst_tmp += dst_stride;
2141  tmp1 = __lsx_vldrepl_d(dst_tmp, 0);
2142  dst_tmp += dst_stride;
2143  tmp2 = __lsx_vldrepl_d(dst_tmp, 0);
2144  dst_tmp += dst_stride;
2145  tmp3 = __lsx_vldrepl_d(dst_tmp, 0);
2146  dst_tmp += dst_stride;
2147  DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, dst0, dst1);
2148  DUP2_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, dst0, dst1);
2149  __lsx_vstelm_d(dst0, dst, 0, 0);
2150  dst += dst_stride;
2151  __lsx_vstelm_d(dst0, dst, 0, 1);
2152  dst += dst_stride;
2153  __lsx_vstelm_d(dst1, dst, 0, 0);
2154  dst += dst_stride;
2155  __lsx_vstelm_d(dst1, dst, 0, 1);
2156  dst += dst_stride;
2157  }
2158 }
2159 
2160 static void avg_width16_lsx(const uint8_t *src, int32_t src_stride,
2161  uint8_t *dst, int32_t dst_stride,
2162  int32_t height)
2163 {
2164  int32_t cnt = height >> 2;
2165  __m128i src0, src1, src2, src3;
2166  __m128i dst0, dst1, dst2, dst3;
2167  int32_t src_stride2 = src_stride << 1;
2168  int32_t src_stride3 = src_stride + src_stride2;
2169  int32_t src_stride4 = src_stride2 << 1;
2170  int32_t dst_stride2 = dst_stride << 1;
2171  int32_t dst_stride3 = dst_stride2 + dst_stride;
2172  int32_t dst_stride4 = dst_stride2 << 1;
2173  uint8_t* _src = (uint8_t*)src;
2174 
2175  for (;cnt--;) {
2176  src0 = __lsx_vld(_src, 0);
2177  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
2178  src3 = __lsx_vldx(_src, src_stride3);
2179  _src += src_stride4;
2180 
2181  dst0 = __lsx_vld(dst, 0);
2182  DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2,
2183  dst1, dst2);
2184  dst3 = __lsx_vldx(dst, dst_stride3);
2185  DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1,
2186  src2, dst2, src3, dst3, dst0, dst1, dst2, dst3);
2187  __lsx_vst(dst0, dst, 0);
2188  __lsx_vstx(dst1, dst, dst_stride);
2189  __lsx_vstx(dst2, dst, dst_stride2);
2190  __lsx_vstx(dst3, dst, dst_stride3);
2191  dst += dst_stride4;
2192  }
2193 }
2194 
2195 static void avg_width32_lsx(const uint8_t *src, int32_t src_stride,
2196  uint8_t *dst, int32_t dst_stride,
2197  int32_t height)
2198 {
2199  int32_t cnt = height >> 2;
2200  uint8_t *src_tmp1 = (uint8_t*)src;
2201  uint8_t *src_tmp2 = src_tmp1 + 16;
2202  uint8_t *dst_tmp1, *dst_tmp2;
2203  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
2204  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2205  int32_t src_stride2 = src_stride << 1;
2206  int32_t src_stride3 = src_stride + src_stride2;
2207  int32_t src_stride4 = src_stride2 << 1;
2208  int32_t dst_stride2 = dst_stride << 1;
2209  int32_t dst_stride3 = dst_stride2 + dst_stride;
2210  int32_t dst_stride4 = dst_stride2 << 1;
2211 
2212  dst_tmp1 = dst;
2213  dst_tmp2 = dst + 16;
2214  for (;cnt--;) {
2215  src0 = __lsx_vld(src_tmp1, 0);
2216  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2,
2217  src2, src4);
2218  src6 = __lsx_vldx(src_tmp1, src_stride3);
2219  src_tmp1 += src_stride4;
2220 
2221  src1 = __lsx_vld(src_tmp2, 0);
2222  DUP2_ARG2(__lsx_vldx, src_tmp2, src_stride, src_tmp2, src_stride2,
2223  src3, src5);
2224  src7 = __lsx_vldx(src_tmp2, src_stride3);
2225  src_tmp2 += src_stride4;
2226 
2227  dst0 = __lsx_vld(dst_tmp1, 0);
2228  DUP2_ARG2(__lsx_vldx, dst_tmp1, dst_stride, dst_tmp1, dst_stride2,
2229  dst2, dst4);
2230  dst6 = __lsx_vldx(dst_tmp1, dst_stride3);
2231  dst1 = __lsx_vld(dst_tmp2, 0);
2232  DUP2_ARG2(__lsx_vldx, dst_tmp2, dst_stride, dst_tmp2, dst_stride2,
2233  dst3, dst5);
2234  dst7 = __lsx_vldx(dst_tmp2, dst_stride3);
2235 
2236  DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1,
2237  src2, dst2, src3, dst3, dst0, dst1, dst2, dst3);
2238  DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5,
2239  src6, dst6, src7, dst7, dst4, dst5, dst6, dst7);
2240  __lsx_vst(dst0, dst_tmp1, 0);
2241  __lsx_vstx(dst2, dst_tmp1, dst_stride);
2242  __lsx_vstx(dst4, dst_tmp1, dst_stride2);
2243  __lsx_vstx(dst6, dst_tmp1, dst_stride3);
2244  dst_tmp1 += dst_stride4;
2245  __lsx_vst(dst1, dst_tmp2, 0);
2246  __lsx_vstx(dst3, dst_tmp2, dst_stride);
2247  __lsx_vstx(dst5, dst_tmp2, dst_stride2);
2248  __lsx_vstx(dst7, dst_tmp2, dst_stride3);
2249  dst_tmp2 += dst_stride4;
2250  }
2251 }
2252 
2253 static void avg_width64_lsx(const uint8_t *src, int32_t src_stride,
2254  uint8_t *dst, int32_t dst_stride,
2255  int32_t height)
2256 {
2257  int32_t cnt = height >> 2;
2258  uint8_t *dst_tmp = dst;
2259  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
2260  __m128i src8, src9, src10, src11, src12, src13, src14, src15;
2261  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2262  __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
2263 
2264  for (;cnt--;) {
2265  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
2266  src0, src1, src2, src3);
2267  src += src_stride;
2268  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
2269  src4, src5, src6, src7);
2270  src += src_stride;
2271  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
2272  src8, src9, src10, src11);
2273  src += src_stride;
2274  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
2275  src12, src13, src14, src15);
2276  src += src_stride;
2277  DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
2278  dst0, dst1, dst2, dst3);
2279  dst_tmp += dst_stride;
2280  DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
2281  dst4, dst5, dst6, dst7);
2282  dst_tmp += dst_stride;
2283  DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
2284  dst8, dst9, dst10, dst11);
2285  dst_tmp += dst_stride;
2286  DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
2287  dst12, dst13, dst14, dst15);
2288  dst_tmp += dst_stride;
2289  DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1,
2290  src2, dst2, src3, dst3, dst0, dst1, dst2, dst3);
2291  DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5,
2292  src6, dst6, src7, dst7, dst4, dst5, dst6, dst7);
2293  DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10,
2294  dst10, src11, dst11, dst8, dst9, dst10, dst11);
2295  DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14,
2296  dst14, src15, dst15, dst12, dst13, dst14, dst15);
2297  __lsx_vst(dst0, dst, 0);
2298  __lsx_vst(dst1, dst, 16);
2299  __lsx_vst(dst2, dst, 32);
2300  __lsx_vst(dst3, dst, 48);
2301  dst += dst_stride;
2302  __lsx_vst(dst4, dst, 0);
2303  __lsx_vst(dst5, dst, 16);
2304  __lsx_vst(dst6, dst, 32);
2305  __lsx_vst(dst7, dst, 48);
2306  dst += dst_stride;
2307  __lsx_vst(dst8, dst, 0);
2308  __lsx_vst(dst9, dst, 16);
2309  __lsx_vst(dst10, dst, 32);
2310  __lsx_vst(dst11, dst, 48);
2311  dst += dst_stride;
2312  __lsx_vst(dst12, dst, 0);
2313  __lsx_vst(dst13, dst, 16);
2314  __lsx_vst(dst14, dst, 32);
2315  __lsx_vst(dst15, dst, 48);
2316  dst += dst_stride;
2317  }
2318 }
2319 
2320 static const int8_t vp9_subpel_filters_lsx[3][15][8] = {
2321  [FILTER_8TAP_REGULAR] = {
2322  {0, 1, -5, 126, 8, -3, 1, 0},
2323  {-1, 3, -10, 122, 18, -6, 2, 0},
2324  {-1, 4, -13, 118, 27, -9, 3, -1},
2325  {-1, 4, -16, 112, 37, -11, 4, -1},
2326  {-1, 5, -18, 105, 48, -14, 4, -1},
2327  {-1, 5, -19, 97, 58, -16, 5, -1},
2328  {-1, 6, -19, 88, 68, -18, 5, -1},
2329  {-1, 6, -19, 78, 78, -19, 6, -1},
2330  {-1, 5, -18, 68, 88, -19, 6, -1},
2331  {-1, 5, -16, 58, 97, -19, 5, -1},
2332  {-1, 4, -14, 48, 105, -18, 5, -1},
2333  {-1, 4, -11, 37, 112, -16, 4, -1},
2334  {-1, 3, -9, 27, 118, -13, 4, -1},
2335  {0, 2, -6, 18, 122, -10, 3, -1},
2336  {0, 1, -3, 8, 126, -5, 1, 0},
2337  }, [FILTER_8TAP_SHARP] = {
2338  {-1, 3, -7, 127, 8, -3, 1, 0},
2339  {-2, 5, -13, 125, 17, -6, 3, -1},
2340  {-3, 7, -17, 121, 27, -10, 5, -2},
2341  {-4, 9, -20, 115, 37, -13, 6, -2},
2342  {-4, 10, -23, 108, 48, -16, 8, -3},
2343  {-4, 10, -24, 100, 59, -19, 9, -3},
2344  {-4, 11, -24, 90, 70, -21, 10, -4},
2345  {-4, 11, -23, 80, 80, -23, 11, -4},
2346  {-4, 10, -21, 70, 90, -24, 11, -4},
2347  {-3, 9, -19, 59, 100, -24, 10, -4},
2348  {-3, 8, -16, 48, 108, -23, 10, -4},
2349  {-2, 6, -13, 37, 115, -20, 9, -4},
2350  {-2, 5, -10, 27, 121, -17, 7, -3},
2351  {-1, 3, -6, 17, 125, -13, 5, -2},
2352  {0, 1, -3, 8, 127, -7, 3, -1},
2353  }, [FILTER_8TAP_SMOOTH] = {
2354  {-3, -1, 32, 64, 38, 1, -3, 0},
2355  {-2, -2, 29, 63, 41, 2, -3, 0},
2356  {-2, -2, 26, 63, 43, 4, -4, 0},
2357  {-2, -3, 24, 62, 46, 5, -4, 0},
2358  {-2, -3, 21, 60, 49, 7, -4, 0},
2359  {-1, -4, 18, 59, 51, 9, -4, 0},
2360  {-1, -4, 16, 57, 53, 12, -4, -1},
2361  {-1, -4, 14, 55, 55, 14, -4, -1},
2362  {-1, -4, 12, 53, 57, 16, -4, -1},
2363  {0, -4, 9, 51, 59, 18, -4, -1},
2364  {0, -4, 7, 49, 60, 21, -3, -2},
2365  {0, -4, 5, 46, 62, 24, -3, -2},
2366  {0, -4, 4, 43, 63, 26, -2, -2},
2367  {0, -3, 2, 41, 63, 29, -2, -2},
2368  {0, -3, 1, 38, 64, 32, -1, -3},
2369  }
2370 };
2371 
2372 #define VP9_8TAP_LOONGARCH_LSX_FUNC(SIZE, type, type_idx) \
2373 void ff_put_8tap_##type##_##SIZE##h_lsx(uint8_t *dst, ptrdiff_t dststride, \
2374  const uint8_t *src, \
2375  ptrdiff_t srcstride, \
2376  int h, int mx, int my) \
2377 { \
2378  const int8_t *filter = vp9_subpel_filters_lsx[type_idx][mx-1]; \
2379  \
2380  common_hz_8t_##SIZE##w_lsx(src, srcstride, dst, dststride, filter, h); \
2381 } \
2382  \
2383 void ff_put_8tap_##type##_##SIZE##v_lsx(uint8_t *dst, ptrdiff_t dststride, \
2384  const uint8_t *src, \
2385  ptrdiff_t srcstride, \
2386  int h, int mx, int my) \
2387 { \
2388  const int8_t *filter = vp9_subpel_filters_lsx[type_idx][my-1]; \
2389  \
2390  common_vt_8t_##SIZE##w_lsx(src, srcstride, dst, dststride, filter, h); \
2391 } \
2392  \
2393 void ff_put_8tap_##type##_##SIZE##hv_lsx(uint8_t *dst, ptrdiff_t dststride, \
2394  const uint8_t *src, \
2395  ptrdiff_t srcstride, \
2396  int h, int mx, int my) \
2397 { \
2398  const int8_t *hfilter = vp9_subpel_filters_lsx[type_idx][mx-1]; \
2399  const int8_t *vfilter = vp9_subpel_filters_lsx[type_idx][my-1]; \
2400  \
2401  common_hv_8ht_8vt_##SIZE##w_lsx(src, srcstride, dst, dststride, hfilter, \
2402  vfilter, h); \
2403 } \
2404  \
2405 void ff_avg_8tap_##type##_##SIZE##h_lsx(uint8_t *dst, ptrdiff_t dststride, \
2406  const uint8_t *src, \
2407  ptrdiff_t srcstride, \
2408  int h, int mx, int my) \
2409 { \
2410  const int8_t *filter = vp9_subpel_filters_lsx[type_idx][mx-1]; \
2411  \
2412  common_hz_8t_and_aver_dst_##SIZE##w_lsx(src, srcstride, dst, \
2413  dststride, filter, h); \
2414 } \
2415  \
2416 void ff_avg_8tap_##type##_##SIZE##v_lsx(uint8_t *dst, ptrdiff_t dststride, \
2417  const uint8_t *src, \
2418  ptrdiff_t srcstride, \
2419  int h, int mx, int my) \
2420 { \
2421  const int8_t *filter = vp9_subpel_filters_lsx[type_idx][my-1]; \
2422  \
2423  common_vt_8t_and_aver_dst_##SIZE##w_lsx(src, srcstride, dst, dststride, \
2424  filter, h); \
2425 } \
2426  \
2427 void ff_avg_8tap_##type##_##SIZE##hv_lsx(uint8_t *dst, ptrdiff_t dststride, \
2428  const uint8_t *src, \
2429  ptrdiff_t srcstride, \
2430  int h, int mx, int my) \
2431 { \
2432  const int8_t *hfilter = vp9_subpel_filters_lsx[type_idx][mx-1]; \
2433  const int8_t *vfilter = vp9_subpel_filters_lsx[type_idx][my-1]; \
2434  \
2435  common_hv_8ht_8vt_and_aver_dst_##SIZE##w_lsx(src, srcstride, dst, \
2436  dststride, hfilter, \
2437  vfilter, h); \
2438 }
2439 
2440 #define VP9_COPY_LOONGARCH_LSX_FUNC(SIZE) \
2441 void ff_copy##SIZE##_lsx(uint8_t *dst, ptrdiff_t dststride, \
2442  const uint8_t *src, ptrdiff_t srcstride, \
2443  int h, int mx, int my) \
2444 { \
2445  \
2446  copy_width##SIZE##_lsx(src, srcstride, dst, dststride, h); \
2447 } \
2448 void ff_avg##SIZE##_lsx(uint8_t *dst, ptrdiff_t dststride, \
2449  const uint8_t *src, ptrdiff_t srcstride, \
2450  int h, int mx, int my) \
2451 { \
2452  \
2453  avg_width##SIZE##_lsx(src, srcstride, dst, dststride, h); \
2454 }
2455 
2461 
2467 
2473 
2478 
2479 #undef VP9_8TAP_LOONGARCH_LSX_FUNC
2480 #undef VP9_COPY_LOONGARCH_LSX_FUNC
common_vt_8t_8w_lsx
static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:510
common_hz_8t_16w_lsx
static void common_hz_8t_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:314
common_vt_8t_and_aver_dst_4w_lsx
static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:1533
common_hz_8t_and_aver_dst_32w_lsx
static void common_hz_8t_and_aver_dst_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:1423
common_vt_8t_and_aver_dst_64w_lsx
static void common_vt_8t_and_aver_dst_64w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:1828
out
FILE * out
Definition: movenc.c:54
filter1
static void filter1(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:360
src1
const pixel * src1
Definition: h264pred_template.c:421
common_hv_8ht_8vt_and_aver_dst_32w_lsx
static void common_hv_8ht_8vt_and_aver_dst_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_lsx.c:2080
common_vt_8t_and_aver_dst_8w_lsx
static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:1609
LSX_LD_4
#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3)
Definition: vp9_mc_lsx.c:121
common_vt_8t_16w_mult_lsx
static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
Definition: vp9_mc_lsx.c:673
common_hz_8t_64w_lsx
static void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:400
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
VP9_COPY_LOONGARCH_LSX_FUNC
#define VP9_COPY_LOONGARCH_LSX_FUNC(SIZE)
Definition: vp9_mc_lsx.c:2440
common_hv_8ht_8vt_and_aver_dst_4w_lsx
static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_lsx.c:1838
DUP2_ARG2
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:58
common_hz_8t_and_aver_dst_4x8_lsx
static void common_hz_8t_and_aver_dst_4x8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_lsx.c:1225
common_hz_8t_and_aver_dst_16w_lsx
static void common_hz_8t_and_aver_dst_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:1365
VP9_8TAP_LOONGARCH_LSX_FUNC
#define VP9_8TAP_LOONGARCH_LSX_FUNC(SIZE, type, type_idx)
Definition: vp9_mc_lsx.c:2372
common_hz_8t_8x8mult_lsx
static void common_hz_8t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:260
common_hz_8t_8x4_lsx
static void common_hz_8t_8x4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_lsx.c:228
FILTER_8TAP_SHARP
@ FILTER_8TAP_SHARP
Definition: vp9.h:67
common_vt_8t_4w_lsx
static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:447
DUP4_ARG2
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:76
width
#define width
copy_width16_lsx
static void copy_width16_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_lsx.c:1066
avg_width8_lsx
static void avg_width8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_lsx.c:2120
common_vt_8t_16w_lsx
static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:582
HORIZ_8TAP_4WID_4VECS_FILT
#define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, _mask0, _mask1, _mask2, _mask3, _filter0, _filter1, _filter2, _filter3, _out0, _out1)
Definition: vp9_mc_lsx.c:36
common_hv_8ht_8vt_16w_lsx
static void common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_lsx.c:991
common_hz_8t_32w_lsx
static void common_hz_8t_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:350
common_hv_8ht_8vt_and_aver_dst_8w_lsx
static void common_hv_8ht_8vt_and_aver_dst_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_lsx.c:1941
common_hv_8ht_8vt_8w_lsx
static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_lsx.c:885
FILT_8TAP_DPADD_S_H
#define FILT_8TAP_DPADD_S_H(_reg0, _reg1, _reg2, _reg3, _filter0, _filter1, _filter2, _filter3)
Definition: vp9_mc_lsx.c:91
common_hz_8t_8w_lsx
static void common_hz_8t_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:302
vp9dsp.h
FILTER_8TAP_REGULAR
@ FILTER_8TAP_REGULAR
Definition: vp9.h:66
DUP2_ARG3
#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:64
common_vt_8t_and_aver_dst_16w_lsx
static void common_vt_8t_and_aver_dst_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:1808
common_hv_8ht_8vt_and_aver_dst_64w_lsx
static void common_hv_8ht_8vt_and_aver_dst_64w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_lsx.c:2100
vp9_subpel_filters_lsx
static const int8_t vp9_subpel_filters_lsx[3][15][8]
Definition: vp9_mc_lsx.c:2320
height
#define height
common_vt_8t_and_aver_dst_32w_lsx
static void common_vt_8t_and_aver_dst_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:1818
avg_width16_lsx
static void avg_width16_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_lsx.c:2160
common_hz_8t_4w_lsx
static void common_hz_8t_4w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:217
src2
const pixel * src2
Definition: h264pred_template.c:422
copy_width32_lsx
static void copy_width32_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_lsx.c:1093
vp9dsp_loongarch.h
FILTER_8TAP_SMOOTH
@ FILTER_8TAP_SMOOTH
Definition: vp9.h:65
stride
#define stride
Definition: h264pred_template.c:537
common_vt_8t_32w_lsx
static void common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:782
common_hv_8ht_8vt_and_aver_dst_16w_lsx
static void common_hv_8ht_8vt_and_aver_dst_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_lsx.c:2060
avg_width64_lsx
static void avg_width64_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_lsx.c:2253
copy_width8_lsx
static void copy_width8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_lsx.c:1039
avg_width32_lsx
static void avg_width32_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_lsx.c:2195
common_hz_8t_4x8_lsx
static void common_hz_8t_4x8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_lsx.c:164
HORIZ_8TAP_8WID_4VECS_FILT
#define HORIZ_8TAP_8WID_4VECS_FILT(_src0, _src1, _src2, _src3, _mask0, _mask1, _mask2, _mask3, _filter0, _filter1, _filter2, _filter3, _out0, _out1, _out2, _out3)
Definition: vp9_mc_lsx.c:61
copy_width64_lsx
static void copy_width64_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_lsx.c:1136
mc_filt_mask_arr
static const uint8_t mc_filt_mask_arr[16 *3]
Definition: vp9_mc_lsx.c:26
common_hv_8ht_8vt_4w_lsx
static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_lsx.c:797
common_hz_8t_and_aver_dst_64w_lsx
static void common_hz_8t_and_aver_dst_64w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:1480
common_hv_8ht_8vt_32w_lsx
static void common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_lsx.c:1007
common_vt_8t_and_aver_dst_16w_mult_lsx
static void common_vt_8t_and_aver_dst_16w_mult_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
Definition: vp9_mc_lsx.c:1693
common_vt_8t_64w_lsx
static void common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:789
src0
const pixel *const src0
Definition: h264pred_template.c:420
filter0
static void filter0(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:352
common_hz_8t_and_aver_dst_4w_lsx
static void common_hz_8t_and_aver_dst_4w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:1297
loongson_intrinsics.h
HORIZ_8TAP_FILT
#define HORIZ_8TAP_FILT(_src0, _src1, _mask0, _mask1, _mask2, _mask3, _filt_h0, _filt_h1, _filt_h2, _filt_h3)
Definition: vp9_mc_lsx.c:105
common_hz_8t_4x4_lsx
static void common_hz_8t_4x4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_lsx.c:132
smooth
static float smooth(DeshakeOpenCLContext *deshake_ctx, float *gauss_kernel, int length, float max_val, AVFifo *values)
Definition: vf_deshake_opencl.c:888
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
int32_t
int32_t
Definition: audioconvert.c:56
common_hz_8t_and_aver_dst_4x4_lsx
static void common_hz_8t_and_aver_dst_4x4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_lsx.c:1180
common_hv_8ht_8vt_64w_lsx
static void common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_lsx.c:1023
common_hz_8t_and_aver_dst_8w_lsx
static void common_hz_8t_and_aver_dst_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_lsx.c:1310
DUP4_ARG3
#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:83