FFmpeg
hevc_mc_uni_lsx.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2022 Loongson Technology Corporation Limited
3  * Contributed by Lu Wang <wanglu@loongson.cn>
4  * Hao Chen <chenhao@loongson.cn>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
24 #include "hevcdsp_lsx.h"
25 
26 static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
27  /* 8 width cases */
28  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29  /* 4 width cases */
30  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31  /* 4 width cases */
32  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
33 };
34 
35 static av_always_inline
36 void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
37  uint8_t *dst, int32_t dst_stride,
38  const int8_t *filter, int32_t height)
39 {
40  int32_t loop_cnt;
41  __m128i mask0, mask1, mask2, mask3, out1, out2;
42  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
43  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
44  __m128i filt0, filt1, filt2, filt3;
45  __m128i res0, res1, res2, res3;
46 
47  mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
48  src -= 3;
49 
50  /* rearranging filter */
51  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
52  filt0, filt1, filt2, filt3);
53 
54  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
55  mask3 = __lsx_vaddi_bu(mask0, 6);
56 
57  for (loop_cnt = height; loop_cnt--;) {
58  DUP4_ARG2(__lsx_vld, src, 0, src, 8, src, 16, src, 24,
59  src0, src1, src2, src3);
60  DUP4_ARG2(__lsx_vld, src, 32, src, 40, src, 48, src, 56,
61  src4, src5, src6, src7);
62  src += src_stride;
63 
64  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
65  vec0, vec1);
66  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
67  vec2, vec3);
68  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
69  vec3, filt0, res0, res1, res2, res3);
70  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
71  vec0, vec1);
72  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
73  vec2, vec3);
74  DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2,
75  res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3);
76  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
77  vec4, vec5);
78  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
79  vec6, vec7);
80  DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1,
81  res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3);
82  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
83  vec4, vec5);
84  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
85  vec6, vec7);
86  DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3,
87  res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3);
88 
89  DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6,
90  out1, out2);
91  __lsx_vst(out1, dst, 0);
92  __lsx_vst(out2, dst, 16);
93 
94  DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src5, mask0,
95  vec0, vec1);
96  DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src7, src7, mask0,
97  vec2, vec3);
98  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
99  vec3, filt0, res0, res1, res2, res3);
100  DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask2, src5, src5, mask2,
101  vec0, vec1);
102  DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask2, src7, src7, mask2,
103  vec2, vec3);
104  DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2,
105  res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3);
106  DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src5, mask1,
107  vec4, vec5);
108  DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask1, src7, src7, mask1,
109  vec6, vec7);
110  DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1,
111  res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3);
112  DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask3, src5, src5, mask3,
113  vec4, vec5);
114  DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask3, src7, src7, mask3,
115  vec6, vec7);
116  DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3,
117  res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3);
118 
119  DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6,
120  out1, out2);
121  __lsx_vst(out1, dst, 32);
122  __lsx_vst(out2, dst, 48);
123  dst += dst_stride;
124  }
125 }
126 
127 static av_always_inline
128 void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
129  uint8_t *dst, int32_t dst_stride,
130  const int8_t *filter, int32_t height)
131 {
132  uint32_t loop_cnt;
133  int32_t src_stride_2x = (src_stride << 1);
134  int32_t dst_stride_2x = (dst_stride << 1);
135  int32_t src_stride_4x = (src_stride << 2);
136  int32_t dst_stride_4x = (dst_stride << 2);
137  int32_t src_stride_3x = src_stride_2x + src_stride;
138  int32_t dst_stride_3x = dst_stride_2x + dst_stride;
139 
140  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
141  __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
142  __m128i src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
143  __m128i tmp0, tmp1;
144  __m128i out0_r, out1_r, out2_r, out3_r;
145 
146  src -= src_stride_3x;
147  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
148  filt0, filt1, filt2, filt3);
149 
150  src0 = __lsx_vld(src, 0);
151  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
152  src3 = __lsx_vldx(src, src_stride_3x);
153  src += src_stride_4x;
154  src4 = __lsx_vld(src, 0);
155  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
156  src += src_stride_3x;
157  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
158  src10_r, src32_r, src54_r, src21_r);
159  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
160 
161  for (loop_cnt = (height >> 2); loop_cnt--;) {
162  src7 = __lsx_vld(src, 0);
163  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
164  src10 = __lsx_vldx(src, src_stride_3x);
165  src += src_stride_4x;
166 
167  DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
168  src9, src76_r, src87_r, src98_r, src109_r);
169  DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
170  filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r);
171  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r,
172  src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r,
173  filt1, out0_r, out1_r, out2_r, out3_r);
174  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r,
175  src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r,
176  filt2, out0_r, out1_r, out2_r, out3_r);
177  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r,
178  src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r,
179  filt3, out0_r, out1_r, out2_r, out3_r);
180 
181  DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6,
182  tmp0, tmp1)
183  __lsx_vstelm_d(tmp0, dst, 0, 0);
184  __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
185  __lsx_vstelm_d(tmp1, dst + dst_stride_2x, 0, 0);
186  __lsx_vstelm_d(tmp1, dst + dst_stride_3x, 0, 1);
187  dst += dst_stride_4x;
188 
189  src10_r = src54_r;
190  src32_r = src76_r;
191  src54_r = src98_r;
192  src21_r = src65_r;
193  src43_r = src87_r;
194  src65_r = src109_r;
195  src6 = src10;
196  }
197 }
198 
199 static av_always_inline
200 void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
201  int32_t dst_stride, const int8_t *filter,
203 {
204  const uint8_t *src_tmp;
205  uint8_t *dst_tmp;
206  uint32_t loop_cnt, cnt;
207  const int32_t src_stride_2x = (src_stride << 1);
208  const int32_t dst_stride_2x = (dst_stride << 1);
209  const int32_t src_stride_4x = (src_stride << 2);
210  const int32_t dst_stride_4x = (dst_stride << 2);
211  const int32_t src_stride_3x = src_stride_2x + src_stride;
212  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
213 
214  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
215  __m128i filt0, filt1, filt2, filt3;
216  __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
217  __m128i src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
218  __m128i src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
219  __m128i tmp0, tmp1, tmp2, tmp3;
220  __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
221 
222  src -= src_stride_3x;
223  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, filt0,
224  filt1, filt2, filt3);
225 
226  for (cnt = (width >> 4); cnt--;) {
227  src_tmp = src;
228  dst_tmp = dst;
229 
230  src0 = __lsx_vld(src_tmp, 0);
231  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
232  src1, src2);
233  src3 = __lsx_vldx(src_tmp, src_stride_3x);
234  src_tmp += src_stride_4x;
235  src4 = __lsx_vld(src_tmp, 0);
236  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
237  src5, src6);
238  src_tmp += src_stride_3x;
239  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
240  src10_r, src32_r, src54_r, src21_r);
241  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
242  DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
243  src10_l, src32_l, src54_l, src21_l);
244  DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
245 
246  for (loop_cnt = (height >> 2); loop_cnt--;) {
247  src7 = __lsx_vld(src_tmp, 0);
248  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
249  src8, src9);
250  src10 = __lsx_vldx(src_tmp, src_stride_3x);
251  src_tmp += src_stride_4x;
252  DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
253  src9, src76_r, src87_r, src98_r, src109_r);
254  DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10,
255  src9, src76_l, src87_l, src98_l, src109_l);
256  DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
257  filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r);
258  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r,
259  src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r,
260  filt1, out0_r, out1_r, out2_r, out3_r);
261  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r,
262  src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r,
263  filt2, out0_r, out1_r, out2_r, out3_r);
264  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r,
265  src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r,
266  filt3, out0_r, out1_r, out2_r, out3_r);
267  DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_l, filt0, src21_l, filt0, src32_l,
268  filt0, src43_l, filt0, out0_l, out1_l, out2_l, out3_l);
269  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src32_l, filt1, out1_l,
270  src43_l, filt1, out2_l, src54_l, filt1, out3_l, src65_l,
271  filt1, out0_l, out1_l, out2_l, out3_l);
272  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src54_l, filt2, out1_l,
273  src65_l, filt2, out2_l, src76_l, filt2, out3_l, src87_l,
274  filt2, out0_l, out1_l, out2_l, out3_l);
275  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src76_l, filt3, out1_l,
276  src87_l, filt3, out2_l, src98_l, filt3, out3_l, src109_l,
277  filt3, out0_l, out1_l, out2_l, out3_l);
278  DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r,
279  6, out2_l, out2_r, 6, out3_l, out3_r, 6,
280  tmp0, tmp1, tmp2, tmp3);
281  __lsx_vst(tmp0, dst_tmp, 0);
282  __lsx_vstx(tmp1, dst_tmp, dst_stride);
283  __lsx_vstx(tmp2, dst_tmp, dst_stride_2x);
284  __lsx_vstx(tmp3, dst_tmp, dst_stride_3x);
285  dst_tmp += dst_stride_4x;
286 
287  src10_r = src54_r;
288  src32_r = src76_r;
289  src54_r = src98_r;
290  src21_r = src65_r;
291  src43_r = src87_r;
292  src65_r = src109_r;
293  src10_l = src54_l;
294  src32_l = src76_l;
295  src54_l = src98_l;
296  src21_l = src65_l;
297  src43_l = src87_l;
298  src65_l = src109_l;
299  src6 = src10;
300  }
301 
302  src += 16;
303  dst += 16;
304  }
305 }
306 
307 static void common_vt_8t_24w_lsx(const uint8_t *src, int32_t src_stride,
308  uint8_t *dst, int32_t dst_stride,
309  const int8_t *filter, int32_t height)
310 {
311  common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 16);
312  common_vt_8t_8w_lsx(src + 16, src_stride, dst + 16, dst_stride, filter,
313  height);
314 }
315 
316 static void common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
317  uint8_t *dst, int32_t dst_stride,
318  const int8_t *filter, int32_t height)
319 {
320  common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 32);
321 }
322 
323 static void common_vt_8t_48w_lsx(const uint8_t *src, int32_t src_stride,
324  uint8_t *dst, int32_t dst_stride,
325  const int8_t *filter, int32_t height)
326 {
327  common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 48);
328 }
329 
330 static void common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
331  uint8_t *dst, int32_t dst_stride,
332  const int8_t *filter, int32_t height)
333 {
334  common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 64);
335 }
336 
337 static av_always_inline
338 void hevc_hv_8t_8x2_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
339  int32_t dst_stride, const int8_t *filter_x,
340  const int8_t *filter_y, int32_t height, int32_t width)
341 {
342  uint32_t loop_cnt, cnt;
343  const uint8_t *src_tmp;
344  uint8_t *dst_tmp;
345  const int32_t src_stride_2x = (src_stride << 1);
346  const int32_t dst_stride_2x = (dst_stride << 1);
347  const int32_t src_stride_4x = (src_stride << 2);
348  const int32_t src_stride_3x = src_stride_2x + src_stride;
349 
350  __m128i out;
351  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
352  __m128i filt0, filt1, filt2, filt3;
353  __m128i filt_h0, filt_h1, filt_h2, filt_h3;
354  __m128i mask1, mask2, mask3;
355  __m128i filter_vec;
356  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
357  __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
358  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
359  __m128i dst0_r, dst0_l, dst1_r, dst1_l;
360  __m128i dst10_r, dst32_r, dst54_r, dst76_r;
361  __m128i dst10_l, dst32_l, dst54_l, dst76_l;
362  __m128i dst21_r, dst43_r, dst65_r, dst87_r;
363  __m128i dst21_l, dst43_l, dst65_l, dst87_l;
364  __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
365 
366  src -= (src_stride_3x + 3);
367  DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
368  filter_x, 6, filt0, filt1, filt2, filt3);
369 
370  filter_vec = __lsx_vld(filter_y, 0);
371  filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
372  DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
373  filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
374 
375  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
376  mask3 = __lsx_vaddi_bu(mask0, 6);
377 
378  for (cnt = width >> 3; cnt--;) {
379  src_tmp = src;
380  dst_tmp = dst;
381 
382  src0 = __lsx_vld(src_tmp, 0);
383  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
384  src1, src2);
385  src3 = __lsx_vldx(src_tmp, src_stride_3x);
386  src_tmp += src_stride_4x;
387  src4 = __lsx_vld(src_tmp, 0);
388  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
389  src5, src6);
390  src_tmp += src_stride_3x;
391 
392  /* row 0 row 1 row 2 row 3 */
393  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
394  src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
395  DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
396  src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
397  DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
398  src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
399  DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
400  src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
401  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
402  vec12, filt0, dst0, dst1, dst2, dst3);
403  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
404  dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
405  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
406  dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
407  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
408  dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
409 
410  DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
411  src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
412  DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
413  src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
414  DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
415  src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
416  DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
417  dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
418  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
419  dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
420  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
421  dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
422  dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
423  DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
424  dst1, dst10_r, dst32_r, dst54_r, dst21_r);
425  DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
426  dst1, dst10_l, dst32_l, dst54_l, dst21_l);
427  DUP2_ARG2(__lsx_vilvl_h, dst4, dst3, dst6, dst5, dst43_r, dst65_r);
428  DUP2_ARG2(__lsx_vilvh_h, dst4, dst3, dst6, dst5, dst43_l, dst65_l);
429 
430  for (loop_cnt = height >> 1; loop_cnt--;) {
431  src7 = __lsx_vld(src_tmp, 0);
432  src8 = __lsx_vldx(src_tmp, src_stride);
433  src_tmp += src_stride_2x;
434 
435  DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
436  src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
437  dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
438  DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
439  filt2, dst7, dst7);
440  dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
441  dst76_r = __lsx_vilvl_h(dst7, dst6);
442  dst76_l = __lsx_vilvh_h(dst7, dst6);
443  DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
444  dst0_r, dst0_l);
445  DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
446  dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
447  dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
448  DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
449  dst76_l, filt_h3, dst0_r, dst0_l);
450  DUP2_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst0_r, dst0_l);
451 
452  DUP4_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, src8,
453  src8, mask2, src8, src8, mask3, vec0, vec1, vec2, vec3);
454  dst8 = __lsx_vdp2_h_bu_b(vec0, filt0);
455  DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst8, vec1, filt1, dst8, vec2,
456  filt2, dst8, dst8);
457  dst8 = __lsx_vdp2add_h_bu_b(dst8, vec3, filt3);
458 
459  dst87_r = __lsx_vilvl_h(dst8, dst7);
460  dst87_l = __lsx_vilvh_h(dst8, dst7);
461  DUP2_ARG2(__lsx_vdp2_w_h, dst21_r, filt_h0, dst21_l, filt_h0,
462  dst1_r, dst1_l);
463  DUP4_ARG3(__lsx_vdp2add_w_h, dst1_r, dst43_r, filt_h1, dst1_l,
464  dst43_l, filt_h1, dst1_r, dst65_r, filt_h2, dst1_l,
465  dst65_l, filt_h2, dst1_r, dst1_l, dst1_r, dst1_l);
466  DUP2_ARG3(__lsx_vdp2add_w_h, dst1_r, dst87_r, filt_h3, dst1_l,
467  dst87_l, filt_h3, dst1_r, dst1_l);
468  DUP2_ARG2(__lsx_vsrai_w, dst1_r, 6, dst1_l, 6, dst1_r, dst1_l);
469  DUP4_ARG2(__lsx_vsrari_w, dst0_r, 6, dst0_l, 6,dst1_r, 6, dst1_l,
470  6, dst0_r, dst0_l, dst1_r, dst1_l);
471  DUP4_ARG1(__lsx_vclip255_w, dst0_l, dst0_r, dst1_l, dst1_r,
472  dst0_l, dst0_r, dst1_l, dst1_r);
473  DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r,
474  dst0, dst1);
475  out = __lsx_vpickev_b(dst1, dst0);
476  __lsx_vstelm_d(out, dst_tmp, 0, 0);
477  __lsx_vstelm_d(out, dst_tmp + dst_stride, 0, 1);
478  dst_tmp += dst_stride_2x;
479 
480  dst10_r = dst32_r;
481  dst32_r = dst54_r;
482  dst54_r = dst76_r;
483  dst10_l = dst32_l;
484  dst32_l = dst54_l;
485  dst54_l = dst76_l;
486  dst21_r = dst43_r;
487  dst43_r = dst65_r;
488  dst65_r = dst87_r;
489  dst21_l = dst43_l;
490  dst43_l = dst65_l;
491  dst65_l = dst87_l;
492  dst6 = dst8;
493  }
494  src += 8;
495  dst += 8;
496  }
497 }
498 
499 static void hevc_hv_8t_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
500  int32_t dst_stride, const int8_t *filter_x,
501  const int8_t *filter_y, int32_t height)
502 {
503  hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
504  filter_x, filter_y, height, 8);
505 }
506 
507 static void hevc_hv_8t_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
508  int32_t dst_stride, const int8_t *filter_x,
509  const int8_t *filter_y, int32_t height)
510 {
511  hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
512  filter_x, filter_y, height, 16);
513 }
514 
515 static void hevc_hv_8t_24w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
516  int32_t dst_stride, const int8_t *filter_x,
517  const int8_t *filter_y, int32_t height)
518 {
519  hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
520  filter_x, filter_y, height, 24);
521 }
522 
523 static void hevc_hv_8t_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
524  int32_t dst_stride, const int8_t *filter_x,
525  const int8_t *filter_y, int32_t height)
526 {
527  hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
528  filter_x, filter_y, height, 32);
529 }
530 
531 static void hevc_hv_8t_48w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
532  int32_t dst_stride, const int8_t *filter_x,
533  const int8_t *filter_y, int32_t height)
534 {
535  hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
536  filter_x, filter_y, height, 48);
537 }
538 
539 static void hevc_hv_8t_64w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
540  int32_t dst_stride, const int8_t *filter_x,
541  const int8_t *filter_y, int32_t height)
542 {
543  hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
544  filter_x, filter_y, height, 64);
545 }
546 
547 static av_always_inline
548 void common_vt_4t_24w_lsx(const uint8_t *src, int32_t src_stride,
549  uint8_t *dst, int32_t dst_stride,
550  const int8_t *filter, int32_t height)
551 {
552  uint32_t loop_cnt;
553  int32_t src_stride_2x = (src_stride << 1);
554  int32_t src_stride_3x = src_stride_2x + src_stride;
555  const uint8_t *_src;
556 
557  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
558  __m128i src11, filt0, filt1;
559  __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
560  __m128i src109_r, src10_l, src32_l, src21_l, src43_l;
561  __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
562  __m128i out1, out2, out3, out4;
563 
564  src -= src_stride;
565  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
566  _src = src + 16;
567 
568  /* 16 width */
569  src0 = __lsx_vld(src, 0);
570  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
571  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
572  DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
573 
574  /* 8 width */
575  src6 = __lsx_vld(_src, 0);
576  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
577  src += src_stride_3x;
578  _src += src_stride_3x;
579  DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
580 
581  for (loop_cnt = 8; loop_cnt--;) {
582  /* 16 width */
583  DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
584  DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
585  DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
586  DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
587 
588  /* 8 width */
589  src += src_stride_2x;
590  _src += src_stride_2x;
591  DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
592 
593  /* 16 width */
594  DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
595  filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l);
596  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l,
597  filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1,
598  out0_r, out0_l, out1_r, out1_l);
599 
600  /* 8 width */
601  DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
602  out2_r, out3_r);
603  DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out3_r,
604  src109_r, filt1, out2_r, out3_r);
605 
606  /* 16 + 8 width */
607  DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6,
608  out3_r, out3_r, 6, out1_l, out1_r, 6, out1, out2, out3, out4);
609  __lsx_vst(out1, dst, 0);
610  __lsx_vstelm_d(out2, dst, 16, 0);
611  dst += dst_stride;
612  __lsx_vst(out4, dst, 0);
613  __lsx_vstelm_d(out3, dst, 16, 0);
614  dst += dst_stride;
615 
616  /* 16 width */
617  DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11);
618  DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8);
619  DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
620  DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
621 
622  /* 8 width */
623  src += src_stride_2x;
624  _src += src_stride_2x;
625  DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
626 
627  /* 16 width */
628  DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
629  filt0, src43_l, filt0, out0_r, out0_l, out1_r, out1_l);
630  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src10_r, filt1, out0_l, src10_l,
631  filt1, out1_r, src21_r, filt1, out1_l, src21_l, filt1,
632  out0_r, out0_l, out1_r, out1_l);
633 
634  /* 8 width */
635  DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
636  out2_r, out3_r);
637  DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src76_r, filt1, out3_r,
638  src87_r, filt1, out2_r, out3_r);
639 
640  /* 16 + 8 width */
641  DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6,
642  out1_l, out1_r, 6, out3_r, out3_r, 6, out1, out2, out3, out4);
643 
644  __lsx_vst(out1, dst, 0);
645  __lsx_vstelm_d(out2, dst, 16, 0);
646  dst += dst_stride;
647  __lsx_vst(out3, dst, 0);
648  __lsx_vstelm_d(out4, dst, 16, 0);
649  dst += dst_stride;
650  }
651 }
652 
653 static av_always_inline
654 void common_vt_4t_32w_lsx(const uint8_t *src, int32_t src_stride,
655  uint8_t *dst, int32_t dst_stride,
656  const int8_t *filter, int32_t height)
657 {
658  uint32_t loop_cnt;
659  int32_t src_stride_2x = (src_stride << 1);
660  int32_t dst_stride_2x = (dst_stride << 1);
661  int32_t src_stride_3x = src_stride_2x + src_stride;
662  const uint8_t *_src;
663 
664  __m128i src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
665  __m128i src10_r, src32_r, src76_r, src98_r;
666  __m128i src21_r, src43_r, src87_r, src109_r;
667  __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
668  __m128i src10_l, src32_l, src76_l, src98_l;
669  __m128i src21_l, src43_l, src87_l, src109_l;
670  __m128i filt0, filt1;
671  __m128i out1, out2;
672 
673  src -= src_stride;
674  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
675  _src = src + 16;
676 
677  /* 16 width */
678  src0 = __lsx_vld(src, 0);
679  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
680 
681  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
682  DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
683 
684  /* next 16 width */
685  src6 = __lsx_vld(_src, 0);
686  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
687  src += src_stride_3x;
688  _src += src_stride_3x;
689 
690  DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
691  DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
692 
693  for (loop_cnt = (height >> 1); loop_cnt--;) {
694  /* 16 width */
695  DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
696  DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
697  DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
698  DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
699 
700  /* 16 width */
701  DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
702  filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l);
703  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l,
704  filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1,
705  out0_r, out0_l, out1_r, out1_l);
706 
707  DUP2_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r, 6,
708  out1, out2);
709  __lsx_vst(out1, dst, 0);
710  __lsx_vstx(out2, dst, dst_stride);
711 
712  src10_r = src32_r;
713  src21_r = src43_r;
714  src10_l = src32_l;
715  src21_l = src43_l;
716  src2 = src4;
717 
718  /* next 16 width */
719  src += src_stride_2x;
720  _src += src_stride_2x;
721  DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
722  DUP2_ARG2(__lsx_vilvh_b, src9, src8, src10, src9, src98_l, src109_l);
723 
724  /* next 16 width */
725  DUP4_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src76_l, filt0, src87_r,
726  filt0, src87_l, filt0, out2_r, out2_l, out3_r, out3_l);
727  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out2_l, src98_l,
728  filt1, out3_r, src109_r, filt1, out3_l, src109_l, filt1,
729  out2_r, out2_l, out3_r, out3_l);
730 
731  /* next 16 width */
732  DUP2_ARG3(__lsx_vssrarni_bu_h, out2_l, out2_r, 6, out3_l, out3_r, 6,
733  out1, out2);
734  __lsx_vst(out1, dst, 16);
735  __lsx_vst(out2, dst + dst_stride, 16);
736 
737  dst += dst_stride_2x;
738 
739  src76_r = src98_r;
740  src87_r = src109_r;
741  src76_l = src98_l;
742  src87_l = src109_l;
743  src8 = src10;
744  }
745 }
746 
747 static av_always_inline
748 void hevc_hv_4t_8x2_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
749  int32_t dst_stride, const int8_t *filter_x,
750  const int8_t *filter_y)
751 {
752  const int32_t src_stride_2x = (src_stride << 1);
753  const int32_t src_stride_4x = (src_stride << 2);
754  const int32_t src_stride_3x = src_stride_2x + src_stride;
755  __m128i out;
756  __m128i src0, src1, src2, src3, src4;
757  __m128i filt0, filt1;
758  __m128i filt_h0, filt_h1, filter_vec;
759  __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
760  __m128i mask1;
761  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
762  __m128i dst0, dst1, dst2, dst3, dst4;
763  __m128i dst0_r, dst0_l, dst1_r, dst1_l;
764  __m128i dst10_r, dst32_r, dst21_r, dst43_r;
765  __m128i dst10_l, dst32_l, dst21_l, dst43_l;
766  __m128i out0_r, out1_r;
767 
768  src -= (src_stride + 1);
769  DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
770 
771  filter_vec = __lsx_vld(filter_y, 0);
772  filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
773  DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
774 
775  mask1 = __lsx_vaddi_bu(mask0, 2);
776  src0 = __lsx_vld(src, 0);
777  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
778  src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
779 
780  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1,
781  mask0, src1, src1, mask1, vec0, vec1, vec2, vec3);
782  DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src3, src3,
783  mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
784  DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
785 
786  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
787  filt0, dst0, dst1, dst2, dst3);
788  dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
789  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
790  vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
791  dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
792  DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
793  dst10_r, dst21_r, dst32_r, dst43_r);
794  DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
795  dst10_l, dst21_l, dst32_l, dst43_l);
796  DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
797  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
798  DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
799  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
800  dst0_r, dst0_l, dst1_r, dst1_l);
801  DUP2_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
802  out0_r, out1_r);
803  out = __lsx_vssrarni_bu_h(out1_r, out0_r, 6);
804  __lsx_vstelm_d(out, dst, 0, 0);
805  __lsx_vstelm_d(out, dst + dst_stride, 0, 1);
806 }
807 
808 static av_always_inline
809 void hevc_hv_4t_8multx4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
810  int32_t dst_stride, const int8_t *filter_x,
811  const int8_t *filter_y, int32_t width8mult)
812 {
813  uint32_t cnt;
814  const int32_t src_stride_2x = (src_stride << 1);
815  const int32_t dst_stride_2x = (dst_stride << 1);
816  const int32_t src_stride_4x = (src_stride << 2);
817  const int32_t src_stride_3x = src_stride_2x + src_stride;
818  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
819 
820  __m128i out0, out1;
821  __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
822  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
823  __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
824  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
825  __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
826  __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
827  __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
828 
829  src -= (src_stride + 1);
830  DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
831 
832  filter_vec = __lsx_vld(filter_y, 0);
833  filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
834  DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
835 
836  mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
837  mask1 = __lsx_vaddi_bu(mask0, 2);
838 
839  for (cnt = width8mult; cnt--;) {
840  src0 = __lsx_vld(src, 0);
841  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
842  src3 = __lsx_vldx(src, src_stride_3x);
843  src += src_stride_4x;
844  src4 = __lsx_vld(src, 0);
845  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
846  src += (8 - src_stride_4x);
847  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
848  vec0, vec1);
849  DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
850  vec2, vec3);
851  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
852  vec4, vec5);
853 
854  DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
855  dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
856  DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
857  dst0, dst1);
858  dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
859 
860  DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
861  DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
862 
863  DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
864  vec0, vec1);
865  DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
866  vec2, vec3);
867  DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
868  vec4, vec5);
869  DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
870  vec6, vec7);
871 
872  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
873  vec6, filt0, dst3, dst4, dst5, dst6);
874  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
875  dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
876 
877  DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6,
878  dst5, dst32_r, dst43_r, dst54_r, dst65_r);
879  DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6,
880  dst5, dst32_l, dst43_l, dst54_l, dst65_l);
881 
882  DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
883  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
884  DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
885  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
886  DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
887  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
888  dst0_r, dst0_l, dst1_r, dst1_l);
889  DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
890  filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
891  dst2_r, dst2_l, dst3_r, dst3_l);
892 
893  DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
894  dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3);
895  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
896  __lsx_vstelm_d(out0, dst, 0, 0);
897  __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
898  __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
899  __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
900  dst += 8;
901  }
902 }
903 
904 static av_always_inline
905 void hevc_hv_4t_8x6_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
906  int32_t dst_stride, const int8_t *filter_x,
907  const int8_t *filter_y)
908 {
909  const int32_t src_stride_2x = (src_stride << 1);
910  const int32_t dst_stride_2x = (dst_stride << 1);
911  const int32_t src_stride_4x = (src_stride << 2);
912  const int32_t dst_stride_4x = (dst_stride << 2);
913  const int32_t src_stride_3x = src_stride_2x + src_stride;
914  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
915  __m128i out0, out1, out2;
916  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
917  __m128i filt0, filt1;
918  __m128i filt_h0, filt_h1, filter_vec;
919  __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
920  __m128i mask1;
921  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
922  __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
923  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
924  __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
925  __m128i dst4_r, dst4_l, dst5_r, dst5_l;
926  __m128i dst10_r, dst32_r, dst10_l, dst32_l;
927  __m128i dst21_r, dst43_r, dst21_l, dst43_l;
928  __m128i dst54_r, dst54_l, dst65_r, dst65_l;
929  __m128i dst76_r, dst76_l, dst87_r, dst87_l;
930  __m128i out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
931 
932  src -= (src_stride + 1);
933  DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
934 
935  filter_vec = __lsx_vld(filter_y, 0);
936  filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
937  DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
938 
939  mask1 = __lsx_vaddi_bu(mask0, 2);
940 
941  src0 = __lsx_vld(src, 0);
942  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,src,
943  src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
944  src += src_stride_4x;
945  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,src,
946  src_stride_3x, src, src_stride_4x, src5, src6, src7, src8);
947 
948  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1,
949  mask0, src1, src1, mask1, vec0, vec1, vec2, vec3);
950  DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src3, src3,
951  mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
952  DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src5, src5,
953  mask0, src5, src5, mask1, vec8, vec9, vec10, vec11);
954  DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src7, src7,
955  mask0, src7, src7, mask1, vec12, vec13, vec14, vec15);
956  DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17);
957 
958  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
959  filt0, dst0, dst1, dst2, dst3);
960  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec8, filt0, vec10, filt0, vec12, filt0, vec14,
961  filt0, dst4, dst5, dst6, dst7);
962  dst8 = __lsx_vdp2_h_bu_b(vec16, filt0);
963  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
964  vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
965  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec9, filt1, dst5, vec11, filt1, dst6,
966  vec13, filt1, dst7, vec15, filt1, dst4, dst5, dst6, dst7);
967  dst8 = __lsx_vdp2add_h_bu_b(dst8, vec17, filt1);
968 
969  DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
970  dst10_r, dst21_r, dst32_r, dst43_r);
971  DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
972  dst10_l, dst21_l, dst32_l, dst43_l);
973  DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
974  dst54_r, dst65_r, dst76_r, dst87_r);
975  DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
976  dst54_l, dst65_l, dst76_l, dst87_l);
977 
978  DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
979  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
980  DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
981  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
982  DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
983  filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
984  DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
985  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
986  dst0_r, dst0_l, dst1_r, dst1_l);
987  DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
988  filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
989  dst2_r, dst2_l, dst3_r, dst3_l);
990  DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
991  filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
992  dst4_r, dst4_l, dst5_r, dst5_l);
993 
994  DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l,
995  dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r, out2_r, out3_r);
996  DUP2_ARG3(__lsx_vsrani_h_w, dst4_l, dst4_r, 6, dst5_l, dst5_r, 6,
997  out4_r, out5_r);
998  DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6,
999  out0, out1);
1000  out2 = __lsx_vssrarni_bu_h(out5_r, out4_r, 6);
1001 
1002  __lsx_vstelm_d(out0, dst, 0, 0);
1003  __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
1004  __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
1005  __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
1006  dst += dst_stride_4x;
1007  __lsx_vstelm_d(out2, dst, 0, 0);
1008  __lsx_vstelm_d(out2, dst + dst_stride, 0, 1);
1009 }
1010 
1011 static av_always_inline
1012 void hevc_hv_4t_8multx4mult_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
1013  int32_t dst_stride, const int8_t *filter_x,
1014  const int8_t *filter_y, int32_t height,
1015  int32_t width8mult)
1016 {
1017  uint32_t loop_cnt, cnt;
1018  const uint8_t *src_tmp;
1019  uint8_t *dst_tmp;
1020  const int32_t src_stride_2x = (src_stride << 1);
1021  const int32_t dst_stride_2x = (dst_stride << 1);
1022  const int32_t src_stride_4x = (src_stride << 2);
1023  const int32_t dst_stride_4x = (dst_stride << 2);
1024  const int32_t src_stride_3x = src_stride_2x + src_stride;
1025  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1026 
1027  __m128i out0, out1;
1028  __m128i src0, src1, src2, src3, src4, src5, src6;
1029  __m128i filt0, filt1;
1030  __m128i filt_h0, filt_h1, filter_vec;
1031  __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1032  __m128i mask1;
1033  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1034  __m128i dst0, dst1, dst2, dst3, dst4, dst5;
1035  __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1036  __m128i dst10_r, dst32_r, dst21_r, dst43_r;
1037  __m128i dst10_l, dst32_l, dst21_l, dst43_l;
1038  __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6;
1039  __m128i out0_r, out1_r, out2_r, out3_r;
1040 
1041  src -= (src_stride + 1);
1042  DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1043 
1044  filter_vec = __lsx_vld(filter_y, 0);
1045  filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1046  DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1047  mask1 = __lsx_vaddi_bu(mask0, 2);
1048 
1049  for (cnt = width8mult; cnt--;) {
1050  src_tmp = src;
1051  dst_tmp = dst;
1052 
1053  src0 = __lsx_vld(src_tmp, 0);
1054  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1055  src1, src2);
1056  src_tmp += src_stride_3x;
1057 
1058  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
1059  vec0, vec1);
1060  DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
1061  vec2, vec3);
1062  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
1063  vec4, vec5);
1064 
1065  DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
1066  dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
1067  DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
1068  dst0, dst1);
1069  dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
1070 
1071  DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
1072  DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
1073 
1074  for (loop_cnt = (height >> 2); loop_cnt--;) {
1075  src3 = __lsx_vld(src_tmp, 0);
1076  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1077  src4, src5);
1078  src6 = __lsx_vldx(src_tmp, src_stride_3x);
1079  src_tmp += src_stride_4x;
1080 
1081  DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
1082  src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
1083  DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
1084  src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
1085 
1086  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1087  vec6, filt0, dst3, dst4, dst5, dst6);
1088  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
1089  filt1, dst5, vec5, filt1, dst6, vec7, filt1,
1090  dst3, dst4, dst5, dst6);
1091 
1092  DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4,
1093  dst6, dst5, dst32_r, dst43_r, dst54_r, dst65_r);
1094  DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4,
1095  dst6, dst5, dst32_l, dst43_l, dst54_l, dst65_l);
1096 
1097  DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1098  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1099  DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
1100  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
1101  DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
1102  dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
1103  dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
1104  DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
1105  dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
1106  dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
1107 
1108  DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
1109  dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r,
1110  out2_r, out3_r);
1111  DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r,
1112  6, out0, out1);
1113  __lsx_vstelm_d(out0, dst_tmp, 0, 0);
1114  __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
1115  __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
1116  __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
1117  dst_tmp += dst_stride_4x;
1118 
1119  dst10_r = dst54_r;
1120  dst10_l = dst54_l;
1121  dst21_r = dst65_r;
1122  dst21_l = dst65_l;
1123  dst2 = dst6;
1124  }
1125  src += 8;
1126  dst += 8;
1127  }
1128 }
1129 
1130 static
1131 void hevc_hv_4t_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
1132  int32_t dst_stride, const int8_t *filter_x,
1133  const int8_t *filter_y, int32_t height)
1134 {
1135  if (2 == height) {
1136  hevc_hv_4t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x, filter_y);
1137  } else if (4 == height) {
1138  hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride,
1139  filter_x, filter_y, 1);
1140  } else if (6 == height) {
1141  hevc_hv_4t_8x6_lsx(src, src_stride, dst, dst_stride, filter_x, filter_y);
1142  } else if (0 == (height & 0x03)) {
1143  hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
1144  filter_x, filter_y, height, 1);
1145  }
1146 }
1147 
1148 static av_always_inline
1149 void hevc_hv_4t_12w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
1150  int32_t dst_stride, const int8_t *filter_x,
1151  const int8_t *filter_y, int32_t height)
1152 {
1153  uint32_t loop_cnt;
1154  const uint8_t *src_tmp;
1155  uint8_t *dst_tmp;
1156  const int32_t src_stride_2x = (src_stride << 1);
1157  const int32_t dst_stride_2x = (dst_stride << 1);
1158  const int32_t src_stride_4x = (src_stride << 2);
1159  const int32_t dst_stride_4x = (dst_stride << 2);
1160  const int32_t src_stride_3x = src_stride_2x + src_stride;
1161  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1162  __m128i out0, out1;
1163  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1164  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1165  __m128i mask0, mask1, mask2, mask3;
1166  __m128i filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
1167  __m128i dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
1168  __m128i dst10, dst21, dst22, dst73, dst84, dst95, dst106;
1169  __m128i dst76_r, dst98_r, dst87_r, dst109_r;
1170  __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
1171  __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
1172  __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1173  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1174 
1175  src -= (src_stride + 1);
1176  DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1177 
1178  filter_vec = __lsx_vld(filter_y, 0);
1179  filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1180  DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1181 
1182  mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1183  mask1 = __lsx_vaddi_bu(mask0, 2);
1184 
1185  src_tmp = src;
1186  dst_tmp = dst;
1187 
1188  src0 = __lsx_vld(src_tmp, 0);
1189  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1190  src1, src2);
1191  src_tmp += src_stride_3x;
1192 
1193  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
1194  DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
1195  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
1196 
1197  DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1);
1198  dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0);
1199  DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1,
1200  dsth0, dsth1);
1201  dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1);
1202 
1203  DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, dst10_r, dst21_r);
1204  DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, dst10_l, dst21_l);
1205 
1206  for (loop_cnt = 4; loop_cnt--;) {
1207  src3 = __lsx_vld(src_tmp, 0);
1208  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1209  src4, src5);
1210  src6 = __lsx_vldx(src_tmp, src_stride_3x);
1211  src_tmp += src_stride_4x;
1212 
1213  DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
1214  src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
1215  DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
1216  src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
1217 
1218  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1219  vec6, filt0, dsth3, dsth4, dsth5, dsth6);
1220  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4,
1221  vec3, filt1, dsth5, vec5, filt1, dsth6, vec7, filt1,
1222  dsth3, dsth4, dsth5, dsth6);
1223 
1224  DUP4_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
1225  dsth6, dsth5, dst32_r, dst43_r, dst54_r, dst65_r);
1226  DUP4_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
1227  dsth6, dsth5, dst32_l, dst43_l, dst54_l, dst65_l);
1228 
1229  DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1230  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1231  DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
1232  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
1233  DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
1234  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
1235  dst0_r, dst0_l, dst1_r, dst1_l);
1236  DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
1237  filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
1238  dst2_r, dst2_l, dst3_r, dst3_l);
1239 
1240  DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l,
1241  dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3);
1242  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
1243 
1244  __lsx_vstelm_d(out0, dst_tmp, 0, 0);
1245  __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
1246  __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
1247  __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
1248  dst_tmp += dst_stride_4x;
1249 
1250  dst10_r = dst54_r;
1251  dst10_l = dst54_l;
1252  dst21_r = dst65_r;
1253  dst21_l = dst65_l;
1254  dsth2 = dsth6;
1255  }
1256 
1257  src += 8;
1258  dst += 8;
1259 
1260  mask2 = __lsx_vld(ff_hevc_mask_arr, 16);
1261  mask3 = __lsx_vaddi_bu(mask2, 2);
1262 
1263  src0 = __lsx_vld(src, 0);
1264  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
1265  src += src_stride_3x;
1266  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src1, src0, mask3, vec0, vec1);
1267  DUP2_ARG3(__lsx_vshuf_b, src2, src1, mask2, src2, src1, mask3, vec2, vec3);
1268 
1269  DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst10, dst21);
1270  DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, vec1, filt1, dst21, vec3, filt1,
1271  dst10, dst21);
1272 
1273  dst10_r = __lsx_vilvl_h(dst21, dst10);
1274  dst21_r = __lsx_vilvh_h(dst21, dst10);
1275  dst22 = __lsx_vreplvei_d(dst21, 1);
1276 
1277  for (loop_cnt = 2; loop_cnt--;) {
1278  src3 = __lsx_vld(src, 0);
1279  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src4, src5);
1280  src6 = __lsx_vldx(src, src_stride_3x);
1281  src += src_stride_4x;
1282  src7 = __lsx_vld(src, 0);
1283  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
1284  src10 = __lsx_vldx(src, src_stride_3x);
1285  src += src_stride_4x;
1286  DUP4_ARG3(__lsx_vshuf_b, src7, src3, mask2, src7, src3, mask3, src8,
1287  src4, mask2, src8, src4, mask3, vec0, vec1, vec2, vec3);
1288  DUP4_ARG3(__lsx_vshuf_b, src9, src5, mask2, src9, src5, mask3, src10,
1289  src6, mask2, src10, src6, mask3, vec4, vec5, vec6, vec7);
1290 
1291  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1292  vec6, filt0, dst73, dst84, dst95, dst106);
1293  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst73, vec1, filt1, dst84, vec3,
1294  filt1, dst95, vec5, filt1, dst106, vec7, filt1,
1295  dst73, dst84, dst95, dst106);
1296 
1297  dst32_r = __lsx_vilvl_h(dst73, dst22);
1298  DUP2_ARG2(__lsx_vilvl_h, dst84, dst73, dst95, dst84, dst43_r, dst54_r);
1299  DUP2_ARG2(__lsx_vilvh_h, dst84, dst73, dst95, dst84, dst87_r, dst98_r);
1300  dst65_r = __lsx_vilvl_h(dst106, dst95);
1301  dst109_r = __lsx_vilvh_h(dst106, dst95);
1302  dst22 = __lsx_vreplvei_d(dst73, 1);
1303  dst76_r = __lsx_vilvl_h(dst22, dst106);
1304 
1305  DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
1306  filt_h0, dst43_r, filt_h0, dst0, dst1, dst2, dst3);
1307  DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst65_r, filt_h0, dst76_r,
1308  filt_h0, dst87_r, filt_h0, dst4, dst5, dst6, dst7);
1309  DUP4_ARG3(__lsx_vdp2add_w_h, dst0, dst32_r, filt_h1, dst1, dst43_r,
1310  filt_h1, dst2, dst54_r, filt_h1, dst3, dst65_r, filt_h1,
1311  dst0, dst1, dst2, dst3);
1312  DUP4_ARG3(__lsx_vdp2add_w_h, dst4, dst76_r, filt_h1, dst5, dst87_r,
1313  filt_h1, dst6, dst98_r, filt_h1, dst7, dst109_r, filt_h1,
1314  dst4, dst5, dst6, dst7);
1315 
1316  DUP4_ARG3(__lsx_vsrani_h_w, dst1, dst0, 6, dst3, dst2, 6, dst5, dst4,
1317  6, dst7, dst6, 6, tmp0, tmp1, tmp2, tmp3);
1318  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
1319 
1320  __lsx_vstelm_w(out0, dst, 0, 0);
1321  __lsx_vstelm_w(out0, dst + dst_stride, 0, 1);
1322  __lsx_vstelm_w(out0, dst + dst_stride_2x, 0, 2);
1323  __lsx_vstelm_w(out0, dst + dst_stride_3x, 0, 3);
1324  dst += dst_stride_4x;
1325  __lsx_vstelm_w(out1, dst, 0, 0);
1326  __lsx_vstelm_w(out1, dst + dst_stride, 0, 1);
1327  __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 2);
1328  __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 3);
1329  dst += dst_stride_4x;
1330 
1331  dst10_r = dst98_r;
1332  dst21_r = dst109_r;
1333  dst22 = __lsx_vreplvei_d(dst106, 1);
1334  }
1335 }
1336 
1337 static void hevc_hv_4t_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
1338  int32_t dst_stride, const int8_t *filter_x,
1339  const int8_t *filter_y, int32_t height)
1340 {
1341  if (4 == height) {
1342  hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride, filter_x,
1343  filter_y, 2);
1344  } else {
1345  hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
1346  filter_x, filter_y, height, 2);
1347  }
1348 }
1349 
1350 static void hevc_hv_4t_24w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
1351  int32_t dst_stride, const int8_t *filter_x,
1352  const int8_t *filter_y, int32_t height)
1353 {
1354  hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
1355  filter_x, filter_y, height, 3);
1356 }
1357 
1358 static void hevc_hv_4t_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
1359  int32_t dst_stride, const int8_t *filter_x,
1360  const int8_t *filter_y, int32_t height)
1361 {
1362  hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
1363  filter_x, filter_y, height, 4);
1364 }
1365 
1366 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
1367 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
1368  ptrdiff_t dst_stride, \
1369  const uint8_t *src, \
1370  ptrdiff_t src_stride, \
1371  int height, \
1372  intptr_t mx, \
1373  intptr_t my, \
1374  int width) \
1375 { \
1376  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
1377  \
1378  common_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, \
1379  filter, height); \
1380 }
1381 
1382 UNI_MC(qpel, h, 64, 8, hz, mx);
1383 
1384 UNI_MC(qpel, v, 24, 8, vt, my);
1385 UNI_MC(qpel, v, 32, 8, vt, my);
1386 UNI_MC(qpel, v, 48, 8, vt, my);
1387 UNI_MC(qpel, v, 64, 8, vt, my);
1388 
1389 UNI_MC(epel, v, 24, 4, vt, my);
1390 UNI_MC(epel, v, 32, 4, vt, my);
1391 
1392 #undef UNI_MC
1393 
1394 #define UNI_MC_HV(PEL, WIDTH, TAP) \
1395 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst, \
1396  ptrdiff_t dst_stride, \
1397  const uint8_t *src, \
1398  ptrdiff_t src_stride, \
1399  int height, \
1400  intptr_t mx, \
1401  intptr_t my, \
1402  int width) \
1403 { \
1404  const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
1405  const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
1406  \
1407  hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, \
1408  filter_x, filter_y, height); \
1409 }
1410 
1411 UNI_MC_HV(qpel, 8, 8);
1412 UNI_MC_HV(qpel, 16, 8);
1413 UNI_MC_HV(qpel, 24, 8);
1414 UNI_MC_HV(qpel, 32, 8);
1415 UNI_MC_HV(qpel, 48, 8);
1416 UNI_MC_HV(qpel, 64, 8);
1417 
1418 UNI_MC_HV(epel, 8, 4);
1419 UNI_MC_HV(epel, 12, 4);
1420 UNI_MC_HV(epel, 16, 4);
1421 UNI_MC_HV(epel, 24, 4);
1422 UNI_MC_HV(epel, 32, 4);
1423 
1424 #undef UNI_MC_HV
hevc_hv_8t_48w_lsx
static void hevc_hv_8t_48w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:531
hevc_hv_8t_16w_lsx
static void hevc_hv_8t_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:507
out
FILE * out
Definition: movenc.c:54
hevc_hv_8t_24w_lsx
static void hevc_hv_8t_24w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:515
hevc_hv_8t_32w_lsx
static void hevc_hv_8t_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:523
src1
const pixel * src1
Definition: h264pred_template.c:421
hevc_hv_4t_8x6_lsx
static av_always_inline void hevc_hv_4t_8x6_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
Definition: hevc_mc_uni_lsx.c:905
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
DUP2_ARG2
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:58
UNI_MC
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
Definition: hevc_mc_uni_lsx.c:1366
aligned
static int aligned(int val)
Definition: dashdec.c:170
DUP4_ARG2
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:76
width
#define width
common_vt_8t_16w_lsx
static av_always_inline void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
Definition: hevc_mc_uni_lsx.c:200
hevc_hv_4t_8multx4mult_lsx
static av_always_inline void hevc_hv_4t_8multx4mult_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width8mult)
Definition: hevc_mc_uni_lsx.c:1012
hevc_hv_4t_8x2_lsx
static av_always_inline void hevc_hv_4t_8x2_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
Definition: hevc_mc_uni_lsx.c:748
hevc_hv_8t_8x2_lsx
static av_always_inline void hevc_hv_8t_8x2_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
Definition: hevc_mc_uni_lsx.c:338
DUP4_ARG1
#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:70
hevc_hv_8t_64w_lsx
static void hevc_hv_8t_64w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:539
common_vt_8t_32w_lsx
static void common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_lsx.c:316
UNI_MC_HV
#define UNI_MC_HV(PEL, WIDTH, TAP)
Definition: hevc_mc_uni_lsx.c:1394
common_vt_4t_24w_lsx
static av_always_inline void common_vt_4t_24w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_lsx.c:548
DUP2_ARG3
#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:64
height
#define height
hevc_hv_4t_24w_lsx
static void hevc_hv_4t_24w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:1350
ff_hevc_mask_arr
static const uint8_t ff_hevc_mask_arr[16 *3]
Definition: hevc_mc_uni_lsx.c:26
common_vt_8t_64w_lsx
static void common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_lsx.c:330
common_vt_4t_32w_lsx
static av_always_inline void common_vt_4t_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_lsx.c:654
hevc_hv_4t_32w_lsx
static void hevc_hv_4t_32w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:1358
src2
const pixel * src2
Definition: h264pred_template.c:422
av_always_inline
#define av_always_inline
Definition: attributes.h:49
hevc_hv_4t_8w_lsx
static void hevc_hv_4t_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:1131
hevc_hv_4t_12w_lsx
static av_always_inline void hevc_hv_4t_12w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:1149
common_vt_8t_8w_lsx
static av_always_inline void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_lsx.c:128
common_hz_8t_64w_lsx
static av_always_inline void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_lsx.c:36
src0
const pixel *const src0
Definition: h264pred_template.c:420
loongson_intrinsics.h
hevc_hv_4t_8multx4_lsx
static av_always_inline void hevc_hv_4t_8multx4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult)
Definition: hevc_mc_uni_lsx.c:809
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
int32_t
int32_t
Definition: audioconvert.c:56
common_vt_8t_24w_lsx
static void common_vt_8t_24w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_lsx.c:307
h
h
Definition: vp9dsp_template.c:2038
hevc_hv_8t_8w_lsx
static void hevc_hv_8t_8w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:499
hevcdsp_lsx.h
common_vt_8t_48w_lsx
static void common_vt_8t_48w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_lsx.c:323
hevc_hv_4t_16w_lsx
static void hevc_hv_4t_16w_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:1337
DUP4_ARG3
#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:83