FFmpeg
vp9_mc_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/vp9dsp.h"
23 #include "vp9dsp_mips.h"
24 
25 static const uint8_t mc_filt_mask_arr[16 * 3] = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  /* 4 width cases */
29  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30  /* 4 width cases */
31  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32 };
33 
34 static const int8_t vp9_bilinear_filters_msa[15][2] = {
35  {120, 8},
36  {112, 16},
37  {104, 24},
38  {96, 32},
39  {88, 40},
40  {80, 48},
41  {72, 56},
42  {64, 64},
43  {56, 72},
44  {48, 80},
45  {40, 88},
46  {32, 96},
47  {24, 104},
48  {16, 112},
49  {8, 120}
50 };
51 
52 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
53  filt0, filt1, filt2, filt3) \
54 ( { \
55  v8i16 tmp0, tmp1; \
56  \
57  tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
58  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
59  tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2); \
60  tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3); \
61  tmp0 = __msa_adds_s_h(tmp0, tmp1); \
62  \
63  tmp0; \
64 } )
65 
66 #define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, \
67  filt_h0, filt_h1, filt_h2, filt_h3) \
68 ( { \
69  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
70  v8i16 hz_out_m; \
71  \
72  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, \
73  vec0_m, vec1_m, vec2_m, vec3_m); \
74  hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, \
75  filt_h0, filt_h1, filt_h2, filt_h3); \
76  \
77  hz_out_m = __msa_srari_h(hz_out_m, 7); \
78  hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
79  \
80  hz_out_m; \
81 } )
82 
83 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
84  mask0, mask1, mask2, mask3, \
85  filt0, filt1, filt2, filt3, \
86  out0, out1) \
87 { \
88  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
89  v8i16 res0_m, res1_m, res2_m, res3_m; \
90  \
91  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
92  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
93  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
94  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
95  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
96  DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
97  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
98  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
99  ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
100 }
101 
102 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
103  mask0, mask1, mask2, mask3, \
104  filt0, filt1, filt2, filt3, \
105  out0, out1, out2, out3) \
106 { \
107  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
108  v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
109  \
110  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
111  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
112  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
113  res0_m, res1_m, res2_m, res3_m); \
114  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
115  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
116  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
117  res4_m, res5_m, res6_m, res7_m); \
118  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
119  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
120  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
121  res0_m, res1_m, res2_m, res3_m); \
122  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
123  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
124  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
125  res4_m, res5_m, res6_m, res7_m); \
126  ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
127  res7_m, out0, out1, out2, out3); \
128 }
129 
130 #define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \
131 { \
132  v16u8 tmp_m; \
133  \
134  tmp_m = PCKEV_XORI128_UB(in1, in0); \
135  tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \
136  ST_UB(tmp_m, (pdst)); \
137 }
138 
139 #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \
140 { \
141  v16u8 tmp_m; \
142  \
143  tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
144  tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \
145  ST_UB(tmp_m, (pdst)); \
146 }
147 
148 #define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, \
149  pdst, stride) \
150 { \
151  v16u8 tmp0_m, tmp1_m; \
152  uint8_t *pdst_m = (uint8_t *) (pdst); \
153  \
154  PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
155  AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
156  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
157 }
158 
159 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
160  uint8_t *dst, int32_t dst_stride,
161  const int8_t *filter)
162 {
163  v16u8 mask0, mask1, mask2, mask3, out;
164  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
165  v8i16 filt, out0, out1;
166 
167  mask0 = LD_UB(&mc_filt_mask_arr[16]);
168  src -= 3;
169 
170  /* rearranging filter */
171  filt = LD_SH(filter);
172  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
173 
174  mask1 = mask0 + 2;
175  mask2 = mask0 + 4;
176  mask3 = mask0 + 6;
177 
178  LD_SB4(src, src_stride, src0, src1, src2, src3);
179  XORI_B4_128_SB(src0, src1, src2, src3);
180  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
181  mask3, filt0, filt1, filt2, filt3, out0, out1);
182  SRARI_H2_SH(out0, out1, 7);
183  SAT_SH2_SH(out0, out1, 7);
184  out = PCKEV_XORI128_UB(out0, out1);
185  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
186 }
187 
188 static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
189  uint8_t *dst, int32_t dst_stride,
190  const int8_t *filter)
191 {
192  v16i8 filt0, filt1, filt2, filt3;
193  v16i8 src0, src1, src2, src3;
194  v16u8 mask0, mask1, mask2, mask3, out;
195  v8i16 filt, out0, out1, out2, out3;
196 
197  mask0 = LD_UB(&mc_filt_mask_arr[16]);
198  src -= 3;
199 
200  /* rearranging filter */
201  filt = LD_SH(filter);
202  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
203 
204  mask1 = mask0 + 2;
205  mask2 = mask0 + 4;
206  mask3 = mask0 + 6;
207 
208  LD_SB4(src, src_stride, src0, src1, src2, src3);
209  XORI_B4_128_SB(src0, src1, src2, src3);
210  src += (4 * src_stride);
211  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
212  mask3, filt0, filt1, filt2, filt3, out0, out1);
213  LD_SB4(src, src_stride, src0, src1, src2, src3);
214  XORI_B4_128_SB(src0, src1, src2, src3);
215  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
216  mask3, filt0, filt1, filt2, filt3, out2, out3);
217  SRARI_H4_SH(out0, out1, out2, out3, 7);
218  SAT_SH4_SH(out0, out1, out2, out3, 7);
219  out = PCKEV_XORI128_UB(out0, out1);
220  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
221  dst += (4 * dst_stride);
222  out = PCKEV_XORI128_UB(out2, out3);
223  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
224 }
225 
226 static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
227  uint8_t *dst, int32_t dst_stride,
228  const int8_t *filter, int32_t height)
229 {
230  if (4 == height) {
231  common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
232  } else if (8 == height) {
233  common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
234  }
235 }
236 
237 static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
238  uint8_t *dst, int32_t dst_stride,
239  const int8_t *filter)
240 {
241  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
242  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
243  v8i16 filt, out0, out1, out2, out3;
244 
245  mask0 = LD_UB(&mc_filt_mask_arr[0]);
246  src -= 3;
247 
248  /* rearranging filter */
249  filt = LD_SH(filter);
250  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
251 
252  mask1 = mask0 + 2;
253  mask2 = mask0 + 4;
254  mask3 = mask0 + 6;
255 
256  LD_SB4(src, src_stride, src0, src1, src2, src3);
257  XORI_B4_128_SB(src0, src1, src2, src3);
258  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
259  mask3, filt0, filt1, filt2, filt3, out0, out1,
260  out2, out3);
261  SRARI_H4_SH(out0, out1, out2, out3, 7);
262  SAT_SH4_SH(out0, out1, out2, out3, 7);
263  tmp0 = PCKEV_XORI128_UB(out0, out1);
264  tmp1 = PCKEV_XORI128_UB(out2, out3);
265  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
266 }
267 
268 static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
269  uint8_t *dst, int32_t dst_stride,
270  const int8_t *filter, int32_t height)
271 {
272  uint32_t loop_cnt;
273  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
274  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
275  v8i16 filt, out0, out1, out2, out3;
276 
277  mask0 = LD_UB(&mc_filt_mask_arr[0]);
278  src -= 3;
279 
280  /* rearranging filter */
281  filt = LD_SH(filter);
282  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
283 
284  mask1 = mask0 + 2;
285  mask2 = mask0 + 4;
286  mask3 = mask0 + 6;
287 
288  for (loop_cnt = (height >> 2); loop_cnt--;) {
289  LD_SB4(src, src_stride, src0, src1, src2, src3);
290  XORI_B4_128_SB(src0, src1, src2, src3);
291  src += (4 * src_stride);
292  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
293  mask3, filt0, filt1, filt2, filt3, out0,
294  out1, out2, out3);
295  SRARI_H4_SH(out0, out1, out2, out3, 7);
296  SAT_SH4_SH(out0, out1, out2, out3, 7);
297  tmp0 = PCKEV_XORI128_UB(out0, out1);
298  tmp1 = PCKEV_XORI128_UB(out2, out3);
299  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
300  dst += (4 * dst_stride);
301  }
302 }
303 
304 static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
305  uint8_t *dst, int32_t dst_stride,
306  const int8_t *filter, int32_t height)
307 {
308  if (4 == height) {
309  common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
310  } else {
311  common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
312  height);
313  }
314 }
315 
316 static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
317  uint8_t *dst, int32_t dst_stride,
318  const int8_t *filter, int32_t height)
319 {
320  uint32_t loop_cnt;
321  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
322  v16u8 mask0, mask1, mask2, mask3, out;
323  v8i16 filt, out0, out1, out2, out3;
324 
325  mask0 = LD_UB(&mc_filt_mask_arr[0]);
326  src -= 3;
327 
328  /* rearranging filter */
329  filt = LD_SH(filter);
330  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
331 
332  mask1 = mask0 + 2;
333  mask2 = mask0 + 4;
334  mask3 = mask0 + 6;
335 
336  for (loop_cnt = (height >> 1); loop_cnt--;) {
337  LD_SB2(src, src_stride, src0, src2);
338  LD_SB2(src + 8, src_stride, src1, src3);
339  XORI_B4_128_SB(src0, src1, src2, src3);
340  src += (2 * src_stride);
341  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
342  mask3, filt0, filt1, filt2, filt3, out0,
343  out1, out2, out3);
344  SRARI_H4_SH(out0, out1, out2, out3, 7);
345  SAT_SH4_SH(out0, out1, out2, out3, 7);
346  out = PCKEV_XORI128_UB(out0, out1);
347  ST_UB(out, dst);
348  dst += dst_stride;
349  out = PCKEV_XORI128_UB(out2, out3);
350  ST_UB(out, dst);
351  dst += dst_stride;
352  }
353 }
354 
355 static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
356  uint8_t *dst, int32_t dst_stride,
357  const int8_t *filter, int32_t height)
358 {
359  uint32_t loop_cnt;
360  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
361  v16u8 mask0, mask1, mask2, mask3, out;
362  v8i16 filt, out0, out1, out2, out3;
363 
364  mask0 = LD_UB(&mc_filt_mask_arr[0]);
365  src -= 3;
366 
367  /* rearranging filter */
368  filt = LD_SH(filter);
369  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
370 
371  mask1 = mask0 + 2;
372  mask2 = mask0 + 4;
373  mask3 = mask0 + 6;
374 
375  for (loop_cnt = (height >> 1); loop_cnt--;) {
376  src0 = LD_SB(src);
377  src2 = LD_SB(src + 16);
378  src3 = LD_SB(src + 24);
379  src1 = __msa_sldi_b(src2, src0, 8);
380  src += src_stride;
381  XORI_B4_128_SB(src0, src1, src2, src3);
382  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
383  mask3, filt0, filt1, filt2, filt3, out0,
384  out1, out2, out3);
385  SRARI_H4_SH(out0, out1, out2, out3, 7);
386  SAT_SH4_SH(out0, out1, out2, out3, 7);
387 
388  src0 = LD_SB(src);
389  src2 = LD_SB(src + 16);
390  src3 = LD_SB(src + 24);
391  src1 = __msa_sldi_b(src2, src0, 8);
392  src += src_stride;
393 
394  out = PCKEV_XORI128_UB(out0, out1);
395  ST_UB(out, dst);
396  out = PCKEV_XORI128_UB(out2, out3);
397  ST_UB(out, dst + 16);
398  dst += dst_stride;
399 
400  XORI_B4_128_SB(src0, src1, src2, src3);
401  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
402  mask3, filt0, filt1, filt2, filt3, out0,
403  out1, out2, out3);
404  SRARI_H4_SH(out0, out1, out2, out3, 7);
405  SAT_SH4_SH(out0, out1, out2, out3, 7);
406  out = PCKEV_XORI128_UB(out0, out1);
407  ST_UB(out, dst);
408  out = PCKEV_XORI128_UB(out2, out3);
409  ST_UB(out, dst + 16);
410  dst += dst_stride;
411  }
412 }
413 
414 static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
415  uint8_t *dst, int32_t dst_stride,
416  const int8_t *filter, int32_t height)
417 {
418  int32_t loop_cnt;
419  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
420  v16u8 mask0, mask1, mask2, mask3, out;
421  v8i16 filt, out0, out1, out2, out3;
422 
423  mask0 = LD_UB(&mc_filt_mask_arr[0]);
424  src -= 3;
425 
426  /* rearranging filter */
427  filt = LD_SH(filter);
428  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
429 
430  mask1 = mask0 + 2;
431  mask2 = mask0 + 4;
432  mask3 = mask0 + 6;
433 
434  for (loop_cnt = height; loop_cnt--;) {
435  src0 = LD_SB(src);
436  src2 = LD_SB(src + 16);
437  src3 = LD_SB(src + 24);
438  src1 = __msa_sldi_b(src2, src0, 8);
439 
440  XORI_B4_128_SB(src0, src1, src2, src3);
441  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
442  mask2, mask3, filt0, filt1, filt2, filt3,
443  out0, out1, out2, out3);
444  SRARI_H4_SH(out0, out1, out2, out3, 7);
445  SAT_SH4_SH(out0, out1, out2, out3, 7);
446  out = PCKEV_XORI128_UB(out0, out1);
447  ST_UB(out, dst);
448  out = PCKEV_XORI128_UB(out2, out3);
449  ST_UB(out, dst + 16);
450 
451  src0 = LD_SB(src + 32);
452  src2 = LD_SB(src + 48);
453  src3 = LD_SB(src + 56);
454  src1 = __msa_sldi_b(src2, src0, 8);
455  src += src_stride;
456 
457  XORI_B4_128_SB(src0, src1, src2, src3);
458  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
459  mask2, mask3, filt0, filt1, filt2, filt3,
460  out0, out1, out2, out3);
461  SRARI_H4_SH(out0, out1, out2, out3, 7);
462  SAT_SH4_SH(out0, out1, out2, out3, 7);
463  out = PCKEV_XORI128_UB(out0, out1);
464  ST_UB(out, dst + 32);
465  out = PCKEV_XORI128_UB(out2, out3);
466  ST_UB(out, dst + 48);
467  dst += dst_stride;
468  }
469 }
470 
471 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
472  uint8_t *dst, int32_t dst_stride,
473  const int8_t *filter, int32_t height)
474 {
475  uint32_t loop_cnt;
476  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
477  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
478  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
479  v16i8 src10998, filt0, filt1, filt2, filt3;
480  v16u8 out;
481  v8i16 filt, out10, out32;
482 
483  src -= (3 * src_stride);
484 
485  filt = LD_SH(filter);
486  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
487 
488  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
489  src += (7 * src_stride);
490 
491  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
492  src54_r, src21_r);
493  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
494  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
495  src4332, src6554);
496  XORI_B3_128_SB(src2110, src4332, src6554);
497 
498  for (loop_cnt = (height >> 2); loop_cnt--;) {
499  LD_SB4(src, src_stride, src7, src8, src9, src10);
500  src += (4 * src_stride);
501 
502  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
503  src87_r, src98_r, src109_r);
504  ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
505  XORI_B2_128_SB(src8776, src10998);
506  out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
507  filt1, filt2, filt3);
508  out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
509  filt1, filt2, filt3);
510  SRARI_H2_SH(out10, out32, 7);
511  SAT_SH2_SH(out10, out32, 7);
512  out = PCKEV_XORI128_UB(out10, out32);
513  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
514  dst += (4 * dst_stride);
515 
516  src2110 = src6554;
517  src4332 = src8776;
518  src6554 = src10998;
519  src6 = src10;
520  }
521 }
522 
523 static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
524  uint8_t *dst, int32_t dst_stride,
525  const int8_t *filter, int32_t height)
526 {
527  uint32_t loop_cnt;
528  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
529  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
530  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
531  v16u8 tmp0, tmp1;
532  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
533 
534  src -= (3 * src_stride);
535 
536  filt = LD_SH(filter);
537  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
538 
539  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
540  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
541  src += (7 * src_stride);
542  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
543  src54_r, src21_r);
544  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
545 
546  for (loop_cnt = (height >> 2); loop_cnt--;) {
547  LD_SB4(src, src_stride, src7, src8, src9, src10);
548  XORI_B4_128_SB(src7, src8, src9, src10);
549  src += (4 * src_stride);
550 
551  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
552  src87_r, src98_r, src109_r);
553  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
554  filt1, filt2, filt3);
555  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
556  filt1, filt2, filt3);
557  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
558  filt1, filt2, filt3);
559  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
560  filt1, filt2, filt3);
561  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
562  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
563  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
564  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
565  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
566  dst += (4 * dst_stride);
567 
568  src10_r = src54_r;
569  src32_r = src76_r;
570  src54_r = src98_r;
571  src21_r = src65_r;
572  src43_r = src87_r;
573  src65_r = src109_r;
574  src6 = src10;
575  }
576 }
577 
578 static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
579  uint8_t *dst, int32_t dst_stride,
580  const int8_t *filter, int32_t height)
581 {
582  uint32_t loop_cnt;
583  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
584  v16i8 filt0, filt1, filt2, filt3;
585  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
586  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
587  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
588  v16u8 tmp0, tmp1, tmp2, tmp3;
589  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
590 
591  src -= (3 * src_stride);
592 
593  filt = LD_SH(filter);
594  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
595 
596  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
597  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
598  src += (7 * src_stride);
599  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
600  src54_r, src21_r);
601  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
602  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
603  src54_l, src21_l);
604  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
605 
606  for (loop_cnt = (height >> 2); loop_cnt--;) {
607  LD_SB4(src, src_stride, src7, src8, src9, src10);
608  XORI_B4_128_SB(src7, src8, src9, src10);
609  src += (4 * src_stride);
610 
611  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
612  src87_r, src98_r, src109_r);
613  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
614  src87_l, src98_l, src109_l);
615  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
616  filt1, filt2, filt3);
617  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
618  filt1, filt2, filt3);
619  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
620  filt1, filt2, filt3);
621  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
622  filt1, filt2, filt3);
623  out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
624  filt1, filt2, filt3);
625  out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
626  filt1, filt2, filt3);
627  out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
628  filt1, filt2, filt3);
629  out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
630  filt1, filt2, filt3);
631  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
632  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
633  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
634  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
635  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
636  out3_r, tmp0, tmp1, tmp2, tmp3);
637  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
638  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
639  dst += (4 * dst_stride);
640 
641  src10_r = src54_r;
642  src32_r = src76_r;
643  src54_r = src98_r;
644  src21_r = src65_r;
645  src43_r = src87_r;
646  src65_r = src109_r;
647  src10_l = src54_l;
648  src32_l = src76_l;
649  src54_l = src98_l;
650  src21_l = src65_l;
651  src43_l = src87_l;
652  src65_l = src109_l;
653  src6 = src10;
654  }
655 }
656 
657 static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
658  uint8_t *dst, int32_t dst_stride,
659  const int8_t *filter, int32_t height,
660  int32_t width)
661 {
662  const uint8_t *src_tmp;
663  uint8_t *dst_tmp;
664  uint32_t loop_cnt, cnt;
665  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
666  v16i8 filt0, filt1, filt2, filt3;
667  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
668  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
669  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
670  v16u8 tmp0, tmp1, tmp2, tmp3;
671  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
672 
673  src -= (3 * src_stride);
674 
675  filt = LD_SH(filter);
676  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
677 
678  for (cnt = (width >> 4); cnt--;) {
679  src_tmp = src;
680  dst_tmp = dst;
681 
682  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
683  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
684  src_tmp += (7 * src_stride);
685  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
686  src32_r, src54_r, src21_r);
687  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
688  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
689  src32_l, src54_l, src21_l);
690  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
691 
692  for (loop_cnt = (height >> 2); loop_cnt--;) {
693  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
694  XORI_B4_128_SB(src7, src8, src9, src10);
695  src_tmp += (4 * src_stride);
696  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
697  src87_r, src98_r, src109_r);
698  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
699  src87_l, src98_l, src109_l);
700  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
701  filt0, filt1, filt2, filt3);
702  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
703  filt0, filt1, filt2, filt3);
704  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
705  filt0, filt1, filt2, filt3);
706  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
707  filt0, filt1, filt2, filt3);
708  out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
709  filt0, filt1, filt2, filt3);
710  out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
711  filt0, filt1, filt2, filt3);
712  out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
713  filt0, filt1, filt2, filt3);
714  out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
715  filt0, filt1, filt2, filt3);
716  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
717  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
718  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
719  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
720  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
721  out3_r, tmp0, tmp1, tmp2, tmp3);
722  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
723  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
724  dst_tmp += (4 * dst_stride);
725 
726  src10_r = src54_r;
727  src32_r = src76_r;
728  src54_r = src98_r;
729  src21_r = src65_r;
730  src43_r = src87_r;
731  src65_r = src109_r;
732  src10_l = src54_l;
733  src32_l = src76_l;
734  src54_l = src98_l;
735  src21_l = src65_l;
736  src43_l = src87_l;
737  src65_l = src109_l;
738  src6 = src10;
739  }
740 
741  src += 16;
742  dst += 16;
743  }
744 }
745 
746 static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
747  uint8_t *dst, int32_t dst_stride,
748  const int8_t *filter, int32_t height)
749 {
750  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
751  32);
752 }
753 
754 static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
755  uint8_t *dst, int32_t dst_stride,
756  const int8_t *filter, int32_t height)
757 {
758  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
759  64);
760 }
761 
762 static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
763  uint8_t *dst, int32_t dst_stride,
764  const int8_t *filter_horiz,
765  const int8_t *filter_vert,
766  int32_t height)
767 {
768  uint32_t loop_cnt;
769  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
770  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
771  v16u8 mask0, mask1, mask2, mask3, out;
772  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
773  v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
774  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
775 
776  mask0 = LD_UB(&mc_filt_mask_arr[16]);
777  src -= (3 + 3 * src_stride);
778 
779  /* rearranging filter */
780  filt = LD_SH(filter_horiz);
781  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
782 
783  mask1 = mask0 + 2;
784  mask2 = mask0 + 4;
785  mask3 = mask0 + 6;
786 
787  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
788  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
789  src += (7 * src_stride);
790 
791  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
792  filt_hz1, filt_hz2, filt_hz3);
793  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
794  filt_hz1, filt_hz2, filt_hz3);
795  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
796  filt_hz1, filt_hz2, filt_hz3);
797  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
798  filt_hz1, filt_hz2, filt_hz3);
799  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
800 
801  filt = LD_SH(filter_vert);
802  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
803 
804  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
805  out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
806 
807  for (loop_cnt = (height >> 2); loop_cnt--;) {
808  LD_SB4(src, src_stride, src7, src8, src9, src10);
809  XORI_B4_128_SB(src7, src8, src9, src10);
810  src += (4 * src_stride);
811 
812  hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
813  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
814  hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
815  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
816  tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
817  filt_vt2, filt_vt3);
818 
819  hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
820  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
821  hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
822  out4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
823  tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
824  filt_vt2, filt_vt3);
825  SRARI_H2_SH(tmp0, tmp1, 7);
826  SAT_SH2_SH(tmp0, tmp1, 7);
827  out = PCKEV_XORI128_UB(tmp0, tmp1);
828  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
829  dst += (4 * dst_stride);
830 
831  hz_out5 = hz_out9;
832  out0 = out2;
833  out1 = out3;
834  out2 = out4;
835  }
836 }
837 
838 static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
839  uint8_t *dst, int32_t dst_stride,
840  const int8_t *filter_horiz,
841  const int8_t *filter_vert,
842  int32_t height)
843 {
844  uint32_t loop_cnt;
845  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
846  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
847  v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
848  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
849  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
850  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
851  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
852 
853  mask0 = LD_UB(&mc_filt_mask_arr[0]);
854  src -= (3 + 3 * src_stride);
855 
856  /* rearranging filter */
857  filt = LD_SH(filter_horiz);
858  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
859 
860  mask1 = mask0 + 2;
861  mask2 = mask0 + 4;
862  mask3 = mask0 + 6;
863 
864  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
865  src += (7 * src_stride);
866 
867  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
868  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
869  filt_hz1, filt_hz2, filt_hz3);
870  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
871  filt_hz1, filt_hz2, filt_hz3);
872  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
873  filt_hz1, filt_hz2, filt_hz3);
874  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
875  filt_hz1, filt_hz2, filt_hz3);
876  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
877  filt_hz1, filt_hz2, filt_hz3);
878  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
879  filt_hz1, filt_hz2, filt_hz3);
880  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
881  filt_hz1, filt_hz2, filt_hz3);
882 
883  filt = LD_SH(filter_vert);
884  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
885 
886  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
887  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
888  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
889 
890  for (loop_cnt = (height >> 2); loop_cnt--;) {
891  LD_SB4(src, src_stride, src7, src8, src9, src10);
892  src += (4 * src_stride);
893 
894  XORI_B4_128_SB(src7, src8, src9, src10);
895 
896  hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
897  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
898  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
899  tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
900  filt_vt2, filt_vt3);
901 
902  hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
903  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
904  out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
905  tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
906  filt_vt2, filt_vt3);
907 
908  hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
909  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
910  out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
911  tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0,
912  filt_vt1, filt_vt2, filt_vt3);
913 
914  hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
915  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
916  out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
917  tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
918  filt_vt2, filt_vt3);
919  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
920  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
921  vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
922  vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
923  ST8x4_UB(vec0, vec1, dst, dst_stride);
924  dst += (4 * dst_stride);
925 
926  hz_out6 = hz_out10;
927  out0 = out2;
928  out1 = out3;
929  out2 = out8;
930  out4 = out6;
931  out5 = out7;
932  out6 = out9;
933  }
934 }
935 
936 static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
937  uint8_t *dst, int32_t dst_stride,
938  const int8_t *filter_horiz,
939  const int8_t *filter_vert,
940  int32_t height)
941 {
942  int32_t multiple8_cnt;
943 
944  for (multiple8_cnt = 2; multiple8_cnt--;) {
945  common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
946  filter_vert, height);
947 
948  src += 8;
949  dst += 8;
950  }
951 }
952 
953 static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
954  uint8_t *dst, int32_t dst_stride,
955  const int8_t *filter_horiz,
956  const int8_t *filter_vert,
957  int32_t height)
958 {
959  int32_t multiple8_cnt;
960 
961  for (multiple8_cnt = 4; multiple8_cnt--;) {
962  common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
963  filter_vert, height);
964 
965  src += 8;
966  dst += 8;
967  }
968 }
969 
970 static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
971  uint8_t *dst, int32_t dst_stride,
972  const int8_t *filter_horiz,
973  const int8_t *filter_vert,
974  int32_t height)
975 {
976  int32_t multiple8_cnt;
977 
978  for (multiple8_cnt = 8; multiple8_cnt--;) {
979  common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
980  filter_vert, height);
981 
982  src += 8;
983  dst += 8;
984  }
985 }
986 
988  int32_t src_stride,
989  uint8_t *dst, int32_t dst_stride,
990  const int8_t *filter)
991 {
992  uint32_t tp0, tp1, tp2, tp3;
993  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
994  v16u8 dst0, res;
995  v16u8 mask0, mask1, mask2, mask3;
996  v8i16 filt, res0, res1;
997 
998  mask0 = LD_UB(&mc_filt_mask_arr[16]);
999  src -= 3;
1000 
1001  /* rearranging filter */
1002  filt = LD_SH(filter);
1003  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1004 
1005  mask1 = mask0 + 2;
1006  mask2 = mask0 + 4;
1007  mask3 = mask0 + 6;
1008 
1009  LD_SB4(src, src_stride, src0, src1, src2, src3);
1010  XORI_B4_128_SB(src0, src1, src2, src3);
1011  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1012  mask3, filt0, filt1, filt2, filt3, res0, res1);
1013  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1014  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1015  SRARI_H2_SH(res0, res1, 7);
1016  SAT_SH2_SH(res0, res1, 7);
1017  res = PCKEV_XORI128_UB(res0, res1);
1018  res = (v16u8) __msa_aver_u_b(res, dst0);
1019  ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
1020 }
1021 
1023  int32_t src_stride,
1024  uint8_t *dst, int32_t dst_stride,
1025  const int8_t *filter)
1026 {
1027  uint32_t tp0, tp1, tp2, tp3;
1028  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1029  v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
1030  v16u8 dst0, dst1;
1031  v8i16 filt, vec0, vec1, vec2, vec3;
1032 
1033  mask0 = LD_UB(&mc_filt_mask_arr[16]);
1034  src -= 3;
1035 
1036  /* rearranging filter */
1037  filt = LD_SH(filter);
1038  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1039 
1040  mask1 = mask0 + 2;
1041  mask2 = mask0 + 4;
1042  mask3 = mask0 + 6;
1043 
1044  LD_SB4(src, src_stride, src0, src1, src2, src3);
1045  XORI_B4_128_SB(src0, src1, src2, src3);
1046  src += (4 * src_stride);
1047  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1048  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1049  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
1050  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1051  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1052  mask3, filt0, filt1, filt2, filt3, vec0, vec1);
1053  LD_SB4(src, src_stride, src0, src1, src2, src3);
1054  XORI_B4_128_SB(src0, src1, src2, src3);
1055  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1056  mask3, filt0, filt1, filt2, filt3, vec2, vec3);
1057  SRARI_H4_SH(vec0, vec1, vec2, vec3, 7);
1058  SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
1059  PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
1060  res0, res1, res2, res3);
1061  ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
1062  XORI_B2_128_UB(res0, res2);
1063  AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
1064  ST4x8_UB(res0, res2, dst, dst_stride);
1065 }
1066 
1068  int32_t src_stride,
1069  uint8_t *dst, int32_t dst_stride,
1070  const int8_t *filter,
1071  int32_t height)
1072 {
1073  if (4 == height) {
1074  common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
1075  filter);
1076  } else if (8 == height) {
1077  common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
1078  filter);
1079  }
1080 }
1081 
1083  int32_t src_stride,
1084  uint8_t *dst, int32_t dst_stride,
1085  const int8_t *filter,
1086  int32_t height)
1087 {
1088  int32_t loop_cnt;
1089  int64_t tp0, tp1, tp2, tp3;
1090  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1091  v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
1092  v8i16 filt, out0, out1, out2, out3;
1093 
1094  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1095  src -= 3;
1096 
1097  /* rearranging filter */
1098  filt = LD_SH(filter);
1099  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1100 
1101  mask1 = mask0 + 2;
1102  mask2 = mask0 + 4;
1103  mask3 = mask0 + 6;
1104 
1105  for (loop_cnt = (height >> 2); loop_cnt--;) {
1106  LD_SB4(src, src_stride, src0, src1, src2, src3);
1107  XORI_B4_128_SB(src0, src1, src2, src3);
1108  src += (4 * src_stride);
1109  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1110  mask3, filt0, filt1, filt2, filt3, out0,
1111  out1, out2, out3);
1112  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1113  INSERT_D2_UB(tp0, tp1, dst0);
1114  INSERT_D2_UB(tp2, tp3, dst1);
1115  SRARI_H4_SH(out0, out1, out2, out3, 7);
1116  SAT_SH4_SH(out0, out1, out2, out3, 7);
1117  CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
1118  dst, dst_stride);
1119  dst += (4 * dst_stride);
1120  }
1121 }
1122 
1124  int32_t src_stride,
1125  uint8_t *dst, int32_t dst_stride,
1126  const int8_t *filter,
1127  int32_t height)
1128 {
1129  int32_t loop_cnt;
1130  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1131  v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
1132  v8i16 filt, out0, out1, out2, out3;
1133  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1134  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1135 
1136  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1137  src -= 3;
1138 
1139  /* rearranging filter */
1140  filt = LD_SH(filter);
1141  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1142 
1143  mask1 = mask0 + 2;
1144  mask2 = mask0 + 4;
1145  mask3 = mask0 + 6;
1146 
1147  for (loop_cnt = height >> 1; loop_cnt--;) {
1148  LD_SB2(src, src_stride, src0, src2);
1149  LD_SB2(src + 8, src_stride, src1, src3);
1150  src += (2 * src_stride);
1151 
1152  XORI_B4_128_SB(src0, src1, src2, src3);
1153  VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1154  vec12);
1155  VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1156  vec13);
1157  VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
1158  vec14);
1159  VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
1160  vec15);
1161  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
1162  vec1, vec2, vec3);
1163  DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
1164  vec9, vec10, vec11);
1165  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
1166  vec1, vec2, vec3);
1167  DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1168  vec8, vec9, vec10, vec11);
1169  ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1170  out1, out2, out3);
1171  LD_UB2(dst, dst_stride, dst0, dst1);
1172  SRARI_H4_SH(out0, out1, out2, out3, 7);
1173  SAT_SH4_SH(out0, out1, out2, out3, 7);
1174  PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
1175  dst += dst_stride;
1176  PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
1177  dst += dst_stride;
1178  }
1179 }
1180 
1182  int32_t src_stride,
1183  uint8_t *dst, int32_t dst_stride,
1184  const int8_t *filter,
1185  int32_t height)
1186 {
1187  uint32_t loop_cnt;
1188  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1189  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
1190  v8i16 filt, out0, out1, out2, out3;
1191  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1192  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1193 
1194  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1195  src -= 3;
1196 
1197  /* rearranging filter */
1198  filt = LD_SH(filter);
1199  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1200 
1201  mask1 = mask0 + 2;
1202  mask2 = mask0 + 4;
1203  mask3 = mask0 + 6;
1204 
1205  for (loop_cnt = height; loop_cnt--;) {
1206  src0 = LD_SB(src);
1207  src2 = LD_SB(src + 16);
1208  src3 = LD_SB(src + 24);
1209  src1 = __msa_sldi_b(src2, src0, 8);
1210  src += src_stride;
1211 
1212  XORI_B4_128_SB(src0, src1, src2, src3);
1213  VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1214  vec12);
1215  VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1216  vec13);
1217  VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
1218  vec14);
1219  VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
1220  vec15);
1221  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
1222  vec1, vec2, vec3);
1223  DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
1224  vec9, vec10, vec11);
1225  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
1226  vec1, vec2, vec3);
1227  DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1228  vec8, vec9, vec10, vec11);
1229  ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1230  out1, out2, out3);
1231  SRARI_H4_SH(out0, out1, out2, out3, 7);
1232  SAT_SH4_SH(out0, out1, out2, out3, 7);
1233  LD_UB2(dst, 16, dst1, dst2);
1234  PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
1235  PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
1236  dst += dst_stride;
1237  }
1238 }
1239 
1241  int32_t src_stride,
1242  uint8_t *dst, int32_t dst_stride,
1243  const int8_t *filter,
1244  int32_t height)
1245 {
1246  uint32_t loop_cnt, cnt;
1247  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1248  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
1249  v8i16 filt, out0, out1, out2, out3;
1250  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1251  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1252 
1253  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1254  src -= 3;
1255 
1256  /* rearranging filter */
1257  filt = LD_SH(filter);
1258  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1259 
1260  mask1 = mask0 + 2;
1261  mask2 = mask0 + 4;
1262  mask3 = mask0 + 6;
1263 
1264  for (loop_cnt = height; loop_cnt--;) {
1265  for (cnt = 0; cnt < 2; ++cnt) {
1266  src0 = LD_SB(&src[cnt << 5]);
1267  src2 = LD_SB(&src[16 + (cnt << 5)]);
1268  src3 = LD_SB(&src[24 + (cnt << 5)]);
1269  src1 = __msa_sldi_b(src2, src0, 8);
1270 
1271  XORI_B4_128_SB(src0, src1, src2, src3);
1272  VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1273  vec12);
1274  VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1275  vec13);
1276  VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6,
1277  vec10, vec14);
1278  VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7,
1279  vec11, vec15);
1280  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1281  vec0, vec1, vec2, vec3);
1282  DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2,
1283  vec8, vec9, vec10, vec11);
1284  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
1285  vec0, vec1, vec2, vec3);
1286  DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1287  vec8, vec9, vec10, vec11);
1288  ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1289  out1, out2, out3);
1290  SRARI_H4_SH(out0, out1, out2, out3, 7);
1291  SAT_SH4_SH(out0, out1, out2, out3, 7);
1292  LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
1293  PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
1294  PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
1295  }
1296 
1297  src += src_stride;
1298  dst += dst_stride;
1299  }
1300 }
1301 
1303  int32_t src_stride,
1304  uint8_t *dst, int32_t dst_stride,
1305  const int8_t *filter,
1306  int32_t height)
1307 {
1308  uint32_t loop_cnt;
1309  uint32_t tp0, tp1, tp2, tp3;
1310  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1311  v16u8 dst0, out;
1312  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1313  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
1314  v16i8 src10998, filt0, filt1, filt2, filt3;
1315  v8i16 filt, out10, out32;
1316 
1317  src -= (3 * src_stride);
1318 
1319  filt = LD_SH(filter);
1320  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1321 
1322  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1323  src += (7 * src_stride);
1324 
1325  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1326  src54_r, src21_r);
1327  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1328  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
1329  src4332, src6554);
1330  XORI_B3_128_SB(src2110, src4332, src6554);
1331 
1332  for (loop_cnt = (height >> 2); loop_cnt--;) {
1333  LD_SB4(src, src_stride, src7, src8, src9, src10);
1334  src += (4 * src_stride);
1335 
1336  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1337  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1338  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1339  src87_r, src98_r, src109_r);
1340  ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
1341  XORI_B2_128_SB(src8776, src10998);
1342  out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
1343  filt1, filt2, filt3);
1344  out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
1345  filt1, filt2, filt3);
1346  SRARI_H2_SH(out10, out32, 7);
1347  SAT_SH2_SH(out10, out32, 7);
1348  out = PCKEV_XORI128_UB(out10, out32);
1349  out = __msa_aver_u_b(out, dst0);
1350 
1351  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1352  dst += (4 * dst_stride);
1353 
1354  src2110 = src6554;
1355  src4332 = src8776;
1356  src6554 = src10998;
1357  src6 = src10;
1358  }
1359 }
1360 
1362  int32_t src_stride,
1363  uint8_t *dst, int32_t dst_stride,
1364  const int8_t *filter,
1365  int32_t height)
1366 {
1367  uint32_t loop_cnt;
1368  uint64_t tp0, tp1, tp2, tp3;
1369  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1370  v16u8 dst0, dst1;
1371  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1372  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
1373  v8i16 filt, out0, out1, out2, out3;
1374 
1375  src -= (3 * src_stride);
1376 
1377  filt = LD_SH(filter);
1378  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1379 
1380  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1381  src += (7 * src_stride);
1382 
1383  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1384  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1385  src54_r, src21_r);
1386  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1387 
1388  for (loop_cnt = (height >> 2); loop_cnt--;) {
1389  LD_SB4(src, src_stride, src7, src8, src9, src10);
1390  src += (4 * src_stride);
1391 
1392  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1393  INSERT_D2_UB(tp0, tp1, dst0);
1394  INSERT_D2_UB(tp2, tp3, dst1);
1395  XORI_B4_128_SB(src7, src8, src9, src10);
1396  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1397  src87_r, src98_r, src109_r);
1398  out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
1399  filt1, filt2, filt3);
1400  out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
1401  filt1, filt2, filt3);
1402  out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
1403  filt1, filt2, filt3);
1404  out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
1405  filt1, filt2, filt3);
1406  SRARI_H4_SH(out0, out1, out2, out3, 7);
1407  SAT_SH4_SH(out0, out1, out2, out3, 7);
1408  CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
1409  dst, dst_stride);
1410  dst += (4 * dst_stride);
1411 
1412  src10_r = src54_r;
1413  src32_r = src76_r;
1414  src54_r = src98_r;
1415  src21_r = src65_r;
1416  src43_r = src87_r;
1417  src65_r = src109_r;
1418  src6 = src10;
1419  }
1420 }
1421 
1423  int32_t src_stride,
1424  uint8_t *dst,
1425  int32_t dst_stride,
1426  const int8_t *filter,
1427  int32_t height,
1428  int32_t width)
1429 {
1430  const uint8_t *src_tmp;
1431  uint8_t *dst_tmp;
1432  uint32_t loop_cnt, cnt;
1433  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1434  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1435  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1436  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1437  v16i8 filt0, filt1, filt2, filt3;
1438  v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
1439  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
1440 
1441  src -= (3 * src_stride);
1442 
1443  filt = LD_SH(filter);
1444  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1445 
1446  for (cnt = (width >> 4); cnt--;) {
1447  src_tmp = src;
1448  dst_tmp = dst;
1449 
1450  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1451  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1452  src_tmp += (7 * src_stride);
1453 
1454  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1455  src32_r, src54_r, src21_r);
1456  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1457  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1458  src32_l, src54_l, src21_l);
1459  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1460 
1461  for (loop_cnt = (height >> 2); loop_cnt--;) {
1462  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1463  src_tmp += (4 * src_stride);
1464 
1465  LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
1466  XORI_B4_128_SB(src7, src8, src9, src10);
1467  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1468  src87_r, src98_r, src109_r);
1469  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1470  src87_l, src98_l, src109_l);
1471  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
1472  filt0, filt1, filt2, filt3);
1473  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
1474  filt0, filt1, filt2, filt3);
1475  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
1476  filt0, filt1, filt2, filt3);
1477  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
1478  filt0, filt1, filt2, filt3);
1479  out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
1480  filt0, filt1, filt2, filt3);
1481  out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
1482  filt0, filt1, filt2, filt3);
1483  out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
1484  filt0, filt1, filt2, filt3);
1485  out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
1486  filt0, filt1, filt2, filt3);
1487  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1488  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1489  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1490  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1491  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1492  out3_r, tmp0, tmp1, tmp2, tmp3);
1493  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1494  AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
1495  dst0, dst1, dst2, dst3);
1496  ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
1497  dst_tmp += (4 * dst_stride);
1498 
1499  src10_r = src54_r;
1500  src32_r = src76_r;
1501  src54_r = src98_r;
1502  src21_r = src65_r;
1503  src43_r = src87_r;
1504  src65_r = src109_r;
1505  src10_l = src54_l;
1506  src32_l = src76_l;
1507  src54_l = src98_l;
1508  src21_l = src65_l;
1509  src43_l = src87_l;
1510  src65_l = src109_l;
1511  src6 = src10;
1512  }
1513 
1514  src += 16;
1515  dst += 16;
1516  }
1517 }
1518 
1520  int32_t src_stride,
1521  uint8_t *dst, int32_t dst_stride,
1522  const int8_t *filter,
1523  int32_t height)
1524 {
1525  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1526  filter, height, 16);
1527 }
1528 
1530  int32_t src_stride,
1531  uint8_t *dst, int32_t dst_stride,
1532  const int8_t *filter,
1533  int32_t height)
1534 {
1535  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1536  filter, height, 32);
1537 }
1538 
1540  int32_t src_stride,
1541  uint8_t *dst, int32_t dst_stride,
1542  const int8_t *filter,
1543  int32_t height)
1544 {
1545  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1546  filter, height, 64);
1547 }
1548 
1550  int32_t src_stride,
1551  uint8_t *dst,
1552  int32_t dst_stride,
1553  const int8_t *filter_horiz,
1554  const int8_t *filter_vert,
1555  int32_t height)
1556 {
1557  uint32_t loop_cnt;
1558  uint32_t tp0, tp1, tp2, tp3;
1559  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1560  v16u8 dst0, res, mask0, mask1, mask2, mask3;
1561  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1562  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1563  v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
1564  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1565 
1566  mask0 = LD_UB(&mc_filt_mask_arr[16]);
1567  src -= (3 + 3 * src_stride);
1568 
1569  /* rearranging filter */
1570  filt = LD_SH(filter_horiz);
1571  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1572 
1573  mask1 = mask0 + 2;
1574  mask2 = mask0 + 4;
1575  mask3 = mask0 + 6;
1576 
1577  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1578  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1579  src += (7 * src_stride);
1580 
1581  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
1582  filt_hz1, filt_hz2, filt_hz3);
1583  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
1584  filt_hz1, filt_hz2, filt_hz3);
1585  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
1586  filt_hz1, filt_hz2, filt_hz3);
1587  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
1588  filt_hz1, filt_hz2, filt_hz3);
1589  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
1590 
1591  filt = LD_SH(filter_vert);
1592  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1593 
1594  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1595  vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1596 
1597  for (loop_cnt = (height >> 2); loop_cnt--;) {
1598  LD_SB4(src, src_stride, src7, src8, src9, src10);
1599  XORI_B4_128_SB(src7, src8, src9, src10);
1600  src += (4 * src_stride);
1601 
1602  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1603  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1604  hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
1605  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1606  hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
1607  vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1608  res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
1609  filt_vt2, filt_vt3);
1610 
1611  hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
1612  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1613  hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
1614  vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
1615  res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
1616  filt_vt2, filt_vt3);
1617 
1618  SRARI_H2_SH(res0, res1, 7);
1619  SAT_SH2_SH(res0, res1, 7);
1620  res = PCKEV_XORI128_UB(res0, res1);
1621  res = (v16u8) __msa_aver_u_b(res, dst0);
1622  ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
1623  dst += (4 * dst_stride);
1624 
1625  hz_out5 = hz_out9;
1626  vec0 = vec2;
1627  vec1 = vec3;
1628  vec2 = vec4;
1629  }
1630 }
1631 
1633  int32_t src_stride,
1634  uint8_t *dst,
1635  int32_t dst_stride,
1636  const int8_t *filter_horiz,
1637  const int8_t *filter_vert,
1638  int32_t height)
1639 {
1640  uint32_t loop_cnt;
1641  uint64_t tp0, tp1, tp2, tp3;
1642  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1643  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1644  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1645  v16u8 dst0, dst1, mask0, mask1, mask2, mask3;
1646  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1647  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
1648  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
1649 
1650  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1651  src -= (3 + 3 * src_stride);
1652 
1653  /* rearranging filter */
1654  filt = LD_SH(filter_horiz);
1655  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1656 
1657  mask1 = mask0 + 2;
1658  mask2 = mask0 + 4;
1659  mask3 = mask0 + 6;
1660 
1661  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1662  src += (7 * src_stride);
1663 
1664  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1665  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
1666  filt_hz1, filt_hz2, filt_hz3);
1667  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
1668  filt_hz1, filt_hz2, filt_hz3);
1669  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
1670  filt_hz1, filt_hz2, filt_hz3);
1671  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
1672  filt_hz1, filt_hz2, filt_hz3);
1673  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
1674  filt_hz1, filt_hz2, filt_hz3);
1675  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
1676  filt_hz1, filt_hz2, filt_hz3);
1677  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
1678  filt_hz1, filt_hz2, filt_hz3);
1679 
1680  filt = LD_SH(filter_vert);
1681  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1682 
1683  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1684  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
1685  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
1686 
1687  for (loop_cnt = (height >> 2); loop_cnt--;) {
1688  LD_SB4(src, src_stride, src7, src8, src9, src10);
1689  XORI_B4_128_SB(src7, src8, src9, src10);
1690  src += (4 * src_stride);
1691 
1692  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1693  INSERT_D2_UB(tp0, tp1, dst0);
1694  INSERT_D2_UB(tp2, tp3, dst1);
1695 
1696  hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
1697  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1698  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1699  tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
1700  filt_vt2, filt_vt3);
1701 
1702  hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
1703  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1704  out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
1705  tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
1706  filt_vt2, filt_vt3);
1707 
1708  hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
1709  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1710  out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
1711  tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
1712  filt_vt2, filt_vt3);
1713 
1714  hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
1715  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1716  out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
1717  tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
1718  filt_vt2, filt_vt3);
1719 
1720  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1721  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1722  CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1,
1723  dst, dst_stride);
1724  dst += (4 * dst_stride);
1725 
1726  hz_out6 = hz_out10;
1727  out0 = out2;
1728  out1 = out3;
1729  out2 = out8;
1730  out4 = out6;
1731  out5 = out7;
1732  out6 = out9;
1733  }
1734 }
1735 
1737  int32_t src_stride,
1738  uint8_t *dst,
1739  int32_t dst_stride,
1740  const int8_t *filter_horiz,
1741  const int8_t *filter_vert,
1742  int32_t height)
1743 {
1744  int32_t multiple8_cnt;
1745 
1746  for (multiple8_cnt = 2; multiple8_cnt--;) {
1747  common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1748  filter_horiz, filter_vert,
1749  height);
1750 
1751  src += 8;
1752  dst += 8;
1753  }
1754 }
1755 
1757  int32_t src_stride,
1758  uint8_t *dst,
1759  int32_t dst_stride,
1760  const int8_t *filter_horiz,
1761  const int8_t *filter_vert,
1762  int32_t height)
1763 {
1764  int32_t multiple8_cnt;
1765 
1766  for (multiple8_cnt = 4; multiple8_cnt--;) {
1767  common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1768  filter_horiz, filter_vert,
1769  height);
1770 
1771  src += 8;
1772  dst += 8;
1773  }
1774 }
1775 
1777  int32_t src_stride,
1778  uint8_t *dst,
1779  int32_t dst_stride,
1780  const int8_t *filter_horiz,
1781  const int8_t *filter_vert,
1782  int32_t height)
1783 {
1784  int32_t multiple8_cnt;
1785 
1786  for (multiple8_cnt = 8; multiple8_cnt--;) {
1787  common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1788  filter_horiz, filter_vert,
1789  height);
1790 
1791  src += 8;
1792  dst += 8;
1793  }
1794 }
1795 
1796 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
1797  uint8_t *dst, int32_t dst_stride,
1798  const int8_t *filter)
1799 {
1800  v16i8 src0, src1, src2, src3, mask;
1801  v16u8 filt0, vec0, vec1, res0, res1;
1802  v8u16 vec2, vec3, filt;
1803 
1804  mask = LD_SB(&mc_filt_mask_arr[16]);
1805 
1806  /* rearranging filter */
1807  filt = LD_UH(filter);
1808  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1809 
1810  LD_SB4(src, src_stride, src0, src1, src2, src3);
1811  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1812  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
1813  SRARI_H2_UH(vec2, vec3, 7);
1814  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
1815  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1816 }
1817 
1818 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
1819  uint8_t *dst, int32_t dst_stride,
1820  const int8_t *filter)
1821 {
1822  v16u8 vec0, vec1, vec2, vec3, filt0;
1823  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1824  v16i8 res0, res1, res2, res3;
1825  v8u16 vec4, vec5, vec6, vec7, filt;
1826 
1827  mask = LD_SB(&mc_filt_mask_arr[16]);
1828 
1829  /* rearranging filter */
1830  filt = LD_UH(filter);
1831  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1832 
1833  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1834  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1835  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
1836  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1837  vec4, vec5, vec6, vec7);
1838  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
1839  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
1840  res0, res1, res2, res3);
1841  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1842  dst += (4 * dst_stride);
1843  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
1844 }
1845 
1846 void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1847  const uint8_t *src, ptrdiff_t src_stride,
1848  int height, int mx, int my)
1849 {
1850  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
1851 
1852  if (4 == height) {
1853  common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1854  } else if (8 == height) {
1855  common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1856  }
1857 }
1858 
1859 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
1860  uint8_t *dst, int32_t dst_stride,
1861  const int8_t *filter)
1862 {
1863  v16u8 filt0;
1864  v16i8 src0, src1, src2, src3, mask;
1865  v8u16 vec0, vec1, vec2, vec3, filt;
1866 
1867  mask = LD_SB(&mc_filt_mask_arr[0]);
1868 
1869  /* rearranging filter */
1870  filt = LD_UH(filter);
1871  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1872 
1873  LD_SB4(src, src_stride, src0, src1, src2, src3);
1874  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1875  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1876  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1877  vec0, vec1, vec2, vec3);
1878  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1879  PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
1880  ST8x4_UB(src0, src1, dst, dst_stride);
1881 }
1882 
1883 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
1884  uint8_t *dst, int32_t dst_stride,
1885  const int8_t *filter, int32_t height)
1886 {
1887  v16u8 filt0;
1888  v16i8 src0, src1, src2, src3, mask, out0, out1;
1889  v8u16 vec0, vec1, vec2, vec3, filt;
1890 
1891  mask = LD_SB(&mc_filt_mask_arr[0]);
1892 
1893  /* rearranging filter */
1894  filt = LD_UH(filter);
1895  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1896 
1897  LD_SB4(src, src_stride, src0, src1, src2, src3);
1898  src += (4 * src_stride);
1899 
1900  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1901  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1902  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1903  vec0, vec1, vec2, vec3);
1904  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1905  LD_SB4(src, src_stride, src0, src1, src2, src3);
1906  src += (4 * src_stride);
1907 
1908  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1909  ST8x4_UB(out0, out1, dst, dst_stride);
1910  dst += (4 * dst_stride);
1911 
1912  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1913  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1914  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1915  vec0, vec1, vec2, vec3);
1916  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1917  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1918  ST8x4_UB(out0, out1, dst, dst_stride);
1919  dst += (4 * dst_stride);
1920 
1921  if (16 == height) {
1922  LD_SB4(src, src_stride, src0, src1, src2, src3);
1923  src += (4 * src_stride);
1924 
1925  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1926  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1927  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1928  vec0, vec1, vec2, vec3);
1929  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1930  LD_SB4(src, src_stride, src0, src1, src2, src3);
1931  src += (4 * src_stride);
1932 
1933  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1934  ST8x4_UB(out0, out1, dst, dst_stride);
1935 
1936  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1937  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1938  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1939  vec0, vec1, vec2, vec3);
1940  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1941  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1942  ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
1943  }
1944 }
1945 
1946 void ff_put_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1947  const uint8_t *src, ptrdiff_t src_stride,
1948  int height, int mx, int my)
1949 {
1950  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
1951 
1952  if (4 == height) {
1953  common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1954  } else {
1955  common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1956  height);
1957  }
1958 }
1959 
1960 void ff_put_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1961  const uint8_t *src, ptrdiff_t src_stride,
1962  int height, int mx, int my)
1963 {
1964  uint32_t loop_cnt;
1965  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
1966  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1967  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1968  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
1969 
1970  mask = LD_SB(&mc_filt_mask_arr[0]);
1971 
1972  loop_cnt = (height >> 2) - 1;
1973 
1974  /* rearranging filter */
1975  filt = LD_UH(filter);
1976  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1977 
1978  LD_SB4(src, src_stride, src0, src2, src4, src6);
1979  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1980  src += (4 * src_stride);
1981 
1982  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1983  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1984  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1985  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1986  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1987  out0, out1, out2, out3);
1988  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1989  out4, out5, out6, out7);
1990  SRARI_H4_UH(out0, out1, out2, out3, 7);
1991  SRARI_H4_UH(out4, out5, out6, out7, 7);
1992  PCKEV_ST_SB(out0, out1, dst);
1993  dst += dst_stride;
1994  PCKEV_ST_SB(out2, out3, dst);
1995  dst += dst_stride;
1996  PCKEV_ST_SB(out4, out5, dst);
1997  dst += dst_stride;
1998  PCKEV_ST_SB(out6, out7, dst);
1999  dst += dst_stride;
2000 
2001  for (; loop_cnt--;) {
2002  LD_SB4(src, src_stride, src0, src2, src4, src6);
2003  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2004  src += (4 * src_stride);
2005 
2006  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2007  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2008  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2009  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2010  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2011  out0, out1, out2, out3);
2012  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2013  out4, out5, out6, out7);
2014  SRARI_H4_UH(out0, out1, out2, out3, 7);
2015  SRARI_H4_UH(out4, out5, out6, out7, 7);
2016  PCKEV_ST_SB(out0, out1, dst);
2017  dst += dst_stride;
2018  PCKEV_ST_SB(out2, out3, dst);
2019  dst += dst_stride;
2020  PCKEV_ST_SB(out4, out5, dst);
2021  dst += dst_stride;
2022  PCKEV_ST_SB(out6, out7, dst);
2023  dst += dst_stride;
2024  }
2025 }
2026 
2027 void ff_put_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2028  const uint8_t *src, ptrdiff_t src_stride,
2029  int height, int mx, int my)
2030 {
2031  uint32_t loop_cnt;
2032  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2033  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2034  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2035  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
2036 
2037  mask = LD_SB(&mc_filt_mask_arr[0]);
2038 
2039  /* rearranging filter */
2040  filt = LD_UH(filter);
2041  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2042 
2043  for (loop_cnt = height >> 1; loop_cnt--;) {
2044  src0 = LD_SB(src);
2045  src2 = LD_SB(src + 16);
2046  src3 = LD_SB(src + 24);
2047  src1 = __msa_sldi_b(src2, src0, 8);
2048  src += src_stride;
2049  src4 = LD_SB(src);
2050  src6 = LD_SB(src + 16);
2051  src7 = LD_SB(src + 24);
2052  src5 = __msa_sldi_b(src6, src4, 8);
2053  src += src_stride;
2054 
2055  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2056  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2057  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2058  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2059  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2060  out0, out1, out2, out3);
2061  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2062  out4, out5, out6, out7);
2063  SRARI_H4_UH(out0, out1, out2, out3, 7);
2064  SRARI_H4_UH(out4, out5, out6, out7, 7);
2065  PCKEV_ST_SB(out0, out1, dst);
2066  PCKEV_ST_SB(out2, out3, dst + 16);
2067  dst += dst_stride;
2068  PCKEV_ST_SB(out4, out5, dst);
2069  PCKEV_ST_SB(out6, out7, dst + 16);
2070  dst += dst_stride;
2071  }
2072 }
2073 
2074 void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2075  const uint8_t *src, ptrdiff_t src_stride,
2076  int height, int mx, int my)
2077 {
2078  uint32_t loop_cnt;
2079  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2080  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2081  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2082  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
2083 
2084  mask = LD_SB(&mc_filt_mask_arr[0]);
2085 
2086  /* rearranging filter */
2087  filt = LD_UH(filter);
2088  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2089 
2090  for (loop_cnt = height; loop_cnt--;) {
2091  src0 = LD_SB(src);
2092  src2 = LD_SB(src + 16);
2093  src4 = LD_SB(src + 32);
2094  src6 = LD_SB(src + 48);
2095  src7 = LD_SB(src + 56);
2096  SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
2097  src += src_stride;
2098 
2099  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2100  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2101  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2102  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2103  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2104  out0, out1, out2, out3);
2105  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2106  out4, out5, out6, out7);
2107  SRARI_H4_UH(out0, out1, out2, out3, 7);
2108  SRARI_H4_UH(out4, out5, out6, out7, 7);
2109  PCKEV_ST_SB(out0, out1, dst);
2110  PCKEV_ST_SB(out2, out3, dst + 16);
2111  PCKEV_ST_SB(out4, out5, dst + 32);
2112  PCKEV_ST_SB(out6, out7, dst + 48);
2113  dst += dst_stride;
2114  }
2115 }
2116 
2117 static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
2118  uint8_t *dst, int32_t dst_stride,
2119  const int8_t *filter)
2120 {
2121  v16i8 src0, src1, src2, src3, src4;
2122  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2123  v16u8 filt0;
2124  v8i16 filt;
2125  v8u16 tmp0, tmp1;
2126 
2127  filt = LD_SH(filter);
2128  filt0 = (v16u8) __msa_splati_h(filt, 0);
2129 
2130  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2131  src += (5 * src_stride);
2132 
2133  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2134  src10_r, src21_r, src32_r, src43_r);
2135  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2136  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
2137  SRARI_H2_UH(tmp0, tmp1, 7);
2138  SAT_UH2_UH(tmp0, tmp1, 7);
2139  src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2140  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
2141 }
2142 
2143 static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
2144  uint8_t *dst, int32_t dst_stride,
2145  const int8_t *filter)
2146 {
2147  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2148  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
2149  v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
2150  v8u16 tmp0, tmp1, tmp2, tmp3;
2151  v16u8 filt0;
2152  v8i16 filt;
2153 
2154  filt = LD_SH(filter);
2155  filt0 = (v16u8) __msa_splati_h(filt, 0);
2156 
2157  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2158  src += (8 * src_stride);
2159 
2160  src8 = LD_SB(src);
2161  src += src_stride;
2162 
2163  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2164  src32_r, src43_r);
2165  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2166  src76_r, src87_r);
2167  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
2168  src87_r, src76_r, src2110, src4332, src6554, src8776);
2169  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
2170  tmp0, tmp1, tmp2, tmp3);
2171  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2172  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2173  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
2174  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
2175  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2176 }
2177 
2178 void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2179  const uint8_t *src, ptrdiff_t src_stride,
2180  int height, int mx, int my)
2181 {
2182  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2183 
2184  if (4 == height) {
2185  common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2186  } else if (8 == height) {
2187  common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2188  }
2189 }
2190 
2191 static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
2192  uint8_t *dst, int32_t dst_stride,
2193  const int8_t *filter)
2194 {
2195  v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
2196  v16i8 out0, out1;
2197  v8u16 tmp0, tmp1, tmp2, tmp3;
2198  v8i16 filt;
2199 
2200  /* rearranging filter_y */
2201  filt = LD_SH(filter);
2202  filt0 = (v16u8) __msa_splati_h(filt, 0);
2203 
2204  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
2205  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
2206  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
2207  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2208  tmp0, tmp1, tmp2, tmp3);
2209  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2210  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2211  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2212  ST8x4_UB(out0, out1, dst, dst_stride);
2213 }
2214 
2215 static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
2216  uint8_t *dst, int32_t dst_stride,
2217  const int8_t *filter, int32_t height)
2218 {
2219  uint32_t loop_cnt;
2220  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2221  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2222  v16i8 out0, out1;
2223  v8u16 tmp0, tmp1, tmp2, tmp3;
2224  v8i16 filt;
2225 
2226  /* rearranging filter_y */
2227  filt = LD_SH(filter);
2228  filt0 = (v16u8) __msa_splati_h(filt, 0);
2229 
2230  src0 = LD_UB(src);
2231  src += src_stride;
2232 
2233  for (loop_cnt = (height >> 3); loop_cnt--;) {
2234  LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
2235  src += (8 * src_stride);
2236 
2237  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
2238  vec0, vec1, vec2, vec3);
2239  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
2240  vec4, vec5, vec6, vec7);
2241  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2242  tmp0, tmp1, tmp2, tmp3);
2243  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2244  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2245  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2246  ST8x4_UB(out0, out1, dst, dst_stride);
2247  dst += (4 * dst_stride);
2248 
2249  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2250  tmp0, tmp1, tmp2, tmp3);
2251  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2252  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2253  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2254  ST8x4_UB(out0, out1, dst, dst_stride);
2255  dst += (4 * dst_stride);
2256 
2257  src0 = src8;
2258  }
2259 }
2260 
2261 void ff_put_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2262  const uint8_t *src, ptrdiff_t src_stride,
2263  int height, int mx, int my)
2264 {
2265  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2266 
2267  if (4 == height) {
2268  common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
2269  } else {
2270  common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
2271  height);
2272  }
2273 }
2274 
2275 void ff_put_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2276  const uint8_t *src, ptrdiff_t src_stride,
2277  int height, int mx, int my)
2278 {
2279  uint32_t loop_cnt;
2280  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2281  v16u8 src0, src1, src2, src3, src4;
2282  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2283  v8u16 tmp0, tmp1, tmp2, tmp3;
2284  v8i16 filt;
2285 
2286  /* rearranging filter_y */
2287  filt = LD_SH(filter);
2288  filt0 = (v16u8) __msa_splati_h(filt, 0);
2289 
2290  src0 = LD_UB(src);
2291  src += src_stride;
2292 
2293  for (loop_cnt = (height >> 2); loop_cnt--;) {
2294  LD_UB4(src, src_stride, src1, src2, src3, src4);
2295  src += (4 * src_stride);
2296 
2297  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2298  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2299  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2300  SRARI_H2_UH(tmp0, tmp1, 7);
2301  SAT_UH2_UH(tmp0, tmp1, 7);
2302  PCKEV_ST_SB(tmp0, tmp1, dst);
2303  dst += dst_stride;
2304 
2305  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
2306  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
2307  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2308  SRARI_H2_UH(tmp2, tmp3, 7);
2309  SAT_UH2_UH(tmp2, tmp3, 7);
2310  PCKEV_ST_SB(tmp2, tmp3, dst);
2311  dst += dst_stride;
2312 
2313  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2314  SRARI_H2_UH(tmp0, tmp1, 7);
2315  SAT_UH2_UH(tmp0, tmp1, 7);
2316  PCKEV_ST_SB(tmp0, tmp1, dst);
2317  dst += dst_stride;
2318 
2319  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2320  SRARI_H2_UH(tmp2, tmp3, 7);
2321  SAT_UH2_UH(tmp2, tmp3, 7);
2322  PCKEV_ST_SB(tmp2, tmp3, dst);
2323  dst += dst_stride;
2324 
2325  src0 = src4;
2326  }
2327 }
2328 
2329 void ff_put_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2330  const uint8_t *src, ptrdiff_t src_stride,
2331  int height, int mx, int my)
2332 {
2333  uint32_t loop_cnt;
2334  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2335  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
2336  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2337  v8u16 tmp0, tmp1, tmp2, tmp3;
2338  v8i16 filt;
2339 
2340  /* rearranging filter_y */
2341  filt = LD_SH(filter);
2342  filt0 = (v16u8) __msa_splati_h(filt, 0);
2343 
2344  src0 = LD_UB(src);
2345  src5 = LD_UB(src + 16);
2346  src += src_stride;
2347 
2348  for (loop_cnt = (height >> 2); loop_cnt--;) {
2349  LD_UB4(src, src_stride, src1, src2, src3, src4);
2350  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2351  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2352 
2353  LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
2354  src += (4 * src_stride);
2355 
2356  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2357  SRARI_H2_UH(tmp0, tmp1, 7);
2358  SAT_UH2_UH(tmp0, tmp1, 7);
2359  PCKEV_ST_SB(tmp0, tmp1, dst);
2360  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2361  SRARI_H2_UH(tmp2, tmp3, 7);
2362  SAT_UH2_UH(tmp2, tmp3, 7);
2363  PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
2364 
2365  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
2366  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
2367  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2368  SRARI_H2_UH(tmp0, tmp1, 7);
2369  SAT_UH2_UH(tmp0, tmp1, 7);
2370  PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
2371 
2372  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2373  SRARI_H2_UH(tmp2, tmp3, 7);
2374  SAT_UH2_UH(tmp2, tmp3, 7);
2375  PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
2376 
2377  ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
2378  ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
2379  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2380  SRARI_H2_UH(tmp0, tmp1, 7);
2381  SAT_UH2_UH(tmp0, tmp1, 7);
2382  PCKEV_ST_SB(tmp0, tmp1, dst + 16);
2383 
2384  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2385  SRARI_H2_UH(tmp2, tmp3, 7);
2386  SAT_UH2_UH(tmp2, tmp3, 7);
2387  PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
2388 
2389  ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
2390  ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
2391  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2392  SRARI_H2_UH(tmp0, tmp1, 7);
2393  SAT_UH2_UH(tmp0, tmp1, 7);
2394  PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
2395 
2396  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2397  SRARI_H2_UH(tmp2, tmp3, 7);
2398  SAT_UH2_UH(tmp2, tmp3, 7);
2399  PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
2400  dst += (4 * dst_stride);
2401 
2402  src0 = src4;
2403  src5 = src9;
2404  }
2405 }
2406 
2407 void ff_put_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2408  const uint8_t *src, ptrdiff_t src_stride,
2409  int height, int mx, int my)
2410 {
2411  uint32_t loop_cnt;
2412  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2413  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2414  v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2415  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2416  v8i16 filt;
2417 
2418  /* rearranging filter_y */
2419  filt = LD_SH(filter);
2420  filt0 = (v16u8) __msa_splati_h(filt, 0);
2421 
2422  LD_UB4(src, 16, src0, src3, src6, src9);
2423  src += src_stride;
2424 
2425  for (loop_cnt = (height >> 1); loop_cnt--;) {
2426  LD_UB2(src, src_stride, src1, src2);
2427  LD_UB2(src + 16, src_stride, src4, src5);
2428  LD_UB2(src + 32, src_stride, src7, src8);
2429  LD_UB2(src + 48, src_stride, src10, src11);
2430  src += (2 * src_stride);
2431 
2432  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2433  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2434  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2435  SRARI_H2_UH(tmp0, tmp1, 7);
2436  SAT_UH2_UH(tmp0, tmp1, 7);
2437  PCKEV_ST_SB(tmp0, tmp1, dst);
2438 
2439  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2440  SRARI_H2_UH(tmp2, tmp3, 7);
2441  SAT_UH2_UH(tmp2, tmp3, 7);
2442  PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
2443 
2444  ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
2445  ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
2446  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
2447  SRARI_H2_UH(tmp4, tmp5, 7);
2448  SAT_UH2_UH(tmp4, tmp5, 7);
2449  PCKEV_ST_SB(tmp4, tmp5, dst + 16);
2450 
2451  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
2452  SRARI_H2_UH(tmp6, tmp7, 7);
2453  SAT_UH2_UH(tmp6, tmp7, 7);
2454  PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
2455 
2456  ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
2457  ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
2458  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2459  SRARI_H2_UH(tmp0, tmp1, 7);
2460  SAT_UH2_UH(tmp0, tmp1, 7);
2461  PCKEV_ST_SB(tmp0, tmp1, dst + 32);
2462 
2463  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2464  SRARI_H2_UH(tmp2, tmp3, 7);
2465  SAT_UH2_UH(tmp2, tmp3, 7);
2466  PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
2467 
2468  ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
2469  ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
2470  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
2471  SRARI_H2_UH(tmp4, tmp5, 7);
2472  SAT_UH2_UH(tmp4, tmp5, 7);
2473  PCKEV_ST_SB(tmp4, tmp5, dst + 48);
2474 
2475  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
2476  SRARI_H2_UH(tmp6, tmp7, 7);
2477  SAT_UH2_UH(tmp6, tmp7, 7);
2478  PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
2479  dst += (2 * dst_stride);
2480 
2481  src0 = src2;
2482  src3 = src5;
2483  src6 = src8;
2484  src9 = src11;
2485  }
2486 }
2487 
2488 static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
2489  uint8_t *dst, int32_t dst_stride,
2490  const int8_t *filter_horiz, const int8_t *filter_vert)
2491 {
2492  v16i8 src0, src1, src2, src3, src4, mask;
2493  v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
2494  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
2495 
2496  mask = LD_SB(&mc_filt_mask_arr[16]);
2497 
2498  /* rearranging filter */
2499  filt = LD_UH(filter_horiz);
2500  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
2501 
2502  filt = LD_UH(filter_vert);
2503  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
2504 
2505  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2506  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
2507  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
2508  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2509  hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
2510  hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
2511 
2512  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2513  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
2514  SRARI_H2_UH(tmp0, tmp1, 7);
2515  SAT_UH2_UH(tmp0, tmp1, 7);
2516  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
2517  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2518 }
2519 
2520 static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
2521  uint8_t *dst, int32_t dst_stride,
2522  const int8_t *filter_horiz, const int8_t *filter_vert)
2523 {
2524  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
2525  v16i8 res0, res1, res2, res3;
2526  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2527  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2528  v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
2529 
2530  mask = LD_SB(&mc_filt_mask_arr[16]);
2531 
2532  /* rearranging filter */
2533  filt = LD_UH(filter_horiz);
2534  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
2535 
2536  filt = LD_UH(filter_vert);
2537  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
2538 
2539  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2540  src += (8 * src_stride);
2541  src8 = LD_SB(src);
2542 
2543  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
2544  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
2545  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
2546  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
2547  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
2548  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
2549  hz_out3, hz_out5, 8);
2550  hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
2551 
2552  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2553  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2554  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
2555  vec4, vec5, vec6, vec7);
2556  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2557  SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
2558  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
2559  res0, res1, res2, res3);
2560  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2561  dst += (4 * dst_stride);
2562  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
2563 }
2564 
2565 void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2566  const uint8_t *src, ptrdiff_t src_stride,
2567  int height, int mx, int my)
2568 {
2569  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2570  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2571 
2572  if (4 == height) {
2573  common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
2574  filter_horiz, filter_vert);
2575  } else if (8 == height) {
2576  common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
2577  filter_horiz, filter_vert);
2578  }
2579 }
2580 
2581 static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
2582  uint8_t *dst, int32_t dst_stride,
2583  const int8_t *filter_horiz, const int8_t *filter_vert)
2584 {
2585  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2586  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2587  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
2588  v8i16 filt;
2589 
2590  mask = LD_SB(&mc_filt_mask_arr[0]);
2591 
2592  /* rearranging filter */
2593  filt = LD_SH(filter_horiz);
2594  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2595 
2596  filt = LD_SH(filter_vert);
2597  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2598 
2599  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2600 
2601  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2602  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2603  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2604  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
2605 
2606  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2607  vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2608  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
2609 
2610  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2611  vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2612  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
2613 
2614  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2615  vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2616  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
2617 
2618  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2619  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2620  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2621  ST8x4_UB(out0, out1, dst, dst_stride);
2622 }
2623 
2624 static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride,
2625  uint8_t *dst, int32_t dst_stride,
2626  const int8_t *filter_horiz, const int8_t *filter_vert,
2627  int32_t height)
2628 {
2629  uint32_t loop_cnt;
2630  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2631  v16u8 filt_hz, filt_vt, vec0;
2632  v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
2633  v8i16 filt;
2634 
2635  mask = LD_SB(&mc_filt_mask_arr[0]);
2636 
2637  /* rearranging filter */
2638  filt = LD_SH(filter_horiz);
2639  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2640 
2641  filt = LD_SH(filter_vert);
2642  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2643 
2644  src0 = LD_SB(src);
2645  src += src_stride;
2646 
2647  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2648 
2649  for (loop_cnt = (height >> 3); loop_cnt--;) {
2650  LD_SB4(src, src_stride, src1, src2, src3, src4);
2651  src += (4 * src_stride);
2652 
2653  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2654  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2655  tmp1 = __msa_dotp_u_h(vec0, filt_vt);
2656 
2657  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2658  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2659  tmp2 = __msa_dotp_u_h(vec0, filt_vt);
2660 
2661  SRARI_H2_UH(tmp1, tmp2, 7);
2662  SAT_UH2_UH(tmp1, tmp2, 7);
2663 
2664  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2665  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2666  tmp3 = __msa_dotp_u_h(vec0, filt_vt);
2667 
2668  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2669  LD_SB4(src, src_stride, src1, src2, src3, src4);
2670  src += (4 * src_stride);
2671  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2672  tmp4 = __msa_dotp_u_h(vec0, filt_vt);
2673 
2674  SRARI_H2_UH(tmp3, tmp4, 7);
2675  SAT_UH2_UH(tmp3, tmp4, 7);
2676  PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
2677  ST8x4_UB(out0, out1, dst, dst_stride);
2678  dst += (4 * dst_stride);
2679 
2680  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2681  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2682  tmp5 = __msa_dotp_u_h(vec0, filt_vt);
2683 
2684  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2685  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2686  tmp6 = __msa_dotp_u_h(vec0, filt_vt);
2687 
2688  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2689  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2690  tmp7 = __msa_dotp_u_h(vec0, filt_vt);
2691 
2692  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2693  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2694  tmp8 = __msa_dotp_u_h(vec0, filt_vt);
2695 
2696  SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2697  SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2698  PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
2699  ST8x4_UB(out0, out1, dst, dst_stride);
2700  dst += (4 * dst_stride);
2701  }
2702 }
2703 
2704 void ff_put_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2705  const uint8_t *src, ptrdiff_t src_stride,
2706  int height, int mx, int my)
2707 {
2708  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2709  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2710 
2711  if (4 == height) {
2712  common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
2713  filter_horiz, filter_vert);
2714  } else {
2715  common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
2716  filter_horiz, filter_vert, height);
2717  }
2718 }
2719 
2720 void ff_put_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2721  const uint8_t *src, ptrdiff_t src_stride,
2722  int height, int mx, int my)
2723 {
2724  uint32_t loop_cnt;
2725  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2726  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2727  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2728  v16u8 filt_hz, filt_vt, vec0, vec1;
2729  v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
2730  v8i16 filt;
2731 
2732  mask = LD_SB(&mc_filt_mask_arr[0]);
2733 
2734  /* rearranging filter */
2735  filt = LD_SH(filter_horiz);
2736  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2737 
2738  filt = LD_SH(filter_vert);
2739  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2740 
2741  LD_SB2(src, 8, src0, src1);
2742  src += src_stride;
2743 
2744  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2745  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2746 
2747 
2748  for (loop_cnt = (height >> 2); loop_cnt--;) {
2749  LD_SB4(src, src_stride, src0, src2, src4, src6);
2750  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2751  src += (4 * src_stride);
2752 
2753  hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2754  hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2755  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2756  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2757  SRARI_H2_UH(tmp1, tmp2, 7);
2758  SAT_UH2_UH(tmp1, tmp2, 7);
2759  PCKEV_ST_SB(tmp1, tmp2, dst);
2760  dst += dst_stride;
2761 
2762  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2763  hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2764  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2765  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2766  SRARI_H2_UH(tmp1, tmp2, 7);
2767  SAT_UH2_UH(tmp1, tmp2, 7);
2768  PCKEV_ST_SB(tmp1, tmp2, dst);
2769  dst += dst_stride;
2770 
2771  hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2772  hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
2773  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2774  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2775  SRARI_H2_UH(tmp1, tmp2, 7);
2776  SAT_UH2_UH(tmp1, tmp2, 7);
2777  PCKEV_ST_SB(tmp1, tmp2, dst);
2778  dst += dst_stride;
2779 
2780  hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
2781  hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
2782  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2783  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2784  SRARI_H2_UH(tmp1, tmp2, 7);
2785  SAT_UH2_UH(tmp1, tmp2, 7);
2786  PCKEV_ST_SB(tmp1, tmp2, dst);
2787  dst += dst_stride;
2788  }
2789 }
2790 
2791 void ff_put_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2792  const uint8_t *src, ptrdiff_t src_stride,
2793  int height, int mx, int my)
2794 {
2795  int32_t multiple8_cnt;
2796 
2797  for (multiple8_cnt = 2; multiple8_cnt--;) {
2798  ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
2799 
2800  src += 16;
2801  dst += 16;
2802  }
2803 }
2804 
2805 void ff_put_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2806  const uint8_t *src, ptrdiff_t src_stride,
2807  int height, int mx, int my)
2808 {
2809  int32_t multiple8_cnt;
2810 
2811  for (multiple8_cnt = 4; multiple8_cnt--;) {
2812  ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
2813 
2814  src += 16;
2815  dst += 16;
2816  }
2817 }
2818 
2820  int32_t src_stride,
2821  uint8_t *dst, int32_t dst_stride,
2822  const int8_t *filter)
2823 {
2824  uint32_t tp0, tp1, tp2, tp3;
2825  v16i8 src0, src1, src2, src3, mask;
2826  v16u8 filt0, dst0, vec0, vec1, res;
2827  v8u16 vec2, vec3, filt;
2828 
2829  mask = LD_SB(&mc_filt_mask_arr[16]);
2830 
2831  /* rearranging filter */
2832  filt = LD_UH(filter);
2833  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2834 
2835  LD_SB4(src, src_stride, src0, src1, src2, src3);
2836  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
2837  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
2838  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
2839  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
2840  SRARI_H2_UH(vec2, vec3, 7);
2841 
2842  res = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
2843  res = (v16u8) __msa_aver_u_b(res, dst0);
2844 
2845  ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
2846 }
2847 
2849  int32_t src_stride,
2850  uint8_t *dst, int32_t dst_stride,
2851  const int8_t *filter)
2852 {
2853  uint32_t tp0, tp1, tp2, tp3;
2854  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2855  v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
2856  v16u8 dst0, dst1;
2857  v8u16 vec4, vec5, vec6, vec7, filt;
2858 
2859  mask = LD_SB(&mc_filt_mask_arr[16]);
2860 
2861  /* rearranging filter */
2862  filt = LD_UH(filter);
2863  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2864 
2865  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2866  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
2867  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
2868  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
2869  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
2870  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
2871  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
2872  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
2873  vec6, vec7);
2874  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2875  PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
2876  res2, res3);
2877  ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
2878  AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
2879  ST4x8_UB(res0, res2, dst, dst_stride);
2880 }
2881 
2882 void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2883  const uint8_t *src, ptrdiff_t src_stride,
2884  int height, int mx, int my)
2885 {
2886  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2887 
2888  if (4 == height) {
2889  common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
2890  filter);
2891  } else if (8 == height) {
2892  common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
2893  filter);
2894  }
2895 }
2896 
2898  int32_t src_stride,
2899  uint8_t *dst, int32_t dst_stride,
2900  const int8_t *filter)
2901 {
2902  int64_t tp0, tp1, tp2, tp3;
2903  v16i8 src0, src1, src2, src3, mask;
2904  v16u8 filt0, dst0, dst1;
2905  v8u16 vec0, vec1, vec2, vec3, filt;
2906 
2907  mask = LD_SB(&mc_filt_mask_arr[0]);
2908 
2909  /* rearranging filter */
2910  filt = LD_UH(filter);
2911  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2912 
2913  LD_SB4(src, src_stride, src0, src1, src2, src3);
2914  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2915  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2916  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2917  vec0, vec1, vec2, vec3);
2918  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2919  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2920  INSERT_D2_UB(tp0, tp1, dst0);
2921  INSERT_D2_UB(tp2, tp3, dst1);
2922  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2923 }
2924 
2926  int32_t src_stride,
2927  uint8_t *dst,
2928  int32_t dst_stride,
2929  const int8_t *filter,
2930  int32_t height)
2931 {
2932  int64_t tp0, tp1, tp2, tp3;
2933  v16i8 src0, src1, src2, src3, mask;
2934  v16u8 filt0, dst0, dst1;
2935  v8u16 vec0, vec1, vec2, vec3, filt;
2936 
2937  mask = LD_SB(&mc_filt_mask_arr[0]);
2938 
2939  /* rearranging filter */
2940  filt = LD_UH(filter);
2941  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2942 
2943  LD_SB4(src, src_stride, src0, src1, src2, src3);
2944  src += (4 * src_stride);
2945  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2946  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2947  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
2948  vec2, vec3);
2949  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2950  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2951  INSERT_D2_UB(tp0, tp1, dst0);
2952  INSERT_D2_UB(tp2, tp3, dst1);
2953  LD_SB4(src, src_stride, src0, src1, src2, src3);
2954  src += (4 * src_stride);
2955  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2956  dst += (4 * dst_stride);
2957 
2958  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2959  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2960  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
2961  vec2, vec3);
2962  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2963  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2964  INSERT_D2_UB(tp0, tp1, dst0);
2965  INSERT_D2_UB(tp2, tp3, dst1);
2966  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2967  dst += (4 * dst_stride);
2968 
2969  if (16 == height) {
2970  LD_SB4(src, src_stride, src0, src1, src2, src3);
2971  src += (4 * src_stride);
2972 
2973  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2974  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2975  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
2976  vec1, vec2, vec3);
2977  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2978  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2979  INSERT_D2_UB(tp0, tp1, dst0);
2980  INSERT_D2_UB(tp2, tp3, dst1);
2981  LD_SB4(src, src_stride, src0, src1, src2, src3);
2982  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2983  dst += (4 * dst_stride);
2984 
2985  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2986  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2987  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
2988  vec1, vec2, vec3);
2989  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2990  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2991  INSERT_D2_UB(tp0, tp1, dst0);
2992  INSERT_D2_UB(tp2, tp3, dst1);
2993  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2994  }
2995 }
2996 
2997 void ff_avg_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2998  const uint8_t *src, ptrdiff_t src_stride,
2999  int height, int mx, int my)
3000 {
3001  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3002 
3003  if (4 == height) {
3004  common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3005  filter);
3006  } else {
3007  common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
3008  filter, height);
3009  }
3010 }
3011 
3012 void ff_avg_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3013  const uint8_t *src, ptrdiff_t src_stride,
3014  int height, int mx, int my)
3015 {
3016  uint32_t loop_cnt;
3017  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3018  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3019  v16u8 filt0, dst0, dst1, dst2, dst3;
3020  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3021  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
3022 
3023  mask = LD_SB(&mc_filt_mask_arr[0]);
3024 
3025  /* rearranging filter */
3026  filt = LD_UH(filter);
3027  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3028 
3029  LD_SB4(src, src_stride, src0, src2, src4, src6);
3030  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3031  src += (4 * src_stride);
3032 
3033  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3034  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3035  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3036  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3037  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
3038  res2, res3);
3039  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
3040  res6, res7);
3041  SRARI_H4_UH(res0, res1, res2, res3, 7);
3042  SRARI_H4_UH(res4, res5, res6, res7, 7);
3043  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3044  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3045  dst += dst_stride;
3046  PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
3047  dst += dst_stride;
3048  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3049  dst += dst_stride;
3050  PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
3051  dst += dst_stride;
3052 
3053  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
3054  LD_SB4(src, src_stride, src0, src2, src4, src6);
3055  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3056  src += (4 * src_stride);
3057 
3058  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3059  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3060  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3061  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3062  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
3063  res1, res2, res3);
3064  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4,
3065  res5, res6, res7);
3066  SRARI_H4_UH(res0, res1, res2, res3, 7);
3067  SRARI_H4_UH(res4, res5, res6, res7, 7);
3068  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3069  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3070  dst += dst_stride;
3071  PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
3072  dst += dst_stride;
3073  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3074  dst += dst_stride;
3075  PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
3076  dst += dst_stride;
3077  }
3078 }
3079 
3080 void ff_avg_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3081  const uint8_t *src, ptrdiff_t src_stride,
3082  int height, int mx, int my)
3083 {
3084  uint32_t loop_cnt;
3085  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3086  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3087  v16u8 filt0, dst0, dst1, dst2, dst3;
3088  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3089  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
3090 
3091  mask = LD_SB(&mc_filt_mask_arr[0]);
3092 
3093  /* rearranging filter */
3094  filt = LD_UH(filter);
3095  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3096 
3097  for (loop_cnt = (height >> 1); loop_cnt--;) {
3098  src0 = LD_SB(src);
3099  src2 = LD_SB(src + 16);
3100  src3 = LD_SB(src + 24);
3101  src1 = __msa_sldi_b(src2, src0, 8);
3102  src += src_stride;
3103  src4 = LD_SB(src);
3104  src6 = LD_SB(src + 16);
3105  src7 = LD_SB(src + 24);
3106  src5 = __msa_sldi_b(src6, src4, 8);
3107  src += src_stride;
3108 
3109  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3110  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3111  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3112  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3113  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3114  res0, res1, res2, res3);
3115  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3116  res4, res5, res6, res7);
3117  SRARI_H4_UH(res0, res1, res2, res3, 7);
3118  SRARI_H4_UH(res4, res5, res6, res7, 7);
3119  LD_UB2(dst, 16, dst0, dst1);
3120  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3121  PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
3122  dst += dst_stride;
3123  LD_UB2(dst, 16, dst2, dst3);
3124  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3125  PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
3126  dst += dst_stride;
3127  }
3128 }
3129 
3130 void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3131  const uint8_t *src, ptrdiff_t src_stride,
3132  int height, int mx, int my)
3133 {
3134  uint32_t loop_cnt;
3135  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3136  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3137  v16u8 filt0, dst0, dst1, dst2, dst3;
3138  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3139  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
3140 
3141  mask = LD_SB(&mc_filt_mask_arr[0]);
3142 
3143  /* rearranging filter */
3144  filt = LD_UH(filter);
3145  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3146 
3147  for (loop_cnt = height; loop_cnt--;) {
3148  LD_SB4(src, 16, src0, src2, src4, src6);
3149  src7 = LD_SB(src + 56);
3150  SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
3151  src += src_stride;
3152 
3153  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3154  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3155  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3156  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3157  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3158  out0, out1, out2, out3);
3159  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3160  out4, out5, out6, out7);
3161  SRARI_H4_UH(out0, out1, out2, out3, 7);
3162  SRARI_H4_UH(out4, out5, out6, out7, 7);
3163  LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
3164  PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
3165  PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
3166  PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
3167  PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
3168  dst += dst_stride;
3169  }
3170 }
3171 
3173  int32_t src_stride,
3174  uint8_t *dst, int32_t dst_stride,
3175  const int8_t *filter)
3176 {
3177  uint32_t tp0, tp1, tp2, tp3;
3178  v16i8 src0, src1, src2, src3, src4;
3179  v16u8 dst0, out, filt0, src2110, src4332;
3180  v16i8 src10_r, src32_r, src21_r, src43_r;
3181  v8i16 filt;
3182  v8u16 tmp0, tmp1;
3183 
3184  filt = LD_SH(filter);
3185  filt0 = (v16u8) __msa_splati_h(filt, 0);
3186 
3187  LD_SB4(src, src_stride, src0, src1, src2, src3);
3188  src += (4 * src_stride);
3189 
3190  src4 = LD_SB(src);
3191  src += src_stride;
3192 
3193  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3194  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3195  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
3196  src10_r, src21_r, src32_r, src43_r);
3197  ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3198  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
3199  SRARI_H2_UH(tmp0, tmp1, 7);
3200  SAT_UH2_UH(tmp0, tmp1, 7);
3201 
3202  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3203  out = __msa_aver_u_b(out, dst0);
3204 
3205  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
3206 }
3207 
3209  int32_t src_stride,
3210  uint8_t *dst, int32_t dst_stride,
3211  const int8_t *filter)
3212 {
3213  uint32_t tp0, tp1, tp2, tp3;
3214  v16u8 dst0, dst1;
3215  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
3216  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3217  v16u8 src2110, src4332, src6554, src8776, filt0;
3218  v8u16 tmp0, tmp1, tmp2, tmp3;
3219  v8i16 filt;
3220 
3221  filt = LD_SH(filter);
3222  filt0 = (v16u8) __msa_splati_h(filt, 0);
3223 
3224  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3225  src += (8 * src_stride);
3226  src8 = LD_SB(src);
3227 
3228  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3229  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3230  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3231  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
3232  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3233  src32_r, src43_r);
3234  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3235  src76_r, src87_r);
3236  ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
3237  src87_r, src76_r, src2110, src4332, src6554, src8776);
3238  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
3239  tmp0, tmp1, tmp2, tmp3);
3240  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3241  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3242  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
3243  AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
3244  ST4x8_UB(src2110, src4332, dst, dst_stride);
3245 }
3246 
3247 void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3248  const uint8_t *src, ptrdiff_t src_stride,
3249  int height, int mx, int my)
3250 {
3251  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3252 
3253  if (4 == height) {
3254  common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
3255  filter);
3256  } else if (8 == height) {
3257  common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
3258  filter);
3259  }
3260 }
3261 
3263  int32_t src_stride,
3264  uint8_t *dst,
3265  int32_t dst_stride,
3266  const int8_t *filter)
3267 {
3268  int64_t tp0, tp1, tp2, tp3;
3269  v16u8 src0, src1, src2, src3, src4;
3270  v16u8 dst0, dst1, vec0, vec1, vec2, vec3, filt0;
3271  v8u16 tmp0, tmp1, tmp2, tmp3;
3272  v8i16 filt;
3273 
3274  /* rearranging filter_y */
3275  filt = LD_SH(filter);
3276  filt0 = (v16u8) __msa_splati_h(filt, 0);
3277 
3278  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
3279  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3280  INSERT_D2_UB(tp0, tp1, dst0);
3281  INSERT_D2_UB(tp2, tp3, dst1);
3282  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
3283  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
3284  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3285  tmp0, tmp1, tmp2, tmp3);
3286  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3287  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3288  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3289 }
3290 
3292  int32_t src_stride,
3293  uint8_t *dst,
3294  int32_t dst_stride,
3295  const int8_t *filter,
3296  int32_t height)
3297 {
3298  uint32_t loop_cnt;
3299  int64_t tp0, tp1, tp2, tp3;
3300  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3301  v16u8 dst0, dst1, dst2, dst3;
3302  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
3303  v8u16 tmp0, tmp1, tmp2, tmp3;
3304  v8i16 filt;
3305 
3306  /* rearranging filter_y */
3307  filt = LD_SH(filter);
3308  filt0 = (v16u8) __msa_splati_h(filt, 0);
3309 
3310  src0 = LD_UB(src);
3311  src += src_stride;
3312 
3313  for (loop_cnt = (height >> 3); loop_cnt--;) {
3314  LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
3315  src += (8 * src_stride);
3316 
3317  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3318  INSERT_D2_UB(tp0, tp1, dst0);
3319  INSERT_D2_UB(tp2, tp3, dst1);
3320  LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3321  INSERT_D2_UB(tp0, tp1, dst2);
3322  INSERT_D2_UB(tp2, tp3, dst3);
3323 
3324  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
3325  vec0, vec1, vec2, vec3);
3326  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
3327  vec4, vec5, vec6, vec7);
3328  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3329  tmp0, tmp1, tmp2, tmp3);
3330  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3331  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3332  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3333  dst += (4 * dst_stride);
3334 
3335  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3336  tmp0, tmp1, tmp2, tmp3);
3337  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3338  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3339  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst2, dst3, dst, dst_stride);
3340  dst += (4 * dst_stride);
3341 
3342  src0 = src8;
3343  }
3344 }
3345 
3346 void ff_avg_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3347  const uint8_t *src, ptrdiff_t src_stride,
3348  int height, int mx, int my)
3349 {
3350  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3351 
3352  if (4 == height) {
3353  common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3354  filter);
3355  } else {
3356  common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
3357  filter, height);
3358  }
3359 }
3360 
3361 void ff_avg_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3362  const uint8_t *src, ptrdiff_t src_stride,
3363  int height, int mx, int my)
3364 {
3365  uint32_t loop_cnt;
3366  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3367  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
3368  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3369  v8u16 tmp0, tmp1, tmp2, tmp3, filt;
3370 
3371  /* rearranging filter_y */
3372  filt = LD_UH(filter);
3373  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3374 
3375  src0 = LD_UB(src);
3376  src += src_stride;
3377 
3378  for (loop_cnt = (height >> 2); loop_cnt--;) {
3379  LD_UB4(src, src_stride, src1, src2, src3, src4);
3380  src += (4 * src_stride);
3381 
3382  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3383  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3384  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3385  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3386  SRARI_H2_UH(tmp0, tmp1, 7);
3387  SAT_UH2_UH(tmp0, tmp1, 7);
3388  PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3389  dst += dst_stride;
3390 
3391  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
3392  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
3393  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3394  SRARI_H2_UH(tmp2, tmp3, 7);
3395  SAT_UH2_UH(tmp2, tmp3, 7);
3396  PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
3397  dst += dst_stride;
3398 
3399  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3400  SRARI_H2_UH(tmp0, tmp1, 7);
3401  SAT_UH2_UH(tmp0, tmp1, 7);
3402  PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
3403  dst += dst_stride;
3404 
3405  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3406  SRARI_H2_UH(tmp2, tmp3, 7);
3407  SAT_UH2_UH(tmp2, tmp3, 7);
3408  PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
3409  dst += dst_stride;
3410 
3411  src0 = src4;
3412  }
3413 }
3414 
3415 void ff_avg_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3416  const uint8_t *src, ptrdiff_t src_stride,
3417  int height, int mx, int my)
3418 {
3419  uint32_t loop_cnt;
3420  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3421  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3422  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3423  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
3424  v8u16 tmp0, tmp1, tmp2, tmp3, filt;
3425 
3426  /* rearranging filter_y */
3427  filt = LD_UH(filter);
3428  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3429 
3430  LD_UB2(src, 16, src0, src5);
3431  src += src_stride;
3432 
3433  for (loop_cnt = (height >> 2); loop_cnt--;) {
3434  LD_UB4(src, src_stride, src1, src2, src3, src4);
3435  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3436  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3437  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3438 
3439  LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
3440  LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
3441  src += (4 * src_stride);
3442 
3443  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3444  SRARI_H2_UH(tmp0, tmp1, 7);
3445  SAT_UH2_UH(tmp0, tmp1, 7);
3446  PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3447 
3448  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3449  SRARI_H2_UH(tmp2, tmp3, 7);
3450  SAT_UH2_UH(tmp2, tmp3, 7);
3451  PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
3452 
3453  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
3454  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
3455  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3456  SRARI_H2_UH(tmp0, tmp1, 7);
3457  SAT_UH2_UH(tmp0, tmp1, 7);
3458  PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
3459 
3460  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3461  SRARI_H2_UH(tmp2, tmp3, 7);
3462  SAT_UH2_UH(tmp2, tmp3, 7);
3463  PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
3464 
3465  ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
3466  ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
3467  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3468  SRARI_H2_UH(tmp0, tmp1, 7);
3469  SAT_UH2_UH(tmp0, tmp1, 7);
3470  PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
3471 
3472  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3473  SRARI_H2_UH(tmp2, tmp3, 7);
3474  SAT_UH2_UH(tmp2, tmp3, 7);
3475  PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
3476 
3477  ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
3478  ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
3479  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3480  SRARI_H2_UH(tmp0, tmp1, 7);
3481  SAT_UH2_UH(tmp0, tmp1, 7);
3482  PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
3483 
3484  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3485  SRARI_H2_UH(tmp2, tmp3, 7);
3486  SAT_UH2_UH(tmp2, tmp3, 7);
3487  PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
3488  dst += (4 * dst_stride);
3489 
3490  src0 = src4;
3491  src5 = src9;
3492  }
3493 }
3494 
3495 void ff_avg_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3496  const uint8_t *src, ptrdiff_t src_stride,
3497  int height, int mx, int my)
3498 {
3499  uint32_t loop_cnt;
3500  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3501  v16u8 src0, src1, src2, src3, src4, src5;
3502  v16u8 src6, src7, src8, src9, src10, src11, filt0;
3503  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3504  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3505  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3506  v8u16 filt;
3507 
3508  /* rearranging filter_y */
3509  filt = LD_UH(filter);
3510  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3511 
3512  LD_UB4(src, 16, src0, src3, src6, src9);
3513  src += src_stride;
3514 
3515  for (loop_cnt = (height >> 1); loop_cnt--;) {
3516  LD_UB2(src, src_stride, src1, src2);
3517  LD_UB2(dst, dst_stride, dst0, dst1);
3518  LD_UB2(src + 16, src_stride, src4, src5);
3519  LD_UB2(dst + 16, dst_stride, dst2, dst3);
3520  LD_UB2(src + 32, src_stride, src7, src8);
3521  LD_UB2(dst + 32, dst_stride, dst4, dst5);
3522  LD_UB2(src + 48, src_stride, src10, src11);
3523  LD_UB2(dst + 48, dst_stride, dst6, dst7);
3524  src += (2 * src_stride);
3525 
3526  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3527  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3528  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3529  SRARI_H2_UH(tmp0, tmp1, 7);
3530  SAT_UH2_UH(tmp0, tmp1, 7);
3531  PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3532 
3533  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3534  SRARI_H2_UH(tmp2, tmp3, 7);
3535  SAT_UH2_UH(tmp2, tmp3, 7);
3536  PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
3537 
3538  ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
3539  ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
3540  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
3541  SRARI_H2_UH(tmp4, tmp5, 7);
3542  SAT_UH2_UH(tmp4, tmp5, 7);
3543  PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
3544 
3545  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
3546  SRARI_H2_UH(tmp6, tmp7, 7);
3547  SAT_UH2_UH(tmp6, tmp7, 7);
3548  PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
3549 
3550  ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
3551  ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
3552  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3553  SRARI_H2_UH(tmp0, tmp1, 7);
3554  SAT_UH2_UH(tmp0, tmp1, 7);
3555  PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
3556 
3557  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3558  SRARI_H2_UH(tmp2, tmp3, 7);
3559  SAT_UH2_UH(tmp2, tmp3, 7);
3560  PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
3561 
3562  ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
3563  ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
3564  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
3565  SRARI_H2_UH(tmp4, tmp5, 7);
3566  SAT_UH2_UH(tmp4, tmp5, 7);
3567  PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
3568 
3569  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
3570  SRARI_H2_UH(tmp6, tmp7, 7);
3571  SAT_UH2_UH(tmp6, tmp7, 7);
3572  PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
3573  dst += (2 * dst_stride);
3574 
3575  src0 = src2;
3576  src3 = src5;
3577  src6 = src8;
3578  src9 = src11;
3579  }
3580 }
3581 
3583  int32_t src_stride,
3584  uint8_t *dst,
3585  int32_t dst_stride,
3586  const int8_t *filter_horiz,
3587  const int8_t *filter_vert)
3588 {
3589  uint32_t tp0, tp1, tp2, tp3;
3590  v16i8 src0, src1, src2, src3, src4, mask;
3591  v16u8 filt_hz, filt_vt, vec0, vec1;
3592  v16u8 dst0, out;
3593  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
3594 
3595  mask = LD_SB(&mc_filt_mask_arr[16]);
3596 
3597  /* rearranging filter */
3598  filt = LD_UH(filter_horiz);
3599  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
3600 
3601  filt = LD_UH(filter_vert);
3602  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
3603 
3604  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3605 
3606  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
3607  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
3608  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3609  hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
3610  hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
3611  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3612 
3613  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3614  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3615 
3616  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3617  SRARI_H2_UH(tmp0, tmp1, 7);
3618  SAT_UH2_UH(tmp0, tmp1, 7);
3619 
3620  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3621  out = __msa_aver_u_b(out, dst0);
3622 
3623  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
3624 }
3625 
3627  int32_t src_stride,
3628  uint8_t *dst,
3629  int32_t dst_stride,
3630  const int8_t *filter_horiz,
3631  const int8_t *filter_vert)
3632 {
3633  uint32_t tp0, tp1, tp2, tp3;
3634  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
3635  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
3636  v16u8 dst0, dst1;
3637  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3638  v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
3639  v8i16 filt;
3640 
3641  mask = LD_SB(&mc_filt_mask_arr[16]);
3642 
3643  /* rearranging filter */
3644  filt = LD_SH(filter_horiz);
3645  filt_hz = (v16u8) __msa_splati_h(filt, 0);
3646 
3647  filt = LD_SH(filter_vert);
3648  filt_vt = (v16u8) __msa_splati_h(filt, 0);
3649 
3650  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3651  src += (8 * src_stride);
3652  src8 = LD_SB(src);
3653 
3654  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
3655  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
3656  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
3657  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
3658  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
3659  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
3660  hz_out3, hz_out5, 8);
3661  hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
3662 
3663  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3664  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3665  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3666  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
3667  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3668  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
3669  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
3670  tmp0, tmp1, tmp2, tmp3);
3671  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3672  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3673  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1);
3674  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
3675  ST4x8_UB(res0, res1, dst, dst_stride);
3676 }
3677 
3678 void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3679  const uint8_t *src, ptrdiff_t src_stride,
3680  int height, int mx, int my)
3681 {
3682  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3683  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3684 
3685  if (4 == height) {
3686  common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
3687  filter_horiz, filter_vert);
3688  } else if (8 == height) {
3689  common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
3690  filter_horiz, filter_vert);
3691  }
3692 }
3693 
3695  int32_t src_stride,
3696  uint8_t *dst,
3697  int32_t dst_stride,
3698  const int8_t *filter_horiz,
3699  const int8_t *filter_vert)
3700 {
3701  uint64_t tp0, tp1, tp2, tp3;
3702  v16i8 src0, src1, src2, src3, src4, mask;
3703  v16u8 filt_hz, filt_vt, dst0, dst1, vec0, vec1, vec2, vec3;
3704  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
3705  v8i16 filt;
3706 
3707  mask = LD_SB(&mc_filt_mask_arr[0]);
3708 
3709  /* rearranging filter */
3710  filt = LD_SH(filter_horiz);
3711  filt_hz = (v16u8) __msa_splati_h(filt, 0);
3712 
3713  filt = LD_SH(filter_vert);
3714  filt_vt = (v16u8) __msa_splati_h(filt, 0);
3715 
3716  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3717  src += (5 * src_stride);
3718 
3719  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3720  INSERT_D2_UB(tp0, tp1, dst0);
3721  INSERT_D2_UB(tp2, tp3, dst1);
3722  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3723  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3724  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3725  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
3726 
3727  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3728  vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3729  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
3730 
3731  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3732  vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3733  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
3734 
3735  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3736  vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3737  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
3738 
3739  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3740  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3741  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3742 }
3743 
3745  int32_t src_stride,
3746  uint8_t *dst,
3747  int32_t dst_stride,
3748  const int8_t *filter_horiz,
3749  const int8_t *filter_vert,
3750  int32_t height)
3751 {
3752  uint32_t loop_cnt;
3753  uint64_t tp0, tp1, tp2, tp3;
3754  v16i8 src0, src1, src2, src3, src4, mask;
3755  v16u8 filt_hz, filt_vt, vec0, dst0, dst1;
3756  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
3757  v8i16 filt;
3758 
3759  mask = LD_SB(&mc_filt_mask_arr[0]);
3760 
3761  /* rearranging filter */
3762  filt = LD_SH(filter_horiz);
3763  filt_hz = (v16u8) __msa_splati_h(filt, 0);
3764 
3765  filt = LD_SH(filter_vert);
3766  filt_vt = (v16u8) __msa_splati_h(filt, 0);
3767 
3768  src0 = LD_SB(src);
3769  src += src_stride;
3770 
3771  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3772 
3773  for (loop_cnt = (height >> 2); loop_cnt--;) {
3774  LD_SB4(src, src_stride, src1, src2, src3, src4);
3775  src += (4 * src_stride);
3776 
3777  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3778  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3779  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
3780 
3781  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3782  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3783  tmp1 = __msa_dotp_u_h(vec0, filt_vt);
3784 
3785  SRARI_H2_UH(tmp0, tmp1, 7);
3786  SAT_UH2_UH(tmp0, tmp1, 7);
3787 
3788  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3789  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3790  tmp2 = __msa_dotp_u_h(vec0, filt_vt);
3791 
3792  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3793  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3794  tmp3 = __msa_dotp_u_h(vec0, filt_vt);
3795 
3796  SRARI_H2_UH(tmp2, tmp3, 7);
3797  SAT_UH2_UH(tmp2, tmp3, 7);
3798  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3799  INSERT_D2_UB(tp0, tp1, dst0);
3800  INSERT_D2_UB(tp2, tp3, dst1);
3801  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3802  dst += (4 * dst_stride);
3803  }
3804 }
3805 
3806 void ff_avg_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3807  const uint8_t *src, ptrdiff_t src_stride,
3808  int height, int mx, int my)
3809 {
3810  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3811  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3812 
3813  if (4 == height) {
3814  common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3815  filter_horiz, filter_vert);
3816  } else {
3818  dst, dst_stride,
3819  filter_horiz, filter_vert,
3820  height);
3821  }
3822 }
3823 
3824 void ff_avg_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3825  const uint8_t *src, ptrdiff_t src_stride,
3826  int height, int mx, int my)
3827 {
3828  uint32_t loop_cnt;
3829  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3830  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3831  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3832  v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
3833  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
3834  v8i16 filt;
3835 
3836  mask = LD_SB(&mc_filt_mask_arr[0]);
3837 
3838  /* rearranging filter */
3839  filt = LD_SH(filter_horiz);
3840  filt_hz = (v16u8) __msa_splati_h(filt, 0);
3841 
3842  filt = LD_SH(filter_vert);
3843  filt_vt = (v16u8) __msa_splati_h(filt, 0);
3844 
3845  LD_SB2(src, 8, src0, src1);
3846  src += src_stride;
3847 
3848  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3849  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3850 
3851  for (loop_cnt = (height >> 2); loop_cnt--;) {
3852  LD_SB4(src, src_stride, src0, src2, src4, src6);
3853  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3854  src += (4 * src_stride);
3855  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3856 
3857  hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3858  hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3859  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3860  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3861  SRARI_H2_UH(tmp0, tmp1, 7);
3862  SAT_UH2_UH(tmp0, tmp1, 7);
3863  PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3864  dst += dst_stride;
3865 
3866  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3867  hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3868  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
3869  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3870  SRARI_H2_UH(tmp0, tmp1, 7);
3871  SAT_UH2_UH(tmp0, tmp1, 7);
3872  PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
3873  dst += dst_stride;
3874 
3875  hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3876  hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
3877  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3878  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3879  SRARI_H2_UH(tmp0, tmp1, 7);
3880  SAT_UH2_UH(tmp0, tmp1, 7);
3881  PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
3882  dst += dst_stride;
3883 
3884  hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
3885  hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
3886  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
3887  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3888  SRARI_H2_UH(tmp0, tmp1, 7);
3889  SAT_UH2_UH(tmp0, tmp1, 7);
3890  PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
3891  dst += dst_stride;
3892  }
3893 }
3894 
3895 void ff_avg_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3896  const uint8_t *src, ptrdiff_t src_stride,
3897  int height, int mx, int my)
3898 {
3899  int32_t multiple8_cnt;
3900 
3901  for (multiple8_cnt = 2; multiple8_cnt--;) {
3902  ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
3903 
3904  src += 16;
3905  dst += 16;
3906  }
3907 }
3908 
3909 void ff_avg_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3910  const uint8_t *src, ptrdiff_t src_stride,
3911  int height, int mx, int my)
3912 {
3913  int32_t multiple8_cnt;
3914 
3915  for (multiple8_cnt = 4; multiple8_cnt--;) {
3916  ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
3917 
3918  src += 16;
3919  dst += 16;
3920  }
3921 }
3922 
3923 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
3924  uint8_t *dst, int32_t dst_stride,
3925  int32_t height)
3926 {
3927  int32_t cnt;
3928  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
3929 
3930  if (0 == height % 8) {
3931  for (cnt = height >> 3; cnt--;) {
3932  LD4(src, src_stride, out0, out1, out2, out3);
3933  src += (4 * src_stride);
3934  LD4(src, src_stride, out4, out5, out6, out7);
3935  src += (4 * src_stride);
3936 
3937  SD4(out0, out1, out2, out3, dst, dst_stride);
3938  dst += (4 * dst_stride);
3939  SD4(out4, out5, out6, out7, dst, dst_stride);
3940  dst += (4 * dst_stride);
3941  }
3942  } else if (0 == height % 4) {
3943  for (cnt = (height / 4); cnt--;) {
3944  LD4(src, src_stride, out0, out1, out2, out3);
3945  src += (4 * src_stride);
3946 
3947  SD4(out0, out1, out2, out3, dst, dst_stride);
3948  dst += (4 * dst_stride);
3949  }
3950  }
3951 }
3952 
3953 static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
3954  uint8_t *dst, int32_t dst_stride,
3955  int32_t height)
3956 {
3957  int32_t cnt;
3958  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
3959 
3960  if (8 == height) {
3961  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3962  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3963  } else if (16 == height) {
3964  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3965  src += (8 * src_stride);
3966  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3967  dst += (8 * dst_stride);
3968  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3969  src += (8 * src_stride);
3970  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3971  dst += (8 * dst_stride);
3972  } else if (32 == height) {
3973  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3974  src += (8 * src_stride);
3975  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3976  dst += (8 * dst_stride);
3977  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3978  src += (8 * src_stride);
3979  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3980  dst += (8 * dst_stride);
3981  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3982  src += (8 * src_stride);
3983  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3984  dst += (8 * dst_stride);
3985  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3986  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3987  } else if (0 == height % 4) {
3988  for (cnt = (height >> 2); cnt--;) {
3989  LD_UB4(src, src_stride, src0, src1, src2, src3);
3990  src += (4 * src_stride);
3991  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
3992  dst += (4 * dst_stride);
3993  }
3994  }
3995 }
3996 
3997 static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
3998  uint8_t *dst, int32_t dst_stride,
3999  int32_t height)
4000 {
4001  int32_t cnt;
4002  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4003 
4004  if (0 == height % 8) {
4005  for (cnt = (height >> 3); cnt--;) {
4006  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
4007  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
4008  LD_UB8(src + 16, src_stride, src0, src1, src2, src3, src4, src5, src6,
4009  src7);
4010  src += (8 * src_stride);
4011  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 16,
4012  dst_stride);
4013  dst += (8 * dst_stride);
4014  }
4015  } else if (0 == height % 4) {
4016  for (cnt = (height >> 2); cnt--;) {
4017  LD_UB4(src, src_stride, src0, src1, src2, src3);
4018  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
4019  src += (4 * src_stride);
4020  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
4021  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
4022  dst += (4 * dst_stride);
4023  }
4024  }
4025 }
4026 
4027 static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
4028  uint8_t *dst, int32_t dst_stride,
4029  int32_t height)
4030 {
4031  int32_t cnt;
4032  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4033  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4034 
4035  for (cnt = (height >> 2); cnt--;) {
4036  LD_UB4(src, 16, src0, src1, src2, src3);
4037  src += src_stride;
4038  LD_UB4(src, 16, src4, src5, src6, src7);
4039  src += src_stride;
4040  LD_UB4(src, 16, src8, src9, src10, src11);
4041  src += src_stride;
4042  LD_UB4(src, 16, src12, src13, src14, src15);
4043  src += src_stride;
4044 
4045  ST_UB4(src0, src1, src2, src3, dst, 16);
4046  dst += dst_stride;
4047  ST_UB4(src4, src5, src6, src7, dst, 16);
4048  dst += dst_stride;
4049  ST_UB4(src8, src9, src10, src11, dst, 16);
4050  dst += dst_stride;
4051  ST_UB4(src12, src13, src14, src15, dst, 16);
4052  dst += dst_stride;
4053  }
4054 }
4055 
4056 static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
4057  uint8_t *dst, int32_t dst_stride,
4058  int32_t height)
4059 {
4060  uint32_t tp0, tp1, tp2, tp3;
4061  v16u8 src0 = { 0 }, src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 };
4062 
4063  if (8 == height) {
4064  LW4(src, src_stride, tp0, tp1, tp2, tp3);
4065  src += 4 * src_stride;
4066  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
4067  LW4(src, src_stride, tp0, tp1, tp2, tp3);
4068  INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
4069  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
4070  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4071  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
4072  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
4073  AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
4074  ST4x8_UB(dst0, dst1, dst, dst_stride);
4075  } else if (4 == height) {
4076  LW4(src, src_stride, tp0, tp1, tp2, tp3);
4077  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
4078  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
4079  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4080  dst0 = __msa_aver_u_b(src0, dst0);
4081  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
4082  }
4083 }
4084 
4085 static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
4086  uint8_t *dst, int32_t dst_stride,
4087  int32_t height)
4088 {
4089  int32_t cnt;
4090  uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
4091  v16u8 src0, src1, src2, src3;
4092  v16u8 dst0, dst1, dst2, dst3;
4093 
4094  if (0 == (height % 8)) {
4095  for (cnt = (height >> 3); cnt--;) {
4096  LD4(src, src_stride, tp0, tp1, tp2, tp3);
4097  src += 4 * src_stride;
4098  LD4(src, src_stride, tp4, tp5, tp6, tp7);
4099  src += 4 * src_stride;
4100  INSERT_D2_UB(tp0, tp1, src0);
4101  INSERT_D2_UB(tp2, tp3, src1);
4102  INSERT_D2_UB(tp4, tp5, src2);
4103  INSERT_D2_UB(tp6, tp7, src3);
4104  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
4105  LD4(dst + 4 * dst_stride, dst_stride, tp4, tp5, tp6, tp7);
4106  INSERT_D2_UB(tp0, tp1, dst0);
4107  INSERT_D2_UB(tp2, tp3, dst1);
4108  INSERT_D2_UB(tp4, tp5, dst2);
4109  INSERT_D2_UB(tp6, tp7, dst3);
4110  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0,
4111  dst1, dst2, dst3);
4112  ST8x8_UB(dst0, dst1, dst2, dst3, dst, dst_stride);
4113  dst += 8 * dst_stride;
4114  }
4115  } else if (4 == height) {
4116  LD4(src, src_stride, tp0, tp1, tp2, tp3);
4117  INSERT_D2_UB(tp0, tp1, src0);
4118  INSERT_D2_UB(tp2, tp3, src1);
4119  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
4120  INSERT_D2_UB(tp0, tp1, dst0);
4121  INSERT_D2_UB(tp2, tp3, dst1);
4122  AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
4123  ST8x4_UB(dst0, dst1, dst, dst_stride);
4124  }
4125 }
4126 
4127 static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
4128  uint8_t *dst, int32_t dst_stride,
4129  int32_t height)
4130 {
4131  int32_t cnt;
4132  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4133  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4134 
4135  if (0 == (height % 8)) {
4136  for (cnt = (height / 8); cnt--;) {
4137  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
4138  src += (8 * src_stride);
4139  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
4140 
4141  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4142  dst0, dst1, dst2, dst3);
4143  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4144  dst4, dst5, dst6, dst7);
4145  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
4146  dst += (8 * dst_stride);
4147  }
4148  } else if (0 == (height % 4)) {
4149  for (cnt = (height / 4); cnt--;) {
4150  LD_UB4(src, src_stride, src0, src1, src2, src3);
4151  src += (4 * src_stride);
4152  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
4153 
4154  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4155  dst0, dst1, dst2, dst3);
4156  ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
4157  dst += (4 * dst_stride);
4158  }
4159  }
4160 }
4161 
4162 static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
4163  uint8_t *dst, int32_t dst_stride,
4164  int32_t height)
4165 {
4166  int32_t cnt;
4167  uint8_t *dst_dup = dst;
4168  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4169  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4170  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4171  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4172 
4173  if (0 == (height % 8)) {
4174  for (cnt = (height / 8); cnt--;) {
4175  LD_UB4(src, src_stride, src0, src2, src4, src6);
4176  LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
4177  src += (4 * src_stride);
4178  LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
4179  LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
4180  dst_dup += (4 * dst_stride);
4181  LD_UB4(src, src_stride, src8, src10, src12, src14);
4182  LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
4183  src += (4 * src_stride);
4184  LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
4185  LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
4186  dst_dup += (4 * dst_stride);
4187 
4188  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4189  dst0, dst1, dst2, dst3);
4190  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4191  dst4, dst5, dst6, dst7);
4192  AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
4193  dst8, dst9, dst10, dst11);
4194  AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
4195  dst12, dst13, dst14, dst15);
4196 
4197  ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
4198  ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
4199  dst += (4 * dst_stride);
4200  ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
4201  ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
4202  dst += (4 * dst_stride);
4203  }
4204  } else if (0 == (height % 4)) {
4205  for (cnt = (height / 4); cnt--;) {
4206  LD_UB4(src, src_stride, src0, src2, src4, src6);
4207  LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
4208  src += (4 * src_stride);
4209  LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
4210  LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
4211  dst_dup += (4 * dst_stride);
4212 
4213  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4214  dst0, dst1, dst2, dst3);
4215  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4216  dst4, dst5, dst6, dst7);
4217 
4218  ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
4219  ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
4220  dst += (4 * dst_stride);
4221  }
4222  }
4223 }
4224 
4225 static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
4226  uint8_t *dst, int32_t dst_stride,
4227  int32_t height)
4228 {
4229  int32_t cnt;
4230  uint8_t *dst_dup = dst;
4231  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4232  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4233  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4234  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4235 
4236  for (cnt = (height / 4); cnt--;) {
4237  LD_UB4(src, 16, src0, src1, src2, src3);
4238  src += src_stride;
4239  LD_UB4(src, 16, src4, src5, src6, src7);
4240  src += src_stride;
4241  LD_UB4(src, 16, src8, src9, src10, src11);
4242  src += src_stride;
4243  LD_UB4(src, 16, src12, src13, src14, src15);
4244  src += src_stride;
4245 
4246  LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
4247  dst_dup += dst_stride;
4248  LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
4249  dst_dup += dst_stride;
4250  LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
4251  dst_dup += dst_stride;
4252  LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
4253  dst_dup += dst_stride;
4254 
4255  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4256  dst0, dst1, dst2, dst3);
4257  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4258  dst4, dst5, dst6, dst7);
4259  AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
4260  dst8, dst9, dst10, dst11);
4261  AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
4262  dst12, dst13, dst14, dst15);
4263 
4264  ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
4265  dst += dst_stride;
4266  ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
4267  dst += dst_stride;
4268  ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
4269  dst += dst_stride;
4270  ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
4271  dst += dst_stride;
4272  }
4273 }
4274 
4275 static const int8_t vp9_subpel_filters_msa[3][15][8] = {
4276  [FILTER_8TAP_REGULAR] = {
4277  {0, 1, -5, 126, 8, -3, 1, 0},
4278  {-1, 3, -10, 122, 18, -6, 2, 0},
4279  {-1, 4, -13, 118, 27, -9, 3, -1},
4280  {-1, 4, -16, 112, 37, -11, 4, -1},
4281  {-1, 5, -18, 105, 48, -14, 4, -1},
4282  {-1, 5, -19, 97, 58, -16, 5, -1},
4283  {-1, 6, -19, 88, 68, -18, 5, -1},
4284  {-1, 6, -19, 78, 78, -19, 6, -1},
4285  {-1, 5, -18, 68, 88, -19, 6, -1},
4286  {-1, 5, -16, 58, 97, -19, 5, -1},
4287  {-1, 4, -14, 48, 105, -18, 5, -1},
4288  {-1, 4, -11, 37, 112, -16, 4, -1},
4289  {-1, 3, -9, 27, 118, -13, 4, -1},
4290  {0, 2, -6, 18, 122, -10, 3, -1},
4291  {0, 1, -3, 8, 126, -5, 1, 0},
4292  }, [FILTER_8TAP_SHARP] = {
4293  {-1, 3, -7, 127, 8, -3, 1, 0},
4294  {-2, 5, -13, 125, 17, -6, 3, -1},
4295  {-3, 7, -17, 121, 27, -10, 5, -2},
4296  {-4, 9, -20, 115, 37, -13, 6, -2},
4297  {-4, 10, -23, 108, 48, -16, 8, -3},
4298  {-4, 10, -24, 100, 59, -19, 9, -3},
4299  {-4, 11, -24, 90, 70, -21, 10, -4},
4300  {-4, 11, -23, 80, 80, -23, 11, -4},
4301  {-4, 10, -21, 70, 90, -24, 11, -4},
4302  {-3, 9, -19, 59, 100, -24, 10, -4},
4303  {-3, 8, -16, 48, 108, -23, 10, -4},
4304  {-2, 6, -13, 37, 115, -20, 9, -4},
4305  {-2, 5, -10, 27, 121, -17, 7, -3},
4306  {-1, 3, -6, 17, 125, -13, 5, -2},
4307  {0, 1, -3, 8, 127, -7, 3, -1},
4308  }, [FILTER_8TAP_SMOOTH] = {
4309  {-3, -1, 32, 64, 38, 1, -3, 0},
4310  {-2, -2, 29, 63, 41, 2, -3, 0},
4311  {-2, -2, 26, 63, 43, 4, -4, 0},
4312  {-2, -3, 24, 62, 46, 5, -4, 0},
4313  {-2, -3, 21, 60, 49, 7, -4, 0},
4314  {-1, -4, 18, 59, 51, 9, -4, 0},
4315  {-1, -4, 16, 57, 53, 12, -4, -1},
4316  {-1, -4, 14, 55, 55, 14, -4, -1},
4317  {-1, -4, 12, 53, 57, 16, -4, -1},
4318  {0, -4, 9, 51, 59, 18, -4, -1},
4319  {0, -4, 7, 49, 60, 21, -3, -2},
4320  {0, -4, 5, 46, 62, 24, -3, -2},
4321  {0, -4, 4, 43, 63, 26, -2, -2},
4322  {0, -3, 2, 41, 63, 29, -2, -2},
4323  {0, -3, 1, 38, 64, 32, -1, -3},
4324  }
4325 };
4326 
4327 #define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx) \
4328 void ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \
4329  const uint8_t *src, \
4330  ptrdiff_t srcstride, \
4331  int h, int mx, int my) \
4332 { \
4333  const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1]; \
4334  \
4335  common_hz_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h); \
4336 } \
4337  \
4338 void ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \
4339  const uint8_t *src, \
4340  ptrdiff_t srcstride, \
4341  int h, int mx, int my) \
4342 { \
4343  const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1]; \
4344  \
4345  common_vt_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h); \
4346 } \
4347  \
4348 void ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \
4349  const uint8_t *src, \
4350  ptrdiff_t srcstride, \
4351  int h, int mx, int my) \
4352 { \
4353  const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1]; \
4354  const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1]; \
4355  \
4356  common_hv_8ht_8vt_##SIZE##w_msa(src, srcstride, dst, dststride, hfilter, \
4357  vfilter, h); \
4358 } \
4359  \
4360 void ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \
4361  const uint8_t *src, \
4362  ptrdiff_t srcstride, \
4363  int h, int mx, int my) \
4364 { \
4365  const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1]; \
4366  \
4367  common_hz_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, \
4368  dststride, filter, h); \
4369 } \
4370  \
4371 void ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \
4372  const uint8_t *src, \
4373  ptrdiff_t srcstride, \
4374  int h, int mx, int my) \
4375 { \
4376  const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1]; \
4377  \
4378  common_vt_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, dststride, \
4379  filter, h); \
4380 } \
4381  \
4382 void ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \
4383  const uint8_t *src, \
4384  ptrdiff_t srcstride, \
4385  int h, int mx, int my) \
4386 { \
4387  const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1]; \
4388  const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1]; \
4389  \
4390  common_hv_8ht_8vt_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, \
4391  dststride, hfilter, \
4392  vfilter, h); \
4393 }
4394 
4395 #define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE) \
4396 void ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \
4397  const uint8_t *src, ptrdiff_t srcstride, \
4398  int h, int mx, int my) \
4399 { \
4400  \
4401  copy_width##SIZE##_msa(src, srcstride, dst, dststride, h); \
4402 } \
4403  \
4404 void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \
4405  const uint8_t *src, ptrdiff_t srcstride, \
4406  int h, int mx, int my) \
4407 { \
4408  \
4409  avg_width##SIZE##_msa(src, srcstride, dst, dststride, h); \
4410 }
4411 
4412 #define VP9_AVG_MIPS_MSA_FUNC(SIZE) \
4413 void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \
4414  const uint8_t *src, ptrdiff_t srcstride, \
4415  int h, int mx, int my) \
4416 { \
4417  \
4418  avg_width##SIZE##_msa(src, srcstride, dst, dststride, h); \
4419 }
4420 
4426 
4432 
4438 
4444 
4445 #undef VP9_8TAP_MIPS_MSA_FUNC
4446 #undef VP9_COPY_AVG_MIPS_MSA_FUNC
4447 #undef VP9_AVG_MIPS_MSA_FUNC
void ff_put_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2261
void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3130
static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1776
static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:1022
void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3678
static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:746
#define SLDI_B2_SH(...)
#define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE)
Definition: vp9_mc_msa.c:4395
#define VSHF_B4_SH(...)
static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:2624
void ff_avg_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3806
static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:523
#define XORI_B2_128_SB(...)
static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:159
static void copy_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:3923
#define PCKEV_XORI128_UB(in0, in1)
static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:3582
static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2819
#define LD_SB(...)
#define XORI_B3_128_SB(...)
#define SLDI_B3_UH(...)
#define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx)
Definition: vp9_mc_msa.c:4327
#define ILVR_D2_UB(...)
void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2565
void ff_put_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2027
#define LD_UB4(...)
void ff_avg_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3361
#define DPADD_SB4_SH(...)
#define ILVR_B2_SB(...)
#define src
Definition: vp8dsp.c:254
static void copy_width64_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4027
static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:3208
static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1549
static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1756
#define LD_SB2(...)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1067
static const int8_t vp9_subpel_filters_msa[3][15][8]
Definition: vp9_mc_msa.c:4275
#define XORI_B4_128_UB(...)
static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:2925
#define PCKEV_ST_SB(in0, in1, pdst)
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1, out2, out3)
Definition: vp9_mc_msa.c:102
static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:1818
#define ILVR_D2_SB(...)
static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:3744
uint8_t
#define LD4(psrc, stride, out0, out1, out2, out3)
#define LD_UB2(...)
#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3,dst0, dst1,pdst, stride)
Definition: vp9_mc_msa.c:148
#define SRARI_H4_SH(...)
#define XORI_B2_128_UB(...)
void ff_put_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2805
#define SPLATI_H4_SH(...)
#define ILVL_B2_SB(...)
#define height
#define LD_SH(...)
static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:3172
static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2191
static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:754
static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:268
#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)
Definition: vp9_mc_msa.c:139
#define LD_UB5(...)
void ff_put_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2704
#define ILVR_D3_SB(...)
#define ILVR_D4_SB(...)
#define LD_SB8(...)
void ff_put_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2275
static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
Definition: vp9_mc_msa.c:657
#define PCKEV_B2_SB(...)
static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:471
void ff_put_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2720
static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:970
void ff_put_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2407
static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2897
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
static const uint16_t mask[17]
Definition: lzw.c:38
static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1632
static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:1859
#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3,filt_h0, filt_h1, filt_h2, filt_h3)
Definition: vp9_mc_msa.c:66
static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:762
static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1361
#define XORI_B7_128_SB(...)
void ff_avg_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3895
#define XORI_B4_128_SB(...)
static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:304
static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:936
static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1123
static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
Definition: vp9_mc_msa.c:1422
static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:237
static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:414
#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,dst0, dst1, pdst, stride)
static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:2581
#define SRARI_H2_SH(...)
#define ILVR_B4_UB(...)
void ff_avg_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3080
static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:953
static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2143
#define LD_UB8(...)
#define width
#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)
void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2178
#define SRARI_H2_UH(...)
#define VSHF_B2_UH(...)
int32_t
void ff_avg_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2997
void ff_avg_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3415
#define PCKEV_B4_SB(...)
#define AVER_UB2_UB(...)
static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:2488
static void avg_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4127
static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:316
#define ST_UB(...)
static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1736
static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1529
#define SAT_SH4_SH(...)
#define SPLATI_H4_SB(...)
#define LD_SB4(...)
#define PCKEV_B4_UB(...)
#define INSERT_W4_UB(...)
static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:2520
void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2074
#define ST_UB8(...)
#define AVER_UB4_UB(...)
void ff_put_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:1946
#define ST_UB4(...)
static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1181
static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:578
#define src1
Definition: h264pred.c:139
static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:987
void ff_put_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2329
#define ILVL_B4_SB(...)
#define SAT_SH2_SH(...)
static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2117
static void copy_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:3953
static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:355
static const int8_t vp9_bilinear_filters_msa[15][2]
Definition: vp9_mc_msa.c:34
static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4085
static void avg_width64_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4225
#define ILVR_D4_UB(...)
void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:1846
static void avg_width32_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4162
#define DOTP_SB4_SH(...)
#define DOTP_UB2_UH(...)
#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,filt0, filt1, filt2, filt3)
Definition: vp9_mc_msa.c:52
void ff_put_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:1960
static void copy_width32_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:3997
#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst)
Definition: vp9_mc_msa.c:130
#define SRARI_H4_UH(...)
#define src0
Definition: h264pred.c:138
static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:3291
static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:838
#define SD4(in0, in1, in2, in3, pdst, stride)
void ff_avg_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3012
static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:2215
static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:3262
static const int8_t filt[NUMTAPS]
Definition: af_earwax.c:39
static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:188
static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1302
#define ST4x8_UB(in0, in1, pdst, stride)
#define LD_SB7(...)
#define LD_SB5(...)
void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3247
static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1240
static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:3694
#define INSERT_D2_UB(...)
#define ILVEV_B2_SH(...)
#define LW4(psrc, stride, out0, out1, out2, out3)
#define ILVEV_B2_UB(...)
#define ST8x4_UB(in0, in1, pdst, stride)
#define ILVL_B2_UB(...)
void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2882
#define SAT_UH2_UH(...)
void ff_avg_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3346
#define ST8x8_UB(in0, in1, in2, in3, pdst, stride)
#define SAT_UH4_UH(...)
void ff_avg_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3495
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1)
Definition: vp9_mc_msa.c:83
static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:226
#define SLDI_B3_SB(...)
#define LD_UB(...)
static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1082
#define DOTP_UB4_UH(...)
#define VSHF_B2_UB(...)
void ff_avg_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3909
#define ILVR_B4_SB(...)
static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1883
static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1519
FILE * out
Definition: movenc.c:54
static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:3626
void ff_avg_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3824
static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2848
static const uint8_t mc_filt_mask_arr[16 *3]
Definition: vp9_mc_msa.c:25
static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:1796
#define LD_UH(...)
#define VP9_AVG_MIPS_MSA_FUNC(SIZE)
Definition: vp9_mc_msa.c:4412
void ff_put_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2791
#define PCKEV_B2_UB(...)
static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4056
#define ILVR_B2_UB(...)
static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1539
#define ADDS_SH4_SH(...)