FFmpeg
vp9_mc_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/vp9dsp.h"
23 #include "vp9dsp_mips.h"
24 
25 static const uint8_t mc_filt_mask_arr[16 * 3] = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  /* 4 width cases */
29  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30  /* 4 width cases */
31  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32 };
33 
34 static const int8_t vp9_bilinear_filters_msa[15][2] = {
35  {120, 8},
36  {112, 16},
37  {104, 24},
38  {96, 32},
39  {88, 40},
40  {80, 48},
41  {72, 56},
42  {64, 64},
43  {56, 72},
44  {48, 80},
45  {40, 88},
46  {32, 96},
47  {24, 104},
48  {16, 112},
49  {8, 120}
50 };
51 
52 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
53  filt0, filt1, filt2, filt3) \
54 ( { \
55  v8i16 tmp0, tmp1; \
56  \
57  tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
58  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
59  tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2); \
60  tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3); \
61  tmp0 = __msa_adds_s_h(tmp0, tmp1); \
62  \
63  tmp0; \
64 } )
65 
66 #define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, \
67  filt_h0, filt_h1, filt_h2, filt_h3) \
68 ( { \
69  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
70  v8i16 hz_out_m; \
71  \
72  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, \
73  vec0_m, vec1_m, vec2_m, vec3_m); \
74  hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, \
75  filt_h0, filt_h1, filt_h2, filt_h3); \
76  \
77  hz_out_m = __msa_srari_h(hz_out_m, 7); \
78  hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
79  \
80  hz_out_m; \
81 } )
82 
83 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
84  mask0, mask1, mask2, mask3, \
85  filt0, filt1, filt2, filt3, \
86  out0, out1) \
87 { \
88  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
89  v8i16 res0_m, res1_m, res2_m, res3_m; \
90  \
91  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
92  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
93  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
94  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
95  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
96  DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
97  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
98  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
99  ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
100 }
101 
102 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
103  mask0, mask1, mask2, mask3, \
104  filt0, filt1, filt2, filt3, \
105  out0, out1, out2, out3) \
106 { \
107  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
108  v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
109  \
110  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
111  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
112  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
113  res0_m, res1_m, res2_m, res3_m); \
114  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
115  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
116  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
117  res4_m, res5_m, res6_m, res7_m); \
118  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
119  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
120  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
121  res0_m, res1_m, res2_m, res3_m); \
122  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
123  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
124  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
125  res4_m, res5_m, res6_m, res7_m); \
126  ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
127  res7_m, out0, out1, out2, out3); \
128 }
129 
130 #define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \
131 { \
132  v16u8 tmp_m; \
133  \
134  tmp_m = PCKEV_XORI128_UB(in1, in0); \
135  tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \
136  ST_UB(tmp_m, (pdst)); \
137 }
138 
139 #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \
140 { \
141  v16u8 tmp_m; \
142  \
143  tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
144  tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \
145  ST_UB(tmp_m, (pdst)); \
146 }
147 
148 #define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, \
149  pdst, stride) \
150 { \
151  v16u8 tmp0_m, tmp1_m; \
152  uint8_t *pdst_m = (uint8_t *) (pdst); \
153  \
154  PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
155  AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
156  ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
157 }
158 
159 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
160  uint8_t *dst, int32_t dst_stride,
161  const int8_t *filter)
162 {
163  v16u8 mask0, mask1, mask2, mask3, out;
164  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
165  v8i16 filt, out0, out1;
166 
167  mask0 = LD_UB(&mc_filt_mask_arr[16]);
168  src -= 3;
169 
170  /* rearranging filter */
171  filt = LD_SH(filter);
172  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
173 
174  mask1 = mask0 + 2;
175  mask2 = mask0 + 4;
176  mask3 = mask0 + 6;
177 
178  LD_SB4(src, src_stride, src0, src1, src2, src3);
179  XORI_B4_128_SB(src0, src1, src2, src3);
180  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
181  mask3, filt0, filt1, filt2, filt3, out0, out1);
182  SRARI_H2_SH(out0, out1, 7);
183  SAT_SH2_SH(out0, out1, 7);
184  out = PCKEV_XORI128_UB(out0, out1);
185  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
186 }
187 
188 static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
189  uint8_t *dst, int32_t dst_stride,
190  const int8_t *filter)
191 {
192  v16i8 filt0, filt1, filt2, filt3;
193  v16i8 src0, src1, src2, src3;
194  v16u8 mask0, mask1, mask2, mask3, out;
195  v8i16 filt, out0, out1, out2, out3;
196 
197  mask0 = LD_UB(&mc_filt_mask_arr[16]);
198  src -= 3;
199 
200  /* rearranging filter */
201  filt = LD_SH(filter);
202  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
203 
204  mask1 = mask0 + 2;
205  mask2 = mask0 + 4;
206  mask3 = mask0 + 6;
207 
208  LD_SB4(src, src_stride, src0, src1, src2, src3);
209  XORI_B4_128_SB(src0, src1, src2, src3);
210  src += (4 * src_stride);
211  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
212  mask3, filt0, filt1, filt2, filt3, out0, out1);
213  LD_SB4(src, src_stride, src0, src1, src2, src3);
214  XORI_B4_128_SB(src0, src1, src2, src3);
215  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
216  mask3, filt0, filt1, filt2, filt3, out2, out3);
217  SRARI_H4_SH(out0, out1, out2, out3, 7);
218  SAT_SH4_SH(out0, out1, out2, out3, 7);
219  out = PCKEV_XORI128_UB(out0, out1);
220  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
221  out = PCKEV_XORI128_UB(out2, out3);
222  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
223 }
224 
225 static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
226  uint8_t *dst, int32_t dst_stride,
227  const int8_t *filter, int32_t height)
228 {
229  if (4 == height) {
230  common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
231  } else if (8 == height) {
232  common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
233  }
234 }
235 
236 static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
237  uint8_t *dst, int32_t dst_stride,
238  const int8_t *filter)
239 {
240  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
241  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
242  v8i16 filt, out0, out1, out2, out3;
243 
244  mask0 = LD_UB(&mc_filt_mask_arr[0]);
245  src -= 3;
246 
247  /* rearranging filter */
248  filt = LD_SH(filter);
249  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
250 
251  mask1 = mask0 + 2;
252  mask2 = mask0 + 4;
253  mask3 = mask0 + 6;
254 
255  LD_SB4(src, src_stride, src0, src1, src2, src3);
256  XORI_B4_128_SB(src0, src1, src2, src3);
257  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
258  mask3, filt0, filt1, filt2, filt3, out0, out1,
259  out2, out3);
260  SRARI_H4_SH(out0, out1, out2, out3, 7);
261  SAT_SH4_SH(out0, out1, out2, out3, 7);
262  tmp0 = PCKEV_XORI128_UB(out0, out1);
263  tmp1 = PCKEV_XORI128_UB(out2, out3);
264  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
265 }
266 
267 static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
268  uint8_t *dst, int32_t dst_stride,
269  const int8_t *filter, int32_t height)
270 {
271  uint32_t loop_cnt;
272  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
273  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
274  v8i16 filt, out0, out1, out2, out3;
275 
276  mask0 = LD_UB(&mc_filt_mask_arr[0]);
277  src -= 3;
278 
279  /* rearranging filter */
280  filt = LD_SH(filter);
281  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
282 
283  mask1 = mask0 + 2;
284  mask2 = mask0 + 4;
285  mask3 = mask0 + 6;
286 
287  for (loop_cnt = (height >> 2); loop_cnt--;) {
288  LD_SB4(src, src_stride, src0, src1, src2, src3);
289  XORI_B4_128_SB(src0, src1, src2, src3);
290  src += (4 * src_stride);
291  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
292  mask3, filt0, filt1, filt2, filt3, out0,
293  out1, out2, out3);
294  SRARI_H4_SH(out0, out1, out2, out3, 7);
295  SAT_SH4_SH(out0, out1, out2, out3, 7);
296  tmp0 = PCKEV_XORI128_UB(out0, out1);
297  tmp1 = PCKEV_XORI128_UB(out2, out3);
298  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
299  dst += (4 * dst_stride);
300  }
301 }
302 
303 static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
304  uint8_t *dst, int32_t dst_stride,
305  const int8_t *filter, int32_t height)
306 {
307  if (4 == height) {
308  common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
309  } else {
310  common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
311  height);
312  }
313 }
314 
315 static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
316  uint8_t *dst, int32_t dst_stride,
317  const int8_t *filter, int32_t height)
318 {
319  uint32_t loop_cnt;
320  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
321  v16u8 mask0, mask1, mask2, mask3, out;
322  v8i16 filt, out0, out1, out2, out3;
323 
324  mask0 = LD_UB(&mc_filt_mask_arr[0]);
325  src -= 3;
326 
327  /* rearranging filter */
328  filt = LD_SH(filter);
329  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
330 
331  mask1 = mask0 + 2;
332  mask2 = mask0 + 4;
333  mask3 = mask0 + 6;
334 
335  for (loop_cnt = (height >> 1); loop_cnt--;) {
336  LD_SB2(src, src_stride, src0, src2);
337  LD_SB2(src + 8, src_stride, src1, src3);
338  XORI_B4_128_SB(src0, src1, src2, src3);
339  src += (2 * src_stride);
340  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
341  mask3, filt0, filt1, filt2, filt3, out0,
342  out1, out2, out3);
343  SRARI_H4_SH(out0, out1, out2, out3, 7);
344  SAT_SH4_SH(out0, out1, out2, out3, 7);
345  out = PCKEV_XORI128_UB(out0, out1);
346  ST_UB(out, dst);
347  dst += dst_stride;
348  out = PCKEV_XORI128_UB(out2, out3);
349  ST_UB(out, dst);
350  dst += dst_stride;
351  }
352 }
353 
354 static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
355  uint8_t *dst, int32_t dst_stride,
356  const int8_t *filter, int32_t height)
357 {
358  uint32_t loop_cnt;
359  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
360  v16u8 mask0, mask1, mask2, mask3, out;
361  v8i16 filt, out0, out1, out2, out3;
362 
363  mask0 = LD_UB(&mc_filt_mask_arr[0]);
364  src -= 3;
365 
366  /* rearranging filter */
367  filt = LD_SH(filter);
368  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
369 
370  mask1 = mask0 + 2;
371  mask2 = mask0 + 4;
372  mask3 = mask0 + 6;
373 
374  for (loop_cnt = (height >> 1); loop_cnt--;) {
375  src0 = LD_SB(src);
376  src2 = LD_SB(src + 16);
377  src3 = LD_SB(src + 24);
378  src1 = __msa_sldi_b(src2, src0, 8);
379  src += src_stride;
380  XORI_B4_128_SB(src0, src1, src2, src3);
381  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
382  mask3, filt0, filt1, filt2, filt3, out0,
383  out1, out2, out3);
384  SRARI_H4_SH(out0, out1, out2, out3, 7);
385  SAT_SH4_SH(out0, out1, out2, out3, 7);
386 
387  src0 = LD_SB(src);
388  src2 = LD_SB(src + 16);
389  src3 = LD_SB(src + 24);
390  src1 = __msa_sldi_b(src2, src0, 8);
391  src += src_stride;
392 
393  out = PCKEV_XORI128_UB(out0, out1);
394  ST_UB(out, dst);
395  out = PCKEV_XORI128_UB(out2, out3);
396  ST_UB(out, dst + 16);
397  dst += dst_stride;
398 
399  XORI_B4_128_SB(src0, src1, src2, src3);
400  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
401  mask3, filt0, filt1, filt2, filt3, out0,
402  out1, out2, out3);
403  SRARI_H4_SH(out0, out1, out2, out3, 7);
404  SAT_SH4_SH(out0, out1, out2, out3, 7);
405  out = PCKEV_XORI128_UB(out0, out1);
406  ST_UB(out, dst);
407  out = PCKEV_XORI128_UB(out2, out3);
408  ST_UB(out, dst + 16);
409  dst += dst_stride;
410  }
411 }
412 
413 static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
414  uint8_t *dst, int32_t dst_stride,
415  const int8_t *filter, int32_t height)
416 {
417  int32_t loop_cnt;
418  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
419  v16u8 mask0, mask1, mask2, mask3, out;
420  v8i16 filt, out0, out1, out2, out3;
421 
422  mask0 = LD_UB(&mc_filt_mask_arr[0]);
423  src -= 3;
424 
425  /* rearranging filter */
426  filt = LD_SH(filter);
427  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
428 
429  mask1 = mask0 + 2;
430  mask2 = mask0 + 4;
431  mask3 = mask0 + 6;
432 
433  for (loop_cnt = height; loop_cnt--;) {
434  src0 = LD_SB(src);
435  src2 = LD_SB(src + 16);
436  src3 = LD_SB(src + 24);
437  src1 = __msa_sldi_b(src2, src0, 8);
438 
439  XORI_B4_128_SB(src0, src1, src2, src3);
440  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
441  mask2, mask3, filt0, filt1, filt2, filt3,
442  out0, out1, out2, out3);
443  SRARI_H4_SH(out0, out1, out2, out3, 7);
444  SAT_SH4_SH(out0, out1, out2, out3, 7);
445  out = PCKEV_XORI128_UB(out0, out1);
446  ST_UB(out, dst);
447  out = PCKEV_XORI128_UB(out2, out3);
448  ST_UB(out, dst + 16);
449 
450  src0 = LD_SB(src + 32);
451  src2 = LD_SB(src + 48);
452  src3 = LD_SB(src + 56);
453  src1 = __msa_sldi_b(src2, src0, 8);
454  src += src_stride;
455 
456  XORI_B4_128_SB(src0, src1, src2, src3);
457  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
458  mask2, mask3, filt0, filt1, filt2, filt3,
459  out0, out1, out2, out3);
460  SRARI_H4_SH(out0, out1, out2, out3, 7);
461  SAT_SH4_SH(out0, out1, out2, out3, 7);
462  out = PCKEV_XORI128_UB(out0, out1);
463  ST_UB(out, dst + 32);
464  out = PCKEV_XORI128_UB(out2, out3);
465  ST_UB(out, dst + 48);
466  dst += dst_stride;
467  }
468 }
469 
470 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
471  uint8_t *dst, int32_t dst_stride,
472  const int8_t *filter, int32_t height)
473 {
474  uint32_t loop_cnt;
475  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
476  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
477  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
478  v16i8 src10998, filt0, filt1, filt2, filt3;
479  v16u8 out;
480  v8i16 filt, out10, out32;
481 
482  src -= (3 * src_stride);
483 
484  filt = LD_SH(filter);
485  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
486 
487  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
488  src += (7 * src_stride);
489 
490  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
491  src54_r, src21_r);
492  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
493  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
494  src4332, src6554);
495  XORI_B3_128_SB(src2110, src4332, src6554);
496 
497  for (loop_cnt = (height >> 2); loop_cnt--;) {
498  LD_SB4(src, src_stride, src7, src8, src9, src10);
499  src += (4 * src_stride);
500 
501  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
502  src87_r, src98_r, src109_r);
503  ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
504  XORI_B2_128_SB(src8776, src10998);
505  out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
506  filt1, filt2, filt3);
507  out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
508  filt1, filt2, filt3);
509  SRARI_H2_SH(out10, out32, 7);
510  SAT_SH2_SH(out10, out32, 7);
511  out = PCKEV_XORI128_UB(out10, out32);
512  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
513  dst += (4 * dst_stride);
514 
515  src2110 = src6554;
516  src4332 = src8776;
517  src6554 = src10998;
518  src6 = src10;
519  }
520 }
521 
522 static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
523  uint8_t *dst, int32_t dst_stride,
524  const int8_t *filter, int32_t height)
525 {
526  uint32_t loop_cnt;
527  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
528  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
529  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
530  v16u8 tmp0, tmp1;
531  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
532 
533  src -= (3 * src_stride);
534 
535  filt = LD_SH(filter);
536  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
537 
538  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
539  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
540  src += (7 * src_stride);
541  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
542  src54_r, src21_r);
543  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
544 
545  for (loop_cnt = (height >> 2); loop_cnt--;) {
546  LD_SB4(src, src_stride, src7, src8, src9, src10);
547  XORI_B4_128_SB(src7, src8, src9, src10);
548  src += (4 * src_stride);
549 
550  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
551  src87_r, src98_r, src109_r);
552  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
553  filt1, filt2, filt3);
554  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
555  filt1, filt2, filt3);
556  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
557  filt1, filt2, filt3);
558  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
559  filt1, filt2, filt3);
560  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
561  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
562  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
563  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
564  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
565  dst += (4 * dst_stride);
566 
567  src10_r = src54_r;
568  src32_r = src76_r;
569  src54_r = src98_r;
570  src21_r = src65_r;
571  src43_r = src87_r;
572  src65_r = src109_r;
573  src6 = src10;
574  }
575 }
576 
577 static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
578  uint8_t *dst, int32_t dst_stride,
579  const int8_t *filter, int32_t height)
580 {
581  uint32_t loop_cnt;
582  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
583  v16i8 filt0, filt1, filt2, filt3;
584  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
585  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
586  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
587  v16u8 tmp0, tmp1, tmp2, tmp3;
588  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
589 
590  src -= (3 * src_stride);
591 
592  filt = LD_SH(filter);
593  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
594 
595  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
596  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
597  src += (7 * src_stride);
598  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
599  src54_r, src21_r);
600  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
601  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
602  src54_l, src21_l);
603  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
604 
605  for (loop_cnt = (height >> 2); loop_cnt--;) {
606  LD_SB4(src, src_stride, src7, src8, src9, src10);
607  XORI_B4_128_SB(src7, src8, src9, src10);
608  src += (4 * src_stride);
609 
610  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
611  src87_r, src98_r, src109_r);
612  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
613  src87_l, src98_l, src109_l);
614  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
615  filt1, filt2, filt3);
616  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
617  filt1, filt2, filt3);
618  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
619  filt1, filt2, filt3);
620  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
621  filt1, filt2, filt3);
622  out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
623  filt1, filt2, filt3);
624  out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
625  filt1, filt2, filt3);
626  out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
627  filt1, filt2, filt3);
628  out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
629  filt1, filt2, filt3);
630  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
631  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
632  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
633  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
634  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
635  out3_r, tmp0, tmp1, tmp2, tmp3);
636  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
637  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
638  dst += (4 * dst_stride);
639 
640  src10_r = src54_r;
641  src32_r = src76_r;
642  src54_r = src98_r;
643  src21_r = src65_r;
644  src43_r = src87_r;
645  src65_r = src109_r;
646  src10_l = src54_l;
647  src32_l = src76_l;
648  src54_l = src98_l;
649  src21_l = src65_l;
650  src43_l = src87_l;
651  src65_l = src109_l;
652  src6 = src10;
653  }
654 }
655 
656 static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
657  uint8_t *dst, int32_t dst_stride,
658  const int8_t *filter, int32_t height,
659  int32_t width)
660 {
661  const uint8_t *src_tmp;
662  uint8_t *dst_tmp;
663  uint32_t loop_cnt, cnt;
664  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
665  v16i8 filt0, filt1, filt2, filt3;
666  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
667  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
668  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
669  v16u8 tmp0, tmp1, tmp2, tmp3;
670  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
671 
672  src -= (3 * src_stride);
673 
674  filt = LD_SH(filter);
675  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
676 
677  for (cnt = (width >> 4); cnt--;) {
678  src_tmp = src;
679  dst_tmp = dst;
680 
681  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
682  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
683  src_tmp += (7 * src_stride);
684  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
685  src32_r, src54_r, src21_r);
686  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
687  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
688  src32_l, src54_l, src21_l);
689  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
690 
691  for (loop_cnt = (height >> 2); loop_cnt--;) {
692  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
693  XORI_B4_128_SB(src7, src8, src9, src10);
694  src_tmp += (4 * src_stride);
695  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
696  src87_r, src98_r, src109_r);
697  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
698  src87_l, src98_l, src109_l);
699  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
700  filt0, filt1, filt2, filt3);
701  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
702  filt0, filt1, filt2, filt3);
703  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
704  filt0, filt1, filt2, filt3);
705  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
706  filt0, filt1, filt2, filt3);
707  out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
708  filt0, filt1, filt2, filt3);
709  out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
710  filt0, filt1, filt2, filt3);
711  out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
712  filt0, filt1, filt2, filt3);
713  out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
714  filt0, filt1, filt2, filt3);
715  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
716  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
717  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
718  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
719  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
720  out3_r, tmp0, tmp1, tmp2, tmp3);
721  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
722  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
723  dst_tmp += (4 * dst_stride);
724 
725  src10_r = src54_r;
726  src32_r = src76_r;
727  src54_r = src98_r;
728  src21_r = src65_r;
729  src43_r = src87_r;
730  src65_r = src109_r;
731  src10_l = src54_l;
732  src32_l = src76_l;
733  src54_l = src98_l;
734  src21_l = src65_l;
735  src43_l = src87_l;
736  src65_l = src109_l;
737  src6 = src10;
738  }
739 
740  src += 16;
741  dst += 16;
742  }
743 }
744 
745 static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
746  uint8_t *dst, int32_t dst_stride,
747  const int8_t *filter, int32_t height)
748 {
749  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
750  32);
751 }
752 
753 static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
754  uint8_t *dst, int32_t dst_stride,
755  const int8_t *filter, int32_t height)
756 {
757  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
758  64);
759 }
760 
761 static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
762  uint8_t *dst, int32_t dst_stride,
763  const int8_t *filter_horiz,
764  const int8_t *filter_vert,
765  int32_t height)
766 {
767  uint32_t loop_cnt;
768  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
769  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
770  v16u8 mask0, mask1, mask2, mask3, out;
771  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
772  v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
773  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
774 
775  mask0 = LD_UB(&mc_filt_mask_arr[16]);
776  src -= (3 + 3 * src_stride);
777 
778  /* rearranging filter */
779  filt = LD_SH(filter_horiz);
780  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
781 
782  mask1 = mask0 + 2;
783  mask2 = mask0 + 4;
784  mask3 = mask0 + 6;
785 
786  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
787  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
788  src += (7 * src_stride);
789 
790  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
791  filt_hz1, filt_hz2, filt_hz3);
792  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
793  filt_hz1, filt_hz2, filt_hz3);
794  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
795  filt_hz1, filt_hz2, filt_hz3);
796  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
797  filt_hz1, filt_hz2, filt_hz3);
798  SLDI_B2_SH(hz_out2, hz_out0, hz_out4, hz_out2, 8, hz_out1, hz_out3);
799 
800  filt = LD_SH(filter_vert);
801  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
802 
803  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
804  out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
805 
806  for (loop_cnt = (height >> 2); loop_cnt--;) {
807  LD_SB4(src, src_stride, src7, src8, src9, src10);
808  XORI_B4_128_SB(src7, src8, src9, src10);
809  src += (4 * src_stride);
810 
811  hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
812  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
813  hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
814  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
815  tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
816  filt_vt2, filt_vt3);
817 
818  hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
819  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
820  hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
821  out4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
822  tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
823  filt_vt2, filt_vt3);
824  SRARI_H2_SH(tmp0, tmp1, 7);
825  SAT_SH2_SH(tmp0, tmp1, 7);
826  out = PCKEV_XORI128_UB(tmp0, tmp1);
827  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
828  dst += (4 * dst_stride);
829 
830  hz_out5 = hz_out9;
831  out0 = out2;
832  out1 = out3;
833  out2 = out4;
834  }
835 }
836 
837 static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
838  uint8_t *dst, int32_t dst_stride,
839  const int8_t *filter_horiz,
840  const int8_t *filter_vert,
841  int32_t height)
842 {
843  uint32_t loop_cnt;
844  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
845  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
846  v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
847  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
848  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
849  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
850  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
851 
852  mask0 = LD_UB(&mc_filt_mask_arr[0]);
853  src -= (3 + 3 * src_stride);
854 
855  /* rearranging filter */
856  filt = LD_SH(filter_horiz);
857  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
858 
859  mask1 = mask0 + 2;
860  mask2 = mask0 + 4;
861  mask3 = mask0 + 6;
862 
863  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
864  src += (7 * src_stride);
865 
866  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
867  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
868  filt_hz1, filt_hz2, filt_hz3);
869  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
870  filt_hz1, filt_hz2, filt_hz3);
871  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
872  filt_hz1, filt_hz2, filt_hz3);
873  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
874  filt_hz1, filt_hz2, filt_hz3);
875  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
876  filt_hz1, filt_hz2, filt_hz3);
877  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
878  filt_hz1, filt_hz2, filt_hz3);
879  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
880  filt_hz1, filt_hz2, filt_hz3);
881 
882  filt = LD_SH(filter_vert);
883  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
884 
885  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
886  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
887  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
888 
889  for (loop_cnt = (height >> 2); loop_cnt--;) {
890  LD_SB4(src, src_stride, src7, src8, src9, src10);
891  src += (4 * src_stride);
892 
893  XORI_B4_128_SB(src7, src8, src9, src10);
894 
895  hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
896  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
897  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
898  tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
899  filt_vt2, filt_vt3);
900 
901  hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
902  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
903  out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
904  tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
905  filt_vt2, filt_vt3);
906 
907  hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
908  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
909  out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
910  tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0,
911  filt_vt1, filt_vt2, filt_vt3);
912 
913  hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
914  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
915  out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
916  tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
917  filt_vt2, filt_vt3);
918  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
919  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
920  vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
921  vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
922  ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
923  dst += (4 * dst_stride);
924 
925  hz_out6 = hz_out10;
926  out0 = out2;
927  out1 = out3;
928  out2 = out8;
929  out4 = out6;
930  out5 = out7;
931  out6 = out9;
932  }
933 }
934 
935 static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
936  uint8_t *dst, int32_t dst_stride,
937  const int8_t *filter_horiz,
938  const int8_t *filter_vert,
939  int32_t height)
940 {
941  int32_t multiple8_cnt;
942 
943  for (multiple8_cnt = 2; multiple8_cnt--;) {
944  common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
945  filter_vert, height);
946 
947  src += 8;
948  dst += 8;
949  }
950 }
951 
952 static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
953  uint8_t *dst, int32_t dst_stride,
954  const int8_t *filter_horiz,
955  const int8_t *filter_vert,
956  int32_t height)
957 {
958  int32_t multiple8_cnt;
959 
960  for (multiple8_cnt = 4; multiple8_cnt--;) {
961  common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
962  filter_vert, height);
963 
964  src += 8;
965  dst += 8;
966  }
967 }
968 
969 static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
970  uint8_t *dst, int32_t dst_stride,
971  const int8_t *filter_horiz,
972  const int8_t *filter_vert,
973  int32_t height)
974 {
975  int32_t multiple8_cnt;
976 
977  for (multiple8_cnt = 8; multiple8_cnt--;) {
978  common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
979  filter_vert, height);
980 
981  src += 8;
982  dst += 8;
983  }
984 }
985 
986 static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
987  int32_t src_stride,
988  uint8_t *dst, int32_t dst_stride,
989  const int8_t *filter)
990 {
991  uint32_t tp0, tp1, tp2, tp3;
992  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
993  v16u8 dst0, res;
994  v16u8 mask0, mask1, mask2, mask3;
995  v8i16 filt, res0, res1;
996 
997  mask0 = LD_UB(&mc_filt_mask_arr[16]);
998  src -= 3;
999 
1000  /* rearranging filter */
1001  filt = LD_SH(filter);
1002  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1003 
1004  mask1 = mask0 + 2;
1005  mask2 = mask0 + 4;
1006  mask3 = mask0 + 6;
1007 
1008  LD_SB4(src, src_stride, src0, src1, src2, src3);
1009  XORI_B4_128_SB(src0, src1, src2, src3);
1010  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1011  mask3, filt0, filt1, filt2, filt3, res0, res1);
1012  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1013  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1014  SRARI_H2_SH(res0, res1, 7);
1015  SAT_SH2_SH(res0, res1, 7);
1016  res = PCKEV_XORI128_UB(res0, res1);
1017  res = (v16u8) __msa_aver_u_b(res, dst0);
1018  ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
1019 }
1020 
1021 static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
1022  int32_t src_stride,
1023  uint8_t *dst, int32_t dst_stride,
1024  const int8_t *filter)
1025 {
1026  uint32_t tp0, tp1, tp2, tp3;
1027  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1028  v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
1029  v16u8 dst0, dst1;
1030  v8i16 filt, vec0, vec1, vec2, vec3;
1031 
1032  mask0 = LD_UB(&mc_filt_mask_arr[16]);
1033  src -= 3;
1034 
1035  /* rearranging filter */
1036  filt = LD_SH(filter);
1037  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1038 
1039  mask1 = mask0 + 2;
1040  mask2 = mask0 + 4;
1041  mask3 = mask0 + 6;
1042 
1043  LD_SB4(src, src_stride, src0, src1, src2, src3);
1044  XORI_B4_128_SB(src0, src1, src2, src3);
1045  src += (4 * src_stride);
1046  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1047  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1048  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
1049  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1050  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1051  mask3, filt0, filt1, filt2, filt3, vec0, vec1);
1052  LD_SB4(src, src_stride, src0, src1, src2, src3);
1053  XORI_B4_128_SB(src0, src1, src2, src3);
1054  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1055  mask3, filt0, filt1, filt2, filt3, vec2, vec3);
1056  SRARI_H4_SH(vec0, vec1, vec2, vec3, 7);
1057  SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
1058  PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
1059  res0, res1, res2, res3);
1060  ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
1061  XORI_B2_128_UB(res0, res2);
1062  AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
1063  ST_W8(res0, res2, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1064 }
1065 
1066 static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
1067  int32_t src_stride,
1068  uint8_t *dst, int32_t dst_stride,
1069  const int8_t *filter,
1070  int32_t height)
1071 {
1072  if (4 == height) {
1073  common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
1074  filter);
1075  } else if (8 == height) {
1076  common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
1077  filter);
1078  }
1079 }
1080 
1081 static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
1082  int32_t src_stride,
1083  uint8_t *dst, int32_t dst_stride,
1084  const int8_t *filter,
1085  int32_t height)
1086 {
1087  int32_t loop_cnt;
1088  int64_t tp0, tp1, tp2, tp3;
1089  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1090  v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
1091  v8i16 filt, out0, out1, out2, out3;
1092 
1093  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1094  src -= 3;
1095 
1096  /* rearranging filter */
1097  filt = LD_SH(filter);
1098  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1099 
1100  mask1 = mask0 + 2;
1101  mask2 = mask0 + 4;
1102  mask3 = mask0 + 6;
1103 
1104  for (loop_cnt = (height >> 2); loop_cnt--;) {
1105  LD_SB4(src, src_stride, src0, src1, src2, src3);
1106  XORI_B4_128_SB(src0, src1, src2, src3);
1107  src += (4 * src_stride);
1108  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1109  mask3, filt0, filt1, filt2, filt3, out0,
1110  out1, out2, out3);
1111  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1112  INSERT_D2_UB(tp0, tp1, dst0);
1113  INSERT_D2_UB(tp2, tp3, dst1);
1114  SRARI_H4_SH(out0, out1, out2, out3, 7);
1115  SAT_SH4_SH(out0, out1, out2, out3, 7);
1116  CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
1117  dst, dst_stride);
1118  dst += (4 * dst_stride);
1119  }
1120 }
1121 
1122 static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
1123  int32_t src_stride,
1124  uint8_t *dst, int32_t dst_stride,
1125  const int8_t *filter,
1126  int32_t height)
1127 {
1128  int32_t loop_cnt;
1129  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1130  v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
1131  v8i16 filt, out0, out1, out2, out3;
1132  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1133  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1134 
1135  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1136  src -= 3;
1137 
1138  /* rearranging filter */
1139  filt = LD_SH(filter);
1140  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1141 
1142  mask1 = mask0 + 2;
1143  mask2 = mask0 + 4;
1144  mask3 = mask0 + 6;
1145 
1146  for (loop_cnt = height >> 1; loop_cnt--;) {
1147  LD_SB2(src, src_stride, src0, src2);
1148  LD_SB2(src + 8, src_stride, src1, src3);
1149  src += (2 * src_stride);
1150 
1151  XORI_B4_128_SB(src0, src1, src2, src3);
1152  VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1153  vec12);
1154  VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1155  vec13);
1156  VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
1157  vec14);
1158  VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
1159  vec15);
1160  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
1161  vec1, vec2, vec3);
1162  DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
1163  vec9, vec10, vec11);
1164  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
1165  vec1, vec2, vec3);
1166  DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1167  vec8, vec9, vec10, vec11);
1168  ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1169  out1, out2, out3);
1170  LD_UB2(dst, dst_stride, dst0, dst1);
1171  SRARI_H4_SH(out0, out1, out2, out3, 7);
1172  SAT_SH4_SH(out0, out1, out2, out3, 7);
1173  PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
1174  dst += dst_stride;
1175  PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
1176  dst += dst_stride;
1177  }
1178 }
1179 
1180 static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
1181  int32_t src_stride,
1182  uint8_t *dst, int32_t dst_stride,
1183  const int8_t *filter,
1184  int32_t height)
1185 {
1186  uint32_t loop_cnt;
1187  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1188  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
1189  v8i16 filt, out0, out1, out2, out3;
1190  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1191  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1192 
1193  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1194  src -= 3;
1195 
1196  /* rearranging filter */
1197  filt = LD_SH(filter);
1198  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1199 
1200  mask1 = mask0 + 2;
1201  mask2 = mask0 + 4;
1202  mask3 = mask0 + 6;
1203 
1204  for (loop_cnt = height; loop_cnt--;) {
1205  src0 = LD_SB(src);
1206  src2 = LD_SB(src + 16);
1207  src3 = LD_SB(src + 24);
1208  src1 = __msa_sldi_b(src2, src0, 8);
1209  src += src_stride;
1210 
1211  XORI_B4_128_SB(src0, src1, src2, src3);
1212  VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1213  vec12);
1214  VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1215  vec13);
1216  VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
1217  vec14);
1218  VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
1219  vec15);
1220  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
1221  vec1, vec2, vec3);
1222  DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
1223  vec9, vec10, vec11);
1224  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
1225  vec1, vec2, vec3);
1226  DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1227  vec8, vec9, vec10, vec11);
1228  ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1229  out1, out2, out3);
1230  SRARI_H4_SH(out0, out1, out2, out3, 7);
1231  SAT_SH4_SH(out0, out1, out2, out3, 7);
1232  LD_UB2(dst, 16, dst1, dst2);
1233  PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
1234  PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
1235  dst += dst_stride;
1236  }
1237 }
1238 
1239 static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
1240  int32_t src_stride,
1241  uint8_t *dst, int32_t dst_stride,
1242  const int8_t *filter,
1243  int32_t height)
1244 {
1245  uint32_t loop_cnt, cnt;
1246  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1247  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
1248  v8i16 filt, out0, out1, out2, out3;
1249  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1250  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1251 
1252  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1253  src -= 3;
1254 
1255  /* rearranging filter */
1256  filt = LD_SH(filter);
1257  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1258 
1259  mask1 = mask0 + 2;
1260  mask2 = mask0 + 4;
1261  mask3 = mask0 + 6;
1262 
1263  for (loop_cnt = height; loop_cnt--;) {
1264  for (cnt = 0; cnt < 2; ++cnt) {
1265  src0 = LD_SB(&src[cnt << 5]);
1266  src2 = LD_SB(&src[16 + (cnt << 5)]);
1267  src3 = LD_SB(&src[24 + (cnt << 5)]);
1268  src1 = __msa_sldi_b(src2, src0, 8);
1269 
1270  XORI_B4_128_SB(src0, src1, src2, src3);
1271  VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1272  vec12);
1273  VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1274  vec13);
1275  VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6,
1276  vec10, vec14);
1277  VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7,
1278  vec11, vec15);
1279  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1280  vec0, vec1, vec2, vec3);
1281  DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2,
1282  vec8, vec9, vec10, vec11);
1283  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
1284  vec0, vec1, vec2, vec3);
1285  DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1286  vec8, vec9, vec10, vec11);
1287  ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1288  out1, out2, out3);
1289  SRARI_H4_SH(out0, out1, out2, out3, 7);
1290  SAT_SH4_SH(out0, out1, out2, out3, 7);
1291  LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
1292  PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
1293  PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
1294  }
1295 
1296  src += src_stride;
1297  dst += dst_stride;
1298  }
1299 }
1300 
1301 static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
1302  int32_t src_stride,
1303  uint8_t *dst, int32_t dst_stride,
1304  const int8_t *filter,
1305  int32_t height)
1306 {
1307  uint32_t loop_cnt;
1308  uint32_t tp0, tp1, tp2, tp3;
1309  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1310  v16u8 dst0, out;
1311  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1312  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
1313  v16i8 src10998, filt0, filt1, filt2, filt3;
1314  v8i16 filt, out10, out32;
1315 
1316  src -= (3 * src_stride);
1317 
1318  filt = LD_SH(filter);
1319  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1320 
1321  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1322  src += (7 * src_stride);
1323 
1324  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1325  src54_r, src21_r);
1326  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1327  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
1328  src4332, src6554);
1329  XORI_B3_128_SB(src2110, src4332, src6554);
1330 
1331  for (loop_cnt = (height >> 2); loop_cnt--;) {
1332  LD_SB4(src, src_stride, src7, src8, src9, src10);
1333  src += (4 * src_stride);
1334 
1335  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1336  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1337  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1338  src87_r, src98_r, src109_r);
1339  ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
1340  XORI_B2_128_SB(src8776, src10998);
1341  out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
1342  filt1, filt2, filt3);
1343  out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
1344  filt1, filt2, filt3);
1345  SRARI_H2_SH(out10, out32, 7);
1346  SAT_SH2_SH(out10, out32, 7);
1347  out = PCKEV_XORI128_UB(out10, out32);
1348  out = __msa_aver_u_b(out, dst0);
1349 
1350  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1351  dst += (4 * dst_stride);
1352 
1353  src2110 = src6554;
1354  src4332 = src8776;
1355  src6554 = src10998;
1356  src6 = src10;
1357  }
1358 }
1359 
1360 static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
1361  int32_t src_stride,
1362  uint8_t *dst, int32_t dst_stride,
1363  const int8_t *filter,
1364  int32_t height)
1365 {
1366  uint32_t loop_cnt;
1367  uint64_t tp0, tp1, tp2, tp3;
1368  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1369  v16u8 dst0, dst1;
1370  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1371  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
1372  v8i16 filt, out0, out1, out2, out3;
1373 
1374  src -= (3 * src_stride);
1375 
1376  filt = LD_SH(filter);
1377  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1378 
1379  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1380  src += (7 * src_stride);
1381 
1382  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1383  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1384  src54_r, src21_r);
1385  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1386 
1387  for (loop_cnt = (height >> 2); loop_cnt--;) {
1388  LD_SB4(src, src_stride, src7, src8, src9, src10);
1389  src += (4 * src_stride);
1390 
1391  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1392  INSERT_D2_UB(tp0, tp1, dst0);
1393  INSERT_D2_UB(tp2, tp3, dst1);
1394  XORI_B4_128_SB(src7, src8, src9, src10);
1395  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1396  src87_r, src98_r, src109_r);
1397  out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
1398  filt1, filt2, filt3);
1399  out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
1400  filt1, filt2, filt3);
1401  out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
1402  filt1, filt2, filt3);
1403  out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
1404  filt1, filt2, filt3);
1405  SRARI_H4_SH(out0, out1, out2, out3, 7);
1406  SAT_SH4_SH(out0, out1, out2, out3, 7);
1407  CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
1408  dst, dst_stride);
1409  dst += (4 * dst_stride);
1410 
1411  src10_r = src54_r;
1412  src32_r = src76_r;
1413  src54_r = src98_r;
1414  src21_r = src65_r;
1415  src43_r = src87_r;
1416  src65_r = src109_r;
1417  src6 = src10;
1418  }
1419 }
1420 
1421 static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
1422  int32_t src_stride,
1423  uint8_t *dst,
1424  int32_t dst_stride,
1425  const int8_t *filter,
1426  int32_t height,
1427  int32_t width)
1428 {
1429  const uint8_t *src_tmp;
1430  uint8_t *dst_tmp;
1431  uint32_t loop_cnt, cnt;
1432  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1433  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1434  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1435  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1436  v16i8 filt0, filt1, filt2, filt3;
1437  v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
1438  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
1439 
1440  src -= (3 * src_stride);
1441 
1442  filt = LD_SH(filter);
1443  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1444 
1445  for (cnt = (width >> 4); cnt--;) {
1446  src_tmp = src;
1447  dst_tmp = dst;
1448 
1449  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1450  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1451  src_tmp += (7 * src_stride);
1452 
1453  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1454  src32_r, src54_r, src21_r);
1455  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1456  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1457  src32_l, src54_l, src21_l);
1458  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1459 
1460  for (loop_cnt = (height >> 2); loop_cnt--;) {
1461  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1462  src_tmp += (4 * src_stride);
1463 
1464  LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
1465  XORI_B4_128_SB(src7, src8, src9, src10);
1466  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1467  src87_r, src98_r, src109_r);
1468  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1469  src87_l, src98_l, src109_l);
1470  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
1471  filt0, filt1, filt2, filt3);
1472  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
1473  filt0, filt1, filt2, filt3);
1474  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
1475  filt0, filt1, filt2, filt3);
1476  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
1477  filt0, filt1, filt2, filt3);
1478  out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
1479  filt0, filt1, filt2, filt3);
1480  out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
1481  filt0, filt1, filt2, filt3);
1482  out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
1483  filt0, filt1, filt2, filt3);
1484  out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
1485  filt0, filt1, filt2, filt3);
1486  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1487  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1488  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1489  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1490  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1491  out3_r, tmp0, tmp1, tmp2, tmp3);
1492  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1493  AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
1494  dst0, dst1, dst2, dst3);
1495  ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
1496  dst_tmp += (4 * dst_stride);
1497 
1498  src10_r = src54_r;
1499  src32_r = src76_r;
1500  src54_r = src98_r;
1501  src21_r = src65_r;
1502  src43_r = src87_r;
1503  src65_r = src109_r;
1504  src10_l = src54_l;
1505  src32_l = src76_l;
1506  src54_l = src98_l;
1507  src21_l = src65_l;
1508  src43_l = src87_l;
1509  src65_l = src109_l;
1510  src6 = src10;
1511  }
1512 
1513  src += 16;
1514  dst += 16;
1515  }
1516 }
1517 
1518 static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
1519  int32_t src_stride,
1520  uint8_t *dst, int32_t dst_stride,
1521  const int8_t *filter,
1522  int32_t height)
1523 {
1524  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1525  filter, height, 16);
1526 }
1527 
1528 static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
1529  int32_t src_stride,
1530  uint8_t *dst, int32_t dst_stride,
1531  const int8_t *filter,
1532  int32_t height)
1533 {
1534  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1535  filter, height, 32);
1536 }
1537 
1538 static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
1539  int32_t src_stride,
1540  uint8_t *dst, int32_t dst_stride,
1541  const int8_t *filter,
1542  int32_t height)
1543 {
1544  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1545  filter, height, 64);
1546 }
1547 
1548 static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
1549  int32_t src_stride,
1550  uint8_t *dst,
1551  int32_t dst_stride,
1552  const int8_t *filter_horiz,
1553  const int8_t *filter_vert,
1554  int32_t height)
1555 {
1556  uint32_t loop_cnt;
1557  uint32_t tp0, tp1, tp2, tp3;
1558  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1559  v16u8 dst0, res, mask0, mask1, mask2, mask3;
1560  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1561  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1562  v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
1563  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1564 
1565  mask0 = LD_UB(&mc_filt_mask_arr[16]);
1566  src -= (3 + 3 * src_stride);
1567 
1568  /* rearranging filter */
1569  filt = LD_SH(filter_horiz);
1570  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1571 
1572  mask1 = mask0 + 2;
1573  mask2 = mask0 + 4;
1574  mask3 = mask0 + 6;
1575 
1576  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1577  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1578  src += (7 * src_stride);
1579 
1580  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
1581  filt_hz1, filt_hz2, filt_hz3);
1582  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
1583  filt_hz1, filt_hz2, filt_hz3);
1584  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
1585  filt_hz1, filt_hz2, filt_hz3);
1586  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
1587  filt_hz1, filt_hz2, filt_hz3);
1588  SLDI_B2_SH(hz_out2, hz_out0, hz_out4, hz_out2, 8, hz_out1, hz_out3);
1589 
1590  filt = LD_SH(filter_vert);
1591  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1592 
1593  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1594  vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1595 
1596  for (loop_cnt = (height >> 2); loop_cnt--;) {
1597  LD_SB4(src, src_stride, src7, src8, src9, src10);
1598  XORI_B4_128_SB(src7, src8, src9, src10);
1599  src += (4 * src_stride);
1600 
1601  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1602  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1603  hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
1604  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1605  hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
1606  vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1607  res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
1608  filt_vt2, filt_vt3);
1609 
1610  hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
1611  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1612  hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
1613  vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
1614  res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
1615  filt_vt2, filt_vt3);
1616 
1617  SRARI_H2_SH(res0, res1, 7);
1618  SAT_SH2_SH(res0, res1, 7);
1619  res = PCKEV_XORI128_UB(res0, res1);
1620  res = (v16u8) __msa_aver_u_b(res, dst0);
1621  ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
1622  dst += (4 * dst_stride);
1623 
1624  hz_out5 = hz_out9;
1625  vec0 = vec2;
1626  vec1 = vec3;
1627  vec2 = vec4;
1628  }
1629 }
1630 
1631 static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
1632  int32_t src_stride,
1633  uint8_t *dst,
1634  int32_t dst_stride,
1635  const int8_t *filter_horiz,
1636  const int8_t *filter_vert,
1637  int32_t height)
1638 {
1639  uint32_t loop_cnt;
1640  uint64_t tp0, tp1, tp2, tp3;
1641  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1642  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1643  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1644  v16u8 dst0, dst1, mask0, mask1, mask2, mask3;
1645  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1646  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
1647  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
1648 
1649  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1650  src -= (3 + 3 * src_stride);
1651 
1652  /* rearranging filter */
1653  filt = LD_SH(filter_horiz);
1654  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1655 
1656  mask1 = mask0 + 2;
1657  mask2 = mask0 + 4;
1658  mask3 = mask0 + 6;
1659 
1660  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1661  src += (7 * src_stride);
1662 
1663  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1664  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
1665  filt_hz1, filt_hz2, filt_hz3);
1666  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
1667  filt_hz1, filt_hz2, filt_hz3);
1668  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
1669  filt_hz1, filt_hz2, filt_hz3);
1670  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
1671  filt_hz1, filt_hz2, filt_hz3);
1672  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
1673  filt_hz1, filt_hz2, filt_hz3);
1674  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
1675  filt_hz1, filt_hz2, filt_hz3);
1676  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
1677  filt_hz1, filt_hz2, filt_hz3);
1678 
1679  filt = LD_SH(filter_vert);
1680  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1681 
1682  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1683  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
1684  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
1685 
1686  for (loop_cnt = (height >> 2); loop_cnt--;) {
1687  LD_SB4(src, src_stride, src7, src8, src9, src10);
1688  XORI_B4_128_SB(src7, src8, src9, src10);
1689  src += (4 * src_stride);
1690 
1691  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1692  INSERT_D2_UB(tp0, tp1, dst0);
1693  INSERT_D2_UB(tp2, tp3, dst1);
1694 
1695  hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
1696  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1697  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1698  tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
1699  filt_vt2, filt_vt3);
1700 
1701  hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
1702  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1703  out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
1704  tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
1705  filt_vt2, filt_vt3);
1706 
1707  hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
1708  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1709  out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
1710  tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
1711  filt_vt2, filt_vt3);
1712 
1713  hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
1714  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1715  out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
1716  tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
1717  filt_vt2, filt_vt3);
1718 
1719  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1720  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1721  CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1,
1722  dst, dst_stride);
1723  dst += (4 * dst_stride);
1724 
1725  hz_out6 = hz_out10;
1726  out0 = out2;
1727  out1 = out3;
1728  out2 = out8;
1729  out4 = out6;
1730  out5 = out7;
1731  out6 = out9;
1732  }
1733 }
1734 
1735 static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
1736  int32_t src_stride,
1737  uint8_t *dst,
1738  int32_t dst_stride,
1739  const int8_t *filter_horiz,
1740  const int8_t *filter_vert,
1741  int32_t height)
1742 {
1743  int32_t multiple8_cnt;
1744 
1745  for (multiple8_cnt = 2; multiple8_cnt--;) {
1746  common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1747  filter_horiz, filter_vert,
1748  height);
1749 
1750  src += 8;
1751  dst += 8;
1752  }
1753 }
1754 
1755 static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
1756  int32_t src_stride,
1757  uint8_t *dst,
1758  int32_t dst_stride,
1759  const int8_t *filter_horiz,
1760  const int8_t *filter_vert,
1761  int32_t height)
1762 {
1763  int32_t multiple8_cnt;
1764 
1765  for (multiple8_cnt = 4; multiple8_cnt--;) {
1766  common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1767  filter_horiz, filter_vert,
1768  height);
1769 
1770  src += 8;
1771  dst += 8;
1772  }
1773 }
1774 
1775 static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
1776  int32_t src_stride,
1777  uint8_t *dst,
1778  int32_t dst_stride,
1779  const int8_t *filter_horiz,
1780  const int8_t *filter_vert,
1781  int32_t height)
1782 {
1783  int32_t multiple8_cnt;
1784 
1785  for (multiple8_cnt = 8; multiple8_cnt--;) {
1786  common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1787  filter_horiz, filter_vert,
1788  height);
1789 
1790  src += 8;
1791  dst += 8;
1792  }
1793 }
1794 
1795 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
1796  uint8_t *dst, int32_t dst_stride,
1797  const int8_t *filter)
1798 {
1799  v16i8 src0, src1, src2, src3, mask;
1800  v16u8 filt0, vec0, vec1, res0, res1;
1801  v8u16 vec2, vec3, filt;
1802 
1803  mask = LD_SB(&mc_filt_mask_arr[16]);
1804 
1805  /* rearranging filter */
1806  filt = LD_UH(filter);
1807  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1808 
1809  LD_SB4(src, src_stride, src0, src1, src2, src3);
1810  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1811  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
1812  SRARI_H2_UH(vec2, vec3, 7);
1813  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
1814  ST_W2(res0, 0, 1, dst, dst_stride);
1815  ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1816 }
1817 
1818 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
1819  uint8_t *dst, int32_t dst_stride,
1820  const int8_t *filter)
1821 {
1822  v16u8 vec0, vec1, vec2, vec3, filt0;
1823  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1824  v16i8 res0, res1, res2, res3;
1825  v8u16 vec4, vec5, vec6, vec7, filt;
1826 
1827  mask = LD_SB(&mc_filt_mask_arr[16]);
1828 
1829  /* rearranging filter */
1830  filt = LD_UH(filter);
1831  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1832 
1833  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1834  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1835  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
1836  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1837  vec4, vec5, vec6, vec7);
1838  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
1839  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
1840  res0, res1, res2, res3);
1841  ST_W2(res0, 0, 1, dst, dst_stride);
1842  ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1843  ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
1844  ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
1845 }
1846 
1847 void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1848  const uint8_t *src, ptrdiff_t src_stride,
1849  int height, int mx, int my)
1850 {
1851  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
1852 
1853  if (4 == height) {
1854  common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1855  } else if (8 == height) {
1856  common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1857  }
1858 }
1859 
1860 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
1861  uint8_t *dst, int32_t dst_stride,
1862  const int8_t *filter)
1863 {
1864  v16u8 filt0;
1865  v16i8 src0, src1, src2, src3, mask;
1866  v8u16 vec0, vec1, vec2, vec3, filt;
1867 
1868  mask = LD_SB(&mc_filt_mask_arr[0]);
1869 
1870  /* rearranging filter */
1871  filt = LD_UH(filter);
1872  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1873 
1874  LD_SB4(src, src_stride, src0, src1, src2, src3);
1875  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1876  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1877  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1878  vec0, vec1, vec2, vec3);
1879  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1880  PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
1881  ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
1882 }
1883 
1884 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
1885  uint8_t *dst, int32_t dst_stride,
1886  const int8_t *filter, int32_t height)
1887 {
1888  v16u8 filt0;
1889  v16i8 src0, src1, src2, src3, mask, out0, out1;
1890  v8u16 vec0, vec1, vec2, vec3, filt;
1891 
1892  mask = LD_SB(&mc_filt_mask_arr[0]);
1893 
1894  /* rearranging filter */
1895  filt = LD_UH(filter);
1896  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1897 
1898  LD_SB4(src, src_stride, src0, src1, src2, src3);
1899  src += (4 * src_stride);
1900 
1901  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1902  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1903  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1904  vec0, vec1, vec2, vec3);
1905  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1906  LD_SB4(src, src_stride, src0, src1, src2, src3);
1907  src += (4 * src_stride);
1908 
1909  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1910  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1911 
1912  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1913  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1914  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1915  vec0, vec1, vec2, vec3);
1916  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1917  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1918  ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1919  dst += (8 * dst_stride);
1920 
1921  if (16 == height) {
1922  LD_SB4(src, src_stride, src0, src1, src2, src3);
1923  src += (4 * src_stride);
1924 
1925  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1926  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1927  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1928  vec0, vec1, vec2, vec3);
1929  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1930  LD_SB4(src, src_stride, src0, src1, src2, src3);
1931  src += (4 * src_stride);
1932 
1933  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1934  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1935 
1936  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1937  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1938  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1939  vec0, vec1, vec2, vec3);
1940  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1941  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1942  ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1943  }
1944 }
1945 
1946 void ff_put_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1947  const uint8_t *src, ptrdiff_t src_stride,
1948  int height, int mx, int my)
1949 {
1950  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
1951 
1952  if (4 == height) {
1953  common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1954  } else {
1955  common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1956  height);
1957  }
1958 }
1959 
1960 void ff_put_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1961  const uint8_t *src, ptrdiff_t src_stride,
1962  int height, int mx, int my)
1963 {
1964  uint32_t loop_cnt;
1965  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
1966  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1967  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1968  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
1969 
1970  mask = LD_SB(&mc_filt_mask_arr[0]);
1971 
1972  loop_cnt = (height >> 2) - 1;
1973 
1974  /* rearranging filter */
1975  filt = LD_UH(filter);
1976  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1977 
1978  LD_SB4(src, src_stride, src0, src2, src4, src6);
1979  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1980  src += (4 * src_stride);
1981 
1982  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1983  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1984  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1985  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1986  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1987  out0, out1, out2, out3);
1988  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1989  out4, out5, out6, out7);
1990  SRARI_H4_UH(out0, out1, out2, out3, 7);
1991  SRARI_H4_UH(out4, out5, out6, out7, 7);
1992  PCKEV_ST_SB(out0, out1, dst);
1993  dst += dst_stride;
1994  PCKEV_ST_SB(out2, out3, dst);
1995  dst += dst_stride;
1996  PCKEV_ST_SB(out4, out5, dst);
1997  dst += dst_stride;
1998  PCKEV_ST_SB(out6, out7, dst);
1999  dst += dst_stride;
2000 
2001  for (; loop_cnt--;) {
2002  LD_SB4(src, src_stride, src0, src2, src4, src6);
2003  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2004  src += (4 * src_stride);
2005 
2006  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2007  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2008  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2009  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2010  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2011  out0, out1, out2, out3);
2012  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2013  out4, out5, out6, out7);
2014  SRARI_H4_UH(out0, out1, out2, out3, 7);
2015  SRARI_H4_UH(out4, out5, out6, out7, 7);
2016  PCKEV_ST_SB(out0, out1, dst);
2017  dst += dst_stride;
2018  PCKEV_ST_SB(out2, out3, dst);
2019  dst += dst_stride;
2020  PCKEV_ST_SB(out4, out5, dst);
2021  dst += dst_stride;
2022  PCKEV_ST_SB(out6, out7, dst);
2023  dst += dst_stride;
2024  }
2025 }
2026 
2027 void ff_put_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2028  const uint8_t *src, ptrdiff_t src_stride,
2029  int height, int mx, int my)
2030 {
2031  uint32_t loop_cnt;
2032  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2033  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2034  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2035  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
2036 
2037  mask = LD_SB(&mc_filt_mask_arr[0]);
2038 
2039  /* rearranging filter */
2040  filt = LD_UH(filter);
2041  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2042 
2043  for (loop_cnt = height >> 1; loop_cnt--;) {
2044  src0 = LD_SB(src);
2045  src2 = LD_SB(src + 16);
2046  src3 = LD_SB(src + 24);
2047  src1 = __msa_sldi_b(src2, src0, 8);
2048  src += src_stride;
2049  src4 = LD_SB(src);
2050  src6 = LD_SB(src + 16);
2051  src7 = LD_SB(src + 24);
2052  src5 = __msa_sldi_b(src6, src4, 8);
2053  src += src_stride;
2054 
2055  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2056  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2057  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2058  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2059  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2060  out0, out1, out2, out3);
2061  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2062  out4, out5, out6, out7);
2063  SRARI_H4_UH(out0, out1, out2, out3, 7);
2064  SRARI_H4_UH(out4, out5, out6, out7, 7);
2065  PCKEV_ST_SB(out0, out1, dst);
2066  PCKEV_ST_SB(out2, out3, dst + 16);
2067  dst += dst_stride;
2068  PCKEV_ST_SB(out4, out5, dst);
2069  PCKEV_ST_SB(out6, out7, dst + 16);
2070  dst += dst_stride;
2071  }
2072 }
2073 
2074 void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2075  const uint8_t *src, ptrdiff_t src_stride,
2076  int height, int mx, int my)
2077 {
2078  uint32_t loop_cnt;
2079  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2080  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2081  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2082  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
2083 
2084  mask = LD_SB(&mc_filt_mask_arr[0]);
2085 
2086  /* rearranging filter */
2087  filt = LD_UH(filter);
2088  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2089 
2090  for (loop_cnt = height; loop_cnt--;) {
2091  src0 = LD_SB(src);
2092  src2 = LD_SB(src + 16);
2093  src4 = LD_SB(src + 32);
2094  src6 = LD_SB(src + 48);
2095  src7 = LD_SB(src + 56);
2096  SLDI_B3_SB(src2, src0, src4, src2, src6, src4, 8, src1, src3, src5);
2097  src += src_stride;
2098 
2099  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2100  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2101  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2102  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2103  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2104  out0, out1, out2, out3);
2105  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2106  out4, out5, out6, out7);
2107  SRARI_H4_UH(out0, out1, out2, out3, 7);
2108  SRARI_H4_UH(out4, out5, out6, out7, 7);
2109  PCKEV_ST_SB(out0, out1, dst);
2110  PCKEV_ST_SB(out2, out3, dst + 16);
2111  PCKEV_ST_SB(out4, out5, dst + 32);
2112  PCKEV_ST_SB(out6, out7, dst + 48);
2113  dst += dst_stride;
2114  }
2115 }
2116 
2117 static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
2118  uint8_t *dst, int32_t dst_stride,
2119  const int8_t *filter)
2120 {
2121  v16i8 src0, src1, src2, src3, src4;
2122  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2123  v16u8 filt0;
2124  v8i16 filt;
2125  v8u16 tmp0, tmp1;
2126 
2127  filt = LD_SH(filter);
2128  filt0 = (v16u8) __msa_splati_h(filt, 0);
2129 
2130  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2131  src += (5 * src_stride);
2132 
2133  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2134  src10_r, src21_r, src32_r, src43_r);
2135  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2136  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
2137  SRARI_H2_UH(tmp0, tmp1, 7);
2138  SAT_UH2_UH(tmp0, tmp1, 7);
2139  src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2140  ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
2141 }
2142 
2143 static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
2144  uint8_t *dst, int32_t dst_stride,
2145  const int8_t *filter)
2146 {
2147  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2148  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
2149  v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
2150  v8u16 tmp0, tmp1, tmp2, tmp3;
2151  v16u8 filt0;
2152  v8i16 filt;
2153 
2154  filt = LD_SH(filter);
2155  filt0 = (v16u8) __msa_splati_h(filt, 0);
2156 
2157  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2158  src += (8 * src_stride);
2159 
2160  src8 = LD_SB(src);
2161  src += src_stride;
2162 
2163  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2164  src32_r, src43_r);
2165  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2166  src76_r, src87_r);
2167  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
2168  src87_r, src76_r, src2110, src4332, src6554, src8776);
2169  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
2170  tmp0, tmp1, tmp2, tmp3);
2171  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2172  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2173  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
2174  ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2175 }
2176 
2177 void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2178  const uint8_t *src, ptrdiff_t src_stride,
2179  int height, int mx, int my)
2180 {
2181  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2182 
2183  if (4 == height) {
2184  common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2185  } else if (8 == height) {
2186  common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2187  }
2188 }
2189 
2190 static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
2191  uint8_t *dst, int32_t dst_stride,
2192  const int8_t *filter)
2193 {
2194  v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
2195  v16i8 out0, out1;
2196  v8u16 tmp0, tmp1, tmp2, tmp3;
2197  v8i16 filt;
2198 
2199  /* rearranging filter_y */
2200  filt = LD_SH(filter);
2201  filt0 = (v16u8) __msa_splati_h(filt, 0);
2202 
2203  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
2204  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
2205  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
2206  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2207  tmp0, tmp1, tmp2, tmp3);
2208  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2209  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2210  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2211  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2212 }
2213 
2214 static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
2215  uint8_t *dst, int32_t dst_stride,
2216  const int8_t *filter, int32_t height)
2217 {
2218  uint32_t loop_cnt;
2219  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2220  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2221  v16i8 out0, out1;
2222  v8u16 tmp0, tmp1, tmp2, tmp3;
2223  v8i16 filt;
2224 
2225  /* rearranging filter_y */
2226  filt = LD_SH(filter);
2227  filt0 = (v16u8) __msa_splati_h(filt, 0);
2228 
2229  src0 = LD_UB(src);
2230  src += src_stride;
2231 
2232  for (loop_cnt = (height >> 3); loop_cnt--;) {
2233  LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
2234  src += (8 * src_stride);
2235 
2236  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
2237  vec0, vec1, vec2, vec3);
2238  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
2239  vec4, vec5, vec6, vec7);
2240  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2241  tmp0, tmp1, tmp2, tmp3);
2242  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2243  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2244  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2245  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2246 
2247  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2248  tmp0, tmp1, tmp2, tmp3);
2249  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2250  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2251  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2252  ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2253  dst += (8 * dst_stride);
2254 
2255  src0 = src8;
2256  }
2257 }
2258 
2259 void ff_put_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2260  const uint8_t *src, ptrdiff_t src_stride,
2261  int height, int mx, int my)
2262 {
2263  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2264 
2265  if (4 == height) {
2266  common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
2267  } else {
2268  common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
2269  height);
2270  }
2271 }
2272 
2273 void ff_put_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2274  const uint8_t *src, ptrdiff_t src_stride,
2275  int height, int mx, int my)
2276 {
2277  uint32_t loop_cnt;
2278  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2279  v16u8 src0, src1, src2, src3, src4;
2280  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2281  v8u16 tmp0, tmp1, tmp2, tmp3;
2282  v8i16 filt;
2283 
2284  /* rearranging filter_y */
2285  filt = LD_SH(filter);
2286  filt0 = (v16u8) __msa_splati_h(filt, 0);
2287 
2288  src0 = LD_UB(src);
2289  src += src_stride;
2290 
2291  for (loop_cnt = (height >> 2); loop_cnt--;) {
2292  LD_UB4(src, src_stride, src1, src2, src3, src4);
2293  src += (4 * src_stride);
2294 
2295  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2296  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2297  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2298  SRARI_H2_UH(tmp0, tmp1, 7);
2299  SAT_UH2_UH(tmp0, tmp1, 7);
2300  PCKEV_ST_SB(tmp0, tmp1, dst);
2301  dst += dst_stride;
2302 
2303  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
2304  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
2305  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2306  SRARI_H2_UH(tmp2, tmp3, 7);
2307  SAT_UH2_UH(tmp2, tmp3, 7);
2308  PCKEV_ST_SB(tmp2, tmp3, dst);
2309  dst += dst_stride;
2310 
2311  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2312  SRARI_H2_UH(tmp0, tmp1, 7);
2313  SAT_UH2_UH(tmp0, tmp1, 7);
2314  PCKEV_ST_SB(tmp0, tmp1, dst);
2315  dst += dst_stride;
2316 
2317  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2318  SRARI_H2_UH(tmp2, tmp3, 7);
2319  SAT_UH2_UH(tmp2, tmp3, 7);
2320  PCKEV_ST_SB(tmp2, tmp3, dst);
2321  dst += dst_stride;
2322 
2323  src0 = src4;
2324  }
2325 }
2326 
2327 void ff_put_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2328  const uint8_t *src, ptrdiff_t src_stride,
2329  int height, int mx, int my)
2330 {
2331  uint32_t loop_cnt;
2332  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2333  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
2334  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2335  v8u16 tmp0, tmp1, tmp2, tmp3;
2336  v8i16 filt;
2337 
2338  /* rearranging filter_y */
2339  filt = LD_SH(filter);
2340  filt0 = (v16u8) __msa_splati_h(filt, 0);
2341 
2342  src0 = LD_UB(src);
2343  src5 = LD_UB(src + 16);
2344  src += src_stride;
2345 
2346  for (loop_cnt = (height >> 2); loop_cnt--;) {
2347  LD_UB4(src, src_stride, src1, src2, src3, src4);
2348  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2349  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2350 
2351  LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
2352  src += (4 * src_stride);
2353 
2354  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2355  SRARI_H2_UH(tmp0, tmp1, 7);
2356  SAT_UH2_UH(tmp0, tmp1, 7);
2357  PCKEV_ST_SB(tmp0, tmp1, dst);
2358  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2359  SRARI_H2_UH(tmp2, tmp3, 7);
2360  SAT_UH2_UH(tmp2, tmp3, 7);
2361  PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
2362 
2363  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
2364  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
2365  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2366  SRARI_H2_UH(tmp0, tmp1, 7);
2367  SAT_UH2_UH(tmp0, tmp1, 7);
2368  PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
2369 
2370  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2371  SRARI_H2_UH(tmp2, tmp3, 7);
2372  SAT_UH2_UH(tmp2, tmp3, 7);
2373  PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
2374 
2375  ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
2376  ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
2377  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2378  SRARI_H2_UH(tmp0, tmp1, 7);
2379  SAT_UH2_UH(tmp0, tmp1, 7);
2380  PCKEV_ST_SB(tmp0, tmp1, dst + 16);
2381 
2382  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2383  SRARI_H2_UH(tmp2, tmp3, 7);
2384  SAT_UH2_UH(tmp2, tmp3, 7);
2385  PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
2386 
2387  ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
2388  ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
2389  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2390  SRARI_H2_UH(tmp0, tmp1, 7);
2391  SAT_UH2_UH(tmp0, tmp1, 7);
2392  PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
2393 
2394  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2395  SRARI_H2_UH(tmp2, tmp3, 7);
2396  SAT_UH2_UH(tmp2, tmp3, 7);
2397  PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
2398  dst += (4 * dst_stride);
2399 
2400  src0 = src4;
2401  src5 = src9;
2402  }
2403 }
2404 
2405 void ff_put_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2406  const uint8_t *src, ptrdiff_t src_stride,
2407  int height, int mx, int my)
2408 {
2409  uint32_t loop_cnt;
2410  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2411  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2412  v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2413  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2414  v8i16 filt;
2415 
2416  /* rearranging filter_y */
2417  filt = LD_SH(filter);
2418  filt0 = (v16u8) __msa_splati_h(filt, 0);
2419 
2420  LD_UB4(src, 16, src0, src3, src6, src9);
2421  src += src_stride;
2422 
2423  for (loop_cnt = (height >> 1); loop_cnt--;) {
2424  LD_UB2(src, src_stride, src1, src2);
2425  LD_UB2(src + 16, src_stride, src4, src5);
2426  LD_UB2(src + 32, src_stride, src7, src8);
2427  LD_UB2(src + 48, src_stride, src10, src11);
2428  src += (2 * src_stride);
2429 
2430  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2431  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2432  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2433  SRARI_H2_UH(tmp0, tmp1, 7);
2434  SAT_UH2_UH(tmp0, tmp1, 7);
2435  PCKEV_ST_SB(tmp0, tmp1, dst);
2436 
2437  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2438  SRARI_H2_UH(tmp2, tmp3, 7);
2439  SAT_UH2_UH(tmp2, tmp3, 7);
2440  PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
2441 
2442  ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
2443  ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
2444  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
2445  SRARI_H2_UH(tmp4, tmp5, 7);
2446  SAT_UH2_UH(tmp4, tmp5, 7);
2447  PCKEV_ST_SB(tmp4, tmp5, dst + 16);
2448 
2449  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
2450  SRARI_H2_UH(tmp6, tmp7, 7);
2451  SAT_UH2_UH(tmp6, tmp7, 7);
2452  PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
2453 
2454  ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
2455  ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
2456  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2457  SRARI_H2_UH(tmp0, tmp1, 7);
2458  SAT_UH2_UH(tmp0, tmp1, 7);
2459  PCKEV_ST_SB(tmp0, tmp1, dst + 32);
2460 
2461  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2462  SRARI_H2_UH(tmp2, tmp3, 7);
2463  SAT_UH2_UH(tmp2, tmp3, 7);
2464  PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
2465 
2466  ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
2467  ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
2468  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
2469  SRARI_H2_UH(tmp4, tmp5, 7);
2470  SAT_UH2_UH(tmp4, tmp5, 7);
2471  PCKEV_ST_SB(tmp4, tmp5, dst + 48);
2472 
2473  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
2474  SRARI_H2_UH(tmp6, tmp7, 7);
2475  SAT_UH2_UH(tmp6, tmp7, 7);
2476  PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
2477  dst += (2 * dst_stride);
2478 
2479  src0 = src2;
2480  src3 = src5;
2481  src6 = src8;
2482  src9 = src11;
2483  }
2484 }
2485 
2486 static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
2487  uint8_t *dst, int32_t dst_stride,
2488  const int8_t *filter_horiz, const int8_t *filter_vert)
2489 {
2490  v16i8 src0, src1, src2, src3, src4, mask;
2491  v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
2492  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
2493 
2494  mask = LD_SB(&mc_filt_mask_arr[16]);
2495 
2496  /* rearranging filter */
2497  filt = LD_UH(filter_horiz);
2498  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
2499 
2500  filt = LD_UH(filter_vert);
2501  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
2502 
2503  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2504  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
2505  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
2506  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2507  hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
2508  hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
2509 
2510  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2511  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
2512  SRARI_H2_UH(tmp0, tmp1, 7);
2513  SAT_UH2_UH(tmp0, tmp1, 7);
2514  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
2515  ST_W2(res0, 0, 1, dst, dst_stride);
2516  ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
2517 }
2518 
2519 static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
2520  uint8_t *dst, int32_t dst_stride,
2521  const int8_t *filter_horiz, const int8_t *filter_vert)
2522 {
2523  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
2524  v16i8 res0, res1, res2, res3;
2525  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2526  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2527  v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
2528 
2529  mask = LD_SB(&mc_filt_mask_arr[16]);
2530 
2531  /* rearranging filter */
2532  filt = LD_UH(filter_horiz);
2533  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
2534 
2535  filt = LD_UH(filter_vert);
2536  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
2537 
2538  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2539  src += (8 * src_stride);
2540  src8 = LD_SB(src);
2541 
2542  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
2543  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
2544  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
2545  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
2546  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
2547  SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
2548  hz_out3, hz_out5);
2549  hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
2550 
2551  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2552  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2553  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
2554  vec4, vec5, vec6, vec7);
2555  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2556  SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
2557  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
2558  res0, res1, res2, res3);
2559  ST_W2(res0, 0, 1, dst, dst_stride);
2560  ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
2561  ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
2562  ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
2563 }
2564 
2565 void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2566  const uint8_t *src, ptrdiff_t src_stride,
2567  int height, int mx, int my)
2568 {
2569  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2570  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2571 
2572  if (4 == height) {
2573  common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
2574  filter_horiz, filter_vert);
2575  } else if (8 == height) {
2576  common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
2577  filter_horiz, filter_vert);
2578  }
2579 }
2580 
2581 static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
2582  uint8_t *dst, int32_t dst_stride,
2583  const int8_t *filter_horiz, const int8_t *filter_vert)
2584 {
2585  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2586  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2587  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
2588  v8i16 filt;
2589 
2590  mask = LD_SB(&mc_filt_mask_arr[0]);
2591 
2592  /* rearranging filter */
2593  filt = LD_SH(filter_horiz);
2594  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2595 
2596  filt = LD_SH(filter_vert);
2597  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2598 
2599  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2600 
2601  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2602  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2603  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2604  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
2605 
2606  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2607  vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2608  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
2609 
2610  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2611  vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2612  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
2613 
2614  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2615  vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2616  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
2617 
2618  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2619  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2620  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2621  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2622 }
2623 
2624 static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride,
2625  uint8_t *dst, int32_t dst_stride,
2626  const int8_t *filter_horiz, const int8_t *filter_vert,
2627  int32_t height)
2628 {
2629  uint32_t loop_cnt;
2630  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2631  v16u8 filt_hz, filt_vt, vec0;
2632  v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
2633  v8i16 filt;
2634 
2635  mask = LD_SB(&mc_filt_mask_arr[0]);
2636 
2637  /* rearranging filter */
2638  filt = LD_SH(filter_horiz);
2639  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2640 
2641  filt = LD_SH(filter_vert);
2642  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2643 
2644  src0 = LD_SB(src);
2645  src += src_stride;
2646 
2647  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2648 
2649  for (loop_cnt = (height >> 3); loop_cnt--;) {
2650  LD_SB4(src, src_stride, src1, src2, src3, src4);
2651  src += (4 * src_stride);
2652 
2653  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2654  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2655  tmp1 = __msa_dotp_u_h(vec0, filt_vt);
2656 
2657  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2658  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2659  tmp2 = __msa_dotp_u_h(vec0, filt_vt);
2660 
2661  SRARI_H2_UH(tmp1, tmp2, 7);
2662  SAT_UH2_UH(tmp1, tmp2, 7);
2663 
2664  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2665  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2666  tmp3 = __msa_dotp_u_h(vec0, filt_vt);
2667 
2668  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2669  LD_SB4(src, src_stride, src1, src2, src3, src4);
2670  src += (4 * src_stride);
2671  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2672  tmp4 = __msa_dotp_u_h(vec0, filt_vt);
2673 
2674  SRARI_H2_UH(tmp3, tmp4, 7);
2675  SAT_UH2_UH(tmp3, tmp4, 7);
2676  PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
2677  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2678 
2679  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2680  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2681  tmp5 = __msa_dotp_u_h(vec0, filt_vt);
2682 
2683  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2684  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2685  tmp6 = __msa_dotp_u_h(vec0, filt_vt);
2686 
2687  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2688  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2689  tmp7 = __msa_dotp_u_h(vec0, filt_vt);
2690 
2691  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2692  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2693  tmp8 = __msa_dotp_u_h(vec0, filt_vt);
2694 
2695  SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2696  SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2697  PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
2698  ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2699  dst += (8 * dst_stride);
2700  }
2701 }
2702 
2703 void ff_put_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2704  const uint8_t *src, ptrdiff_t src_stride,
2705  int height, int mx, int my)
2706 {
2707  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2708  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2709 
2710  if (4 == height) {
2711  common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
2712  filter_horiz, filter_vert);
2713  } else {
2714  common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
2715  filter_horiz, filter_vert, height);
2716  }
2717 }
2718 
2719 void ff_put_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2720  const uint8_t *src, ptrdiff_t src_stride,
2721  int height, int mx, int my)
2722 {
2723  uint32_t loop_cnt;
2724  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2725  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2726  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2727  v16u8 filt_hz, filt_vt, vec0, vec1;
2728  v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
2729  v8i16 filt;
2730 
2731  mask = LD_SB(&mc_filt_mask_arr[0]);
2732 
2733  /* rearranging filter */
2734  filt = LD_SH(filter_horiz);
2735  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2736 
2737  filt = LD_SH(filter_vert);
2738  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2739 
2740  LD_SB2(src, 8, src0, src1);
2741  src += src_stride;
2742 
2743  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2744  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2745 
2746 
2747  for (loop_cnt = (height >> 2); loop_cnt--;) {
2748  LD_SB4(src, src_stride, src0, src2, src4, src6);
2749  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2750  src += (4 * src_stride);
2751 
2752  hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2753  hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2754  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2755  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2756  SRARI_H2_UH(tmp1, tmp2, 7);
2757  SAT_UH2_UH(tmp1, tmp2, 7);
2758  PCKEV_ST_SB(tmp1, tmp2, dst);
2759  dst += dst_stride;
2760 
2761  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2762  hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2763  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2764  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2765  SRARI_H2_UH(tmp1, tmp2, 7);
2766  SAT_UH2_UH(tmp1, tmp2, 7);
2767  PCKEV_ST_SB(tmp1, tmp2, dst);
2768  dst += dst_stride;
2769 
2770  hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2771  hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
2772  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2773  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2774  SRARI_H2_UH(tmp1, tmp2, 7);
2775  SAT_UH2_UH(tmp1, tmp2, 7);
2776  PCKEV_ST_SB(tmp1, tmp2, dst);
2777  dst += dst_stride;
2778 
2779  hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
2780  hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
2781  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2782  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2783  SRARI_H2_UH(tmp1, tmp2, 7);
2784  SAT_UH2_UH(tmp1, tmp2, 7);
2785  PCKEV_ST_SB(tmp1, tmp2, dst);
2786  dst += dst_stride;
2787  }
2788 }
2789 
2790 void ff_put_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2791  const uint8_t *src, ptrdiff_t src_stride,
2792  int height, int mx, int my)
2793 {
2794  int32_t multiple8_cnt;
2795 
2796  for (multiple8_cnt = 2; multiple8_cnt--;) {
2797  ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
2798 
2799  src += 16;
2800  dst += 16;
2801  }
2802 }
2803 
2804 void ff_put_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2805  const uint8_t *src, ptrdiff_t src_stride,
2806  int height, int mx, int my)
2807 {
2808  int32_t multiple8_cnt;
2809 
2810  for (multiple8_cnt = 4; multiple8_cnt--;) {
2811  ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
2812 
2813  src += 16;
2814  dst += 16;
2815  }
2816 }
2817 
2818 static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
2819  int32_t src_stride,
2820  uint8_t *dst, int32_t dst_stride,
2821  const int8_t *filter)
2822 {
2823  uint32_t tp0, tp1, tp2, tp3;
2824  v16i8 src0, src1, src2, src3, mask;
2825  v16u8 filt0, dst0, vec0, vec1, res;
2826  v8u16 vec2, vec3, filt;
2827 
2828  mask = LD_SB(&mc_filt_mask_arr[16]);
2829 
2830  /* rearranging filter */
2831  filt = LD_UH(filter);
2832  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2833 
2834  LD_SB4(src, src_stride, src0, src1, src2, src3);
2835  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
2836  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
2837  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
2838  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
2839  SRARI_H2_UH(vec2, vec3, 7);
2840 
2841  res = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
2842  res = (v16u8) __msa_aver_u_b(res, dst0);
2843 
2844  ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
2845 }
2846 
2847 static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
2848  int32_t src_stride,
2849  uint8_t *dst, int32_t dst_stride,
2850  const int8_t *filter)
2851 {
2852  uint32_t tp0, tp1, tp2, tp3;
2853  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2854  v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
2855  v16u8 dst0, dst1;
2856  v8u16 vec4, vec5, vec6, vec7, filt;
2857 
2858  mask = LD_SB(&mc_filt_mask_arr[16]);
2859 
2860  /* rearranging filter */
2861  filt = LD_UH(filter);
2862  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2863 
2864  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2865  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
2866  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
2867  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
2868  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
2869  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
2870  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
2871  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
2872  vec6, vec7);
2873  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2874  PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
2875  res2, res3);
2876  ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
2877  AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
2878  ST_W8(res0, res2, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2879 }
2880 
2881 void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2882  const uint8_t *src, ptrdiff_t src_stride,
2883  int height, int mx, int my)
2884 {
2885  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2886 
2887  if (4 == height) {
2888  common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
2889  filter);
2890  } else if (8 == height) {
2891  common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
2892  filter);
2893  }
2894 }
2895 
2896 static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
2897  int32_t src_stride,
2898  uint8_t *dst, int32_t dst_stride,
2899  const int8_t *filter)
2900 {
2901  int64_t tp0, tp1, tp2, tp3;
2902  v16i8 src0, src1, src2, src3, mask;
2903  v16u8 filt0, dst0, dst1;
2904  v8u16 vec0, vec1, vec2, vec3, filt;
2905 
2906  mask = LD_SB(&mc_filt_mask_arr[0]);
2907 
2908  /* rearranging filter */
2909  filt = LD_UH(filter);
2910  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2911 
2912  LD_SB4(src, src_stride, src0, src1, src2, src3);
2913  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2914  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2915  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2916  vec0, vec1, vec2, vec3);
2917  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2918  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2919  INSERT_D2_UB(tp0, tp1, dst0);
2920  INSERT_D2_UB(tp2, tp3, dst1);
2921  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2922 }
2923 
2924 static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
2925  int32_t src_stride,
2926  uint8_t *dst,
2927  int32_t dst_stride,
2928  const int8_t *filter,
2929  int32_t height)
2930 {
2931  int64_t tp0, tp1, tp2, tp3;
2932  v16i8 src0, src1, src2, src3, mask;
2933  v16u8 filt0, dst0, dst1;
2934  v8u16 vec0, vec1, vec2, vec3, filt;
2935 
2936  mask = LD_SB(&mc_filt_mask_arr[0]);
2937 
2938  /* rearranging filter */
2939  filt = LD_UH(filter);
2940  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2941 
2942  LD_SB4(src, src_stride, src0, src1, src2, src3);
2943  src += (4 * src_stride);
2944  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2945  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2946  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
2947  vec2, vec3);
2948  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2949  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2950  INSERT_D2_UB(tp0, tp1, dst0);
2951  INSERT_D2_UB(tp2, tp3, dst1);
2952  LD_SB4(src, src_stride, src0, src1, src2, src3);
2953  src += (4 * src_stride);
2954  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2955  dst += (4 * dst_stride);
2956 
2957  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2958  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2959  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
2960  vec2, vec3);
2961  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2962  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2963  INSERT_D2_UB(tp0, tp1, dst0);
2964  INSERT_D2_UB(tp2, tp3, dst1);
2965  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2966  dst += (4 * dst_stride);
2967 
2968  if (16 == height) {
2969  LD_SB4(src, src_stride, src0, src1, src2, src3);
2970  src += (4 * src_stride);
2971 
2972  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2973  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2974  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
2975  vec1, vec2, vec3);
2976  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2977  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2978  INSERT_D2_UB(tp0, tp1, dst0);
2979  INSERT_D2_UB(tp2, tp3, dst1);
2980  LD_SB4(src, src_stride, src0, src1, src2, src3);
2981  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2982  dst += (4 * dst_stride);
2983 
2984  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2985  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2986  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
2987  vec1, vec2, vec3);
2988  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2989  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2990  INSERT_D2_UB(tp0, tp1, dst0);
2991  INSERT_D2_UB(tp2, tp3, dst1);
2992  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
2993  }
2994 }
2995 
2996 void ff_avg_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2997  const uint8_t *src, ptrdiff_t src_stride,
2998  int height, int mx, int my)
2999 {
3000  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3001 
3002  if (4 == height) {
3003  common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3004  filter);
3005  } else {
3006  common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
3007  filter, height);
3008  }
3009 }
3010 
3011 void ff_avg_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3012  const uint8_t *src, ptrdiff_t src_stride,
3013  int height, int mx, int my)
3014 {
3015  uint32_t loop_cnt;
3016  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3017  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3018  v16u8 filt0, dst0, dst1, dst2, dst3;
3019  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3020  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
3021 
3022  mask = LD_SB(&mc_filt_mask_arr[0]);
3023 
3024  /* rearranging filter */
3025  filt = LD_UH(filter);
3026  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3027 
3028  LD_SB4(src, src_stride, src0, src2, src4, src6);
3029  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3030  src += (4 * src_stride);
3031 
3032  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3033  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3034  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3035  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3036  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
3037  res2, res3);
3038  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
3039  res6, res7);
3040  SRARI_H4_UH(res0, res1, res2, res3, 7);
3041  SRARI_H4_UH(res4, res5, res6, res7, 7);
3042  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3043  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3044  dst += dst_stride;
3045  PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
3046  dst += dst_stride;
3047  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3048  dst += dst_stride;
3049  PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
3050  dst += dst_stride;
3051 
3052  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
3053  LD_SB4(src, src_stride, src0, src2, src4, src6);
3054  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3055  src += (4 * src_stride);
3056 
3057  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3058  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3059  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3060  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3061  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
3062  res1, res2, res3);
3063  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4,
3064  res5, res6, res7);
3065  SRARI_H4_UH(res0, res1, res2, res3, 7);
3066  SRARI_H4_UH(res4, res5, res6, res7, 7);
3067  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3068  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3069  dst += dst_stride;
3070  PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
3071  dst += dst_stride;
3072  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3073  dst += dst_stride;
3074  PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
3075  dst += dst_stride;
3076  }
3077 }
3078 
3079 void ff_avg_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3080  const uint8_t *src, ptrdiff_t src_stride,
3081  int height, int mx, int my)
3082 {
3083  uint32_t loop_cnt;
3084  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3085  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3086  v16u8 filt0, dst0, dst1, dst2, dst3;
3087  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3088  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
3089 
3090  mask = LD_SB(&mc_filt_mask_arr[0]);
3091 
3092  /* rearranging filter */
3093  filt = LD_UH(filter);
3094  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3095 
3096  for (loop_cnt = (height >> 1); loop_cnt--;) {
3097  src0 = LD_SB(src);
3098  src2 = LD_SB(src + 16);
3099  src3 = LD_SB(src + 24);
3100  src1 = __msa_sldi_b(src2, src0, 8);
3101  src += src_stride;
3102  src4 = LD_SB(src);
3103  src6 = LD_SB(src + 16);
3104  src7 = LD_SB(src + 24);
3105  src5 = __msa_sldi_b(src6, src4, 8);
3106  src += src_stride;
3107 
3108  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3109  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3110  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3111  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3112  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3113  res0, res1, res2, res3);
3114  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3115  res4, res5, res6, res7);
3116  SRARI_H4_UH(res0, res1, res2, res3, 7);
3117  SRARI_H4_UH(res4, res5, res6, res7, 7);
3118  LD_UB2(dst, 16, dst0, dst1);
3119  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3120  PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
3121  dst += dst_stride;
3122  LD_UB2(dst, 16, dst2, dst3);
3123  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3124  PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
3125  dst += dst_stride;
3126  }
3127 }
3128 
3129 void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3130  const uint8_t *src, ptrdiff_t src_stride,
3131  int height, int mx, int my)
3132 {
3133  uint32_t loop_cnt;
3134  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3135  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3136  v16u8 filt0, dst0, dst1, dst2, dst3;
3137  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3138  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
3139 
3140  mask = LD_SB(&mc_filt_mask_arr[0]);
3141 
3142  /* rearranging filter */
3143  filt = LD_UH(filter);
3144  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3145 
3146  for (loop_cnt = height; loop_cnt--;) {
3147  LD_SB4(src, 16, src0, src2, src4, src6);
3148  src7 = LD_SB(src + 56);
3149  SLDI_B3_SB(src2, src0, src4, src2, src6, src4, 8, src1, src3, src5);
3150  src += src_stride;
3151 
3152  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3153  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3154  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3155  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3156  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3157  out0, out1, out2, out3);
3158  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3159  out4, out5, out6, out7);
3160  SRARI_H4_UH(out0, out1, out2, out3, 7);
3161  SRARI_H4_UH(out4, out5, out6, out7, 7);
3162  LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
3163  PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
3164  PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
3165  PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
3166  PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
3167  dst += dst_stride;
3168  }
3169 }
3170 
3171 static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
3172  int32_t src_stride,
3173  uint8_t *dst, int32_t dst_stride,
3174  const int8_t *filter)
3175 {
3176  uint32_t tp0, tp1, tp2, tp3;
3177  v16i8 src0, src1, src2, src3, src4;
3178  v16u8 dst0, out, filt0, src2110, src4332;
3179  v16i8 src10_r, src32_r, src21_r, src43_r;
3180  v8i16 filt;
3181  v8u16 tmp0, tmp1;
3182 
3183  filt = LD_SH(filter);
3184  filt0 = (v16u8) __msa_splati_h(filt, 0);
3185 
3186  LD_SB4(src, src_stride, src0, src1, src2, src3);
3187  src += (4 * src_stride);
3188 
3189  src4 = LD_SB(src);
3190  src += src_stride;
3191 
3192  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3193  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3194  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
3195  src10_r, src21_r, src32_r, src43_r);
3196  ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3197  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
3198  SRARI_H2_UH(tmp0, tmp1, 7);
3199  SAT_UH2_UH(tmp0, tmp1, 7);
3200 
3201  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3202  out = __msa_aver_u_b(out, dst0);
3203 
3204  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3205 }
3206 
3207 static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
3208  int32_t src_stride,
3209  uint8_t *dst, int32_t dst_stride,
3210  const int8_t *filter)
3211 {
3212  uint32_t tp0, tp1, tp2, tp3;
3213  v16u8 dst0, dst1;
3214  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
3215  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3216  v16u8 src2110, src4332, src6554, src8776, filt0;
3217  v8u16 tmp0, tmp1, tmp2, tmp3;
3218  v8i16 filt;
3219 
3220  filt = LD_SH(filter);
3221  filt0 = (v16u8) __msa_splati_h(filt, 0);
3222 
3223  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3224  src += (8 * src_stride);
3225  src8 = LD_SB(src);
3226 
3227  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3228  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3229  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3230  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
3231  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3232  src32_r, src43_r);
3233  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3234  src76_r, src87_r);
3235  ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
3236  src87_r, src76_r, src2110, src4332, src6554, src8776);
3237  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
3238  tmp0, tmp1, tmp2, tmp3);
3239  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3240  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3241  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
3242  AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
3243  ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3244 }
3245 
3246 void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3247  const uint8_t *src, ptrdiff_t src_stride,
3248  int height, int mx, int my)
3249 {
3250  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3251 
3252  if (4 == height) {
3253  common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
3254  filter);
3255  } else if (8 == height) {
3256  common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
3257  filter);
3258  }
3259 }
3260 
3261 static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
3262  int32_t src_stride,
3263  uint8_t *dst,
3264  int32_t dst_stride,
3265  const int8_t *filter)
3266 {
3267  int64_t tp0, tp1, tp2, tp3;
3268  v16u8 src0, src1, src2, src3, src4;
3269  v16u8 dst0, dst1, vec0, vec1, vec2, vec3, filt0;
3270  v8u16 tmp0, tmp1, tmp2, tmp3;
3271  v8i16 filt;
3272 
3273  /* rearranging filter_y */
3274  filt = LD_SH(filter);
3275  filt0 = (v16u8) __msa_splati_h(filt, 0);
3276 
3277  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
3278  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3279  INSERT_D2_UB(tp0, tp1, dst0);
3280  INSERT_D2_UB(tp2, tp3, dst1);
3281  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
3282  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
3283  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3284  tmp0, tmp1, tmp2, tmp3);
3285  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3286  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3287  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3288 }
3289 
3290 static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
3291  int32_t src_stride,
3292  uint8_t *dst,
3293  int32_t dst_stride,
3294  const int8_t *filter,
3295  int32_t height)
3296 {
3297  uint32_t loop_cnt;
3298  int64_t tp0, tp1, tp2, tp3;
3299  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3300  v16u8 dst0, dst1, dst2, dst3;
3301  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
3302  v8u16 tmp0, tmp1, tmp2, tmp3;
3303  v8i16 filt;
3304 
3305  /* rearranging filter_y */
3306  filt = LD_SH(filter);
3307  filt0 = (v16u8) __msa_splati_h(filt, 0);
3308 
3309  src0 = LD_UB(src);
3310  src += src_stride;
3311 
3312  for (loop_cnt = (height >> 3); loop_cnt--;) {
3313  LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
3314  src += (8 * src_stride);
3315 
3316  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3317  INSERT_D2_UB(tp0, tp1, dst0);
3318  INSERT_D2_UB(tp2, tp3, dst1);
3319  LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3320  INSERT_D2_UB(tp0, tp1, dst2);
3321  INSERT_D2_UB(tp2, tp3, dst3);
3322 
3323  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
3324  vec0, vec1, vec2, vec3);
3325  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
3326  vec4, vec5, vec6, vec7);
3327  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3328  tmp0, tmp1, tmp2, tmp3);
3329  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3330  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3331  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3332  dst += (4 * dst_stride);
3333 
3334  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3335  tmp0, tmp1, tmp2, tmp3);
3336  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3337  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3338  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst2, dst3, dst, dst_stride);
3339  dst += (4 * dst_stride);
3340 
3341  src0 = src8;
3342  }
3343 }
3344 
3345 void ff_avg_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3346  const uint8_t *src, ptrdiff_t src_stride,
3347  int height, int mx, int my)
3348 {
3349  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3350 
3351  if (4 == height) {
3352  common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3353  filter);
3354  } else {
3355  common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
3356  filter, height);
3357  }
3358 }
3359 
3360 void ff_avg_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3361  const uint8_t *src, ptrdiff_t src_stride,
3362  int height, int mx, int my)
3363 {
3364  uint32_t loop_cnt;
3365  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3366  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
3367  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3368  v8u16 tmp0, tmp1, tmp2, tmp3, filt;
3369 
3370  /* rearranging filter_y */
3371  filt = LD_UH(filter);
3372  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3373 
3374  src0 = LD_UB(src);
3375  src += src_stride;
3376 
3377  for (loop_cnt = (height >> 2); loop_cnt--;) {
3378  LD_UB4(src, src_stride, src1, src2, src3, src4);
3379  src += (4 * src_stride);
3380 
3381  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3382  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3383  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3384  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3385  SRARI_H2_UH(tmp0, tmp1, 7);
3386  SAT_UH2_UH(tmp0, tmp1, 7);
3387  PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3388  dst += dst_stride;
3389 
3390  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
3391  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
3392  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3393  SRARI_H2_UH(tmp2, tmp3, 7);
3394  SAT_UH2_UH(tmp2, tmp3, 7);
3395  PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
3396  dst += dst_stride;
3397 
3398  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3399  SRARI_H2_UH(tmp0, tmp1, 7);
3400  SAT_UH2_UH(tmp0, tmp1, 7);
3401  PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
3402  dst += dst_stride;
3403 
3404  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3405  SRARI_H2_UH(tmp2, tmp3, 7);
3406  SAT_UH2_UH(tmp2, tmp3, 7);
3407  PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
3408  dst += dst_stride;
3409 
3410  src0 = src4;
3411  }
3412 }
3413 
3414 void ff_avg_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3415  const uint8_t *src, ptrdiff_t src_stride,
3416  int height, int mx, int my)
3417 {
3418  uint32_t loop_cnt;
3419  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3420  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3421  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3422  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
3423  v8u16 tmp0, tmp1, tmp2, tmp3, filt;
3424 
3425  /* rearranging filter_y */
3426  filt = LD_UH(filter);
3427  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3428 
3429  LD_UB2(src, 16, src0, src5);
3430  src += src_stride;
3431 
3432  for (loop_cnt = (height >> 2); loop_cnt--;) {
3433  LD_UB4(src, src_stride, src1, src2, src3, src4);
3434  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3435  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3436  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3437 
3438  LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
3439  LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
3440  src += (4 * src_stride);
3441 
3442  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3443  SRARI_H2_UH(tmp0, tmp1, 7);
3444  SAT_UH2_UH(tmp0, tmp1, 7);
3445  PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3446 
3447  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3448  SRARI_H2_UH(tmp2, tmp3, 7);
3449  SAT_UH2_UH(tmp2, tmp3, 7);
3450  PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
3451 
3452  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
3453  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
3454  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3455  SRARI_H2_UH(tmp0, tmp1, 7);
3456  SAT_UH2_UH(tmp0, tmp1, 7);
3457  PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
3458 
3459  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3460  SRARI_H2_UH(tmp2, tmp3, 7);
3461  SAT_UH2_UH(tmp2, tmp3, 7);
3462  PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
3463 
3464  ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
3465  ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
3466  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3467  SRARI_H2_UH(tmp0, tmp1, 7);
3468  SAT_UH2_UH(tmp0, tmp1, 7);
3469  PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
3470 
3471  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3472  SRARI_H2_UH(tmp2, tmp3, 7);
3473  SAT_UH2_UH(tmp2, tmp3, 7);
3474  PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
3475 
3476  ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
3477  ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
3478  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3479  SRARI_H2_UH(tmp0, tmp1, 7);
3480  SAT_UH2_UH(tmp0, tmp1, 7);
3481  PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
3482 
3483  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3484  SRARI_H2_UH(tmp2, tmp3, 7);
3485  SAT_UH2_UH(tmp2, tmp3, 7);
3486  PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
3487  dst += (4 * dst_stride);
3488 
3489  src0 = src4;
3490  src5 = src9;
3491  }
3492 }
3493 
3494 void ff_avg_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3495  const uint8_t *src, ptrdiff_t src_stride,
3496  int height, int mx, int my)
3497 {
3498  uint32_t loop_cnt;
3499  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3500  v16u8 src0, src1, src2, src3, src4, src5;
3501  v16u8 src6, src7, src8, src9, src10, src11, filt0;
3502  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3503  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3504  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3505  v8u16 filt;
3506 
3507  /* rearranging filter_y */
3508  filt = LD_UH(filter);
3509  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3510 
3511  LD_UB4(src, 16, src0, src3, src6, src9);
3512  src += src_stride;
3513 
3514  for (loop_cnt = (height >> 1); loop_cnt--;) {
3515  LD_UB2(src, src_stride, src1, src2);
3516  LD_UB2(dst, dst_stride, dst0, dst1);
3517  LD_UB2(src + 16, src_stride, src4, src5);
3518  LD_UB2(dst + 16, dst_stride, dst2, dst3);
3519  LD_UB2(src + 32, src_stride, src7, src8);
3520  LD_UB2(dst + 32, dst_stride, dst4, dst5);
3521  LD_UB2(src + 48, src_stride, src10, src11);
3522  LD_UB2(dst + 48, dst_stride, dst6, dst7);
3523  src += (2 * src_stride);
3524 
3525  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3526  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3527  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3528  SRARI_H2_UH(tmp0, tmp1, 7);
3529  SAT_UH2_UH(tmp0, tmp1, 7);
3530  PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3531 
3532  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3533  SRARI_H2_UH(tmp2, tmp3, 7);
3534  SAT_UH2_UH(tmp2, tmp3, 7);
3535  PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
3536 
3537  ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
3538  ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
3539  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
3540  SRARI_H2_UH(tmp4, tmp5, 7);
3541  SAT_UH2_UH(tmp4, tmp5, 7);
3542  PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
3543 
3544  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
3545  SRARI_H2_UH(tmp6, tmp7, 7);
3546  SAT_UH2_UH(tmp6, tmp7, 7);
3547  PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
3548 
3549  ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
3550  ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
3551  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3552  SRARI_H2_UH(tmp0, tmp1, 7);
3553  SAT_UH2_UH(tmp0, tmp1, 7);
3554  PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
3555 
3556  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3557  SRARI_H2_UH(tmp2, tmp3, 7);
3558  SAT_UH2_UH(tmp2, tmp3, 7);
3559  PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
3560 
3561  ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
3562  ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
3563  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
3564  SRARI_H2_UH(tmp4, tmp5, 7);
3565  SAT_UH2_UH(tmp4, tmp5, 7);
3566  PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
3567 
3568  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
3569  SRARI_H2_UH(tmp6, tmp7, 7);
3570  SAT_UH2_UH(tmp6, tmp7, 7);
3571  PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
3572  dst += (2 * dst_stride);
3573 
3574  src0 = src2;
3575  src3 = src5;
3576  src6 = src8;
3577  src9 = src11;
3578  }
3579 }
3580 
3581 static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
3582  int32_t src_stride,
3583  uint8_t *dst,
3584  int32_t dst_stride,
3585  const int8_t *filter_horiz,
3586  const int8_t *filter_vert)
3587 {
3588  uint32_t tp0, tp1, tp2, tp3;
3589  v16i8 src0, src1, src2, src3, src4, mask;
3590  v16u8 filt_hz, filt_vt, vec0, vec1;
3591  v16u8 dst0, out;
3592  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
3593 
3594  mask = LD_SB(&mc_filt_mask_arr[16]);
3595 
3596  /* rearranging filter */
3597  filt = LD_UH(filter_horiz);
3598  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
3599 
3600  filt = LD_UH(filter_vert);
3601  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
3602 
3603  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3604 
3605  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
3606  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
3607  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3608  hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
3609  hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
3610  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3611 
3612  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3613  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3614 
3615  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3616  SRARI_H2_UH(tmp0, tmp1, 7);
3617  SAT_UH2_UH(tmp0, tmp1, 7);
3618 
3619  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3620  out = __msa_aver_u_b(out, dst0);
3621 
3622  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3623 }
3624 
3625 static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
3626  int32_t src_stride,
3627  uint8_t *dst,
3628  int32_t dst_stride,
3629  const int8_t *filter_horiz,
3630  const int8_t *filter_vert)
3631 {
3632  uint32_t tp0, tp1, tp2, tp3;
3633  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
3634  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
3635  v16u8 dst0, dst1;
3636  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3637  v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
3638  v8i16 filt;
3639 
3640  mask = LD_SB(&mc_filt_mask_arr[16]);
3641 
3642  /* rearranging filter */
3643  filt = LD_SH(filter_horiz);
3644  filt_hz = (v16u8) __msa_splati_h(filt, 0);
3645 
3646  filt = LD_SH(filter_vert);
3647  filt_vt = (v16u8) __msa_splati_h(filt, 0);
3648 
3649  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3650  src += (8 * src_stride);
3651  src8 = LD_SB(src);
3652 
3653  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
3654  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
3655  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
3656  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
3657  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
3658  SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
3659  hz_out3, hz_out5);
3660  hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
3661 
3662  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3663  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3664  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3665  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
3666  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3667  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
3668  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
3669  tmp0, tmp1, tmp2, tmp3);
3670  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3671  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3672  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1);
3673  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
3674  ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3675 }
3676 
3677 void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3678  const uint8_t *src, ptrdiff_t src_stride,
3679  int height, int mx, int my)
3680 {
3681  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3682  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3683 
3684  if (4 == height) {
3685  common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
3686  filter_horiz, filter_vert);
3687  } else if (8 == height) {
3688  common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
3689  filter_horiz, filter_vert);
3690  }
3691 }
3692 
3693 static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
3694  int32_t src_stride,
3695  uint8_t *dst,
3696  int32_t dst_stride,
3697  const int8_t *filter_horiz,
3698  const int8_t *filter_vert)
3699 {
3700  uint64_t tp0, tp1, tp2, tp3;
3701  v16i8 src0, src1, src2, src3, src4, mask;
3702  v16u8 filt_hz, filt_vt, dst0, dst1, vec0, vec1, vec2, vec3;
3703  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
3704  v8i16 filt;
3705 
3706  mask = LD_SB(&mc_filt_mask_arr[0]);
3707 
3708  /* rearranging filter */
3709  filt = LD_SH(filter_horiz);
3710  filt_hz = (v16u8) __msa_splati_h(filt, 0);
3711 
3712  filt = LD_SH(filter_vert);
3713  filt_vt = (v16u8) __msa_splati_h(filt, 0);
3714 
3715  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3716  src += (5 * src_stride);
3717 
3718  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3719  INSERT_D2_UB(tp0, tp1, dst0);
3720  INSERT_D2_UB(tp2, tp3, dst1);
3721  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3722  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3723  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3724  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
3725 
3726  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3727  vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3728  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
3729 
3730  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3731  vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3732  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
3733 
3734  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3735  vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3736  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
3737 
3738  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3739  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3740  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3741 }
3742 
3744  int32_t src_stride,
3745  uint8_t *dst,
3746  int32_t dst_stride,
3747  const int8_t *filter_horiz,
3748  const int8_t *filter_vert,
3749  int32_t height)
3750 {
3751  uint32_t loop_cnt;
3752  uint64_t tp0, tp1, tp2, tp3;
3753  v16i8 src0, src1, src2, src3, src4, mask;
3754  v16u8 filt_hz, filt_vt, vec0, dst0, dst1;
3755  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
3756  v8i16 filt;
3757 
3758  mask = LD_SB(&mc_filt_mask_arr[0]);
3759 
3760  /* rearranging filter */
3761  filt = LD_SH(filter_horiz);
3762  filt_hz = (v16u8) __msa_splati_h(filt, 0);
3763 
3764  filt = LD_SH(filter_vert);
3765  filt_vt = (v16u8) __msa_splati_h(filt, 0);
3766 
3767  src0 = LD_SB(src);
3768  src += src_stride;
3769 
3770  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3771 
3772  for (loop_cnt = (height >> 2); loop_cnt--;) {
3773  LD_SB4(src, src_stride, src1, src2, src3, src4);
3774  src += (4 * src_stride);
3775 
3776  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3777  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3778  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
3779 
3780  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3781  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3782  tmp1 = __msa_dotp_u_h(vec0, filt_vt);
3783 
3784  SRARI_H2_UH(tmp0, tmp1, 7);
3785  SAT_UH2_UH(tmp0, tmp1, 7);
3786 
3787  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3788  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3789  tmp2 = __msa_dotp_u_h(vec0, filt_vt);
3790 
3791  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3792  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3793  tmp3 = __msa_dotp_u_h(vec0, filt_vt);
3794 
3795  SRARI_H2_UH(tmp2, tmp3, 7);
3796  SAT_UH2_UH(tmp2, tmp3, 7);
3797  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3798  INSERT_D2_UB(tp0, tp1, dst0);
3799  INSERT_D2_UB(tp2, tp3, dst1);
3800  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
3801  dst += (4 * dst_stride);
3802  }
3803 }
3804 
3805 void ff_avg_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3806  const uint8_t *src, ptrdiff_t src_stride,
3807  int height, int mx, int my)
3808 {
3809  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3810  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3811 
3812  if (4 == height) {
3813  common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3814  filter_horiz, filter_vert);
3815  } else {
3817  dst, dst_stride,
3818  filter_horiz, filter_vert,
3819  height);
3820  }
3821 }
3822 
3823 void ff_avg_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3824  const uint8_t *src, ptrdiff_t src_stride,
3825  int height, int mx, int my)
3826 {
3827  uint32_t loop_cnt;
3828  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3829  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3830  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3831  v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
3832  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
3833  v8i16 filt;
3834 
3835  mask = LD_SB(&mc_filt_mask_arr[0]);
3836 
3837  /* rearranging filter */
3838  filt = LD_SH(filter_horiz);
3839  filt_hz = (v16u8) __msa_splati_h(filt, 0);
3840 
3841  filt = LD_SH(filter_vert);
3842  filt_vt = (v16u8) __msa_splati_h(filt, 0);
3843 
3844  LD_SB2(src, 8, src0, src1);
3845  src += src_stride;
3846 
3847  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3848  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3849 
3850  for (loop_cnt = (height >> 2); loop_cnt--;) {
3851  LD_SB4(src, src_stride, src0, src2, src4, src6);
3852  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3853  src += (4 * src_stride);
3854  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3855 
3856  hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3857  hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3858  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3859  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3860  SRARI_H2_UH(tmp0, tmp1, 7);
3861  SAT_UH2_UH(tmp0, tmp1, 7);
3862  PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3863  dst += dst_stride;
3864 
3865  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3866  hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3867  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
3868  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3869  SRARI_H2_UH(tmp0, tmp1, 7);
3870  SAT_UH2_UH(tmp0, tmp1, 7);
3871  PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
3872  dst += dst_stride;
3873 
3874  hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3875  hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
3876  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3877  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3878  SRARI_H2_UH(tmp0, tmp1, 7);
3879  SAT_UH2_UH(tmp0, tmp1, 7);
3880  PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
3881  dst += dst_stride;
3882 
3883  hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
3884  hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
3885  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
3886  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3887  SRARI_H2_UH(tmp0, tmp1, 7);
3888  SAT_UH2_UH(tmp0, tmp1, 7);
3889  PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
3890  dst += dst_stride;
3891  }
3892 }
3893 
3894 void ff_avg_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3895  const uint8_t *src, ptrdiff_t src_stride,
3896  int height, int mx, int my)
3897 {
3898  int32_t multiple8_cnt;
3899 
3900  for (multiple8_cnt = 2; multiple8_cnt--;) {
3901  ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
3902 
3903  src += 16;
3904  dst += 16;
3905  }
3906 }
3907 
3908 void ff_avg_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3909  const uint8_t *src, ptrdiff_t src_stride,
3910  int height, int mx, int my)
3911 {
3912  int32_t multiple8_cnt;
3913 
3914  for (multiple8_cnt = 4; multiple8_cnt--;) {
3915  ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
3916 
3917  src += 16;
3918  dst += 16;
3919  }
3920 }
3921 
3922 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
3923  uint8_t *dst, int32_t dst_stride,
3924  int32_t height)
3925 {
3926  int32_t cnt;
3927  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
3928 
3929  if (0 == height % 8) {
3930  for (cnt = height >> 3; cnt--;) {
3931  LD4(src, src_stride, out0, out1, out2, out3);
3932  src += (4 * src_stride);
3933  LD4(src, src_stride, out4, out5, out6, out7);
3934  src += (4 * src_stride);
3935 
3936  SD4(out0, out1, out2, out3, dst, dst_stride);
3937  dst += (4 * dst_stride);
3938  SD4(out4, out5, out6, out7, dst, dst_stride);
3939  dst += (4 * dst_stride);
3940  }
3941  } else if (0 == height % 4) {
3942  for (cnt = (height / 4); cnt--;) {
3943  LD4(src, src_stride, out0, out1, out2, out3);
3944  src += (4 * src_stride);
3945 
3946  SD4(out0, out1, out2, out3, dst, dst_stride);
3947  dst += (4 * dst_stride);
3948  }
3949  }
3950 }
3951 
3952 static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
3953  uint8_t *dst, int32_t dst_stride,
3954  int32_t height)
3955 {
3956  int32_t cnt;
3957  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
3958 
3959  if (8 == height) {
3960  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3961  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3962  } else if (16 == height) {
3963  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3964  src += (8 * src_stride);
3965  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3966  dst += (8 * dst_stride);
3967  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3968  src += (8 * src_stride);
3969  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3970  dst += (8 * dst_stride);
3971  } else if (32 == height) {
3972  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3973  src += (8 * src_stride);
3974  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3975  dst += (8 * dst_stride);
3976  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3977  src += (8 * src_stride);
3978  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3979  dst += (8 * dst_stride);
3980  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3981  src += (8 * src_stride);
3982  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3983  dst += (8 * dst_stride);
3984  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3985  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3986  } else if (0 == height % 4) {
3987  for (cnt = (height >> 2); cnt--;) {
3988  LD_UB4(src, src_stride, src0, src1, src2, src3);
3989  src += (4 * src_stride);
3990  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
3991  dst += (4 * dst_stride);
3992  }
3993  }
3994 }
3995 
3996 static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
3997  uint8_t *dst, int32_t dst_stride,
3998  int32_t height)
3999 {
4000  int32_t cnt;
4001  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4002 
4003  if (0 == height % 8) {
4004  for (cnt = (height >> 3); cnt--;) {
4005  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
4006  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
4007  LD_UB8(src + 16, src_stride, src0, src1, src2, src3, src4, src5, src6,
4008  src7);
4009  src += (8 * src_stride);
4010  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 16,
4011  dst_stride);
4012  dst += (8 * dst_stride);
4013  }
4014  } else if (0 == height % 4) {
4015  for (cnt = (height >> 2); cnt--;) {
4016  LD_UB4(src, src_stride, src0, src1, src2, src3);
4017  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
4018  src += (4 * src_stride);
4019  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
4020  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
4021  dst += (4 * dst_stride);
4022  }
4023  }
4024 }
4025 
4026 static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
4027  uint8_t *dst, int32_t dst_stride,
4028  int32_t height)
4029 {
4030  int32_t cnt;
4031  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4032  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4033 
4034  for (cnt = (height >> 2); cnt--;) {
4035  LD_UB4(src, 16, src0, src1, src2, src3);
4036  src += src_stride;
4037  LD_UB4(src, 16, src4, src5, src6, src7);
4038  src += src_stride;
4039  LD_UB4(src, 16, src8, src9, src10, src11);
4040  src += src_stride;
4041  LD_UB4(src, 16, src12, src13, src14, src15);
4042  src += src_stride;
4043 
4044  ST_UB4(src0, src1, src2, src3, dst, 16);
4045  dst += dst_stride;
4046  ST_UB4(src4, src5, src6, src7, dst, 16);
4047  dst += dst_stride;
4048  ST_UB4(src8, src9, src10, src11, dst, 16);
4049  dst += dst_stride;
4050  ST_UB4(src12, src13, src14, src15, dst, 16);
4051  dst += dst_stride;
4052  }
4053 }
4054 
4055 static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
4056  uint8_t *dst, int32_t dst_stride,
4057  int32_t height)
4058 {
4059  uint32_t tp0, tp1, tp2, tp3;
4060  v16u8 src0 = { 0 }, src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 };
4061 
4062  if (8 == height) {
4063  LW4(src, src_stride, tp0, tp1, tp2, tp3);
4064  src += 4 * src_stride;
4065  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
4066  LW4(src, src_stride, tp0, tp1, tp2, tp3);
4067  INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
4068  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
4069  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4070  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
4071  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
4072  AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
4073  ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4074  } else if (4 == height) {
4075  LW4(src, src_stride, tp0, tp1, tp2, tp3);
4076  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
4077  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
4078  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4079  dst0 = __msa_aver_u_b(src0, dst0);
4080  ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
4081  }
4082 }
4083 
4084 static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
4085  uint8_t *dst, int32_t dst_stride,
4086  int32_t height)
4087 {
4088  int32_t cnt;
4089  uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
4090  v16u8 src0, src1, src2, src3;
4091  v16u8 dst0, dst1, dst2, dst3;
4092 
4093  if (0 == (height % 8)) {
4094  for (cnt = (height >> 3); cnt--;) {
4095  LD4(src, src_stride, tp0, tp1, tp2, tp3);
4096  src += 4 * src_stride;
4097  LD4(src, src_stride, tp4, tp5, tp6, tp7);
4098  src += 4 * src_stride;
4099  INSERT_D2_UB(tp0, tp1, src0);
4100  INSERT_D2_UB(tp2, tp3, src1);
4101  INSERT_D2_UB(tp4, tp5, src2);
4102  INSERT_D2_UB(tp6, tp7, src3);
4103  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
4104  LD4(dst + 4 * dst_stride, dst_stride, tp4, tp5, tp6, tp7);
4105  INSERT_D2_UB(tp0, tp1, dst0);
4106  INSERT_D2_UB(tp2, tp3, dst1);
4107  INSERT_D2_UB(tp4, tp5, dst2);
4108  INSERT_D2_UB(tp6, tp7, dst3);
4109  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0,
4110  dst1, dst2, dst3);
4111  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
4112  dst += 8 * dst_stride;
4113  }
4114  } else if (4 == height) {
4115  LD4(src, src_stride, tp0, tp1, tp2, tp3);
4116  INSERT_D2_UB(tp0, tp1, src0);
4117  INSERT_D2_UB(tp2, tp3, src1);
4118  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
4119  INSERT_D2_UB(tp0, tp1, dst0);
4120  INSERT_D2_UB(tp2, tp3, dst1);
4121  AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
4122  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
4123  }
4124 }
4125 
4126 static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
4127  uint8_t *dst, int32_t dst_stride,
4128  int32_t height)
4129 {
4130  int32_t cnt;
4131  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4132  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4133 
4134  if (0 == (height % 8)) {
4135  for (cnt = (height / 8); cnt--;) {
4136  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
4137  src += (8 * src_stride);
4138  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
4139 
4140  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4141  dst0, dst1, dst2, dst3);
4142  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4143  dst4, dst5, dst6, dst7);
4144  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
4145  dst += (8 * dst_stride);
4146  }
4147  } else if (0 == (height % 4)) {
4148  for (cnt = (height / 4); cnt--;) {
4149  LD_UB4(src, src_stride, src0, src1, src2, src3);
4150  src += (4 * src_stride);
4151  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
4152 
4153  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4154  dst0, dst1, dst2, dst3);
4155  ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
4156  dst += (4 * dst_stride);
4157  }
4158  }
4159 }
4160 
4161 static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
4162  uint8_t *dst, int32_t dst_stride,
4163  int32_t height)
4164 {
4165  int32_t cnt;
4166  uint8_t *dst_dup = dst;
4167  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4168  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4169  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4170  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4171 
4172  if (0 == (height % 8)) {
4173  for (cnt = (height / 8); cnt--;) {
4174  LD_UB4(src, src_stride, src0, src2, src4, src6);
4175  LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
4176  src += (4 * src_stride);
4177  LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
4178  LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
4179  dst_dup += (4 * dst_stride);
4180  LD_UB4(src, src_stride, src8, src10, src12, src14);
4181  LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
4182  src += (4 * src_stride);
4183  LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
4184  LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
4185  dst_dup += (4 * dst_stride);
4186 
4187  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4188  dst0, dst1, dst2, dst3);
4189  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4190  dst4, dst5, dst6, dst7);
4191  AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
4192  dst8, dst9, dst10, dst11);
4193  AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
4194  dst12, dst13, dst14, dst15);
4195 
4196  ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
4197  ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
4198  dst += (4 * dst_stride);
4199  ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
4200  ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
4201  dst += (4 * dst_stride);
4202  }
4203  } else if (0 == (height % 4)) {
4204  for (cnt = (height / 4); cnt--;) {
4205  LD_UB4(src, src_stride, src0, src2, src4, src6);
4206  LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
4207  src += (4 * src_stride);
4208  LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
4209  LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
4210  dst_dup += (4 * dst_stride);
4211 
4212  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4213  dst0, dst1, dst2, dst3);
4214  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4215  dst4, dst5, dst6, dst7);
4216 
4217  ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
4218  ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
4219  dst += (4 * dst_stride);
4220  }
4221  }
4222 }
4223 
4224 static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
4225  uint8_t *dst, int32_t dst_stride,
4226  int32_t height)
4227 {
4228  int32_t cnt;
4229  uint8_t *dst_dup = dst;
4230  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4231  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4232  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4233  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4234 
4235  for (cnt = (height / 4); cnt--;) {
4236  LD_UB4(src, 16, src0, src1, src2, src3);
4237  src += src_stride;
4238  LD_UB4(src, 16, src4, src5, src6, src7);
4239  src += src_stride;
4240  LD_UB4(src, 16, src8, src9, src10, src11);
4241  src += src_stride;
4242  LD_UB4(src, 16, src12, src13, src14, src15);
4243  src += src_stride;
4244 
4245  LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
4246  dst_dup += dst_stride;
4247  LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
4248  dst_dup += dst_stride;
4249  LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
4250  dst_dup += dst_stride;
4251  LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
4252  dst_dup += dst_stride;
4253 
4254  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4255  dst0, dst1, dst2, dst3);
4256  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4257  dst4, dst5, dst6, dst7);
4258  AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
4259  dst8, dst9, dst10, dst11);
4260  AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
4261  dst12, dst13, dst14, dst15);
4262 
4263  ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
4264  dst += dst_stride;
4265  ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
4266  dst += dst_stride;
4267  ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
4268  dst += dst_stride;
4269  ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
4270  dst += dst_stride;
4271  }
4272 }
4273 
4274 static const int8_t vp9_subpel_filters_msa[3][15][8] = {
4275  [FILTER_8TAP_REGULAR] = {
4276  {0, 1, -5, 126, 8, -3, 1, 0},
4277  {-1, 3, -10, 122, 18, -6, 2, 0},
4278  {-1, 4, -13, 118, 27, -9, 3, -1},
4279  {-1, 4, -16, 112, 37, -11, 4, -1},
4280  {-1, 5, -18, 105, 48, -14, 4, -1},
4281  {-1, 5, -19, 97, 58, -16, 5, -1},
4282  {-1, 6, -19, 88, 68, -18, 5, -1},
4283  {-1, 6, -19, 78, 78, -19, 6, -1},
4284  {-1, 5, -18, 68, 88, -19, 6, -1},
4285  {-1, 5, -16, 58, 97, -19, 5, -1},
4286  {-1, 4, -14, 48, 105, -18, 5, -1},
4287  {-1, 4, -11, 37, 112, -16, 4, -1},
4288  {-1, 3, -9, 27, 118, -13, 4, -1},
4289  {0, 2, -6, 18, 122, -10, 3, -1},
4290  {0, 1, -3, 8, 126, -5, 1, 0},
4291  }, [FILTER_8TAP_SHARP] = {
4292  {-1, 3, -7, 127, 8, -3, 1, 0},
4293  {-2, 5, -13, 125, 17, -6, 3, -1},
4294  {-3, 7, -17, 121, 27, -10, 5, -2},
4295  {-4, 9, -20, 115, 37, -13, 6, -2},
4296  {-4, 10, -23, 108, 48, -16, 8, -3},
4297  {-4, 10, -24, 100, 59, -19, 9, -3},
4298  {-4, 11, -24, 90, 70, -21, 10, -4},
4299  {-4, 11, -23, 80, 80, -23, 11, -4},
4300  {-4, 10, -21, 70, 90, -24, 11, -4},
4301  {-3, 9, -19, 59, 100, -24, 10, -4},
4302  {-3, 8, -16, 48, 108, -23, 10, -4},
4303  {-2, 6, -13, 37, 115, -20, 9, -4},
4304  {-2, 5, -10, 27, 121, -17, 7, -3},
4305  {-1, 3, -6, 17, 125, -13, 5, -2},
4306  {0, 1, -3, 8, 127, -7, 3, -1},
4307  }, [FILTER_8TAP_SMOOTH] = {
4308  {-3, -1, 32, 64, 38, 1, -3, 0},
4309  {-2, -2, 29, 63, 41, 2, -3, 0},
4310  {-2, -2, 26, 63, 43, 4, -4, 0},
4311  {-2, -3, 24, 62, 46, 5, -4, 0},
4312  {-2, -3, 21, 60, 49, 7, -4, 0},
4313  {-1, -4, 18, 59, 51, 9, -4, 0},
4314  {-1, -4, 16, 57, 53, 12, -4, -1},
4315  {-1, -4, 14, 55, 55, 14, -4, -1},
4316  {-1, -4, 12, 53, 57, 16, -4, -1},
4317  {0, -4, 9, 51, 59, 18, -4, -1},
4318  {0, -4, 7, 49, 60, 21, -3, -2},
4319  {0, -4, 5, 46, 62, 24, -3, -2},
4320  {0, -4, 4, 43, 63, 26, -2, -2},
4321  {0, -3, 2, 41, 63, 29, -2, -2},
4322  {0, -3, 1, 38, 64, 32, -1, -3},
4323  }
4324 };
4325 
4326 #define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx) \
4327 void ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \
4328  const uint8_t *src, \
4329  ptrdiff_t srcstride, \
4330  int h, int mx, int my) \
4331 { \
4332  const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1]; \
4333  \
4334  common_hz_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h); \
4335 } \
4336  \
4337 void ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \
4338  const uint8_t *src, \
4339  ptrdiff_t srcstride, \
4340  int h, int mx, int my) \
4341 { \
4342  const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1]; \
4343  \
4344  common_vt_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h); \
4345 } \
4346  \
4347 void ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \
4348  const uint8_t *src, \
4349  ptrdiff_t srcstride, \
4350  int h, int mx, int my) \
4351 { \
4352  const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1]; \
4353  const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1]; \
4354  \
4355  common_hv_8ht_8vt_##SIZE##w_msa(src, srcstride, dst, dststride, hfilter, \
4356  vfilter, h); \
4357 } \
4358  \
4359 void ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \
4360  const uint8_t *src, \
4361  ptrdiff_t srcstride, \
4362  int h, int mx, int my) \
4363 { \
4364  const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1]; \
4365  \
4366  common_hz_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, \
4367  dststride, filter, h); \
4368 } \
4369  \
4370 void ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \
4371  const uint8_t *src, \
4372  ptrdiff_t srcstride, \
4373  int h, int mx, int my) \
4374 { \
4375  const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1]; \
4376  \
4377  common_vt_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, dststride, \
4378  filter, h); \
4379 } \
4380  \
4381 void ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \
4382  const uint8_t *src, \
4383  ptrdiff_t srcstride, \
4384  int h, int mx, int my) \
4385 { \
4386  const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1]; \
4387  const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1]; \
4388  \
4389  common_hv_8ht_8vt_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, \
4390  dststride, hfilter, \
4391  vfilter, h); \
4392 }
4393 
4394 #define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE) \
4395 void ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \
4396  const uint8_t *src, ptrdiff_t srcstride, \
4397  int h, int mx, int my) \
4398 { \
4399  \
4400  copy_width##SIZE##_msa(src, srcstride, dst, dststride, h); \
4401 } \
4402  \
4403 void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \
4404  const uint8_t *src, ptrdiff_t srcstride, \
4405  int h, int mx, int my) \
4406 { \
4407  \
4408  avg_width##SIZE##_msa(src, srcstride, dst, dststride, h); \
4409 }
4410 
4411 #define VP9_AVG_MIPS_MSA_FUNC(SIZE) \
4412 void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \
4413  const uint8_t *src, ptrdiff_t srcstride, \
4414  int h, int mx, int my) \
4415 { \
4416  \
4417  avg_width##SIZE##_msa(src, srcstride, dst, dststride, h); \
4418 }
4419 
4425 
4431 
4437 
4443 
4444 #undef VP9_8TAP_MIPS_MSA_FUNC
4445 #undef VP9_COPY_AVG_MIPS_MSA_FUNC
4446 #undef VP9_AVG_MIPS_MSA_FUNC
ff_avg_bilin_64h_msa
void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3129
LD_SB4
#define LD_SB4(...)
Definition: generic_macros_msa.h:297
PCKEV_ST_SB
#define PCKEV_ST_SB(in0, in1, pdst)
Definition: generic_macros_msa.h:2799
ff_put_bilin_64v_msa
void ff_put_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2405
HORIZ_8TAP_8WID_4VECS_FILT
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3)
Definition: vp9_mc_msa.c:102
LD_UB8
#define LD_UB8(...)
Definition: generic_macros_msa.h:335
common_vt_2t_and_aver_dst_4x8_msa
static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:3207
copy_width8_msa
static void copy_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:3922
SRARI_H2_SH
#define SRARI_H2_SH(...)
Definition: generic_macros_msa.h:2059
out
FILE * out
Definition: movenc.c:54
common_vt_2t_4x8_msa
static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2143
ff_avg_bilin_8v_msa
void ff_avg_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3345
common_hz_2t_and_aver_dst_4x4_msa
static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2818
PCKEV_B4_SB
#define PCKEV_B4_SB(...)
Definition: generic_macros_msa.h:1738
common_hz_2t_4x8_msa
static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:1818
SPLATI_H4_SH
#define SPLATI_H4_SH(...)
Definition: generic_macros_msa.h:1674
common_hz_8t_and_aver_dst_4w_msa
static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1066
common_vt_8t_and_aver_dst_16w_msa
static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1518
src1
const pixel * src1
Definition: h264pred_template.c:421
common_hv_2ht_2vt_and_aver_dst_4x8_msa
static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:3625
HORIZ_2TAP_FILT_UH
#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)
Definition: generic_macros_msa.h:2809
SAT_SH4_SH
#define SAT_SH4_SH(...)
Definition: generic_macros_msa.h:1615
common_hz_8t_8x8mult_msa
static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:267
VSHF_B2_UB
#define VSHF_B2_UB(...)
Definition: generic_macros_msa.h:661
common_hz_2t_and_aver_dst_8x8mult_msa
static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:2924
common_hv_8ht_8vt_and_aver_dst_4w_msa
static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1548
VP9_COPY_AVG_MIPS_MSA_FUNC
#define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE)
Definition: vp9_mc_msa.c:4394
AVER_UB2_UB
#define AVER_UB2_UB(...)
Definition: generic_macros_msa.h:595
SRARI_H4_SH
#define SRARI_H4_SH(...)
Definition: generic_macros_msa.h:2067
DOTP_UB2_UH
#define DOTP_UB2_UH(...)
Definition: generic_macros_msa.h:740
ff_put_bilin_32hv_msa
void ff_put_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2790
common_vt_8t_and_aver_dst_4w_msa
static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1301
SAT_SH2_SH
#define SAT_SH2_SH(...)
Definition: generic_macros_msa.h:1601
common_hz_8t_4w_msa
static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:225
common_hz_8t_and_aver_dst_8w_msa
static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1081
PCKEV_AVG_ST8x4_UB
#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride)
Definition: vp9_mc_msa.c:148
common_vt_8t_and_aver_dst_64w_msa
static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1538
ADDS_SH4_SH
#define ADDS_SH4_SH(...)
Definition: generic_macros_msa.h:1906
ff_avg_bilin_4v_msa
void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3246
ff_put_bilin_64h_msa
void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2074
LD_SH
#define LD_SH(...)
Definition: generic_macros_msa.h:35
common_vt_8t_and_aver_dst_8w_msa
static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1360
common_hv_8ht_8vt_4w_msa
static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:761
common_hv_8ht_8vt_64w_msa
static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:969
common_hv_8ht_8vt_and_aver_dst_32w_msa
static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1755
PCKEV_B4_UB
#define PCKEV_B4_UB(...)
Definition: generic_macros_msa.h:1739
common_hz_2t_and_aver_dst_8x4_msa
static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2896
LD_UB5
#define LD_UB5(...)
Definition: generic_macros_msa.h:307
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
common_hz_2t_4x4_msa
static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:1795
ST_UB8
#define ST_UB8(...)
Definition: generic_macros_msa.h:391
AVER_UB4_UB
#define AVER_UB4_UB(...)
Definition: generic_macros_msa.h:603
VP9_AVG_MIPS_MSA_FUNC
#define VP9_AVG_MIPS_MSA_FUNC(SIZE)
Definition: vp9_mc_msa.c:4411
ST_UB4
#define ST_UB4(...)
Definition: generic_macros_msa.h:374
HORIZ_8TAP_FILT
#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, filt_h1, filt_h2, filt_h3)
Definition: vp9_mc_msa.c:66
SRARI_H4_UH
#define SRARI_H4_UH(...)
Definition: generic_macros_msa.h:2066
common_hv_2ht_2vt_and_aver_dst_8x8mult_msa
static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:3743
ff_avg_bilin_4hv_msa
void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3677
XORI_B4_128_SB
#define XORI_B4_128_SB(...)
Definition: generic_macros_msa.h:1851
vp9_subpel_filters_msa
static const int8_t vp9_subpel_filters_msa[3][15][8]
Definition: vp9_mc_msa.c:4274
common_vt_2t_8x4_msa
static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2190
common_vt_8t_64w_msa
static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:753
ff_put_bilin_64hv_msa
void ff_put_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2804
generic_macros_msa.h
ff_put_bilin_8hv_msa
void ff_put_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2703
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: vp8_lpf_lsx.c:234
ff_put_bilin_8v_msa
void ff_put_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2259
VP9_8TAP_MIPS_MSA_FUNC
#define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx)
Definition: vp9_mc_msa.c:4326
common_hz_2t_8x8mult_msa
static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1884
common_hv_8ht_8vt_32w_msa
static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:952
LD_SB
#define LD_SB(...)
Definition: generic_macros_msa.h:33
FILTER_8TAP_SHARP
@ FILTER_8TAP_SHARP
Definition: vp9.h:67
SLDI_B3_SB
#define SLDI_B3_SB(...)
Definition: generic_macros_msa.h:634
LD_UB
#define LD_UB(...)
Definition: generic_macros_msa.h:32
LD_SB5
#define LD_SB5(...)
Definition: generic_macros_msa.h:308
ff_put_bilin_4hv_msa
void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2565
ILVEV_B2_SH
#define ILVEV_B2_SH(...)
Definition: generic_macros_msa.h:1190
ILVEV_B2_UB
#define ILVEV_B2_UB(...)
Definition: generic_macros_msa.h:1188
ff_put_bilin_4h_msa
void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:1847
common_hv_8ht_8vt_and_aver_dst_8w_msa
static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1631
mask
static const uint16_t mask[17]
Definition: lzw.c:38
common_vt_8t_4w_msa
static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:470
ILVR_B4_UB
#define ILVR_B4_UB(...)
Definition: generic_macros_msa.h:1359
width
#define width
ILVL_B2_UB
#define ILVL_B2_UB(...)
Definition: generic_macros_msa.h:1262
SAT_UH2_UH
#define SAT_UH2_UH(...)
Definition: generic_macros_msa.h:1567
DOTP_UB4_UH
#define DOTP_UB4_UH(...)
Definition: generic_macros_msa.h:749
common_vt_2t_and_aver_dst_4x4_msa
static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:3171
VSHF_B2_UH
#define VSHF_B2_UH(...)
Definition: generic_macros_msa.h:663
ff_avg_bilin_4h_msa
void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2881
ff_put_bilin_32h_msa
void ff_put_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2027
common_hz_8t_and_aver_dst_16w_msa
static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1122
common_vt_2t_4x4_msa
static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2117
avg_width16_msa
static void avg_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4126
ff_avg_bilin_32v_msa
void ff_avg_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3414
vp9dsp_mips.h
ST_D8
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:511
ff_avg_bilin_16v_msa
void ff_avg_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3360
XORI_B2_128_UB
#define XORI_B2_128_UB(...)
Definition: generic_macros_msa.h:1834
LD_UH
#define LD_UH(...)
Definition: generic_macros_msa.h:34
ILVR_B4_SB
#define ILVR_B4_SB(...)
Definition: generic_macros_msa.h:1360
ILVR_D2_SB
#define ILVR_D2_SB(...)
Definition: generic_macros_msa.h:1444
common_hv_2ht_2vt_8x8mult_msa
static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:2624
PCKEV_B2_UB
#define PCKEV_B2_UB(...)
Definition: generic_macros_msa.h:1720
ff_avg_bilin_8hv_msa
void ff_avg_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3805
common_vt_2t_and_aver_dst_8x8mult_msa
static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:3290
ff_avg_bilin_16hv_msa
void ff_avg_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3823
common_hv_2ht_2vt_4x4_msa
static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:2486
ILVR_D4_UB
#define ILVR_D4_UB(...)
Definition: generic_macros_msa.h:1461
ff_avg_bilin_16h_msa
void ff_avg_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3011
common_hz_8t_and_aver_dst_4x8_msa
static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:1021
common_hz_8t_4x8_msa
static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:188
ST_W2
#define ST_W2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:450
ILVR_D3_SB
#define ILVR_D3_SB(...)
Definition: generic_macros_msa.h:1452
ILVR_D4_SB
#define ILVR_D4_SB(...)
Definition: generic_macros_msa.h:1460
vp9_bilinear_filters_msa
static const int8_t vp9_bilinear_filters_msa[15][2]
Definition: vp9_mc_msa.c:34
ff_avg_bilin_32h_msa
void ff_avg_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3079
vp9dsp.h
common_hz_8t_64w_msa
static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:413
PCKEV_XORI128_UB
#define PCKEV_XORI128_UB(in0, in1)
Definition: generic_macros_msa.h:2751
common_hz_8t_4x4_msa
static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:159
ff_put_bilin_32v_msa
void ff_put_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2327
LD_SB7
#define LD_SB7(...)
Definition: generic_macros_msa.h:327
FILTER_8TAP_REGULAR
@ FILTER_8TAP_REGULAR
Definition: vp9.h:66
mc_filt_mask_arr
static const uint8_t mc_filt_mask_arr[16 *3]
Definition: vp9_mc_msa.c:25
ff_put_bilin_8h_msa
void ff_put_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:1946
ILVR_D2_UB
#define ILVR_D2_UB(...)
Definition: generic_macros_msa.h:1443
common_hv_8ht_8vt_and_aver_dst_64w_msa
static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1775
SD4
#define SD4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:256
LD_UB4
#define LD_UB4(...)
Definition: generic_macros_msa.h:296
ff_put_bilin_4v_msa
void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2177
ILVR_B2_SB
#define ILVR_B2_SB(...)
Definition: generic_macros_msa.h:1338
XORI_B2_128_SB
#define XORI_B2_128_SB(...)
Definition: generic_macros_msa.h:1835
SLDI_B2_SH
#define SLDI_B2_SH(...)
Definition: generic_macros_msa.h:624
common_hv_8ht_8vt_8w_msa
static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:837
copy_width64_msa
static void copy_width64_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4026
height
#define height
common_hz_8t_32w_msa
static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:354
common_hz_8t_16w_msa
static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:315
common_hz_2t_8x4_msa
static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:1860
common_vt_8t_and_aver_dst_16w_mult_msa
static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
Definition: vp9_mc_msa.c:1421
ff_avg_bilin_64v_msa
void ff_avg_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3494
ff_avg_bilin_32hv_msa
void ff_avg_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3894
LW4
#define LW4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:202
common_vt_8t_and_aver_dst_32w_msa
static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1528
common_hz_8t_and_aver_dst_32w_msa
static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1180
SRARI_H2_UH
#define SRARI_H2_UH(...)
Definition: generic_macros_msa.h:2058
CONVERT_UB_AVG_ST8x4_UB
#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride)
Definition: generic_macros_msa.h:2763
common_hv_2ht_2vt_8x4_msa
static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:2581
XORI_B4_128_UB
#define XORI_B4_128_UB(...)
Definition: generic_macros_msa.h:1850
src2
const pixel * src2
Definition: h264pred_template.c:422
ff_avg_bilin_8h_msa
void ff_avg_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2996
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:470
common_hv_8ht_8vt_16w_msa
static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:935
FILTER_8TAP_SMOOTH
@ FILTER_8TAP_SMOOTH
Definition: vp9.h:65
LD4
#define LD4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:228
ILVL_B2_SB
#define ILVL_B2_SB(...)
Definition: generic_macros_msa.h:1263
common_vt_8t_32w_msa
static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:745
copy_width32_msa
static void copy_width32_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:3996
filt
static const int8_t filt[NUMTAPS *2]
Definition: af_earwax.c:39
SPLATI_H4_SB
#define SPLATI_H4_SB(...)
Definition: generic_macros_msa.h:1673
DPADD_SB4_SH
#define DPADD_SB4_SH(...)
Definition: generic_macros_msa.h:841
INSERT_W4_UB
#define INSERT_W4_UB(...)
Definition: generic_macros_msa.h:1153
avg_width32_msa
static void avg_width32_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4161
common_hz_8t_and_aver_dst_64w_msa
static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1239
common_hv_2ht_2vt_4x8_msa
static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:2519
ILVR_B2_UB
#define ILVR_B2_UB(...)
Definition: generic_macros_msa.h:1337
ff_put_bilin_16h_msa
void ff_put_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:1960
ST_UB
#define ST_UB(...)
Definition: generic_macros_msa.h:40
LD_UB2
#define LD_UB2(...)
Definition: generic_macros_msa.h:277
common_hv_2ht_2vt_and_aver_dst_4x4_msa
static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:3581
copy_width16_msa
static void copy_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:3952
ff_put_bilin_16v_msa
void ff_put_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2273
common_vt_2t_8x8mult_msa
static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:2214
common_hz_8t_8x4_msa
static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:236
ff_put_bilin_16hv_msa
void ff_put_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2719
PCKEV_XORI128_AVG_ST_UB
#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst)
Definition: vp9_mc_msa.c:130
common_vt_8t_16w_msa
static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:577
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:499
DOTP_SB4_SH
#define DOTP_SB4_SH(...)
Definition: generic_macros_msa.h:784
ILVL_B4_SB
#define ILVL_B4_SB(...)
Definition: generic_macros_msa.h:1274
LD_SB8
#define LD_SB8(...)
Definition: generic_macros_msa.h:336
src0
const pixel *const src0
Definition: h264pred_template.c:420
PCKEV_AVG_ST_UB
#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)
Definition: vp9_mc_msa.c:139
HORIZ_8TAP_4WID_4VECS_FILT
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1)
Definition: vp9_mc_msa.c:83
common_vt_8t_16w_mult_msa
static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
Definition: vp9_mc_msa.c:656
XORI_B7_128_SB
#define XORI_B7_128_SB(...)
Definition: generic_macros_msa.h:1873
smooth
static float smooth(DeshakeOpenCLContext *deshake_ctx, float *gauss_kernel, int length, float max_val, AVFifo *values)
Definition: vf_deshake_opencl.c:888
avg_width64_msa
static void avg_width64_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4224
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
common_hz_2t_and_aver_dst_4x8_msa
static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2847
common_hv_2ht_2vt_and_aver_dst_8x4_msa
static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:3693
PCKEV_B2_SB
#define PCKEV_B2_SB(...)
Definition: generic_macros_msa.h:1719
int32_t
int32_t
Definition: audioconvert.c:56
common_hv_8ht_8vt_and_aver_dst_16w_msa
static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1735
avg_width8_msa
static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4084
SLDI_B3_UH
#define SLDI_B3_UH(...)
Definition: generic_macros_msa.h:635
VSHF_B4_SH
#define VSHF_B4_SH(...)
Definition: generic_macros_msa.h:681
avg_width4_msa
static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4055
INSERT_D2_UB
#define INSERT_D2_UB(...)
Definition: generic_macros_msa.h:1169
XORI_B3_128_SB
#define XORI_B3_128_SB(...)
Definition: generic_macros_msa.h:1843
common_vt_2t_and_aver_dst_8x4_msa
static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:3261
LD_SB2
#define LD_SB2(...)
Definition: generic_macros_msa.h:278
ff_avg_bilin_64hv_msa
void ff_avg_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3908
common_vt_8t_8w_msa
static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:522
common_hz_8t_and_aver_dst_4x4_msa
static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:986
FILT_8TAP_DPADD_S_H
#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3)
Definition: vp9_mc_msa.c:52
common_hz_8t_8w_msa
static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:303
SAT_UH4_UH
#define SAT_UH4_UH(...)
Definition: generic_macros_msa.h:1575