FFmpeg
vp8_mc_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/vp8dsp.h"
23 #include "vp8dsp_mips.h"
24 
25 static const uint8_t mc_filt_mask_arr[16 * 3] = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  /* 4 width cases */
29  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30  /* 4 width cases */
31  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32 };
33 
34 static const int8_t subpel_filters_msa[7][8] = {
35  {-6, 123, 12, -1, 0, 0, 0, 0},
36  {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */
37  {-9, 93, 50, -6, 0, 0, 0, 0},
38  {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */
39  {-6, 50, 93, -9, 0, 0, 0, 0},
40  {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */
41  {-1, 12, 123, -6, 0, 0, 0, 0},
42 };
43 
44 static const int8_t bilinear_filters_msa[7][2] = {
45  {112, 16},
46  {96, 32},
47  {80, 48},
48  {64, 64},
49  {48, 80},
50  {32, 96},
51  {16, 112}
52 };
53 
54 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, \
55  filt_h0, filt_h1, filt_h2) \
56 ( { \
57  v16i8 vec0_m, vec1_m, vec2_m; \
58  v8i16 hz_out_m; \
59  \
60  VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \
61  vec0_m, vec1_m, vec2_m); \
62  hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, \
63  filt_h0, filt_h1, filt_h2); \
64  \
65  hz_out_m = __msa_srari_h(hz_out_m, 7); \
66  hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
67  \
68  hz_out_m; \
69 } )
70 
71 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
72  mask0, mask1, mask2, \
73  filt0, filt1, filt2, \
74  out0, out1) \
75 { \
76  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \
77  \
78  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
79  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
80  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
81  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
82  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
83  DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
84 }
85 
86 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
87  mask0, mask1, mask2, \
88  filt0, filt1, filt2, \
89  out0, out1, out2, out3) \
90 { \
91  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
92  \
93  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
94  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
95  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
96  out0, out1, out2, out3); \
97  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
98  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
99  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m); \
100  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m); \
101  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
102  out0, out1, out2, out3); \
103  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \
104  out0, out1, out2, out3); \
105 }
106 
107 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
108 ( { \
109  v8i16 tmp0; \
110  \
111  tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
112  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
113  \
114  tmp0; \
115 } )
116 
117 #define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \
118 ( { \
119  v16i8 vec0_m, vec1_m; \
120  v8i16 hz_out_m; \
121  \
122  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \
123  hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \
124  \
125  hz_out_m = __msa_srari_h(hz_out_m, 7); \
126  hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
127  \
128  hz_out_m; \
129 } )
130 
131 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
132  mask0, mask1, filt0, filt1, \
133  out0, out1) \
134 { \
135  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
136  \
137  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
138  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
139  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
140  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
141 }
142 
143 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
144  mask0, mask1, filt0, filt1, \
145  out0, out1, out2, out3) \
146 { \
147  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
148  \
149  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
150  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
151  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
152  out0, out1, out2, out3); \
153  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
154  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
155  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
156  out0, out1, out2, out3); \
157 }
158 
159 static void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride,
160  uint8_t *dst, int32_t dst_stride,
161  const int8_t *filter)
162 {
163  v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
164  v16u8 mask0, mask1, mask2, out;
165  v8i16 filt, out0, out1;
166 
167  mask0 = LD_UB(&mc_filt_mask_arr[16]);
168  src -= 2;
169 
170  /* rearranging filter */
171  filt = LD_SH(filter);
172  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
173 
174  mask1 = mask0 + 2;
175  mask2 = mask0 + 4;
176 
177  LD_SB4(src, src_stride, src0, src1, src2, src3);
178  XORI_B4_128_SB(src0, src1, src2, src3);
179  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
180  filt0, filt1, filt2, out0, out1);
181  SRARI_H2_SH(out0, out1, 7);
182  SAT_SH2_SH(out0, out1, 7);
183  out = PCKEV_XORI128_UB(out0, out1);
184  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
185 }
186 
187 static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride,
188  uint8_t *dst, int32_t dst_stride,
189  const int8_t *filter)
190 {
191  v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
192  v16u8 mask0, mask1, mask2, out;
193  v8i16 filt, out0, out1, out2, out3;
194 
195  mask0 = LD_UB(&mc_filt_mask_arr[16]);
196  src -= 2;
197 
198  /* rearranging filter */
199  filt = LD_SH(filter);
200  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
201 
202  mask1 = mask0 + 2;
203  mask2 = mask0 + 4;
204 
205  LD_SB4(src, src_stride, src0, src1, src2, src3);
206  XORI_B4_128_SB(src0, src1, src2, src3);
207  src += (4 * src_stride);
208  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
209  filt0, filt1, filt2, out0, out1);
210  LD_SB4(src, src_stride, src0, src1, src2, src3);
211  XORI_B4_128_SB(src0, src1, src2, src3);
212  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
213  filt0, filt1, filt2, out2, out3);
214  SRARI_H4_SH(out0, out1, out2, out3, 7);
215  SAT_SH4_SH(out0, out1, out2, out3, 7);
216  out = PCKEV_XORI128_UB(out0, out1);
217  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
218  out = PCKEV_XORI128_UB(out2, out3);
219  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
220 }
221 
222 void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
223  uint8_t *src, ptrdiff_t src_stride,
224  int height, int mx, int my)
225 {
226  const int8_t *filter = subpel_filters_msa[mx - 1];
227 
228  if (4 == height) {
229  common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
230  } else if (8 == height) {
231  common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
232  }
233 }
234 
235 void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
236  uint8_t *src, ptrdiff_t src_stride,
237  int height, int mx, int my)
238 {
239  uint32_t loop_cnt;
240  const int8_t *filter = subpel_filters_msa[mx - 1];
241  v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
242  v16u8 mask0, mask1, mask2, tmp0, tmp1;
243  v8i16 filt, out0, out1, out2, out3;
244 
245  mask0 = LD_UB(&mc_filt_mask_arr[0]);
246 
247  src -= 2;
248 
249  /* rearranging filter */
250  filt = LD_SH(filter);
251  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
252 
253  mask1 = mask0 + 2;
254  mask2 = mask0 + 4;
255 
256  LD_SB4(src, src_stride, src0, src1, src2, src3);
257  XORI_B4_128_SB(src0, src1, src2, src3);
258  src += (4 * src_stride);
259  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
260  filt0, filt1, filt2, out0, out1, out2, out3);
261  SRARI_H4_SH(out0, out1, out2, out3, 7);
262  SAT_SH4_SH(out0, out1, out2, out3, 7);
263  tmp0 = PCKEV_XORI128_UB(out0, out1);
264  tmp1 = PCKEV_XORI128_UB(out2, out3);
265  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
266  dst += (4 * dst_stride);
267 
268  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
269  LD_SB4(src, src_stride, src0, src1, src2, src3);
270  XORI_B4_128_SB(src0, src1, src2, src3);
271  src += (4 * src_stride);
272  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
273  filt0, filt1, filt2, out0, out1, out2, out3);
274  SRARI_H4_SH(out0, out1, out2, out3, 7);
275  SAT_SH4_SH(out0, out1, out2, out3, 7);
276  tmp0 = PCKEV_XORI128_UB(out0, out1);
277  tmp1 = PCKEV_XORI128_UB(out2, out3);
278  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
279  dst += (4 * dst_stride);
280  }
281 }
282 
283 void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
284  uint8_t *src, ptrdiff_t src_stride,
285  int height, int mx, int my)
286 {
287  uint32_t loop_cnt;
288  const int8_t *filter = subpel_filters_msa[mx - 1];
289  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
290  v16u8 mask0, mask1, mask2, out;
291  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
292 
293  mask0 = LD_UB(&mc_filt_mask_arr[0]);
294  src -= 2;
295 
296  /* rearranging filter */
297  filt = LD_SH(filter);
298  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
299 
300  mask1 = mask0 + 2;
301  mask2 = mask0 + 4;
302 
303  for (loop_cnt = (height >> 2); loop_cnt--;) {
304  LD_SB4(src, src_stride, src0, src2, src4, src6);
305  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
306  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
307  src += (4 * src_stride);
308 
309  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
310  filt0, filt1, filt2, out0, out1, out2, out3);
311  HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
312  filt0, filt1, filt2, out4, out5, out6, out7);
313  SRARI_H4_SH(out0, out1, out2, out3, 7);
314  SRARI_H4_SH(out4, out5, out6, out7, 7);
315  SAT_SH4_SH(out0, out1, out2, out3, 7);
316  SAT_SH4_SH(out4, out5, out6, out7, 7);
317  out = PCKEV_XORI128_UB(out0, out1);
318  ST_UB(out, dst);
319  dst += dst_stride;
320  out = PCKEV_XORI128_UB(out2, out3);
321  ST_UB(out, dst);
322  dst += dst_stride;
323  out = PCKEV_XORI128_UB(out4, out5);
324  ST_UB(out, dst);
325  dst += dst_stride;
326  out = PCKEV_XORI128_UB(out6, out7);
327  ST_UB(out, dst);
328  dst += dst_stride;
329  }
330 }
331 
332 void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
333  uint8_t *src, ptrdiff_t src_stride,
334  int height, int mx, int my)
335 {
336  uint32_t loop_cnt;
337  const int8_t *filter = subpel_filters_msa[my - 1];
338  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
339  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
340  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
341  v16u8 out;
342  v8i16 filt, out10, out32;
343 
344  src -= (2 * src_stride);
345 
346  filt = LD_SH(filter);
347  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
348 
349  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
350  src += (5 * src_stride);
351 
352  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
353  src32_r, src43_r);
354  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
355  XORI_B2_128_SB(src2110, src4332);
356 
357  for (loop_cnt = (height >> 2); loop_cnt--;) {
358  LD_SB4(src, src_stride, src5, src6, src7, src8);
359  src += (4 * src_stride);
360 
361  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
362  src65_r, src76_r, src87_r);
363  ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
364  XORI_B2_128_SB(src6554, src8776);
365  out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
366  out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
367  SRARI_H2_SH(out10, out32, 7);
368  SAT_SH2_SH(out10, out32, 7);
369  out = PCKEV_XORI128_UB(out10, out32);
370  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
371  dst += (4 * dst_stride);
372 
373  src2110 = src6554;
374  src4332 = src8776;
375  src4 = src8;
376  }
377 }
378 
379 void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
380  uint8_t *src, ptrdiff_t src_stride,
381  int height, int mx, int my)
382 {
383  uint32_t loop_cnt;
384  const int8_t *filter = subpel_filters_msa[my - 1];
385  v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
386  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
387  v16i8 src109_r, filt0, filt1, filt2;
388  v16u8 tmp0, tmp1;
389  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
390 
391  src -= (2 * src_stride);
392 
393  filt = LD_SH(filter);
394  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
395 
396  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
397  src += (5 * src_stride);
398 
399  XORI_B5_128_SB(src0, src1, src2, src3, src4);
400  ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3,
401  src10_r, src32_r, src21_r, src43_r);
402 
403  for (loop_cnt = (height >> 2); loop_cnt--;) {
404  LD_SB4(src, src_stride, src7, src8, src9, src10);
405  XORI_B4_128_SB(src7, src8, src9, src10);
406  src += (4 * src_stride);
407 
408  ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
409  src87_r, src98_r, src109_r);
410  out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
411  out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
412  out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
413  out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
414  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
415  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
416  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
417  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
418  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
419  dst += (4 * dst_stride);
420 
421  src10_r = src76_r;
422  src32_r = src98_r;
423  src21_r = src87_r;
424  src43_r = src109_r;
425  src4 = src10;
426  }
427 }
428 
429 void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
430  uint8_t *src, ptrdiff_t src_stride,
431  int height, int mx, int my)
432 {
433  uint32_t loop_cnt;
434  const int8_t *filter = subpel_filters_msa[my - 1];
435  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
436  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
437  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
438  v16i8 src65_l, src87_l, filt0, filt1, filt2;
439  v16u8 tmp0, tmp1, tmp2, tmp3;
440  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
441 
442  src -= (2 * src_stride);
443 
444  filt = LD_SH(filter);
445  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
446 
447  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
448  src += (5 * src_stride);
449 
450  XORI_B5_128_SB(src0, src1, src2, src3, src4);
451  ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r,
452  src32_r, src43_r, src21_r);
453  ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l,
454  src32_l, src43_l, src21_l);
455 
456  for (loop_cnt = (height >> 2); loop_cnt--;) {
457  LD_SB4(src, src_stride, src5, src6, src7, src8);
458  src += (4 * src_stride);
459 
460  XORI_B4_128_SB(src5, src6, src7, src8);
461  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
462  src65_r, src76_r, src87_r);
463  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
464  src65_l, src76_l, src87_l);
465  out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1,
466  filt2);
467  out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1,
468  filt2);
469  out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1,
470  filt2);
471  out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1,
472  filt2);
473  out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1,
474  filt2);
475  out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1,
476  filt2);
477  out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1,
478  filt2);
479  out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1,
480  filt2);
481  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
482  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
483  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
484  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
485  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
486  out3_r, tmp0, tmp1, tmp2, tmp3);
487  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
488  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
489  dst += (4 * dst_stride);
490 
491  src10_r = src54_r;
492  src32_r = src76_r;
493  src21_r = src65_r;
494  src43_r = src87_r;
495  src10_l = src54_l;
496  src32_l = src76_l;
497  src21_l = src65_l;
498  src43_l = src87_l;
499  src4 = src8;
500  }
501 }
502 
503 void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
504  uint8_t *src, ptrdiff_t src_stride,
505  int height, int mx, int my)
506 {
507  uint32_t loop_cnt;
508  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
509  const int8_t *filter_vert = subpel_filters_msa[my - 1];
510  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
511  v16i8 filt_hz0, filt_hz1, filt_hz2;
512  v16u8 mask0, mask1, mask2, out;
513  v8i16 tmp0, tmp1;
514  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
515  v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
516 
517  mask0 = LD_UB(&mc_filt_mask_arr[16]);
518  src -= (2 + 2 * src_stride);
519 
520  /* rearranging filter */
521  filt = LD_SH(filter_horiz);
522  SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
523 
524  filt = LD_SH(filter_vert);
525  SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
526 
527  mask1 = mask0 + 2;
528  mask2 = mask0 + 4;
529 
530  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
531  src += (5 * src_stride);
532 
533  XORI_B5_128_SB(src0, src1, src2, src3, src4);
534  hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
535  filt_hz1, filt_hz2);
536  hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0,
537  filt_hz1, filt_hz2);
538  hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
539  hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
540  filt_hz1, filt_hz2);
541  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
542 
543  for (loop_cnt = (height >> 2); loop_cnt--;) {
544  LD_SB2(src, src_stride, src5, src6);
545  src += (2 * src_stride);
546 
547  XORI_B2_128_SB(src5, src6);
548  hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
549  filt_hz1, filt_hz2);
550  hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
551 
552  LD_SB2(src, src_stride, src7, src8);
553  src += (2 * src_stride);
554 
555  XORI_B2_128_SB(src7, src8);
556  hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
557  filt_hz1, filt_hz2);
558  hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
559 
560  out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
561  tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
562 
563  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
564  tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
565 
566  SRARI_H2_SH(tmp0, tmp1, 7);
567  SAT_SH2_SH(tmp0, tmp1, 7);
568  out = PCKEV_XORI128_UB(tmp0, tmp1);
569  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
570  dst += (4 * dst_stride);
571 
572  hz_out3 = hz_out7;
573  out0 = out2;
574  out1 = out3;
575  }
576 }
577 
578 void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
579  uint8_t *src, ptrdiff_t src_stride,
580  int height, int mx, int my)
581 {
582  uint32_t loop_cnt;
583  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
584  const int8_t *filter_vert = subpel_filters_msa[my - 1];
585  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
586  v16i8 filt_hz0, filt_hz1, filt_hz2;
587  v16u8 mask0, mask1, mask2, vec0, vec1;
588  v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
589  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
590  v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
591  v8i16 tmp0, tmp1, tmp2, tmp3;
592 
593  mask0 = LD_UB(&mc_filt_mask_arr[0]);
594  src -= (2 + 2 * src_stride);
595 
596  /* rearranging filter */
597  filt = LD_SH(filter_horiz);
598  SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
599 
600  mask1 = mask0 + 2;
601  mask2 = mask0 + 4;
602 
603  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
604  src += (5 * src_stride);
605 
606  XORI_B5_128_SB(src0, src1, src2, src3, src4);
607  hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
608  filt_hz1, filt_hz2);
609  hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
610  filt_hz1, filt_hz2);
611  hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
612  filt_hz1, filt_hz2);
613  hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
614  filt_hz1, filt_hz2);
615  hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
616  filt_hz1, filt_hz2);
617 
618  filt = LD_SH(filter_vert);
619  SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
620 
621  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
622  ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
623 
624  for (loop_cnt = (height >> 2); loop_cnt--;) {
625  LD_SB4(src, src_stride, src5, src6, src7, src8);
626  src += (4 * src_stride);
627 
628  XORI_B4_128_SB(src5, src6, src7, src8);
629  hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
630  filt_hz1, filt_hz2);
631  out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
632  tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
633 
634  hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
635  filt_hz1, filt_hz2);
636  out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
637  tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
638 
639  hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
640  filt_hz1, filt_hz2);
641  out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
642  tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
643 
644  hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
645  filt_hz1, filt_hz2);
646  out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
647  tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
648 
649  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
650  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
651  vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
652  vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
653  ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
654  dst += (4 * dst_stride);
655 
656  hz_out4 = hz_out8;
657  out0 = out2;
658  out1 = out7;
659  out3 = out5;
660  out4 = out6;
661  }
662 }
663 
664 
665 void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
666  uint8_t *src, ptrdiff_t src_stride,
667  int height, int mx, int my)
668 {
669  int32_t multiple8_cnt;
670 
671  for (multiple8_cnt = 2; multiple8_cnt--;) {
672  ff_put_vp8_epel8_h6v6_msa(dst, dst_stride, src, src_stride, height,
673  mx, my);
674 
675  src += 8;
676  dst += 8;
677  }
678 }
679 
680 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
681  uint8_t *dst, int32_t dst_stride,
682  const int8_t *filter)
683 {
684  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
685  v8i16 filt, out0, out1;
686  v16u8 out;
687 
688  mask0 = LD_SB(&mc_filt_mask_arr[16]);
689  src -= 1;
690 
691  /* rearranging filter */
692  filt = LD_SH(filter);
693  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
694 
695  mask1 = mask0 + 2;
696 
697  LD_SB4(src, src_stride, src0, src1, src2, src3);
698  XORI_B4_128_SB(src0, src1, src2, src3);
699  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
700  filt0, filt1, out0, out1);
701  SRARI_H2_SH(out0, out1, 7);
702  SAT_SH2_SH(out0, out1, 7);
703  out = PCKEV_XORI128_UB(out0, out1);
704  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
705 }
706 
707 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
708  uint8_t *dst, int32_t dst_stride,
709  const int8_t *filter)
710 {
711  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
712  v16u8 out;
713  v8i16 filt, out0, out1, out2, out3;
714 
715  mask0 = LD_SB(&mc_filt_mask_arr[16]);
716  src -= 1;
717 
718  /* rearranging filter */
719  filt = LD_SH(filter);
720  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
721 
722  mask1 = mask0 + 2;
723 
724  LD_SB4(src, src_stride, src0, src1, src2, src3);
725  src += (4 * src_stride);
726 
727  XORI_B4_128_SB(src0, src1, src2, src3);
728  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
729  filt0, filt1, out0, out1);
730  LD_SB4(src, src_stride, src0, src1, src2, src3);
731  XORI_B4_128_SB(src0, src1, src2, src3);
732  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
733  filt0, filt1, out2, out3);
734  SRARI_H4_SH(out0, out1, out2, out3, 7);
735  SAT_SH4_SH(out0, out1, out2, out3, 7);
736  out = PCKEV_XORI128_UB(out0, out1);
737  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
738  out = PCKEV_XORI128_UB(out2, out3);
739  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
740 }
741 
742 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
743  uint8_t *dst, int32_t dst_stride,
744  const int8_t *filter)
745 {
746  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
747  v16i8 filt0, filt1, mask0, mask1;
748  v16u8 out;
749  v8i16 filt, out0, out1, out2, out3;
750 
751  mask0 = LD_SB(&mc_filt_mask_arr[16]);
752  src -= 1;
753 
754  /* rearranging filter */
755  filt = LD_SH(filter);
756  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
757 
758  mask1 = mask0 + 2;
759 
760  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
761  src += (8 * src_stride);
762  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
763  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
764  filt0, filt1, out0, out1);
765  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
766  filt0, filt1, out2, out3);
767  SRARI_H4_SH(out0, out1, out2, out3, 7);
768  SAT_SH4_SH(out0, out1, out2, out3, 7);
769  out = PCKEV_XORI128_UB(out0, out1);
770  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
771  dst += (4 * dst_stride);
772  out = PCKEV_XORI128_UB(out2, out3);
773  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
774  dst += (4 * dst_stride);
775 
776  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
777  src += (8 * src_stride);
778  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
779  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
780  filt0, filt1, out0, out1);
781  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
782  filt0, filt1, out2, out3);
783  SRARI_H4_SH(out0, out1, out2, out3, 7);
784  SAT_SH4_SH(out0, out1, out2, out3, 7);
785  out = PCKEV_XORI128_UB(out0, out1);
786  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
787  dst += (4 * dst_stride);
788  out = PCKEV_XORI128_UB(out2, out3);
789  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
790 }
791 
792 void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
793  uint8_t *src, ptrdiff_t src_stride,
794  int height, int mx, int my)
795 {
796  const int8_t *filter = subpel_filters_msa[mx - 1];
797 
798  if (4 == height) {
799  common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
800  } else if (8 == height) {
801  common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
802  } else if (16 == height) {
803  common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
804  }
805 }
806 
807 void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
808  uint8_t *src, ptrdiff_t src_stride,
809  int height, int mx, int my)
810 {
811  uint32_t loop_cnt;
812  const int8_t *filter = subpel_filters_msa[mx - 1];
813  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
814  v16u8 tmp0, tmp1;
815  v8i16 filt, out0, out1, out2, out3;
816 
817  mask0 = LD_SB(&mc_filt_mask_arr[0]);
818  src -= 1;
819 
820  /* rearranging filter */
821  filt = LD_SH(filter);
822  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
823 
824  mask1 = mask0 + 2;
825 
826  for (loop_cnt = (height >> 2); loop_cnt--;) {
827  LD_SB4(src, src_stride, src0, src1, src2, src3);
828  src += (4 * src_stride);
829 
830  XORI_B4_128_SB(src0, src1, src2, src3);
831  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
832  filt1, out0, out1, out2, out3);
833  SRARI_H4_SH(out0, out1, out2, out3, 7);
834  SAT_SH4_SH(out0, out1, out2, out3, 7);
835  tmp0 = PCKEV_XORI128_UB(out0, out1);
836  tmp1 = PCKEV_XORI128_UB(out2, out3);
837  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
838  dst += (4 * dst_stride);
839  }
840 }
841 
842 void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
843  uint8_t *src, ptrdiff_t src_stride,
844  int height, int mx, int my)
845 {
846  uint32_t loop_cnt;
847  const int8_t *filter = subpel_filters_msa[mx - 1];
848  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
849  v16i8 filt0, filt1, mask0, mask1;
850  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
851  v16u8 out;
852 
853  mask0 = LD_SB(&mc_filt_mask_arr[0]);
854  src -= 1;
855 
856  /* rearranging filter */
857  filt = LD_SH(filter);
858  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
859 
860  mask1 = mask0 + 2;
861 
862  for (loop_cnt = (height >> 2); loop_cnt--;) {
863  LD_SB4(src, src_stride, src0, src2, src4, src6);
864  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
865  src += (4 * src_stride);
866 
867  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
868  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
869  filt1, out0, out1, out2, out3);
870  HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
871  filt1, out4, out5, out6, out7);
872  SRARI_H4_SH(out0, out1, out2, out3, 7);
873  SRARI_H4_SH(out4, out5, out6, out7, 7);
874  SAT_SH4_SH(out0, out1, out2, out3, 7);
875  SAT_SH4_SH(out4, out5, out6, out7, 7);
876  out = PCKEV_XORI128_UB(out0, out1);
877  ST_UB(out, dst);
878  dst += dst_stride;
879  out = PCKEV_XORI128_UB(out2, out3);
880  ST_UB(out, dst);
881  dst += dst_stride;
882  out = PCKEV_XORI128_UB(out4, out5);
883  ST_UB(out, dst);
884  dst += dst_stride;
885  out = PCKEV_XORI128_UB(out6, out7);
886  ST_UB(out, dst);
887  dst += dst_stride;
888  }
889 }
890 
891 void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
892  uint8_t *src, ptrdiff_t src_stride,
893  int height, int mx, int my)
894 {
895  uint32_t loop_cnt;
896  const int8_t *filter = subpel_filters_msa[my - 1];
897  v16i8 src0, src1, src2, src3, src4, src5;
898  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
899  v16i8 src2110, src4332, filt0, filt1;
900  v8i16 filt, out10, out32;
901  v16u8 out;
902 
903  src -= src_stride;
904 
905  filt = LD_SH(filter);
906  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
907 
908  LD_SB3(src, src_stride, src0, src1, src2);
909  src += (3 * src_stride);
910 
911  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
912 
913  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
914  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
915 
916  for (loop_cnt = (height >> 2); loop_cnt--;) {
917  LD_SB3(src, src_stride, src3, src4, src5);
918  src += (3 * src_stride);
919  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
920  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
921  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
922  out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
923 
924  src2 = LD_SB(src);
925  src += (src_stride);
926  ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
927  src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
928  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
929  out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
930  SRARI_H2_SH(out10, out32, 7);
931  SAT_SH2_SH(out10, out32, 7);
932  out = PCKEV_XORI128_UB(out10, out32);
933  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
934  dst += (4 * dst_stride);
935  }
936 }
937 
938 void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
939  uint8_t *src, ptrdiff_t src_stride,
940  int height, int mx, int my)
941 {
942  uint32_t loop_cnt;
943  const int8_t *filter = subpel_filters_msa[my - 1];
944  v16i8 src0, src1, src2, src7, src8, src9, src10;
945  v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
946  v16u8 tmp0, tmp1;
947  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
948 
949  src -= src_stride;
950 
951  filt = LD_SH(filter);
952  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
953 
954  LD_SB3(src, src_stride, src0, src1, src2);
955  src += (3 * src_stride);
956 
957  XORI_B3_128_SB(src0, src1, src2);
958  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
959 
960  for (loop_cnt = (height >> 2); loop_cnt--;) {
961  LD_SB4(src, src_stride, src7, src8, src9, src10);
962  src += (4 * src_stride);
963 
964  XORI_B4_128_SB(src7, src8, src9, src10);
965  ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
966  src72_r, src87_r, src98_r, src109_r);
967  out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
968  out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
969  out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
970  out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
971  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
972  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
973  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
974  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
975  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
976  dst += (4 * dst_stride);
977 
978  src10_r = src98_r;
979  src21_r = src109_r;
980  src2 = src10;
981  }
982 }
983 
984 void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
985  uint8_t *src, ptrdiff_t src_stride,
986  int height, int mx, int my)
987 {
988  uint32_t loop_cnt;
989  const int8_t *filter = subpel_filters_msa[my - 1];
990  v16i8 src0, src1, src2, src3, src4, src5, src6;
991  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
992  v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
993  v16u8 tmp0, tmp1, tmp2, tmp3;
994  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
995 
996  src -= src_stride;
997 
998  filt = LD_SH(filter);
999  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1000 
1001  LD_SB3(src, src_stride, src0, src1, src2);
1002  src += (3 * src_stride);
1003 
1004  XORI_B3_128_SB(src0, src1, src2);
1005  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
1006  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
1007 
1008  for (loop_cnt = (height >> 2); loop_cnt--;) {
1009  LD_SB4(src, src_stride, src3, src4, src5, src6);
1010  src += (4 * src_stride);
1011 
1012  XORI_B4_128_SB(src3, src4, src5, src6);
1013  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1014  src32_r, src43_r, src54_r, src65_r);
1015  ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1016  src32_l, src43_l, src54_l, src65_l);
1017  out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
1018  out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
1019  out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
1020  out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
1021  out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
1022  out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
1023  out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
1024  out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
1025  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1026  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1027  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1028  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1029  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1030  out3_r, tmp0, tmp1, tmp2, tmp3);
1031  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1032  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1033  dst += (4 * dst_stride);
1034 
1035  src10_r = src54_r;
1036  src21_r = src65_r;
1037  src10_l = src54_l;
1038  src21_l = src65_l;
1039  src2 = src6;
1040  }
1041 }
1042 
1043 void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1044  uint8_t *src, ptrdiff_t src_stride,
1045  int height, int mx, int my)
1046 {
1047  uint32_t loop_cnt;
1048  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1049  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1050  v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1051  v16u8 mask0, mask1, out;
1052  v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1053  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1054 
1055  mask0 = LD_UB(&mc_filt_mask_arr[16]);
1056  src -= (1 + 1 * src_stride);
1057 
1058  /* rearranging filter */
1059  filt = LD_SH(filter_horiz);
1060  SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1061 
1062  mask1 = mask0 + 2;
1063 
1064  LD_SB3(src, src_stride, src0, src1, src2);
1065  src += (3 * src_stride);
1066 
1067  XORI_B3_128_SB(src0, src1, src2);
1068  hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1069  hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
1070  vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1071 
1072  filt = LD_SH(filter_vert);
1073  SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1074 
1075  for (loop_cnt = (height >> 2); loop_cnt--;) {
1076  LD_SB4(src, src_stride, src3, src4, src5, src6);
1077  src += (4 * src_stride);
1078 
1079  XORI_B2_128_SB(src3, src4);
1080  hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1081  hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1082  vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1083  tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1084 
1085  XORI_B2_128_SB(src5, src6);
1086  hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1087  hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1088  vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1089  tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1090 
1091  SRARI_H2_SH(tmp0, tmp1, 7);
1092  SAT_SH2_SH(tmp0, tmp1, 7);
1093  out = PCKEV_XORI128_UB(tmp0, tmp1);
1094  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1095  dst += (4 * dst_stride);
1096 
1097  hz_out1 = hz_out5;
1098  vec0 = vec2;
1099  }
1100 }
1101 
1102 void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1103  uint8_t *src, ptrdiff_t src_stride,
1104  int height, int mx, int my)
1105 {
1106  uint32_t loop_cnt;
1107  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1108  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1109  v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1110  v16u8 mask0, mask1, out0, out1;
1111  v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
1112  v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1113  v8i16 vec0, vec1, vec2, vec3, vec4;
1114 
1115  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1116  src -= (1 + 1 * src_stride);
1117 
1118  /* rearranging filter */
1119  filt = LD_SH(filter_horiz);
1120  SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1121 
1122  mask1 = mask0 + 2;
1123 
1124  LD_SB3(src, src_stride, src0, src1, src2);
1125  src += (3 * src_stride);
1126 
1127  XORI_B3_128_SB(src0, src1, src2);
1128  hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1129  hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1130  hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1131  ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1132 
1133  filt = LD_SH(filter_vert);
1134  SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1135 
1136  for (loop_cnt = (height >> 2); loop_cnt--;) {
1137  LD_SB4(src, src_stride, src3, src4, src5, src6);
1138  src += (4 * src_stride);
1139 
1140  XORI_B4_128_SB(src3, src4, src5, src6);
1141  hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1142  vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1143  tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1144 
1145  hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1146  vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1147  tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1148 
1149  hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1150  vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1151  tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
1152 
1153  hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1154  ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
1155  tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1156 
1157  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1158  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1159  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1160  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1161  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1162  dst += (4 * dst_stride);
1163 
1164  vec0 = vec4;
1165  vec2 = vec1;
1166  }
1167 }
1168 
1169 void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1170  uint8_t *src, ptrdiff_t src_stride,
1171  int height, int mx, int my)
1172 {
1173  int32_t multiple8_cnt;
1174 
1175  for (multiple8_cnt = 2; multiple8_cnt--;) {
1176  ff_put_vp8_epel8_h4v4_msa(dst, dst_stride, src, src_stride, height,
1177  mx, my);
1178 
1179  src += 8;
1180  dst += 8;
1181  }
1182 }
1183 
1184 void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1185  uint8_t *src, ptrdiff_t src_stride,
1186  int height, int mx, int my)
1187 {
1188  uint32_t loop_cnt;
1189  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1190  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1191  v16i8 src0, src1, src2, src3, src4, src5, src6;
1192  v16i8 filt_hz0, filt_hz1, filt_hz2;
1193  v16u8 res0, res1, mask0, mask1, mask2;
1194  v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1195  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1196 
1197  mask0 = LD_UB(&mc_filt_mask_arr[16]);
1198  src -= (2 + 1 * src_stride);
1199 
1200  /* rearranging filter */
1201  filt = LD_SH(filter_horiz);
1202  SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1203 
1204  mask1 = mask0 + 2;
1205  mask2 = mask0 + 4;
1206 
1207  LD_SB3(src, src_stride, src0, src1, src2);
1208  src += (3 * src_stride);
1209 
1210  XORI_B3_128_SB(src0, src1, src2);
1211  hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
1212  filt_hz1, filt_hz2);
1213  hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0,
1214  filt_hz1, filt_hz2);
1215  vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1216 
1217  filt = LD_SH(filter_vert);
1218  SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1219 
1220  for (loop_cnt = (height >> 2); loop_cnt--;) {
1221  LD_SB4(src, src_stride, src3, src4, src5, src6);
1222  src += (4 * src_stride);
1223 
1224  XORI_B4_128_SB(src3, src4, src5, src6);
1225  hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
1226  filt_hz1, filt_hz2);
1227  hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1228  vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1229  tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1230 
1231  hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
1232  filt_hz1, filt_hz2);
1233  hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1234  vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1235  tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1236 
1237  SRARI_H2_SH(tmp0, tmp1, 7);
1238  SAT_SH2_SH(tmp0, tmp1, 7);
1239  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1240  XORI_B2_128_UB(res0, res1);
1241  ST_W2(res0, 0, 1, dst, dst_stride);
1242  ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1243  dst += (4 * dst_stride);
1244 
1245  hz_out1 = hz_out5;
1246  vec0 = vec2;
1247  }
1248 }
1249 
1250 void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1251  uint8_t *src, ptrdiff_t src_stride,
1252  int height, int mx, int my)
1253 {
1254  uint32_t loop_cnt;
1255  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1256  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1257  v16i8 src0, src1, src2, src3, src4, src5, src6;
1258  v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1259  v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
1260  v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
1261  v16u8 out0, out1;
1262 
1263  mask0 = LD_SB(&mc_filt_mask_arr[0]);
1264  src -= (2 + src_stride);
1265 
1266  /* rearranging filter */
1267  filt = LD_SH(filter_horiz);
1268  SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1269 
1270  mask1 = mask0 + 2;
1271  mask2 = mask0 + 4;
1272 
1273  LD_SB3(src, src_stride, src0, src1, src2);
1274  src += (3 * src_stride);
1275 
1276  XORI_B3_128_SB(src0, src1, src2);
1277  hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
1278  filt_hz1, filt_hz2);
1279  hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
1280  filt_hz1, filt_hz2);
1281  hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
1282  filt_hz1, filt_hz2);
1283  ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1284 
1285  filt = LD_SH(filter_vert);
1286  SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1287 
1288  for (loop_cnt = (height >> 2); loop_cnt--;) {
1289  LD_SB4(src, src_stride, src3, src4, src5, src6);
1290  src += (4 * src_stride);
1291 
1292  XORI_B4_128_SB(src3, src4, src5, src6);
1293 
1294  hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
1295  filt_hz1, filt_hz2);
1296  vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1297  tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1298 
1299  hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
1300  filt_hz1, filt_hz2);
1301  vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1302  tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1303 
1304  hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
1305  filt_hz1, filt_hz2);
1306  vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1307  tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
1308 
1309  hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
1310  filt_hz1, filt_hz2);
1311  ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
1312  tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1313 
1314  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1315  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1316  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1317  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1318  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1319  dst += (4 * dst_stride);
1320  }
1321 }
1322 
1323 void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1324  uint8_t *src, ptrdiff_t src_stride,
1325  int height, int mx, int my)
1326 {
1327  int32_t multiple8_cnt;
1328 
1329  for (multiple8_cnt = 2; multiple8_cnt--;) {
1330  ff_put_vp8_epel8_h6v4_msa(dst, dst_stride, src, src_stride, height,
1331  mx, my);
1332 
1333  src += 8;
1334  dst += 8;
1335  }
1336 }
1337 
1338 void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1339  uint8_t *src, ptrdiff_t src_stride,
1340  int height, int mx, int my)
1341 {
1342  uint32_t loop_cnt;
1343  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1344  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1345  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1346  v16i8 filt_hz0, filt_hz1, mask0, mask1;
1347  v16u8 out;
1348  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1349  v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
1350  v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
1351 
1352  mask0 = LD_SB(&mc_filt_mask_arr[16]);
1353 
1354  src -= (1 + 2 * src_stride);
1355 
1356  /* rearranging filter */
1357  filt = LD_SH(filter_horiz);
1358  SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1359 
1360  mask1 = mask0 + 2;
1361 
1362  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1363  src += (5 * src_stride);
1364 
1365  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1366  hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1367  hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
1368  hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1369  hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1370  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1371 
1372  filt = LD_SH(filter_vert);
1373  SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1374 
1375  for (loop_cnt = (height >> 2); loop_cnt--;) {
1376  LD_SB4(src, src_stride, src5, src6, src7, src8);
1377  XORI_B4_128_SB(src5, src6, src7, src8);
1378  src += (4 * src_stride);
1379 
1380  hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1381  hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1382  out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1383  tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1384 
1385  hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
1386  hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
1387  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1388  tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
1389 
1390  SRARI_H2_SH(tmp0, tmp1, 7);
1391  SAT_SH2_SH(tmp0, tmp1, 7);
1392  out = PCKEV_XORI128_UB(tmp0, tmp1);
1393  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1394  dst += (4 * dst_stride);
1395 
1396  hz_out3 = hz_out7;
1397  out0 = out2;
1398  out1 = out3;
1399  }
1400 }
1401 
1402 void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1403  uint8_t *src, ptrdiff_t src_stride,
1404  int height, int mx, int my)
1405 {
1406  uint32_t loop_cnt;
1407  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1408  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1409  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1410  v16i8 filt_hz0, filt_hz1, mask0, mask1;
1411  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
1412  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1413  v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
1414  v16u8 vec0, vec1;
1415 
1416  mask0 = LD_SB(&mc_filt_mask_arr[0]);
1417  src -= (1 + 2 * src_stride);
1418 
1419  /* rearranging filter */
1420  filt = LD_SH(filter_horiz);
1421  SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1422 
1423  mask1 = mask0 + 2;
1424 
1425  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1426  src += (5 * src_stride);
1427 
1428  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1429  hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1430  hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1431  hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1432  hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1433  hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1434  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1435  ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
1436 
1437  filt = LD_SH(filter_vert);
1438  SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1439 
1440  for (loop_cnt = (height >> 2); loop_cnt--;) {
1441  LD_SB4(src, src_stride, src5, src6, src7, src8);
1442  src += (4 * src_stride);
1443 
1444  XORI_B4_128_SB(src5, src6, src7, src8);
1445 
1446  hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1447  out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1448  tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1449 
1450  hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1451  out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
1452  tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
1453 
1454  hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
1455  out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1456  tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
1457 
1458  hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
1459  out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
1460  tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
1461 
1462  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1463  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1464  vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
1465  vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
1466  ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
1467  dst += (4 * dst_stride);
1468 
1469  hz_out4 = hz_out8;
1470  out0 = out2;
1471  out1 = out6;
1472  out3 = out5;
1473  out4 = out7;
1474  }
1475 }
1476 
1477 void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1478  uint8_t *src, ptrdiff_t src_stride,
1479  int height, int mx, int my)
1480 {
1481  int32_t multiple8_cnt;
1482 
1483  for (multiple8_cnt = 2; multiple8_cnt--;) {
1484  ff_put_vp8_epel8_h4v6_msa(dst, dst_stride, src, src_stride, height,
1485  mx, my);
1486 
1487  src += 8;
1488  dst += 8;
1489  }
1490 }
1491 
1492 static void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride,
1493  uint8_t *dst, int32_t dst_stride,
1494  const int8_t *filter)
1495 {
1496  v16i8 src0, src1, src2, src3, mask;
1497  v16u8 filt0, vec0, vec1, res0, res1;
1498  v8u16 vec2, vec3, filt;
1499 
1500  mask = LD_SB(&mc_filt_mask_arr[16]);
1501 
1502  /* rearranging filter */
1503  filt = LD_UH(filter);
1504  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1505 
1506  LD_SB4(src, src_stride, src0, src1, src2, src3);
1507  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1508  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
1509  SRARI_H2_UH(vec2, vec3, 7);
1510  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
1511  ST_W2(res0, 0, 1, dst, dst_stride);
1512  ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1513 }
1514 
1515 static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride,
1516  uint8_t *dst, int32_t dst_stride,
1517  const int8_t *filter)
1518 {
1519  v16u8 vec0, vec1, vec2, vec3, filt0;
1520  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1521  v16i8 res0, res1, res2, res3;
1522  v8u16 vec4, vec5, vec6, vec7, filt;
1523 
1524  mask = LD_SB(&mc_filt_mask_arr[16]);
1525 
1526  /* rearranging filter */
1527  filt = LD_UH(filter);
1528  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1529 
1530  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1531  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1532  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
1533  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1534  vec4, vec5, vec6, vec7);
1535  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
1536  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
1537  res0, res1, res2, res3);
1538  ST_W2(res0, 0, 1, dst, dst_stride);
1539  ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1540  ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
1541  ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
1542 }
1543 
1544 void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1545  uint8_t *src, ptrdiff_t src_stride,
1546  int height, int mx, int my)
1547 {
1548  const int8_t *filter = bilinear_filters_msa[mx - 1];
1549 
1550  if (4 == height) {
1551  common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1552  } else if (8 == height) {
1553  common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1554  }
1555 }
1556 
1557 static void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride,
1558  uint8_t *dst, int32_t dst_stride,
1559  const int8_t *filter)
1560 {
1561  v16u8 filt0;
1562  v16i8 src0, src1, src2, src3, mask;
1563  v8u16 vec0, vec1, vec2, vec3, filt;
1564 
1565  mask = LD_SB(&mc_filt_mask_arr[0]);
1566 
1567  /* rearranging filter */
1568  filt = LD_UH(filter);
1569  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1570 
1571  LD_SB4(src, src_stride, src0, src1, src2, src3);
1572  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1573  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1574  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1575  vec0, vec1, vec2, vec3);
1576  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1577  PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
1578  ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
1579 }
1580 
1581 static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
1582  uint8_t *dst, int32_t dst_stride,
1583  const int8_t *filter, int32_t height)
1584 {
1585  v16u8 filt0;
1586  v16i8 src0, src1, src2, src3, mask, out0, out1;
1587  v8u16 vec0, vec1, vec2, vec3, filt;
1588 
1589  mask = LD_SB(&mc_filt_mask_arr[0]);
1590 
1591  /* rearranging filter */
1592  filt = LD_UH(filter);
1593  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1594 
1595  LD_SB4(src, src_stride, src0, src1, src2, src3);
1596  src += (4 * src_stride);
1597 
1598  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1599  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1600  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1601  vec0, vec1, vec2, vec3);
1602  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1603 
1604  LD_SB4(src, src_stride, src0, src1, src2, src3);
1605  src += (4 * src_stride);
1606 
1607  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1608  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1609 
1610  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1611  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1612  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1613  vec0, vec1, vec2, vec3);
1614  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1615  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1616  ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1617  dst += (8 * dst_stride);
1618 
1619  if (16 == height) {
1620  LD_SB4(src, src_stride, src0, src1, src2, src3);
1621  src += (4 * src_stride);
1622 
1623  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1624  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1625  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1626  vec0, vec1, vec2, vec3);
1627  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1628  LD_SB4(src, src_stride, src0, src1, src2, src3);
1629  src += (4 * src_stride);
1630 
1631  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1632  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1633 
1634  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1635  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1636  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1637  vec0, vec1, vec2, vec3);
1638  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1639  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1640  ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1641  }
1642 }
1643 
1644 void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1645  uint8_t *src, ptrdiff_t src_stride,
1646  int height, int mx, int my)
1647 {
1648  const int8_t *filter = bilinear_filters_msa[mx - 1];
1649 
1650  if (4 == height) {
1651  common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1652  } else {
1653  common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1654  height);
1655  }
1656 }
1657 
1658 void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1659  uint8_t *src, ptrdiff_t src_stride,
1660  int height, int mx, int my)
1661 {
1662  uint32_t loop_cnt;
1663  const int8_t *filter = bilinear_filters_msa[mx - 1];
1664  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1665  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1666  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
1667 
1668  mask = LD_SB(&mc_filt_mask_arr[0]);
1669 
1670  loop_cnt = (height >> 2) - 1;
1671 
1672  /* rearranging filter */
1673  filt = LD_UH(filter);
1674  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1675 
1676  LD_SB4(src, src_stride, src0, src2, src4, src6);
1677  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1678  src += (4 * src_stride);
1679 
1680  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1681  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1682  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1683  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1684  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1685  out0, out1, out2, out3);
1686  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1687  out4, out5, out6, out7);
1688  SRARI_H4_UH(out0, out1, out2, out3, 7);
1689  SRARI_H4_UH(out4, out5, out6, out7, 7);
1690  PCKEV_ST_SB(out0, out1, dst);
1691  dst += dst_stride;
1692  PCKEV_ST_SB(out2, out3, dst);
1693  dst += dst_stride;
1694  PCKEV_ST_SB(out4, out5, dst);
1695  dst += dst_stride;
1696  PCKEV_ST_SB(out6, out7, dst);
1697  dst += dst_stride;
1698 
1699  for (; loop_cnt--;) {
1700  LD_SB4(src, src_stride, src0, src2, src4, src6);
1701  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1702  src += (4 * src_stride);
1703 
1704  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1705  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1706  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1707  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1708  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1709  out0, out1, out2, out3);
1710  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1711  out4, out5, out6, out7);
1712  SRARI_H4_UH(out0, out1, out2, out3, 7);
1713  SRARI_H4_UH(out4, out5, out6, out7, 7);
1714  PCKEV_ST_SB(out0, out1, dst);
1715  dst += dst_stride;
1716  PCKEV_ST_SB(out2, out3, dst);
1717  dst += dst_stride;
1718  PCKEV_ST_SB(out4, out5, dst);
1719  dst += dst_stride;
1720  PCKEV_ST_SB(out6, out7, dst);
1721  dst += dst_stride;
1722  }
1723 }
1724 
1725 static void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride,
1726  uint8_t *dst, int32_t dst_stride,
1727  const int8_t *filter)
1728 {
1729  v16i8 src0, src1, src2, src3, src4;
1730  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
1731  v16u8 filt0;
1732  v8i16 filt;
1733  v8u16 tmp0, tmp1;
1734 
1735  filt = LD_SH(filter);
1736  filt0 = (v16u8) __msa_splati_h(filt, 0);
1737 
1738  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1739  src += (5 * src_stride);
1740 
1741  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1742  src10_r, src21_r, src32_r, src43_r);
1743  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1744  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1745  SRARI_H2_UH(tmp0, tmp1, 7);
1746  SAT_UH2_UH(tmp0, tmp1, 7);
1747  src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
1748  ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
1749 }
1750 
1751 static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride,
1752  uint8_t *dst, int32_t dst_stride,
1753  const int8_t *filter)
1754 {
1755  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1756  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
1757  v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
1758  v8u16 tmp0, tmp1, tmp2, tmp3;
1759  v16u8 filt0;
1760  v8i16 filt;
1761 
1762  filt = LD_SH(filter);
1763  filt0 = (v16u8) __msa_splati_h(filt, 0);
1764 
1765  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1766  src += (8 * src_stride);
1767 
1768  src8 = LD_SB(src);
1769  src += src_stride;
1770 
1771  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1772  src32_r, src43_r);
1773  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1774  src76_r, src87_r);
1775  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1776  src87_r, src76_r, src2110, src4332, src6554, src8776);
1777  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
1778  tmp0, tmp1, tmp2, tmp3);
1779  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1780  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1781  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
1782  ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1783 }
1784 
1785 void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1786  uint8_t *src, ptrdiff_t src_stride,
1787  int height, int mx, int my)
1788 {
1789  const int8_t *filter = bilinear_filters_msa[my - 1];
1790 
1791  if (4 == height) {
1792  common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1793  } else if (8 == height) {
1794  common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1795  }
1796 }
1797 
1798 static void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride,
1799  uint8_t *dst, int32_t dst_stride,
1800  const int8_t *filter)
1801 {
1802  v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
1803  v16i8 out0, out1;
1804  v8u16 tmp0, tmp1, tmp2, tmp3;
1805  v8i16 filt;
1806 
1807  /* rearranging filter_y */
1808  filt = LD_SH(filter);
1809  filt0 = (v16u8) __msa_splati_h(filt, 0);
1810 
1811  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
1812  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
1813  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
1814  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1815  tmp0, tmp1, tmp2, tmp3);
1816  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1817  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1818  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1819  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1820 }
1821 
1822 static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
1823  uint8_t *dst, int32_t dst_stride,
1824  const int8_t *filter, int32_t height)
1825 {
1826  uint32_t loop_cnt;
1827  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1828  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1829  v16i8 out0, out1;
1830  v8u16 tmp0, tmp1, tmp2, tmp3;
1831  v8i16 filt;
1832 
1833  /* rearranging filter_y */
1834  filt = LD_SH(filter);
1835  filt0 = (v16u8) __msa_splati_h(filt, 0);
1836 
1837  src0 = LD_UB(src);
1838  src += src_stride;
1839 
1840  for (loop_cnt = (height >> 3); loop_cnt--;) {
1841  LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
1842  src += (8 * src_stride);
1843 
1844  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1845  vec0, vec1, vec2, vec3);
1846  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1847  vec4, vec5, vec6, vec7);
1848  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1849  tmp0, tmp1, tmp2, tmp3);
1850  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1851  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1852  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1853  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1854 
1855  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1856  tmp0, tmp1, tmp2, tmp3);
1857  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1858  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1859  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1860  ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1861  dst += (8 * dst_stride);
1862 
1863  src0 = src8;
1864  }
1865 }
1866 
1867 void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1868  uint8_t *src, ptrdiff_t src_stride,
1869  int height, int mx, int my)
1870 {
1871  const int8_t *filter = bilinear_filters_msa[my - 1];
1872 
1873  if (4 == height) {
1874  common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1875  } else {
1876  common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1877  height);
1878  }
1879 }
1880 
1881 void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1882  uint8_t *src, ptrdiff_t src_stride,
1883  int height, int mx, int my)
1884 {
1885  uint32_t loop_cnt;
1886  const int8_t *filter = bilinear_filters_msa[my - 1];
1887  v16u8 src0, src1, src2, src3, src4;
1888  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1889  v8u16 tmp0, tmp1, tmp2, tmp3;
1890  v8i16 filt;
1891 
1892  /* rearranging filter_y */
1893  filt = LD_SH(filter);
1894  filt0 = (v16u8) __msa_splati_h(filt, 0);
1895 
1896  src0 = LD_UB(src);
1897  src += src_stride;
1898 
1899  for (loop_cnt = (height >> 2); loop_cnt--;) {
1900  LD_UB4(src, src_stride, src1, src2, src3, src4);
1901  src += (4 * src_stride);
1902 
1903  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
1904  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
1905  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1906  SRARI_H2_UH(tmp0, tmp1, 7);
1907  SAT_UH2_UH(tmp0, tmp1, 7);
1908  PCKEV_ST_SB(tmp0, tmp1, dst);
1909  dst += dst_stride;
1910 
1911  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
1912  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
1913  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1914  SRARI_H2_UH(tmp2, tmp3, 7);
1915  SAT_UH2_UH(tmp2, tmp3, 7);
1916  PCKEV_ST_SB(tmp2, tmp3, dst);
1917  dst += dst_stride;
1918 
1919  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1920  SRARI_H2_UH(tmp0, tmp1, 7);
1921  SAT_UH2_UH(tmp0, tmp1, 7);
1922  PCKEV_ST_SB(tmp0, tmp1, dst);
1923  dst += dst_stride;
1924 
1925  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1926  SRARI_H2_UH(tmp2, tmp3, 7);
1927  SAT_UH2_UH(tmp2, tmp3, 7);
1928  PCKEV_ST_SB(tmp2, tmp3, dst);
1929  dst += dst_stride;
1930 
1931  src0 = src4;
1932  }
1933 }
1934 
1936  uint8_t *dst, int32_t dst_stride,
1937  const int8_t *filter_horiz,
1938  const int8_t *filter_vert)
1939 {
1940  v16i8 src0, src1, src2, src3, src4, mask;
1941  v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
1942  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
1943 
1944  mask = LD_SB(&mc_filt_mask_arr[16]);
1945 
1946  /* rearranging filter */
1947  filt = LD_UH(filter_horiz);
1948  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
1949 
1950  filt = LD_UH(filter_vert);
1951  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
1952 
1953  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1954  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
1955  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
1956  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
1957  hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1958  hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
1959 
1960  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1961  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1962  SRARI_H2_UH(tmp0, tmp1, 7);
1963  SAT_UH2_UH(tmp0, tmp1, 7);
1964  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1965  ST_W2(res0, 0, 1, dst, dst_stride);
1966  ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1967 }
1968 
1970  uint8_t *dst, int32_t dst_stride,
1971  const int8_t *filter_horiz,
1972  const int8_t *filter_vert)
1973 {
1974  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
1975  v16i8 res0, res1, res2, res3;
1976  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
1977  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1978  v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
1979 
1980  mask = LD_SB(&mc_filt_mask_arr[16]);
1981 
1982  /* rearranging filter */
1983  filt = LD_UH(filter_horiz);
1984  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
1985 
1986  filt = LD_UH(filter_vert);
1987  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
1988 
1989  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1990  src += (8 * src_stride);
1991  src8 = LD_SB(src);
1992 
1993  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
1994  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
1995  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
1996  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
1997  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
1998  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
1999  hz_out3, hz_out5, 8);
2000  hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
2001 
2002  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2003  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2004  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
2005  vec4, vec5, vec6, vec7);
2006  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2007  SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
2008  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
2009  res0, res1, res2, res3);
2010  ST_W2(res0, 0, 1, dst, dst_stride);
2011  ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
2012  ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
2013  ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
2014 }
2015 
2016 void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2017  uint8_t *src, ptrdiff_t src_stride,
2018  int height, int mx, int my)
2019 {
2020  const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2021  const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2022 
2023  if (4 == height) {
2024  common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
2025  filter_horiz, filter_vert);
2026  } else if (8 == height) {
2027  common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
2028  filter_horiz, filter_vert);
2029  }
2030 }
2031 
2033  uint8_t *dst, int32_t dst_stride,
2034  const int8_t *filter_horiz,
2035  const int8_t *filter_vert)
2036 {
2037  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2038  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2039  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
2040  v8i16 filt;
2041 
2042  mask = LD_SB(&mc_filt_mask_arr[0]);
2043 
2044  /* rearranging filter */
2045  filt = LD_SH(filter_horiz);
2046  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2047 
2048  filt = LD_SH(filter_vert);
2049  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2050 
2051  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2052 
2053  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2054  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2055  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2056  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
2057 
2058  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2059  vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2060  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
2061 
2062  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2063  vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2064  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
2065 
2066  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2067  vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2068  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
2069 
2070  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2071  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2072  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2073  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2074 }
2075 
2077  uint8_t *dst, int32_t dst_stride,
2078  const int8_t *filter_horiz,
2079  const int8_t *filter_vert,
2080  int32_t height)
2081 {
2082  uint32_t loop_cnt;
2083  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2084  v16u8 filt_hz, filt_vt, vec0;
2085  v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
2086  v8i16 filt;
2087 
2088  mask = LD_SB(&mc_filt_mask_arr[0]);
2089 
2090  /* rearranging filter */
2091  filt = LD_SH(filter_horiz);
2092  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2093 
2094  filt = LD_SH(filter_vert);
2095  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2096 
2097  src0 = LD_SB(src);
2098  src += src_stride;
2099 
2100  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2101 
2102  for (loop_cnt = (height >> 3); loop_cnt--;) {
2103  LD_SB4(src, src_stride, src1, src2, src3, src4);
2104  src += (4 * src_stride);
2105 
2106  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2107  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2108  tmp1 = __msa_dotp_u_h(vec0, filt_vt);
2109 
2110  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2111  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2112  tmp2 = __msa_dotp_u_h(vec0, filt_vt);
2113 
2114  SRARI_H2_UH(tmp1, tmp2, 7);
2115  SAT_UH2_UH(tmp1, tmp2, 7);
2116 
2117  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2118  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2119  tmp3 = __msa_dotp_u_h(vec0, filt_vt);
2120 
2121  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2122  LD_SB4(src, src_stride, src1, src2, src3, src4);
2123  src += (4 * src_stride);
2124  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2125  tmp4 = __msa_dotp_u_h(vec0, filt_vt);
2126 
2127  SRARI_H2_UH(tmp3, tmp4, 7);
2128  SAT_UH2_UH(tmp3, tmp4, 7);
2129  PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
2130  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2131 
2132  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2133  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2134  tmp5 = __msa_dotp_u_h(vec0, filt_vt);
2135 
2136  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2137  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2138  tmp6 = __msa_dotp_u_h(vec0, filt_vt);
2139 
2140  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2141  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2142  tmp7 = __msa_dotp_u_h(vec0, filt_vt);
2143 
2144  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2145  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2146  tmp8 = __msa_dotp_u_h(vec0, filt_vt);
2147 
2148  SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2149  SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2150  PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
2151  ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2152  dst += (8 * dst_stride);
2153  }
2154 }
2155 
2156 void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2157  uint8_t *src, ptrdiff_t src_stride,
2158  int height, int mx, int my)
2159 {
2160  const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2161  const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2162 
2163  if (4 == height) {
2164  common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
2165  filter_horiz, filter_vert);
2166  } else {
2167  common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
2168  filter_horiz, filter_vert, height);
2169  }
2170 }
2171 
2172 void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2173  uint8_t *src, ptrdiff_t src_stride,
2174  int height, int mx, int my)
2175 {
2176  uint32_t loop_cnt;
2177  const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2178  const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2179  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2180  v16u8 filt_hz, filt_vt, vec0, vec1;
2181  v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
2182  v8i16 filt;
2183 
2184  mask = LD_SB(&mc_filt_mask_arr[0]);
2185 
2186  /* rearranging filter */
2187  filt = LD_SH(filter_horiz);
2188  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2189 
2190  filt = LD_SH(filter_vert);
2191  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2192 
2193  LD_SB2(src, 8, src0, src1);
2194  src += src_stride;
2195 
2196  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2197  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2198 
2199 
2200  for (loop_cnt = (height >> 2); loop_cnt--;) {
2201  LD_SB4(src, src_stride, src0, src2, src4, src6);
2202  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2203  src += (4 * src_stride);
2204 
2205  hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2206  hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2207  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2208  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2209  SRARI_H2_UH(tmp1, tmp2, 7);
2210  SAT_UH2_UH(tmp1, tmp2, 7);
2211  PCKEV_ST_SB(tmp1, tmp2, dst);
2212  dst += dst_stride;
2213 
2214  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2215  hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2216  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2217  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2218  SRARI_H2_UH(tmp1, tmp2, 7);
2219  SAT_UH2_UH(tmp1, tmp2, 7);
2220  PCKEV_ST_SB(tmp1, tmp2, dst);
2221  dst += dst_stride;
2222 
2223  hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2224  hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
2225  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2226  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2227  SRARI_H2_UH(tmp1, tmp2, 7);
2228  SAT_UH2_UH(tmp1, tmp2, 7);
2229  PCKEV_ST_SB(tmp1, tmp2, dst);
2230  dst += dst_stride;
2231 
2232  hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
2233  hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
2234  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2235  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2236  SRARI_H2_UH(tmp1, tmp2, 7);
2237  SAT_UH2_UH(tmp1, tmp2, 7);
2238  PCKEV_ST_SB(tmp1, tmp2, dst);
2239  dst += dst_stride;
2240  }
2241 }
2242 
2243 void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride,
2244  uint8_t *src, ptrdiff_t src_stride,
2245  int height, int mx, int my)
2246 {
2247  int32_t cnt;
2248  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
2249  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2250 
2251  if (0 == height % 8) {
2252  for (cnt = height >> 3; cnt--;) {
2253  LD_UB8(src, src_stride,
2254  src0, src1, src2, src3, src4, src5, src6, src7);
2255  src += (8 * src_stride);
2256 
2257  out0 = __msa_copy_u_d((v2i64) src0, 0);
2258  out1 = __msa_copy_u_d((v2i64) src1, 0);
2259  out2 = __msa_copy_u_d((v2i64) src2, 0);
2260  out3 = __msa_copy_u_d((v2i64) src3, 0);
2261  out4 = __msa_copy_u_d((v2i64) src4, 0);
2262  out5 = __msa_copy_u_d((v2i64) src5, 0);
2263  out6 = __msa_copy_u_d((v2i64) src6, 0);
2264  out7 = __msa_copy_u_d((v2i64) src7, 0);
2265 
2266  SD4(out0, out1, out2, out3, dst, dst_stride);
2267  dst += (4 * dst_stride);
2268  SD4(out4, out5, out6, out7, dst, dst_stride);
2269  dst += (4 * dst_stride);
2270  }
2271  } else if (0 == height % 4) {
2272  for (cnt = (height / 4); cnt--;) {
2273  LD_UB4(src, src_stride, src0, src1, src2, src3);
2274  src += (4 * src_stride);
2275  out0 = __msa_copy_u_d((v2i64) src0, 0);
2276  out1 = __msa_copy_u_d((v2i64) src1, 0);
2277  out2 = __msa_copy_u_d((v2i64) src2, 0);
2278  out3 = __msa_copy_u_d((v2i64) src3, 0);
2279 
2280  SD4(out0, out1, out2, out3, dst, dst_stride);
2281  dst += (4 * dst_stride);
2282  }
2283  }
2284 }
2285 
2286 static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride,
2287  uint8_t *dst, int32_t dst_stride,
2289 {
2290  int32_t cnt, loop_cnt;
2291  uint8_t *src_tmp, *dst_tmp;
2292  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2293 
2294  for (cnt = (width >> 4); cnt--;) {
2295  src_tmp = src;
2296  dst_tmp = dst;
2297 
2298  for (loop_cnt = (height >> 3); loop_cnt--;) {
2299  LD_UB8(src_tmp, src_stride,
2300  src0, src1, src2, src3, src4, src5, src6, src7);
2301  src_tmp += (8 * src_stride);
2302 
2303  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
2304  dst_tmp, dst_stride);
2305  dst_tmp += (8 * dst_stride);
2306  }
2307 
2308  src += 16;
2309  dst += 16;
2310  }
2311 }
2312 
2313 void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride,
2314  uint8_t *src, ptrdiff_t src_stride,
2315  int height, int mx, int my)
2316 {
2317  int32_t cnt;
2318  v16u8 src0, src1, src2, src3;
2319 
2320  if (0 == height % 8) {
2321  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
2322  } else if (0 == height % 4) {
2323  for (cnt = (height >> 2); cnt--;) {
2324  LD_UB4(src, src_stride, src0, src1, src2, src3);
2325  src += (4 * src_stride);
2326 
2327  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
2328  dst += (4 * dst_stride);
2329  }
2330  }
2331 }
void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1184
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:742
void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1644
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1, out2, out3)
Definition: vp8_mc_msa.c:143
#define SPLATI_H3_SH(...)
void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1477
static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1515
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:707
static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1751
#define XORI_B2_128_SB(...)
#define PCKEV_XORI128_UB(in0, in1)
#define LD_SB(...)
#define XORI_B3_128_SB(...)
#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)
Definition: vp8_mc_msa.c:107
void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:2243
#define SLDI_B3_UH(...)
void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1043
void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:807
#define LD_UB4(...)
#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2,filt0, filt1, filt2,out0, out1)
Definition: vp8_mc_msa.c:71
#define ILVR_B2_SB(...)
#define SPLATI_H2_SH(...)
#define src
Definition: vp8dsp.c:254
#define LD_SB2(...)
static void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1798
void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:2016
#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)
#define XORI_B4_128_UB(...)
static void common_hv_2ht_2vt_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp8_mc_msa.c:2032
void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:379
void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:984
#define PCKEV_ST_SB(in0, in1, pdst)
#define ILVR_D2_SB(...)
uint8_t
void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:842
void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:332
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
VP8 compatible video decoder.
static void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1725
#define SRARI_H4_SH(...)
static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp8_mc_msa.c:1822
#define XORI_B2_128_UB(...)
void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:665
#define ILVL_B2_SB(...)
#define height
#define LD_SH(...)
static const int8_t bilinear_filters_msa[7][2]
Definition: vp8_mc_msa.c:44
#define LD_UB5(...)
#define ILVR_D4_SB(...)
#define LD_SB8(...)
void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:2172
static void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1492
#define PCKEV_B2_SB(...)
static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t width)
Definition: vp8_mc_msa.c:2286
void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1867
static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp8_mc_msa.c:2076
void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1169
static const int8_t subpel_filters_msa[7][8]
Definition: vp8_mc_msa.c:34
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:938
static const uint16_t mask[17]
Definition: lzw.c:38
static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:680
#define SPLATI_H2_SB(...)
void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:578
#define XORI_B4_128_SB(...)
#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2,filt_h0, filt_h1, filt_h2)
Definition: vp8_mc_msa.c:54
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1)
Definition: vp8_mc_msa.c:131
void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1544
void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1785
void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:2313
#define SRARI_H2_SH(...)
void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1402
#define ILVR_B4_UB(...)
void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:2156
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
#define LD_UB8(...)
#define width
#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)
static const uint8_t mc_filt_mask_arr[16 *3]
Definition: vp8_mc_msa.c:25
#define SRARI_H2_UH(...)
void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:503
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1658
#define VSHF_B2_UH(...)
int32_t
#define PCKEV_B4_SB(...)
#define LD_SB3(...)
void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:429
#define ST_UB(...)
#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)
Definition: vp8_mc_msa.c:117
#define SAT_SH4_SH(...)
#define LD_SB4(...)
#define PCKEV_B4_UB(...)
static void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:159
void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:792
#define ST_UB8(...)
#define ST_UB4(...)
void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:283
#define src1
Definition: h264pred.c:139
#define ILVL_B4_SB(...)
#define SAT_SH2_SH(...)
#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2,filt0, filt1, filt2,out0, out1, out2, out3)
Definition: vp8_mc_msa.c:86
void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:235
void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1102
#define DOTP_UB2_UH(...)
#define SRARI_H4_UH(...)
#define src0
Definition: h264pred.c:138
static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp8_mc_msa.c:1581
#define SD4(in0, in1, in2, in3, pdst, stride)
static const int8_t filt[NUMTAPS]
Definition: af_earwax.c:39
void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1323
#define LD_SB5(...)
#define ILVEV_B2_SH(...)
#define ILVEV_B2_UB(...)
#define ILVL_B2_UB(...)
#define SAT_UH2_UH(...)
#define SAT_UH4_UH(...)
#define LD_UB(...)
#define SPLATI_H3_SB(...)
#define DOTP_UB4_UH(...)
#define VSHF_B2_UB(...)
static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:187
#define ILVR_B4_SB(...)
void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1250
FILE * out
Definition: movenc.c:54
static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp8_mc_msa.c:1969
void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1881
static void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1557
void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:891
#define ST_W2(in, idx0, idx1, pdst, stride)
void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:222
#define LD_UH(...)
#define PCKEV_B2_UB(...)
#define ILVR_B2_UB(...)
static void common_hv_2ht_2vt_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp8_mc_msa.c:1935
void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1338