FFmpeg
vp8_mc_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/vp8dsp.h"
23 #include "vp8dsp_mips.h"
24 
25 static const uint8_t mc_filt_mask_arr[16 * 3] = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  /* 4 width cases */
29  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30  /* 4 width cases */
31  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32 };
33 
34 static const int8_t subpel_filters_msa[7][8] = {
35  {-6, 123, 12, -1, 0, 0, 0, 0},
36  {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */
37  {-9, 93, 50, -6, 0, 0, 0, 0},
38  {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */
39  {-6, 50, 93, -9, 0, 0, 0, 0},
40  {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */
41  {-1, 12, 123, -6, 0, 0, 0, 0},
42 };
43 
44 static const int8_t bilinear_filters_msa[7][2] = {
45  {112, 16},
46  {96, 32},
47  {80, 48},
48  {64, 64},
49  {48, 80},
50  {32, 96},
51  {16, 112}
52 };
53 
54 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, \
55  filt_h0, filt_h1, filt_h2) \
56 ( { \
57  v16i8 vec0_m, vec1_m, vec2_m; \
58  v8i16 hz_out_m; \
59  \
60  VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \
61  vec0_m, vec1_m, vec2_m); \
62  hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, \
63  filt_h0, filt_h1, filt_h2); \
64  \
65  hz_out_m = __msa_srari_h(hz_out_m, 7); \
66  hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
67  \
68  hz_out_m; \
69 } )
70 
71 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
72  mask0, mask1, mask2, \
73  filt0, filt1, filt2, \
74  out0, out1) \
75 { \
76  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \
77  \
78  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
79  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
80  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
81  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
82  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
83  DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
84 }
85 
86 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
87  mask0, mask1, mask2, \
88  filt0, filt1, filt2, \
89  out0, out1, out2, out3) \
90 { \
91  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
92  \
93  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
94  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
95  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
96  out0, out1, out2, out3); \
97  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
98  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
99  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m); \
100  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m); \
101  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
102  out0, out1, out2, out3); \
103  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \
104  out0, out1, out2, out3); \
105 }
106 
107 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
108 ( { \
109  v8i16 tmp0; \
110  \
111  tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
112  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
113  \
114  tmp0; \
115 } )
116 
117 #define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \
118 ( { \
119  v16i8 vec0_m, vec1_m; \
120  v8i16 hz_out_m; \
121  \
122  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \
123  hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \
124  \
125  hz_out_m = __msa_srari_h(hz_out_m, 7); \
126  hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
127  \
128  hz_out_m; \
129 } )
130 
131 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
132  mask0, mask1, filt0, filt1, \
133  out0, out1) \
134 { \
135  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
136  \
137  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
138  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
139  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
140  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
141 }
142 
143 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
144  mask0, mask1, filt0, filt1, \
145  out0, out1, out2, out3) \
146 { \
147  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
148  \
149  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
150  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
151  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
152  out0, out1, out2, out3); \
153  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
154  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
155  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
156  out0, out1, out2, out3); \
157 }
158 
159 static void common_hz_6t_4x4_msa(const uint8_t *src, int32_t src_stride,
160  uint8_t *dst, int32_t dst_stride,
161  const int8_t *filter)
162 {
163  v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
164  v16u8 mask0, mask1, mask2, out;
165  v8i16 filt, out0, out1;
166 
167  mask0 = LD_UB(&mc_filt_mask_arr[16]);
168  src -= 2;
169 
170  /* rearranging filter */
171  filt = LD_SH(filter);
172  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
173 
174  mask1 = mask0 + 2;
175  mask2 = mask0 + 4;
176 
177  LD_SB4(src, src_stride, src0, src1, src2, src3);
178  XORI_B4_128_SB(src0, src1, src2, src3);
179  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
180  filt0, filt1, filt2, out0, out1);
181  SRARI_H2_SH(out0, out1, 7);
182  SAT_SH2_SH(out0, out1, 7);
183  out = PCKEV_XORI128_UB(out0, out1);
184  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
185 }
186 
187 static void common_hz_6t_4x8_msa(const uint8_t *src, int32_t src_stride,
188  uint8_t *dst, int32_t dst_stride,
189  const int8_t *filter)
190 {
191  v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
192  v16u8 mask0, mask1, mask2, out;
193  v8i16 filt, out0, out1, out2, out3;
194 
195  mask0 = LD_UB(&mc_filt_mask_arr[16]);
196  src -= 2;
197 
198  /* rearranging filter */
199  filt = LD_SH(filter);
200  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
201 
202  mask1 = mask0 + 2;
203  mask2 = mask0 + 4;
204 
205  LD_SB4(src, src_stride, src0, src1, src2, src3);
206  XORI_B4_128_SB(src0, src1, src2, src3);
207  src += (4 * src_stride);
208  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
209  filt0, filt1, filt2, out0, out1);
210  LD_SB4(src, src_stride, src0, src1, src2, src3);
211  XORI_B4_128_SB(src0, src1, src2, src3);
212  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
213  filt0, filt1, filt2, out2, out3);
214  SRARI_H4_SH(out0, out1, out2, out3, 7);
215  SAT_SH4_SH(out0, out1, out2, out3, 7);
216  out = PCKEV_XORI128_UB(out0, out1);
217  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
218  out = PCKEV_XORI128_UB(out2, out3);
219  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
220 }
221 
222 void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
223  const uint8_t *src, ptrdiff_t src_stride,
224  int height, int mx, int my)
225 {
226  const int8_t *filter = subpel_filters_msa[mx - 1];
227 
228  if (4 == height) {
229  common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
230  } else if (8 == height) {
231  common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
232  }
233 }
234 
235 void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
236  const uint8_t *src, ptrdiff_t src_stride,
237  int height, int mx, int my)
238 {
239  uint32_t loop_cnt;
240  const int8_t *filter = subpel_filters_msa[mx - 1];
241  v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
242  v16u8 mask0, mask1, mask2, tmp0, tmp1;
243  v8i16 filt, out0, out1, out2, out3;
244 
245  mask0 = LD_UB(&mc_filt_mask_arr[0]);
246 
247  src -= 2;
248 
249  /* rearranging filter */
250  filt = LD_SH(filter);
251  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
252 
253  mask1 = mask0 + 2;
254  mask2 = mask0 + 4;
255 
256  LD_SB4(src, src_stride, src0, src1, src2, src3);
257  XORI_B4_128_SB(src0, src1, src2, src3);
258  src += (4 * src_stride);
259  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
260  filt0, filt1, filt2, out0, out1, out2, out3);
261  SRARI_H4_SH(out0, out1, out2, out3, 7);
262  SAT_SH4_SH(out0, out1, out2, out3, 7);
263  tmp0 = PCKEV_XORI128_UB(out0, out1);
264  tmp1 = PCKEV_XORI128_UB(out2, out3);
265  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
266  dst += (4 * dst_stride);
267 
268  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
269  LD_SB4(src, src_stride, src0, src1, src2, src3);
270  XORI_B4_128_SB(src0, src1, src2, src3);
271  src += (4 * src_stride);
272  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
273  filt0, filt1, filt2, out0, out1, out2, out3);
274  SRARI_H4_SH(out0, out1, out2, out3, 7);
275  SAT_SH4_SH(out0, out1, out2, out3, 7);
276  tmp0 = PCKEV_XORI128_UB(out0, out1);
277  tmp1 = PCKEV_XORI128_UB(out2, out3);
278  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
279  dst += (4 * dst_stride);
280  }
281 }
282 
283 void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
284  const uint8_t *src, ptrdiff_t src_stride,
285  int height, int mx, int my)
286 {
287  uint32_t loop_cnt;
288  const int8_t *filter = subpel_filters_msa[mx - 1];
289  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
290  v16u8 mask0, mask1, mask2, out;
291  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
292 
293  mask0 = LD_UB(&mc_filt_mask_arr[0]);
294  src -= 2;
295 
296  /* rearranging filter */
297  filt = LD_SH(filter);
298  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
299 
300  mask1 = mask0 + 2;
301  mask2 = mask0 + 4;
302 
303  for (loop_cnt = (height >> 2); loop_cnt--;) {
304  LD_SB4(src, src_stride, src0, src2, src4, src6);
305  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
306  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
307  src += (4 * src_stride);
308 
309  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
310  filt0, filt1, filt2, out0, out1, out2, out3);
311  HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
312  filt0, filt1, filt2, out4, out5, out6, out7);
313  SRARI_H4_SH(out0, out1, out2, out3, 7);
314  SRARI_H4_SH(out4, out5, out6, out7, 7);
315  SAT_SH4_SH(out0, out1, out2, out3, 7);
316  SAT_SH4_SH(out4, out5, out6, out7, 7);
317  out = PCKEV_XORI128_UB(out0, out1);
318  ST_UB(out, dst);
319  dst += dst_stride;
320  out = PCKEV_XORI128_UB(out2, out3);
321  ST_UB(out, dst);
322  dst += dst_stride;
323  out = PCKEV_XORI128_UB(out4, out5);
324  ST_UB(out, dst);
325  dst += dst_stride;
326  out = PCKEV_XORI128_UB(out6, out7);
327  ST_UB(out, dst);
328  dst += dst_stride;
329  }
330 }
331 
332 void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
333  const uint8_t *src, ptrdiff_t src_stride,
334  int height, int mx, int my)
335 {
336  uint32_t loop_cnt;
337  const int8_t *filter = subpel_filters_msa[my - 1];
338  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
339  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
340  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
341  v16u8 out;
342  v8i16 filt, out10, out32;
343 
344  src -= (2 * src_stride);
345 
346  filt = LD_SH(filter);
347  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
348 
349  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
350  src += (5 * src_stride);
351 
352  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
353  src32_r, src43_r);
354  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
355  XORI_B2_128_SB(src2110, src4332);
356 
357  for (loop_cnt = (height >> 2); loop_cnt--;) {
358  LD_SB4(src, src_stride, src5, src6, src7, src8);
359  src += (4 * src_stride);
360 
361  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
362  src65_r, src76_r, src87_r);
363  ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
364  XORI_B2_128_SB(src6554, src8776);
365  out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
366  out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
367  SRARI_H2_SH(out10, out32, 7);
368  SAT_SH2_SH(out10, out32, 7);
369  out = PCKEV_XORI128_UB(out10, out32);
370  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
371  dst += (4 * dst_stride);
372 
373  src2110 = src6554;
374  src4332 = src8776;
375  src4 = src8;
376  }
377 }
378 
379 void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
380  const uint8_t *src, ptrdiff_t src_stride,
381  int height, int mx, int my)
382 {
383  uint32_t loop_cnt;
384  const int8_t *filter = subpel_filters_msa[my - 1];
385  v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
386  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
387  v16i8 src109_r, filt0, filt1, filt2;
388  v16u8 tmp0, tmp1;
389  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
390 
391  src -= (2 * src_stride);
392 
393  filt = LD_SH(filter);
394  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
395 
396  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
397  src += (5 * src_stride);
398 
399  XORI_B5_128_SB(src0, src1, src2, src3, src4);
400  ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3,
401  src10_r, src32_r, src21_r, src43_r);
402 
403  for (loop_cnt = (height >> 2); loop_cnt--;) {
404  LD_SB4(src, src_stride, src7, src8, src9, src10);
405  XORI_B4_128_SB(src7, src8, src9, src10);
406  src += (4 * src_stride);
407 
408  ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
409  src87_r, src98_r, src109_r);
410  out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
411  out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
412  out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
413  out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
414  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
415  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
416  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
417  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
418  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
419  dst += (4 * dst_stride);
420 
421  src10_r = src76_r;
422  src32_r = src98_r;
423  src21_r = src87_r;
424  src43_r = src109_r;
425  src4 = src10;
426  }
427 }
428 
429 void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
430  const uint8_t *src, ptrdiff_t src_stride,
431  int height, int mx, int my)
432 {
433  uint32_t loop_cnt;
434  const int8_t *filter = subpel_filters_msa[my - 1];
435  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
436  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
437  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
438  v16i8 src65_l, src87_l, filt0, filt1, filt2;
439  v16u8 tmp0, tmp1, tmp2, tmp3;
440  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
441 
442  src -= (2 * src_stride);
443 
444  filt = LD_SH(filter);
445  SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
446 
447  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
448  src += (5 * src_stride);
449 
450  XORI_B5_128_SB(src0, src1, src2, src3, src4);
451  ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r,
452  src32_r, src43_r, src21_r);
453  ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l,
454  src32_l, src43_l, src21_l);
455 
456  for (loop_cnt = (height >> 2); loop_cnt--;) {
457  LD_SB4(src, src_stride, src5, src6, src7, src8);
458  src += (4 * src_stride);
459 
460  XORI_B4_128_SB(src5, src6, src7, src8);
461  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
462  src65_r, src76_r, src87_r);
463  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
464  src65_l, src76_l, src87_l);
465  out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1,
466  filt2);
467  out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1,
468  filt2);
469  out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1,
470  filt2);
471  out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1,
472  filt2);
473  out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1,
474  filt2);
475  out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1,
476  filt2);
477  out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1,
478  filt2);
479  out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1,
480  filt2);
481  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
482  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
483  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
484  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
485  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
486  out3_r, tmp0, tmp1, tmp2, tmp3);
487  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
488  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
489  dst += (4 * dst_stride);
490 
491  src10_r = src54_r;
492  src32_r = src76_r;
493  src21_r = src65_r;
494  src43_r = src87_r;
495  src10_l = src54_l;
496  src32_l = src76_l;
497  src21_l = src65_l;
498  src43_l = src87_l;
499  src4 = src8;
500  }
501 }
502 
503 void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
504  const uint8_t *src, ptrdiff_t src_stride,
505  int height, int mx, int my)
506 {
507  uint32_t loop_cnt;
508  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
509  const int8_t *filter_vert = subpel_filters_msa[my - 1];
510  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
511  v16i8 filt_hz0, filt_hz1, filt_hz2;
512  v16u8 mask0, mask1, mask2, out;
513  v8i16 tmp0, tmp1;
514  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
515  v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
516 
517  mask0 = LD_UB(&mc_filt_mask_arr[16]);
518  src -= (2 + 2 * src_stride);
519 
520  /* rearranging filter */
521  filt = LD_SH(filter_horiz);
522  SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
523 
524  filt = LD_SH(filter_vert);
525  SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
526 
527  mask1 = mask0 + 2;
528  mask2 = mask0 + 4;
529 
530  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
531  src += (5 * src_stride);
532 
533  XORI_B5_128_SB(src0, src1, src2, src3, src4);
534  hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
535  filt_hz1, filt_hz2);
536  hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0,
537  filt_hz1, filt_hz2);
538  hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
539  hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
540  filt_hz1, filt_hz2);
541  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
542 
543  for (loop_cnt = (height >> 2); loop_cnt--;) {
544  LD_SB2(src, src_stride, src5, src6);
545  src += (2 * src_stride);
546 
547  XORI_B2_128_SB(src5, src6);
548  hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
549  filt_hz1, filt_hz2);
550  hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
551 
552  LD_SB2(src, src_stride, src7, src8);
553  src += (2 * src_stride);
554 
555  XORI_B2_128_SB(src7, src8);
556  hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
557  filt_hz1, filt_hz2);
558  hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
559 
560  out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
561  tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
562 
563  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
564  tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
565 
566  SRARI_H2_SH(tmp0, tmp1, 7);
567  SAT_SH2_SH(tmp0, tmp1, 7);
568  out = PCKEV_XORI128_UB(tmp0, tmp1);
569  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
570  dst += (4 * dst_stride);
571 
572  hz_out3 = hz_out7;
573  out0 = out2;
574  out1 = out3;
575  }
576 }
577 
578 void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
579  const uint8_t *src, ptrdiff_t src_stride,
580  int height, int mx, int my)
581 {
582  uint32_t loop_cnt;
583  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
584  const int8_t *filter_vert = subpel_filters_msa[my - 1];
585  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
586  v16i8 filt_hz0, filt_hz1, filt_hz2;
587  v16u8 mask0, mask1, mask2, vec0, vec1;
588  v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
589  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
590  v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
591  v8i16 tmp0, tmp1, tmp2, tmp3;
592 
593  mask0 = LD_UB(&mc_filt_mask_arr[0]);
594  src -= (2 + 2 * src_stride);
595 
596  /* rearranging filter */
597  filt = LD_SH(filter_horiz);
598  SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
599 
600  mask1 = mask0 + 2;
601  mask2 = mask0 + 4;
602 
603  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
604  src += (5 * src_stride);
605 
606  XORI_B5_128_SB(src0, src1, src2, src3, src4);
607  hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
608  filt_hz1, filt_hz2);
609  hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
610  filt_hz1, filt_hz2);
611  hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
612  filt_hz1, filt_hz2);
613  hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
614  filt_hz1, filt_hz2);
615  hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
616  filt_hz1, filt_hz2);
617 
618  filt = LD_SH(filter_vert);
619  SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
620 
621  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
622  ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
623 
624  for (loop_cnt = (height >> 2); loop_cnt--;) {
625  LD_SB4(src, src_stride, src5, src6, src7, src8);
626  src += (4 * src_stride);
627 
628  XORI_B4_128_SB(src5, src6, src7, src8);
629  hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
630  filt_hz1, filt_hz2);
631  out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
632  tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
633 
634  hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
635  filt_hz1, filt_hz2);
636  out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
637  tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
638 
639  hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
640  filt_hz1, filt_hz2);
641  out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
642  tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
643 
644  hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
645  filt_hz1, filt_hz2);
646  out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
647  tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
648 
649  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
650  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
651  vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
652  vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
653  ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
654  dst += (4 * dst_stride);
655 
656  hz_out4 = hz_out8;
657  out0 = out2;
658  out1 = out7;
659  out3 = out5;
660  out4 = out6;
661  }
662 }
663 
664 
665 void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
666  const uint8_t *src, ptrdiff_t src_stride,
667  int height, int mx, int my)
668 {
669  int32_t multiple8_cnt;
670 
671  for (multiple8_cnt = 2; multiple8_cnt--;) {
672  ff_put_vp8_epel8_h6v6_msa(dst, dst_stride, src, src_stride, height,
673  mx, my);
674 
675  src += 8;
676  dst += 8;
677  }
678 }
679 
680 static void common_hz_4t_4x4_msa(const uint8_t *src, int32_t src_stride,
681  uint8_t *dst, int32_t dst_stride,
682  const int8_t *filter)
683 {
684  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
685  v8i16 filt, out0, out1;
686  v16u8 out;
687 
688  mask0 = LD_SB(&mc_filt_mask_arr[16]);
689  src -= 1;
690 
691  /* rearranging filter */
692  filt = LD_SH(filter);
693  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
694 
695  mask1 = mask0 + 2;
696 
697  LD_SB4(src, src_stride, src0, src1, src2, src3);
698  XORI_B4_128_SB(src0, src1, src2, src3);
699  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
700  filt0, filt1, out0, out1);
701  SRARI_H2_SH(out0, out1, 7);
702  SAT_SH2_SH(out0, out1, 7);
703  out = PCKEV_XORI128_UB(out0, out1);
704  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
705 }
706 
707 static void common_hz_4t_4x8_msa(const uint8_t *src, int32_t src_stride,
708  uint8_t *dst, int32_t dst_stride,
709  const int8_t *filter)
710 {
711  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
712  v16u8 out;
713  v8i16 filt, out0, out1, out2, out3;
714 
715  mask0 = LD_SB(&mc_filt_mask_arr[16]);
716  src -= 1;
717 
718  /* rearranging filter */
719  filt = LD_SH(filter);
720  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
721 
722  mask1 = mask0 + 2;
723 
724  LD_SB4(src, src_stride, src0, src1, src2, src3);
725  src += (4 * src_stride);
726 
727  XORI_B4_128_SB(src0, src1, src2, src3);
728  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
729  filt0, filt1, out0, out1);
730  LD_SB4(src, src_stride, src0, src1, src2, src3);
731  XORI_B4_128_SB(src0, src1, src2, src3);
732  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
733  filt0, filt1, out2, out3);
734  SRARI_H4_SH(out0, out1, out2, out3, 7);
735  SAT_SH4_SH(out0, out1, out2, out3, 7);
736  out = PCKEV_XORI128_UB(out0, out1);
737  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
738  out = PCKEV_XORI128_UB(out2, out3);
739  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
740 }
741 
742 static void common_hz_4t_4x16_msa(const uint8_t *src, int32_t src_stride,
743  uint8_t *dst, int32_t dst_stride,
744  const int8_t *filter)
745 {
746  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
747  v16i8 filt0, filt1, mask0, mask1;
748  v16u8 out;
749  v8i16 filt, out0, out1, out2, out3;
750 
751  mask0 = LD_SB(&mc_filt_mask_arr[16]);
752  src -= 1;
753 
754  /* rearranging filter */
755  filt = LD_SH(filter);
756  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
757 
758  mask1 = mask0 + 2;
759 
760  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
761  src += (8 * src_stride);
762  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
763  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
764  filt0, filt1, out0, out1);
765  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
766  filt0, filt1, out2, out3);
767  SRARI_H4_SH(out0, out1, out2, out3, 7);
768  SAT_SH4_SH(out0, out1, out2, out3, 7);
769  out = PCKEV_XORI128_UB(out0, out1);
770  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
771  dst += (4 * dst_stride);
772  out = PCKEV_XORI128_UB(out2, out3);
773  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
774  dst += (4 * dst_stride);
775 
776  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
777  src += (8 * src_stride);
778  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
779  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
780  filt0, filt1, out0, out1);
781  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
782  filt0, filt1, out2, out3);
783  SRARI_H4_SH(out0, out1, out2, out3, 7);
784  SAT_SH4_SH(out0, out1, out2, out3, 7);
785  out = PCKEV_XORI128_UB(out0, out1);
786  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
787  dst += (4 * dst_stride);
788  out = PCKEV_XORI128_UB(out2, out3);
789  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
790 }
791 
792 void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
793  const uint8_t *src, ptrdiff_t src_stride,
794  int height, int mx, int my)
795 {
796  const int8_t *filter = subpel_filters_msa[mx - 1];
797 
798  if (4 == height) {
799  common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
800  } else if (8 == height) {
801  common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
802  } else if (16 == height) {
803  common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
804  }
805 }
806 
807 void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
808  const uint8_t *src, ptrdiff_t src_stride,
809  int height, int mx, int my)
810 {
811  uint32_t loop_cnt;
812  const int8_t *filter = subpel_filters_msa[mx - 1];
813  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
814  v16u8 tmp0, tmp1;
815  v8i16 filt, out0, out1, out2, out3;
816 
817  mask0 = LD_SB(&mc_filt_mask_arr[0]);
818  src -= 1;
819 
820  /* rearranging filter */
821  filt = LD_SH(filter);
822  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
823 
824  mask1 = mask0 + 2;
825 
826  for (loop_cnt = (height >> 2); loop_cnt--;) {
827  LD_SB4(src, src_stride, src0, src1, src2, src3);
828  src += (4 * src_stride);
829 
830  XORI_B4_128_SB(src0, src1, src2, src3);
831  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
832  filt1, out0, out1, out2, out3);
833  SRARI_H4_SH(out0, out1, out2, out3, 7);
834  SAT_SH4_SH(out0, out1, out2, out3, 7);
835  tmp0 = PCKEV_XORI128_UB(out0, out1);
836  tmp1 = PCKEV_XORI128_UB(out2, out3);
837  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
838  dst += (4 * dst_stride);
839  }
840 }
841 
842 void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
843  const uint8_t *src, ptrdiff_t src_stride,
844  int height, int mx, int my)
845 {
846  uint32_t loop_cnt;
847  const int8_t *filter = subpel_filters_msa[mx - 1];
848  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
849  v16i8 filt0, filt1, mask0, mask1;
850  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
851  v16u8 out;
852 
853  mask0 = LD_SB(&mc_filt_mask_arr[0]);
854  src -= 1;
855 
856  /* rearranging filter */
857  filt = LD_SH(filter);
858  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
859 
860  mask1 = mask0 + 2;
861 
862  for (loop_cnt = (height >> 2); loop_cnt--;) {
863  LD_SB4(src, src_stride, src0, src2, src4, src6);
864  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
865  src += (4 * src_stride);
866 
867  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
868  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
869  filt1, out0, out1, out2, out3);
870  HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
871  filt1, out4, out5, out6, out7);
872  SRARI_H4_SH(out0, out1, out2, out3, 7);
873  SRARI_H4_SH(out4, out5, out6, out7, 7);
874  SAT_SH4_SH(out0, out1, out2, out3, 7);
875  SAT_SH4_SH(out4, out5, out6, out7, 7);
876  out = PCKEV_XORI128_UB(out0, out1);
877  ST_UB(out, dst);
878  dst += dst_stride;
879  out = PCKEV_XORI128_UB(out2, out3);
880  ST_UB(out, dst);
881  dst += dst_stride;
882  out = PCKEV_XORI128_UB(out4, out5);
883  ST_UB(out, dst);
884  dst += dst_stride;
885  out = PCKEV_XORI128_UB(out6, out7);
886  ST_UB(out, dst);
887  dst += dst_stride;
888  }
889 }
890 
891 void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
892  const uint8_t *src, ptrdiff_t src_stride,
893  int height, int mx, int my)
894 {
895  uint32_t loop_cnt;
896  const int8_t *filter = subpel_filters_msa[my - 1];
897  v16i8 src0, src1, src2, src3, src4, src5;
898  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
899  v16i8 src2110, src4332, filt0, filt1;
900  v8i16 filt, out10, out32;
901  v16u8 out;
902 
903  src -= src_stride;
904 
905  filt = LD_SH(filter);
906  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
907 
908  LD_SB3(src, src_stride, src0, src1, src2);
909  src += (3 * src_stride);
910 
911  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
912 
913  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
914  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
915 
916  for (loop_cnt = (height >> 2); loop_cnt--;) {
917  LD_SB3(src, src_stride, src3, src4, src5);
918  src += (3 * src_stride);
919  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
920  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
921  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
922  out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
923 
924  src2 = LD_SB(src);
925  src += (src_stride);
926  ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
927  src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
928  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
929  out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
930  SRARI_H2_SH(out10, out32, 7);
931  SAT_SH2_SH(out10, out32, 7);
932  out = PCKEV_XORI128_UB(out10, out32);
933  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
934  dst += (4 * dst_stride);
935  }
936 }
937 
938 void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
939  const uint8_t *src, ptrdiff_t src_stride,
940  int height, int mx, int my)
941 {
942  uint32_t loop_cnt;
943  const int8_t *filter = subpel_filters_msa[my - 1];
944  v16i8 src0, src1, src2, src7, src8, src9, src10;
945  v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
946  v16u8 tmp0, tmp1;
947  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
948 
949  src -= src_stride;
950 
951  filt = LD_SH(filter);
952  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
953 
954  LD_SB3(src, src_stride, src0, src1, src2);
955  src += (3 * src_stride);
956 
958  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
959 
960  for (loop_cnt = (height >> 2); loop_cnt--;) {
961  LD_SB4(src, src_stride, src7, src8, src9, src10);
962  src += (4 * src_stride);
963 
964  XORI_B4_128_SB(src7, src8, src9, src10);
965  ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
966  src72_r, src87_r, src98_r, src109_r);
967  out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
968  out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
969  out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
970  out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
971  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
972  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
973  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
974  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
975  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
976  dst += (4 * dst_stride);
977 
978  src10_r = src98_r;
979  src21_r = src109_r;
980  src2 = src10;
981  }
982 }
983 
984 void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
985  const uint8_t *src, ptrdiff_t src_stride,
986  int height, int mx, int my)
987 {
988  uint32_t loop_cnt;
989  const int8_t *filter = subpel_filters_msa[my - 1];
990  v16i8 src0, src1, src2, src3, src4, src5, src6;
991  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
992  v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
993  v16u8 tmp0, tmp1, tmp2, tmp3;
994  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
995 
996  src -= src_stride;
997 
998  filt = LD_SH(filter);
999  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1000 
1001  LD_SB3(src, src_stride, src0, src1, src2);
1002  src += (3 * src_stride);
1003 
1005  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
1006  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
1007 
1008  for (loop_cnt = (height >> 2); loop_cnt--;) {
1009  LD_SB4(src, src_stride, src3, src4, src5, src6);
1010  src += (4 * src_stride);
1011 
1012  XORI_B4_128_SB(src3, src4, src5, src6);
1013  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1014  src32_r, src43_r, src54_r, src65_r);
1015  ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1016  src32_l, src43_l, src54_l, src65_l);
1017  out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
1018  out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
1019  out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
1020  out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
1021  out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
1022  out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
1023  out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
1024  out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
1025  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1026  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1027  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1028  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1029  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1030  out3_r, tmp0, tmp1, tmp2, tmp3);
1031  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1032  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1033  dst += (4 * dst_stride);
1034 
1035  src10_r = src54_r;
1036  src21_r = src65_r;
1037  src10_l = src54_l;
1038  src21_l = src65_l;
1039  src2 = src6;
1040  }
1041 }
1042 
1043 void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1044  const uint8_t *src, ptrdiff_t src_stride,
1045  int height, int mx, int my)
1046 {
1047  uint32_t loop_cnt;
1048  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1049  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1050  v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1051  v16u8 mask0, mask1, out;
1052  v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1053  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1054 
1055  mask0 = LD_UB(&mc_filt_mask_arr[16]);
1056  src -= (1 + 1 * src_stride);
1057 
1058  /* rearranging filter */
1059  filt = LD_SH(filter_horiz);
1060  SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1061 
1062  mask1 = mask0 + 2;
1063 
1064  LD_SB3(src, src_stride, src0, src1, src2);
1065  src += (3 * src_stride);
1066 
1068  hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1069  hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
1070  vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1071 
1072  filt = LD_SH(filter_vert);
1073  SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1074 
1075  for (loop_cnt = (height >> 2); loop_cnt--;) {
1076  LD_SB4(src, src_stride, src3, src4, src5, src6);
1077  src += (4 * src_stride);
1078 
1079  XORI_B2_128_SB(src3, src4);
1080  hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1081  hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1082  vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1083  tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1084 
1085  XORI_B2_128_SB(src5, src6);
1086  hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1087  hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1088  vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1089  tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1090 
1091  SRARI_H2_SH(tmp0, tmp1, 7);
1092  SAT_SH2_SH(tmp0, tmp1, 7);
1093  out = PCKEV_XORI128_UB(tmp0, tmp1);
1094  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1095  dst += (4 * dst_stride);
1096 
1097  hz_out1 = hz_out5;
1098  vec0 = vec2;
1099  }
1100 }
1101 
1102 void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1103  const uint8_t *src, ptrdiff_t src_stride,
1104  int height, int mx, int my)
1105 {
1106  uint32_t loop_cnt;
1107  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1108  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1109  v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1110  v16u8 mask0, mask1, out0, out1;
1111  v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
1112  v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1113  v8i16 vec0, vec1, vec2, vec3, vec4;
1114 
1115  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1116  src -= (1 + 1 * src_stride);
1117 
1118  /* rearranging filter */
1119  filt = LD_SH(filter_horiz);
1120  SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1121 
1122  mask1 = mask0 + 2;
1123 
1124  LD_SB3(src, src_stride, src0, src1, src2);
1125  src += (3 * src_stride);
1126 
1128  hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1129  hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1130  hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1131  ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1132 
1133  filt = LD_SH(filter_vert);
1134  SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1135 
1136  for (loop_cnt = (height >> 2); loop_cnt--;) {
1137  LD_SB4(src, src_stride, src3, src4, src5, src6);
1138  src += (4 * src_stride);
1139 
1140  XORI_B4_128_SB(src3, src4, src5, src6);
1141  hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1142  vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1143  tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1144 
1145  hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1146  vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1147  tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1148 
1149  hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1150  vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1151  tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
1152 
1153  hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1154  ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
1155  tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1156 
1157  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1158  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1159  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1160  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1161  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1162  dst += (4 * dst_stride);
1163 
1164  vec0 = vec4;
1165  vec2 = vec1;
1166  }
1167 }
1168 
1169 void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1170  const uint8_t *src, ptrdiff_t src_stride,
1171  int height, int mx, int my)
1172 {
1173  int32_t multiple8_cnt;
1174 
1175  for (multiple8_cnt = 2; multiple8_cnt--;) {
1176  ff_put_vp8_epel8_h4v4_msa(dst, dst_stride, src, src_stride, height,
1177  mx, my);
1178 
1179  src += 8;
1180  dst += 8;
1181  }
1182 }
1183 
1184 void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1185  const uint8_t *src, ptrdiff_t src_stride,
1186  int height, int mx, int my)
1187 {
1188  uint32_t loop_cnt;
1189  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1190  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1191  v16i8 src0, src1, src2, src3, src4, src5, src6;
1192  v16i8 filt_hz0, filt_hz1, filt_hz2;
1193  v16u8 res0, res1, mask0, mask1, mask2;
1194  v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1195  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1196 
1197  mask0 = LD_UB(&mc_filt_mask_arr[16]);
1198  src -= (2 + 1 * src_stride);
1199 
1200  /* rearranging filter */
1201  filt = LD_SH(filter_horiz);
1202  SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1203 
1204  mask1 = mask0 + 2;
1205  mask2 = mask0 + 4;
1206 
1207  LD_SB3(src, src_stride, src0, src1, src2);
1208  src += (3 * src_stride);
1209 
1211  hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
1212  filt_hz1, filt_hz2);
1213  hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0,
1214  filt_hz1, filt_hz2);
1215  vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1216 
1217  filt = LD_SH(filter_vert);
1218  SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1219 
1220  for (loop_cnt = (height >> 2); loop_cnt--;) {
1221  LD_SB4(src, src_stride, src3, src4, src5, src6);
1222  src += (4 * src_stride);
1223 
1224  XORI_B4_128_SB(src3, src4, src5, src6);
1225  hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
1226  filt_hz1, filt_hz2);
1227  hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1228  vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1229  tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1230 
1231  hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
1232  filt_hz1, filt_hz2);
1233  hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1234  vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1235  tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1236 
1237  SRARI_H2_SH(tmp0, tmp1, 7);
1238  SAT_SH2_SH(tmp0, tmp1, 7);
1239  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1240  XORI_B2_128_UB(res0, res1);
1241  ST_W2(res0, 0, 1, dst, dst_stride);
1242  ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1243  dst += (4 * dst_stride);
1244 
1245  hz_out1 = hz_out5;
1246  vec0 = vec2;
1247  }
1248 }
1249 
1250 void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1251  const uint8_t *src, ptrdiff_t src_stride,
1252  int height, int mx, int my)
1253 {
1254  uint32_t loop_cnt;
1255  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1256  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1257  v16i8 src0, src1, src2, src3, src4, src5, src6;
1258  v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1259  v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
1260  v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
1261  v16u8 out0, out1;
1262 
1263  mask0 = LD_SB(&mc_filt_mask_arr[0]);
1264  src -= (2 + src_stride);
1265 
1266  /* rearranging filter */
1267  filt = LD_SH(filter_horiz);
1268  SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1269 
1270  mask1 = mask0 + 2;
1271  mask2 = mask0 + 4;
1272 
1273  LD_SB3(src, src_stride, src0, src1, src2);
1274  src += (3 * src_stride);
1275 
1277  hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
1278  filt_hz1, filt_hz2);
1279  hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
1280  filt_hz1, filt_hz2);
1281  hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
1282  filt_hz1, filt_hz2);
1283  ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1284 
1285  filt = LD_SH(filter_vert);
1286  SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1287 
1288  for (loop_cnt = (height >> 2); loop_cnt--;) {
1289  LD_SB4(src, src_stride, src3, src4, src5, src6);
1290  src += (4 * src_stride);
1291 
1292  XORI_B4_128_SB(src3, src4, src5, src6);
1293 
1294  hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
1295  filt_hz1, filt_hz2);
1296  vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1297  tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1298 
1299  hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
1300  filt_hz1, filt_hz2);
1301  vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1302  tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1303 
1304  hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
1305  filt_hz1, filt_hz2);
1306  vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1307  tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
1308 
1309  hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
1310  filt_hz1, filt_hz2);
1311  ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
1312  tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1313 
1314  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1315  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1316  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1317  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1318  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1319  dst += (4 * dst_stride);
1320  }
1321 }
1322 
1323 void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1324  const uint8_t *src, ptrdiff_t src_stride,
1325  int height, int mx, int my)
1326 {
1327  int32_t multiple8_cnt;
1328 
1329  for (multiple8_cnt = 2; multiple8_cnt--;) {
1330  ff_put_vp8_epel8_h6v4_msa(dst, dst_stride, src, src_stride, height,
1331  mx, my);
1332 
1333  src += 8;
1334  dst += 8;
1335  }
1336 }
1337 
1338 void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1339  const uint8_t *src, ptrdiff_t src_stride,
1340  int height, int mx, int my)
1341 {
1342  uint32_t loop_cnt;
1343  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1344  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1345  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1346  v16i8 filt_hz0, filt_hz1, mask0, mask1;
1347  v16u8 out;
1348  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1349  v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
1350  v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
1351 
1352  mask0 = LD_SB(&mc_filt_mask_arr[16]);
1353 
1354  src -= (1 + 2 * src_stride);
1355 
1356  /* rearranging filter */
1357  filt = LD_SH(filter_horiz);
1358  SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1359 
1360  mask1 = mask0 + 2;
1361 
1362  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1363  src += (5 * src_stride);
1364 
1365  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1366  hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1367  hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
1368  hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1369  hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1370  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1371 
1372  filt = LD_SH(filter_vert);
1373  SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1374 
1375  for (loop_cnt = (height >> 2); loop_cnt--;) {
1376  LD_SB4(src, src_stride, src5, src6, src7, src8);
1377  XORI_B4_128_SB(src5, src6, src7, src8);
1378  src += (4 * src_stride);
1379 
1380  hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1381  hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1382  out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1383  tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1384 
1385  hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
1386  hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
1387  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1388  tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
1389 
1390  SRARI_H2_SH(tmp0, tmp1, 7);
1391  SAT_SH2_SH(tmp0, tmp1, 7);
1392  out = PCKEV_XORI128_UB(tmp0, tmp1);
1393  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1394  dst += (4 * dst_stride);
1395 
1396  hz_out3 = hz_out7;
1397  out0 = out2;
1398  out1 = out3;
1399  }
1400 }
1401 
1402 void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1403  const uint8_t *src, ptrdiff_t src_stride,
1404  int height, int mx, int my)
1405 {
1406  uint32_t loop_cnt;
1407  const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1408  const int8_t *filter_vert = subpel_filters_msa[my - 1];
1409  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1410  v16i8 filt_hz0, filt_hz1, mask0, mask1;
1411  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
1412  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1413  v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
1414  v16u8 vec0, vec1;
1415 
1416  mask0 = LD_SB(&mc_filt_mask_arr[0]);
1417  src -= (1 + 2 * src_stride);
1418 
1419  /* rearranging filter */
1420  filt = LD_SH(filter_horiz);
1421  SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1422 
1423  mask1 = mask0 + 2;
1424 
1425  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1426  src += (5 * src_stride);
1427 
1428  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1429  hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1430  hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1431  hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1432  hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1433  hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1434  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1435  ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
1436 
1437  filt = LD_SH(filter_vert);
1438  SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1439 
1440  for (loop_cnt = (height >> 2); loop_cnt--;) {
1441  LD_SB4(src, src_stride, src5, src6, src7, src8);
1442  src += (4 * src_stride);
1443 
1444  XORI_B4_128_SB(src5, src6, src7, src8);
1445 
1446  hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1447  out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1448  tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1449 
1450  hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1451  out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
1452  tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
1453 
1454  hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
1455  out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1456  tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
1457 
1458  hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
1459  out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
1460  tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
1461 
1462  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1463  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1464  vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
1465  vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
1466  ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
1467  dst += (4 * dst_stride);
1468 
1469  hz_out4 = hz_out8;
1470  out0 = out2;
1471  out1 = out6;
1472  out3 = out5;
1473  out4 = out7;
1474  }
1475 }
1476 
1477 void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1478  const uint8_t *src, ptrdiff_t src_stride,
1479  int height, int mx, int my)
1480 {
1481  int32_t multiple8_cnt;
1482 
1483  for (multiple8_cnt = 2; multiple8_cnt--;) {
1484  ff_put_vp8_epel8_h4v6_msa(dst, dst_stride, src, src_stride, height,
1485  mx, my);
1486 
1487  src += 8;
1488  dst += 8;
1489  }
1490 }
1491 
1492 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
1493  uint8_t *dst, int32_t dst_stride,
1494  const int8_t *filter)
1495 {
1496  v16i8 src0, src1, src2, src3, mask;
1497  v16u8 filt0, vec0, vec1, res0, res1;
1498  v8u16 vec2, vec3, filt;
1499 
1500  mask = LD_SB(&mc_filt_mask_arr[16]);
1501 
1502  /* rearranging filter */
1503  filt = LD_UH(filter);
1504  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1505 
1506  LD_SB4(src, src_stride, src0, src1, src2, src3);
1507  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1508  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
1509  SRARI_H2_UH(vec2, vec3, 7);
1510  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
1511  ST_W2(res0, 0, 1, dst, dst_stride);
1512  ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1513 }
1514 
1515 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
1516  uint8_t *dst, int32_t dst_stride,
1517  const int8_t *filter)
1518 {
1519  v16u8 vec0, vec1, vec2, vec3, filt0;
1520  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1521  v16i8 res0, res1, res2, res3;
1522  v8u16 vec4, vec5, vec6, vec7, filt;
1523 
1524  mask = LD_SB(&mc_filt_mask_arr[16]);
1525 
1526  /* rearranging filter */
1527  filt = LD_UH(filter);
1528  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1529 
1530  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1531  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1532  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
1533  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1534  vec4, vec5, vec6, vec7);
1535  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
1536  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
1537  res0, res1, res2, res3);
1538  ST_W2(res0, 0, 1, dst, dst_stride);
1539  ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1540  ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
1541  ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
1542 }
1543 
1544 void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1545  const uint8_t *src, ptrdiff_t src_stride,
1546  int height, int mx, int my)
1547 {
1548  const int8_t *filter = bilinear_filters_msa[mx - 1];
1549 
1550  if (4 == height) {
1551  common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1552  } else if (8 == height) {
1553  common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1554  }
1555 }
1556 
1557 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
1558  uint8_t *dst, int32_t dst_stride,
1559  const int8_t *filter)
1560 {
1561  v16u8 filt0;
1562  v16i8 src0, src1, src2, src3, mask;
1563  v8u16 vec0, vec1, vec2, vec3, filt;
1564 
1565  mask = LD_SB(&mc_filt_mask_arr[0]);
1566 
1567  /* rearranging filter */
1568  filt = LD_UH(filter);
1569  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1570 
1571  LD_SB4(src, src_stride, src0, src1, src2, src3);
1572  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1573  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1574  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1575  vec0, vec1, vec2, vec3);
1576  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1577  PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
1578  ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
1579 }
1580 
1581 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
1582  uint8_t *dst, int32_t dst_stride,
1583  const int8_t *filter, int32_t height)
1584 {
1585  v16u8 filt0;
1586  v16i8 src0, src1, src2, src3, mask, out0, out1;
1587  v8u16 vec0, vec1, vec2, vec3, filt;
1588 
1589  mask = LD_SB(&mc_filt_mask_arr[0]);
1590 
1591  /* rearranging filter */
1592  filt = LD_UH(filter);
1593  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1594 
1595  LD_SB4(src, src_stride, src0, src1, src2, src3);
1596  src += (4 * src_stride);
1597 
1598  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1599  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1600  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1601  vec0, vec1, vec2, vec3);
1602  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1603 
1604  LD_SB4(src, src_stride, src0, src1, src2, src3);
1605  src += (4 * src_stride);
1606 
1607  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1608  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1609 
1610  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1611  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1612  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1613  vec0, vec1, vec2, vec3);
1614  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1615  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1616  ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1617  dst += (8 * dst_stride);
1618 
1619  if (16 == height) {
1620  LD_SB4(src, src_stride, src0, src1, src2, src3);
1621  src += (4 * src_stride);
1622 
1623  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1624  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1625  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1626  vec0, vec1, vec2, vec3);
1627  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1628  LD_SB4(src, src_stride, src0, src1, src2, src3);
1629  src += (4 * src_stride);
1630 
1631  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1632  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1633 
1634  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1635  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1636  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1637  vec0, vec1, vec2, vec3);
1638  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1639  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1640  ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1641  }
1642 }
1643 
1644 void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1645  const uint8_t *src, ptrdiff_t src_stride,
1646  int height, int mx, int my)
1647 {
1648  const int8_t *filter = bilinear_filters_msa[mx - 1];
1649 
1650  if (4 == height) {
1651  common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1652  } else {
1653  common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1654  height);
1655  }
1656 }
1657 
1658 void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1659  const uint8_t *src, ptrdiff_t src_stride,
1660  int height, int mx, int my)
1661 {
1662  uint32_t loop_cnt;
1663  const int8_t *filter = bilinear_filters_msa[mx - 1];
1664  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1665  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1666  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
1667 
1668  mask = LD_SB(&mc_filt_mask_arr[0]);
1669 
1670  loop_cnt = (height >> 2) - 1;
1671 
1672  /* rearranging filter */
1673  filt = LD_UH(filter);
1674  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1675 
1676  LD_SB4(src, src_stride, src0, src2, src4, src6);
1677  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1678  src += (4 * src_stride);
1679 
1680  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1681  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1682  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1683  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1684  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1685  out0, out1, out2, out3);
1686  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1687  out4, out5, out6, out7);
1688  SRARI_H4_UH(out0, out1, out2, out3, 7);
1689  SRARI_H4_UH(out4, out5, out6, out7, 7);
1690  PCKEV_ST_SB(out0, out1, dst);
1691  dst += dst_stride;
1692  PCKEV_ST_SB(out2, out3, dst);
1693  dst += dst_stride;
1694  PCKEV_ST_SB(out4, out5, dst);
1695  dst += dst_stride;
1696  PCKEV_ST_SB(out6, out7, dst);
1697  dst += dst_stride;
1698 
1699  for (; loop_cnt--;) {
1700  LD_SB4(src, src_stride, src0, src2, src4, src6);
1701  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1702  src += (4 * src_stride);
1703 
1704  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1705  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1706  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1707  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1708  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1709  out0, out1, out2, out3);
1710  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1711  out4, out5, out6, out7);
1712  SRARI_H4_UH(out0, out1, out2, out3, 7);
1713  SRARI_H4_UH(out4, out5, out6, out7, 7);
1714  PCKEV_ST_SB(out0, out1, dst);
1715  dst += dst_stride;
1716  PCKEV_ST_SB(out2, out3, dst);
1717  dst += dst_stride;
1718  PCKEV_ST_SB(out4, out5, dst);
1719  dst += dst_stride;
1720  PCKEV_ST_SB(out6, out7, dst);
1721  dst += dst_stride;
1722  }
1723 }
1724 
1725 static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
1726  uint8_t *dst, int32_t dst_stride,
1727  const int8_t *filter)
1728 {
1729  v16i8 src0, src1, src2, src3, src4;
1730  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
1731  v16u8 filt0;
1732  v8i16 filt;
1733  v8u16 tmp0, tmp1;
1734 
1735  filt = LD_SH(filter);
1736  filt0 = (v16u8) __msa_splati_h(filt, 0);
1737 
1738  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1739  src += (5 * src_stride);
1740 
1741  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1742  src10_r, src21_r, src32_r, src43_r);
1743  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1744  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1745  SRARI_H2_UH(tmp0, tmp1, 7);
1746  SAT_UH2_UH(tmp0, tmp1, 7);
1747  src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
1748  ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
1749 }
1750 
1751 static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
1752  uint8_t *dst, int32_t dst_stride,
1753  const int8_t *filter)
1754 {
1755  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1756  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
1757  v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
1758  v8u16 tmp0, tmp1, tmp2, tmp3;
1759  v16u8 filt0;
1760  v8i16 filt;
1761 
1762  filt = LD_SH(filter);
1763  filt0 = (v16u8) __msa_splati_h(filt, 0);
1764 
1765  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1766  src += (8 * src_stride);
1767 
1768  src8 = LD_SB(src);
1769  src += src_stride;
1770 
1771  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1772  src32_r, src43_r);
1773  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1774  src76_r, src87_r);
1775  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1776  src87_r, src76_r, src2110, src4332, src6554, src8776);
1777  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
1778  tmp0, tmp1, tmp2, tmp3);
1779  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1780  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1781  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
1782  ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1783 }
1784 
1785 void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1786  const uint8_t *src, ptrdiff_t src_stride,
1787  int height, int mx, int my)
1788 {
1789  const int8_t *filter = bilinear_filters_msa[my - 1];
1790 
1791  if (4 == height) {
1792  common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1793  } else if (8 == height) {
1794  common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1795  }
1796 }
1797 
1798 static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
1799  uint8_t *dst, int32_t dst_stride,
1800  const int8_t *filter)
1801 {
1802  v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
1803  v16i8 out0, out1;
1804  v8u16 tmp0, tmp1, tmp2, tmp3;
1805  v8i16 filt;
1806 
1807  /* rearranging filter_y */
1808  filt = LD_SH(filter);
1809  filt0 = (v16u8) __msa_splati_h(filt, 0);
1810 
1811  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
1812  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
1813  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
1814  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1815  tmp0, tmp1, tmp2, tmp3);
1816  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1817  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1818  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1819  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1820 }
1821 
1822 static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
1823  uint8_t *dst, int32_t dst_stride,
1824  const int8_t *filter, int32_t height)
1825 {
1826  uint32_t loop_cnt;
1827  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1828  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1829  v16i8 out0, out1;
1830  v8u16 tmp0, tmp1, tmp2, tmp3;
1831  v8i16 filt;
1832 
1833  /* rearranging filter_y */
1834  filt = LD_SH(filter);
1835  filt0 = (v16u8) __msa_splati_h(filt, 0);
1836 
1837  src0 = LD_UB(src);
1838  src += src_stride;
1839 
1840  for (loop_cnt = (height >> 3); loop_cnt--;) {
1841  LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
1842  src += (8 * src_stride);
1843 
1844  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1845  vec0, vec1, vec2, vec3);
1846  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1847  vec4, vec5, vec6, vec7);
1848  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1849  tmp0, tmp1, tmp2, tmp3);
1850  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1851  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1852  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1853  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1854 
1855  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1856  tmp0, tmp1, tmp2, tmp3);
1857  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1858  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1859  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1860  ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1861  dst += (8 * dst_stride);
1862 
1863  src0 = src8;
1864  }
1865 }
1866 
1867 void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1868  const uint8_t *src, ptrdiff_t src_stride,
1869  int height, int mx, int my)
1870 {
1871  const int8_t *filter = bilinear_filters_msa[my - 1];
1872 
1873  if (4 == height) {
1874  common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1875  } else {
1876  common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1877  height);
1878  }
1879 }
1880 
1881 void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1882  const uint8_t *src, ptrdiff_t src_stride,
1883  int height, int mx, int my)
1884 {
1885  uint32_t loop_cnt;
1886  const int8_t *filter = bilinear_filters_msa[my - 1];
1887  v16u8 src0, src1, src2, src3, src4;
1888  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1889  v8u16 tmp0, tmp1, tmp2, tmp3;
1890  v8i16 filt;
1891 
1892  /* rearranging filter_y */
1893  filt = LD_SH(filter);
1894  filt0 = (v16u8) __msa_splati_h(filt, 0);
1895 
1896  src0 = LD_UB(src);
1897  src += src_stride;
1898 
1899  for (loop_cnt = (height >> 2); loop_cnt--;) {
1900  LD_UB4(src, src_stride, src1, src2, src3, src4);
1901  src += (4 * src_stride);
1902 
1903  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
1904  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
1905  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1906  SRARI_H2_UH(tmp0, tmp1, 7);
1907  SAT_UH2_UH(tmp0, tmp1, 7);
1908  PCKEV_ST_SB(tmp0, tmp1, dst);
1909  dst += dst_stride;
1910 
1911  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
1912  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
1913  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1914  SRARI_H2_UH(tmp2, tmp3, 7);
1915  SAT_UH2_UH(tmp2, tmp3, 7);
1916  PCKEV_ST_SB(tmp2, tmp3, dst);
1917  dst += dst_stride;
1918 
1919  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1920  SRARI_H2_UH(tmp0, tmp1, 7);
1921  SAT_UH2_UH(tmp0, tmp1, 7);
1922  PCKEV_ST_SB(tmp0, tmp1, dst);
1923  dst += dst_stride;
1924 
1925  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1926  SRARI_H2_UH(tmp2, tmp3, 7);
1927  SAT_UH2_UH(tmp2, tmp3, 7);
1928  PCKEV_ST_SB(tmp2, tmp3, dst);
1929  dst += dst_stride;
1930 
1931  src0 = src4;
1932  }
1933 }
1934 
1935 static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
1936  uint8_t *dst, int32_t dst_stride,
1937  const int8_t *filter_horiz,
1938  const int8_t *filter_vert)
1939 {
1940  v16i8 src0, src1, src2, src3, src4, mask;
1941  v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
1942  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
1943 
1944  mask = LD_SB(&mc_filt_mask_arr[16]);
1945 
1946  /* rearranging filter */
1947  filt = LD_UH(filter_horiz);
1948  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
1949 
1950  filt = LD_UH(filter_vert);
1951  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
1952 
1953  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1954  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
1955  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
1956  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
1957  hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1958  hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
1959 
1960  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1961  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1962  SRARI_H2_UH(tmp0, tmp1, 7);
1963  SAT_UH2_UH(tmp0, tmp1, 7);
1964  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1965  ST_W2(res0, 0, 1, dst, dst_stride);
1966  ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1967 }
1968 
1969 static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
1970  uint8_t *dst, int32_t dst_stride,
1971  const int8_t *filter_horiz,
1972  const int8_t *filter_vert)
1973 {
1974  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
1975  v16i8 res0, res1, res2, res3;
1976  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
1977  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1978  v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
1979 
1980  mask = LD_SB(&mc_filt_mask_arr[16]);
1981 
1982  /* rearranging filter */
1983  filt = LD_UH(filter_horiz);
1984  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
1985 
1986  filt = LD_UH(filter_vert);
1987  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
1988 
1989  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1990  src += (8 * src_stride);
1991  src8 = LD_SB(src);
1992 
1993  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
1994  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
1995  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
1996  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
1997  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
1998  SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
1999  hz_out3, hz_out5);
2000  hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
2001 
2002  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2003  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2004  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
2005  vec4, vec5, vec6, vec7);
2006  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2007  SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
2008  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
2009  res0, res1, res2, res3);
2010  ST_W2(res0, 0, 1, dst, dst_stride);
2011  ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
2012  ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
2013  ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
2014 }
2015 
2016 void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2017  const uint8_t *src, ptrdiff_t src_stride,
2018  int height, int mx, int my)
2019 {
2020  const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2021  const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2022 
2023  if (4 == height) {
2024  common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
2025  filter_horiz, filter_vert);
2026  } else if (8 == height) {
2027  common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
2028  filter_horiz, filter_vert);
2029  }
2030 }
2031 
2032 static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
2033  uint8_t *dst, int32_t dst_stride,
2034  const int8_t *filter_horiz,
2035  const int8_t *filter_vert)
2036 {
2037  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2038  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2039  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
2040  v8i16 filt;
2041 
2042  mask = LD_SB(&mc_filt_mask_arr[0]);
2043 
2044  /* rearranging filter */
2045  filt = LD_SH(filter_horiz);
2046  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2047 
2048  filt = LD_SH(filter_vert);
2049  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2050 
2051  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2052 
2053  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2054  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2055  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2056  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
2057 
2058  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2059  vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2060  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
2061 
2062  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2063  vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2064  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
2065 
2066  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2067  vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2068  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
2069 
2070  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2071  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2072  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2073  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2074 }
2075 
2076 static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride,
2077  uint8_t *dst, int32_t dst_stride,
2078  const int8_t *filter_horiz,
2079  const int8_t *filter_vert,
2080  int32_t height)
2081 {
2082  uint32_t loop_cnt;
2083  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2084  v16u8 filt_hz, filt_vt, vec0;
2085  v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
2086  v8i16 filt;
2087 
2088  mask = LD_SB(&mc_filt_mask_arr[0]);
2089 
2090  /* rearranging filter */
2091  filt = LD_SH(filter_horiz);
2092  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2093 
2094  filt = LD_SH(filter_vert);
2095  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2096 
2097  src0 = LD_SB(src);
2098  src += src_stride;
2099 
2100  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2101 
2102  for (loop_cnt = (height >> 3); loop_cnt--;) {
2103  LD_SB4(src, src_stride, src1, src2, src3, src4);
2104  src += (4 * src_stride);
2105 
2106  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2107  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2108  tmp1 = __msa_dotp_u_h(vec0, filt_vt);
2109 
2110  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2111  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2112  tmp2 = __msa_dotp_u_h(vec0, filt_vt);
2113 
2114  SRARI_H2_UH(tmp1, tmp2, 7);
2115  SAT_UH2_UH(tmp1, tmp2, 7);
2116 
2117  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2118  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2119  tmp3 = __msa_dotp_u_h(vec0, filt_vt);
2120 
2121  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2122  LD_SB4(src, src_stride, src1, src2, src3, src4);
2123  src += (4 * src_stride);
2124  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2125  tmp4 = __msa_dotp_u_h(vec0, filt_vt);
2126 
2127  SRARI_H2_UH(tmp3, tmp4, 7);
2128  SAT_UH2_UH(tmp3, tmp4, 7);
2129  PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
2130  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2131 
2132  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2133  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2134  tmp5 = __msa_dotp_u_h(vec0, filt_vt);
2135 
2136  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2137  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2138  tmp6 = __msa_dotp_u_h(vec0, filt_vt);
2139 
2140  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2141  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2142  tmp7 = __msa_dotp_u_h(vec0, filt_vt);
2143 
2144  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2145  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2146  tmp8 = __msa_dotp_u_h(vec0, filt_vt);
2147 
2148  SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2149  SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2150  PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
2151  ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2152  dst += (8 * dst_stride);
2153  }
2154 }
2155 
2156 void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2157  const uint8_t *src, ptrdiff_t src_stride,
2158  int height, int mx, int my)
2159 {
2160  const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2161  const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2162 
2163  if (4 == height) {
2164  common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
2165  filter_horiz, filter_vert);
2166  } else {
2167  common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
2168  filter_horiz, filter_vert, height);
2169  }
2170 }
2171 
2172 void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2173  const uint8_t *src, ptrdiff_t src_stride,
2174  int height, int mx, int my)
2175 {
2176  uint32_t loop_cnt;
2177  const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2178  const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2179  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2180  v16u8 filt_hz, filt_vt, vec0, vec1;
2181  v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
2182  v8i16 filt;
2183 
2184  mask = LD_SB(&mc_filt_mask_arr[0]);
2185 
2186  /* rearranging filter */
2187  filt = LD_SH(filter_horiz);
2188  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2189 
2190  filt = LD_SH(filter_vert);
2191  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2192 
2193  LD_SB2(src, 8, src0, src1);
2194  src += src_stride;
2195 
2196  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2197  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2198 
2199 
2200  for (loop_cnt = (height >> 2); loop_cnt--;) {
2201  LD_SB4(src, src_stride, src0, src2, src4, src6);
2202  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2203  src += (4 * src_stride);
2204 
2205  hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2206  hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2207  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2208  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2209  SRARI_H2_UH(tmp1, tmp2, 7);
2210  SAT_UH2_UH(tmp1, tmp2, 7);
2211  PCKEV_ST_SB(tmp1, tmp2, dst);
2212  dst += dst_stride;
2213 
2214  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2215  hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2216  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2217  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2218  SRARI_H2_UH(tmp1, tmp2, 7);
2219  SAT_UH2_UH(tmp1, tmp2, 7);
2220  PCKEV_ST_SB(tmp1, tmp2, dst);
2221  dst += dst_stride;
2222 
2223  hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2224  hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
2225  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2226  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2227  SRARI_H2_UH(tmp1, tmp2, 7);
2228  SAT_UH2_UH(tmp1, tmp2, 7);
2229  PCKEV_ST_SB(tmp1, tmp2, dst);
2230  dst += dst_stride;
2231 
2232  hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
2233  hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
2234  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2235  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2236  SRARI_H2_UH(tmp1, tmp2, 7);
2237  SAT_UH2_UH(tmp1, tmp2, 7);
2238  PCKEV_ST_SB(tmp1, tmp2, dst);
2239  dst += dst_stride;
2240  }
2241 }
2242 
2243 void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride,
2244  const uint8_t *src, ptrdiff_t src_stride,
2245  int height, int mx, int my)
2246 {
2247  int32_t cnt;
2248  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
2249  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2250 
2251  if (0 == height % 8) {
2252  for (cnt = height >> 3; cnt--;) {
2253  LD_UB8(src, src_stride,
2254  src0, src1, src2, src3, src4, src5, src6, src7);
2255  src += (8 * src_stride);
2256 
2257  out0 = __msa_copy_u_d((v2i64) src0, 0);
2258  out1 = __msa_copy_u_d((v2i64) src1, 0);
2259  out2 = __msa_copy_u_d((v2i64) src2, 0);
2260  out3 = __msa_copy_u_d((v2i64) src3, 0);
2261  out4 = __msa_copy_u_d((v2i64) src4, 0);
2262  out5 = __msa_copy_u_d((v2i64) src5, 0);
2263  out6 = __msa_copy_u_d((v2i64) src6, 0);
2264  out7 = __msa_copy_u_d((v2i64) src7, 0);
2265 
2266  SD4(out0, out1, out2, out3, dst, dst_stride);
2267  dst += (4 * dst_stride);
2268  SD4(out4, out5, out6, out7, dst, dst_stride);
2269  dst += (4 * dst_stride);
2270  }
2271  } else if (0 == height % 4) {
2272  for (cnt = (height / 4); cnt--;) {
2273  LD_UB4(src, src_stride, src0, src1, src2, src3);
2274  src += (4 * src_stride);
2275  out0 = __msa_copy_u_d((v2i64) src0, 0);
2276  out1 = __msa_copy_u_d((v2i64) src1, 0);
2277  out2 = __msa_copy_u_d((v2i64) src2, 0);
2278  out3 = __msa_copy_u_d((v2i64) src3, 0);
2279 
2280  SD4(out0, out1, out2, out3, dst, dst_stride);
2281  dst += (4 * dst_stride);
2282  }
2283  }
2284 }
2285 
2286 static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
2287  uint8_t *dst, int32_t dst_stride,
2289 {
2290  int32_t cnt, loop_cnt;
2291  uint8_t *dst_tmp;
2292  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2293 
2294  for (cnt = (width >> 4); cnt--;) {
2295  const uint8_t *src_tmp = src;
2296  dst_tmp = dst;
2297 
2298  for (loop_cnt = (height >> 3); loop_cnt--;) {
2299  LD_UB8(src_tmp, src_stride,
2300  src0, src1, src2, src3, src4, src5, src6, src7);
2301  src_tmp += (8 * src_stride);
2302 
2303  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
2304  dst_tmp, dst_stride);
2305  dst_tmp += (8 * dst_stride);
2306  }
2307 
2308  src += 16;
2309  dst += 16;
2310  }
2311 }
2312 
2313 void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride,
2314  const uint8_t *src, ptrdiff_t src_stride,
2315  int height, int mx, int my)
2316 {
2317  int32_t cnt;
2318  v16u8 src0, src1, src2, src3;
2319 
2320  if (0 == height % 8) {
2321  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
2322  } else if (0 == height % 4) {
2323  for (cnt = (height >> 2); cnt--;) {
2324  LD_UB4(src, src_stride, src0, src1, src2, src3);
2325  src += (4 * src_stride);
2326 
2327  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
2328  dst += (4 * dst_stride);
2329  }
2330  }
2331 }
LD_SB4
#define LD_SB4(...)
Definition: generic_macros_msa.h:297
ff_put_vp8_epel8_v6_msa
void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:379
PCKEV_ST_SB
#define PCKEV_ST_SB(in0, in1, pdst)
Definition: generic_macros_msa.h:2799
HORIZ_6TAP_4WID_4VECS_FILT
#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, filt1, filt2, out0, out1)
Definition: vp8_mc_msa.c:71
LD_UB8
#define LD_UB8(...)
Definition: generic_macros_msa.h:335
ff_put_vp8_bilinear16_v_msa
void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1881
common_hv_2ht_2vt_8x8mult_msa
static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp8_mc_msa.c:2076
ff_put_vp8_bilinear4_v_msa
void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1785
ff_put_vp8_epel4_h6v6_msa
void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:503
ff_put_vp8_pixels16_msa
void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:2313
SRARI_H2_SH
#define SRARI_H2_SH(...)
Definition: generic_macros_msa.h:2059
ff_put_vp8_bilinear8_hv_msa
void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:2156
out
FILE * out
Definition: movenc.c:55
PCKEV_B4_SB
#define PCKEV_B4_SB(...)
Definition: generic_macros_msa.h:1738
ff_put_vp8_epel16_h4v4_msa
void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1169
src1
const pixel * src1
Definition: h264pred_template.c:421
HORIZ_2TAP_FILT_UH
#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)
Definition: generic_macros_msa.h:2809
SAT_SH4_SH
#define SAT_SH4_SH(...)
Definition: generic_macros_msa.h:1615
ff_put_vp8_epel8_h4v6_msa
void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1402
ff_put_vp8_bilinear4_hv_msa
void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:2016
common_hv_2ht_2vt_8x4_msa
static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp8_mc_msa.c:2032
VSHF_B2_UB
#define VSHF_B2_UB(...)
Definition: generic_macros_msa.h:661
SRARI_H4_SH
#define SRARI_H4_SH(...)
Definition: generic_macros_msa.h:2067
DOTP_UB2_UH
#define DOTP_UB2_UH(...)
Definition: generic_macros_msa.h:740
SAT_SH2_SH
#define SAT_SH2_SH(...)
Definition: generic_macros_msa.h:1601
ff_put_vp8_epel16_v4_msa
void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:984
LD_SH
#define LD_SH(...)
Definition: generic_macros_msa.h:35
subpel_filters_msa
static const int8_t subpel_filters_msa[7][8]
Definition: vp8_mc_msa.c:34
filter
void(* filter)(uint8_t *src, int stride, int qscale)
Definition: h263dsp.c:29
PCKEV_B4_UB
#define PCKEV_B4_UB(...)
Definition: generic_macros_msa.h:1739
LD_UB5
#define LD_UB5(...)
Definition: generic_macros_msa.h:307
mc_filt_mask_arr
static const uint8_t mc_filt_mask_arr[16 *3]
Definition: vp8_mc_msa.c:25
ST_UB8
#define ST_UB8(...)
Definition: generic_macros_msa.h:391
ST_UB4
#define ST_UB4(...)
Definition: generic_macros_msa.h:374
ff_put_vp8_epel16_h6v4_msa
void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1323
ff_put_vp8_bilinear16_hv_msa
void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:2172
SRARI_H4_UH
#define SRARI_H4_UH(...)
Definition: generic_macros_msa.h:2066
ff_put_vp8_epel4_h4v4_msa
void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1043
common_hz_2t_8x4_msa
static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1557
XORI_B4_128_SB
#define XORI_B4_128_SB(...)
Definition: generic_macros_msa.h:1851
ff_put_vp8_bilinear8_v_msa
void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1867
ff_put_vp8_epel8_h6v4_msa
void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1250
generic_macros_msa.h
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: vp8_lpf_lsx.c:234
ff_put_vp8_epel4_h4v6_msa
void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1338
LD_SB
#define LD_SB(...)
Definition: generic_macros_msa.h:33
ff_put_vp8_bilinear16_h_msa
void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1658
LD_UB
#define LD_UB(...)
Definition: generic_macros_msa.h:32
LD_SB5
#define LD_SB5(...)
Definition: generic_macros_msa.h:308
HORIZ_6TAP_FILT
#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, filt_h2)
Definition: vp8_mc_msa.c:54
ILVEV_B2_SH
#define ILVEV_B2_SH(...)
Definition: generic_macros_msa.h:1190
ILVEV_B2_UB
#define ILVEV_B2_UB(...)
Definition: generic_macros_msa.h:1188
vp8dsp.h
mask
static const uint16_t mask[17]
Definition: lzw.c:38
ILVR_B4_UB
#define ILVR_B4_UB(...)
Definition: generic_macros_msa.h:1359
width
#define width
ILVL_B2_UB
#define ILVL_B2_UB(...)
Definition: generic_macros_msa.h:1262
FILT_4TAP_DPADD_S_H
#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)
Definition: vp8_mc_msa.c:107
SAT_UH2_UH
#define SAT_UH2_UH(...)
Definition: generic_macros_msa.h:1567
common_hv_2ht_2vt_4x8_msa
static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp8_mc_msa.c:1969
HORIZ_6TAP_8WID_4VECS_FILT
#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, filt1, filt2, out0, out1, out2, out3)
Definition: vp8_mc_msa.c:86
DOTP_UB4_UH
#define DOTP_UB4_UH(...)
Definition: generic_macros_msa.h:749
VSHF_B2_UH
#define VSHF_B2_UH(...)
Definition: generic_macros_msa.h:663
common_vt_2t_8x8mult_msa
static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp8_mc_msa.c:1822
ff_put_vp8_epel8_h6v6_msa
void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:578
ff_put_vp8_epel8_h6_msa
void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:235
common_hz_4t_4x4_msa
static void common_hz_4t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:680
common_hz_6t_4x8_msa
static void common_hz_6t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:187
XORI_B2_128_UB
#define XORI_B2_128_UB(...)
Definition: generic_macros_msa.h:1834
LD_UH
#define LD_UH(...)
Definition: generic_macros_msa.h:34
ILVR_B4_SB
#define ILVR_B4_SB(...)
Definition: generic_macros_msa.h:1360
HORIZ_4TAP_FILT
#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)
Definition: vp8_mc_msa.c:117
ILVR_D2_SB
#define ILVR_D2_SB(...)
Definition: generic_macros_msa.h:1444
PCKEV_B2_UB
#define PCKEV_B2_UB(...)
Definition: generic_macros_msa.h:1720
XORI_B5_128_SB
#define XORI_B5_128_SB(...)
Definition: generic_macros_msa.h:1859
ff_put_vp8_epel4_v6_msa
void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:332
common_vt_2t_4x8_msa
static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1751
HORIZ_4TAP_8WID_4VECS_FILT
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1, out2, out3)
Definition: vp8_mc_msa.c:143
ff_put_vp8_epel8_h4_msa
void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:807
ST_W2
#define ST_W2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:450
ff_put_vp8_epel16_h6v6_msa
void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:665
ILVR_D4_SB
#define ILVR_D4_SB(...)
Definition: generic_macros_msa.h:1460
ff_put_vp8_epel4_h4_msa
void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:792
PCKEV_XORI128_UB
#define PCKEV_XORI128_UB(in0, in1)
Definition: generic_macros_msa.h:2751
common_hz_6t_4x4_msa
static void common_hz_6t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:159
HORIZ_4TAP_4WID_4VECS_FILT
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1)
Definition: vp8_mc_msa.c:131
bilinear_filters_msa
static const int8_t bilinear_filters_msa[7][2]
Definition: vp8_mc_msa.c:44
SPLATI_H3_SH
#define SPLATI_H3_SH(...)
Definition: generic_macros_msa.h:1665
ff_put_vp8_epel16_v6_msa
void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:429
SD4
#define SD4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:256
LD_UB4
#define LD_UB4(...)
Definition: generic_macros_msa.h:296
XORI_B8_128_SB
#define XORI_B8_128_SB(...)
Definition: generic_macros_msa.h:1880
ILVR_B2_SB
#define ILVR_B2_SB(...)
Definition: generic_macros_msa.h:1338
XORI_B2_128_SB
#define XORI_B2_128_SB(...)
Definition: generic_macros_msa.h:1835
ff_put_vp8_pixels8_msa
void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:2243
height
#define height
ff_put_vp8_epel4_h6_msa
void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:222
SPLATI_H2_SH
#define SPLATI_H2_SH(...)
Definition: generic_macros_msa.h:1656
common_hz_2t_4x8_msa
static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1515
SPLATI_H3_SB
#define SPLATI_H3_SB(...)
Definition: generic_macros_msa.h:1664
SRARI_H2_UH
#define SRARI_H2_UH(...)
Definition: generic_macros_msa.h:2058
common_hz_2t_8x8mult_msa
static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp8_mc_msa.c:1581
ff_put_vp8_bilinear8_h_msa
void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1644
XORI_B4_128_UB
#define XORI_B4_128_UB(...)
Definition: generic_macros_msa.h:1850
src2
const pixel * src2
Definition: h264pred_template.c:422
ff_put_vp8_epel16_h4_msa
void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:842
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:470
ILVL_B2_SB
#define ILVL_B2_SB(...)
Definition: generic_macros_msa.h:1263
filt
static const int8_t filt[NUMTAPS *2]
Definition: af_earwax.c:39
common_hz_2t_4x4_msa
static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1492
LD_SB3
#define LD_SB3(...)
Definition: generic_macros_msa.h:289
ILVR_B2_UB
#define ILVR_B2_UB(...)
Definition: generic_macros_msa.h:1337
vp8dsp_mips.h
ST_UB
#define ST_UB(...)
Definition: generic_macros_msa.h:40
copy_16multx8mult_msa
static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t width)
Definition: vp8_mc_msa.c:2286
ff_put_vp8_epel16_h4v6_msa
void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1477
DPADD_SH3_SH
#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)
Definition: vp8_mc_lsx.c:43
common_vt_2t_8x4_msa
static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1798
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:499
ILVL_B4_SB
#define ILVL_B4_SB(...)
Definition: generic_macros_msa.h:1274
LD_SB8
#define LD_SB8(...)
Definition: generic_macros_msa.h:336
ff_put_vp8_epel8_h4v4_msa
void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1102
ff_put_vp8_bilinear4_h_msa
void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1544
src0
const pixel *const src0
Definition: h264pred_template.c:420
common_hv_2ht_2vt_4x4_msa
static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp8_mc_msa.c:1935
ff_put_vp8_epel4_h6v4_msa
void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:1184
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
PCKEV_B2_SB
#define PCKEV_B2_SB(...)
Definition: generic_macros_msa.h:1719
int32_t
int32_t
Definition: audioconvert.c:56
SLDI_B3_UH
#define SLDI_B3_UH(...)
Definition: generic_macros_msa.h:635
common_hz_4t_4x8_msa
static void common_hz_4t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:707
ff_put_vp8_epel4_v4_msa
void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:891
SPLATI_H2_SB
#define SPLATI_H2_SB(...)
Definition: generic_macros_msa.h:1655
common_hz_4t_4x16_msa
static void common_hz_4t_4x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:742
XORI_B3_128_SB
#define XORI_B3_128_SB(...)
Definition: generic_macros_msa.h:1843
ff_put_vp8_epel8_v4_msa
void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:938
LD_SB2
#define LD_SB2(...)
Definition: generic_macros_msa.h:278
common_vt_2t_4x4_msa
static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp8_mc_msa.c:1725
ff_put_vp8_epel16_h6_msa
void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_msa.c:283
SAT_UH4_UH
#define SAT_UH4_UH(...)
Definition: generic_macros_msa.h:1575