FFmpeg
qpeldsp_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "qpeldsp_mips.h"
23 
24 #define APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, coef0, coef1, coef2) \
25 ( { \
26  v16u8 out, tmp0, tmp1; \
27  v16u8 data0, data1, data2, data3, data4, data5; \
28  v8i16 res_r, res_l; \
29  v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
30  v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
31  \
32  VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \
33  ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \
34  data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \
35  data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \
36  HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \
37  ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \
38  data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \
39  data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \
40  sum0_r *= (v8u16) (coef0); \
41  sum0_l *= (v8u16) (coef0); \
42  ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \
43  data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \
44  data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \
45  DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
46  ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \
47  HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
48  DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
49  res_r = (v8i16) (sum0_r - sum3_r); \
50  res_l = (v8i16) (sum0_l - sum3_l); \
51  SRARI_H2_SH(res_r, res_l, 5); \
52  CLIP_SH2_0_255(res_r, res_l); \
53  out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
54  \
55  out; \
56 } )
57 
58 #define APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, \
59  mask0, mask1, mask2, mask3, \
60  coef0, coef1, coef2) \
61 ( { \
62  v16u8 out; \
63  v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
64  v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \
65  v8i16 res0_r, res1_r; \
66  \
67  VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \
68  VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \
69  HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \
70  DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \
71  VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \
72  VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \
73  DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \
74  DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \
75  res0_r = (v8i16) (sum0_r - sum3_r); \
76  res1_r = (v8i16) (sum4_r - sum7_r); \
77  SRARI_H2_SH(res0_r, res1_r, 5); \
78  CLIP_SH2_0_255(res0_r, res1_r); \
79  out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \
80  \
81  out; \
82 } )
83 
84 #define APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, \
85  mask0, mask1, mask2, mask3, \
86  coef0, coef1, coef2) \
87 ( { \
88  v16u8 out; \
89  v8i16 res0_r; \
90  v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
91  \
92  VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \
93  sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \
94  sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \
95  VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \
96  DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \
97  res0_r = (v8i16) (sum0_r - sum3_r); \
98  res0_r = __msa_srari_h(res0_r, 5); \
99  CLIP_SH_0_255(res0_r); \
100  out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \
101  \
102  out; \
103 } )
104 
105 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, \
106  mask2, mask3, coef0, \
107  coef1, coef2) \
108 ( { \
109  v16u8 out; \
110  v8i16 res0_r; \
111  v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
112  \
113  VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \
114  sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \
115  sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \
116  VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \
117  DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \
118  res0_r = (v8i16) (sum0_r - sum3_r); \
119  res0_r += 15; \
120  res0_r >>= 5; \
121  CLIP_SH_0_255(res0_r); \
122  out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \
123  \
124  out; \
125 } )
126 
127 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, \
128  coef0, coef1, coef2) \
129 ( { \
130  v16u8 out, tmp0, tmp1; \
131  v16u8 data0, data1, data2, data3, data4, data5; \
132  v8i16 res_r, res_l; \
133  v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
134  v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
135  \
136  VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \
137  ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \
138  data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \
139  data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \
140  HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \
141  ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \
142  data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \
143  data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \
144  sum0_r *= (v8u16) (coef0); \
145  sum0_l *= (v8u16) (coef0); \
146  ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \
147  data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \
148  data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \
149  DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
150  ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \
151  HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
152  DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
153  res_r = (v8i16) (sum0_r - sum3_r); \
154  res_l = (v8i16) (sum0_l - sum3_l); \
155  res_r += 15; \
156  res_l += 15; \
157  res_r >>= 5; \
158  res_l >>= 5; \
159  CLIP_SH2_0_255(res_r, res_l); \
160  out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
161  \
162  out; \
163 } )
164 
165 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, \
166  mask0, mask1, mask2, mask3, \
167  coef0, coef1, coef2) \
168 ( { \
169  v16u8 out; \
170  v8i16 res0_r, res1_r; \
171  v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
172  v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \
173  \
174  VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \
175  VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \
176  HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \
177  DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \
178  VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \
179  VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \
180  DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \
181  DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \
182  res0_r = (v8i16) (sum0_r - sum3_r); \
183  res1_r = (v8i16) (sum4_r - sum7_r); \
184  res0_r += 15; \
185  res1_r += 15; \
186  res0_r >>= 5; \
187  res1_r >>= 5; \
188  CLIP_SH2_0_255(res0_r, res1_r); \
189  out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \
190  \
191  out; \
192 } )
193 
194 #define APPLY_VERT_QPEL_FILTER(inp0, inp1, inp2, inp3, \
195  inp4, inp5, inp6, inp7, \
196  coef0, coef1, coef2) \
197 ( { \
198  v16u8 res; \
199  v8i16 res_r, res_l; \
200  v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
201  v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
202  \
203  ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \
204  ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \
205  DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \
206  HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
207  ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \
208  ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \
209  DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
210  DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
211  res_r = (v8i16) (sum0_r - sum3_r); \
212  res_l = (v8i16) (sum0_l - sum3_l); \
213  SRARI_H2_SH(res_r, res_l, 5); \
214  CLIP_SH2_0_255(res_r, res_l); \
215  res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
216  \
217  res; \
218 } )
219 
220 #define APPLY_VERT_QPEL_FILTER_8BYTE(inp00, inp01, inp02, inp03, \
221  inp04, inp05, inp06, inp07, \
222  inp10, inp11, inp12, inp13, \
223  inp14, inp15, inp16, inp17, \
224  coef0, coef1, coef2) \
225 ( { \
226  v16u8 res; \
227  v8i16 val0, val1; \
228  v8u16 sum00, sum01, sum02, sum03; \
229  v8u16 sum10, sum11, sum12, sum13; \
230  \
231  ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \
232  sum00, sum10, sum03, sum13); \
233  DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \
234  HADD_UB2_UH(sum03, sum13, sum03, sum13); \
235  ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \
236  sum02, sum12, sum01, sum11); \
237  DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \
238  DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \
239  val0 = (v8i16) (sum00 - sum03); \
240  val1 = (v8i16) (sum10 - sum13); \
241  SRARI_H2_SH(val0, val1, 5); \
242  CLIP_SH2_0_255(val0, val1); \
243  res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \
244  \
245  res; \
246 } )
247 
248 #define APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp1, inp2, inp3, \
249  inp4, inp5, inp6, inp7, \
250  coef0, coef1, coef2) \
251 ( { \
252  v16u8 res; \
253  v8i16 res_r, res_l; \
254  v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
255  v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
256  \
257  ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \
258  ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \
259  DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \
260  HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
261  ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \
262  ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \
263  DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
264  DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
265  res_r = (v8i16) (sum0_r - sum3_r); \
266  res_l = (v8i16) (sum0_l - sum3_l); \
267  res_r += 15; \
268  res_l += 15; \
269  res_r >>= 5; \
270  res_l >>= 5; \
271  CLIP_SH2_0_255(res_r, res_l); \
272  res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
273  \
274  res; \
275 } )
276 
277 #define APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp00, inp01, inp02, inp03, \
278  inp04, inp05, inp06, inp07, \
279  inp10, inp11, inp12, inp13, \
280  inp14, inp15, inp16, inp17, \
281  coef0, coef1, coef2) \
282 ( { \
283  v16u8 res; \
284  v8i16 val0, val1; \
285  v8u16 sum00, sum01, sum02, sum03; \
286  v8u16 sum10, sum11, sum12, sum13; \
287  \
288  ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \
289  sum00, sum10, sum03, sum13); \
290  DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \
291  HADD_UB2_UH(sum03, sum13, sum03, sum13); \
292  ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \
293  sum02, sum12, sum01, sum11); \
294  DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \
295  DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \
296  val0 = (v8i16) (sum00 - sum03); \
297  val1 = (v8i16) (sum10 - sum13); \
298  val0 += 15; \
299  val1 += 15; \
300  val0 >>= 5; \
301  val1 >>= 5; \
302  CLIP_SH2_0_255(val0, val1); \
303  res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \
304  \
305  res; \
306 } )
307 
309  int32_t src_stride,
310  uint8_t *dst,
311  int32_t dst_stride,
312  int32_t height)
313 {
314  uint8_t loop_count;
315  v16u8 inp0, inp1, inp2, inp3;
316  v16u8 res0, res1;
317  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
318  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
319  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
320  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
321  v16u8 const20 = (v16u8) __msa_ldi_b(20);
322  v16u8 const6 = (v16u8) __msa_ldi_b(6);
323  v16u8 const3 = (v16u8) __msa_ldi_b(3);
324 
325  for (loop_count = (height >> 2); loop_count--;) {
326  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
327  src += (4 * src_stride);
328  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
329  mask0, mask1, mask2, mask3,
330  const20, const6, const3);
331  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
332  mask0, mask1, mask2, mask3,
333  const20, const6, const3);
334  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
335  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
336  AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
337  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
338  dst += (4 * dst_stride);
339  }
340 }
341 
343  int32_t src_stride,
344  uint8_t *dst,
345  int32_t dst_stride,
346  int32_t height)
347 {
348  uint8_t loop_count;
349  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
350  v16u8 res;
351  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
352  v16u8 const6 = (v16u8) __msa_ldi_b(6);
353  v16u8 const3 = (v16u8) __msa_ldi_b(3);
354  v8u16 const20 = (v8u16) __msa_ldi_h(20);
355 
356  for (loop_count = (height >> 2); loop_count--;) {
357  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
358  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
359  src += (4 * src_stride);
360  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
361  const20, const6, const3);
362  res = __msa_aver_u_b(inp0, res);
363  ST_UB(res, dst);
364  dst += dst_stride;
365 
366  res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
367  const20, const6, const3);
368  res = __msa_aver_u_b(inp2, res);
369  ST_UB(res, dst);
370  dst += dst_stride;
371 
372  res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
373  const20, const6, const3);
374  res = __msa_aver_u_b(inp4, res);
375  ST_UB(res, dst);
376  dst += dst_stride;
377 
378  res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
379  const20, const6, const3);
380  res = __msa_aver_u_b(inp6, res);
381  ST_UB(res, dst);
382  dst += dst_stride;
383  }
384 }
385 
387  int32_t src_stride,
388  uint8_t *dst,
389  int32_t dst_stride,
390  int32_t height)
391 {
392  uint8_t loop_count;
393  v16u8 inp0, inp1, inp2, inp3;
394  v16u8 res0, res1;
395  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
396  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
397  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
398  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
399  v16u8 const20 = (v16u8) __msa_ldi_b(20);
400  v16u8 const6 = (v16u8) __msa_ldi_b(6);
401  v16u8 const3 = (v16u8) __msa_ldi_b(3);
402 
403  for (loop_count = (height >> 2); loop_count--;) {
404  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
405  src += (4 * src_stride);
406  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
407  mask0, mask1, mask2, mask3,
408  const20, const6, const3);
409  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
410  mask0, mask1, mask2, mask3,
411  const20, const6, const3);
412  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
413  dst += (4 * dst_stride);
414  }
415 }
416 
418  int32_t src_stride,
419  uint8_t *dst,
420  int32_t dst_stride,
421  int32_t height)
422 {
423  uint8_t loop_count;
424  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
425  v16u8 res;
426  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
427  v8u16 const20 = (v8u16) __msa_ldi_h(20);
428  v16u8 const6 = (v16u8) __msa_ldi_b(6);
429  v16u8 const3 = (v16u8) __msa_ldi_b(3);
430 
431  for (loop_count = (height >> 2); loop_count--;) {
432  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
433  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
434  src += (4 * src_stride);
435  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
436  const20, const6, const3);
437  ST_UB(res, dst);
438  dst += dst_stride;
439 
440  res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
441  const20, const6, const3);
442  ST_UB(res, dst);
443  dst += dst_stride;
444 
445  res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
446  const20, const6, const3);
447  ST_UB(res, dst);
448  dst += dst_stride;
449 
450  res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
451  const20, const6, const3);
452  ST_UB(res, dst);
453  dst += dst_stride;
454  }
455 }
456 
458  int32_t src_stride,
459  uint8_t *dst,
460  int32_t dst_stride,
461  int32_t height)
462 {
463  uint8_t loop_count;
464  v16u8 inp0, inp1, inp2, inp3;
465  v16u8 res0, res1;
466  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
467  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
468  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
469  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
470  v16u8 const20 = (v16u8) __msa_ldi_b(20);
471  v16u8 const6 = (v16u8) __msa_ldi_b(6);
472  v16u8 const3 = (v16u8) __msa_ldi_b(3);
473 
474  for (loop_count = (height >> 2); loop_count--;) {
475  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
476  src += (4 * src_stride);
477  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
478  mask0, mask1, mask2, mask3,
479  const20, const6, const3);
480  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
481  mask0, mask1, mask2, mask3,
482  const20, const6, const3);
483  SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1,
484  inp0, inp1, inp2, inp3);
485  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
486  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
487  AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
488  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
489  dst += (4 * dst_stride);
490  }
491 }
492 
494  int32_t src_stride,
495  uint8_t *dst,
496  int32_t dst_stride,
497  int32_t height)
498 {
499  uint8_t loop_count;
500  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
501  v16u8 res;
502  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
503  v8u16 const20 = (v8u16) __msa_ldi_h(20);
504  v16u8 const6 = (v16u8) __msa_ldi_b(6);
505  v16u8 const3 = (v16u8) __msa_ldi_b(3);
506 
507  for (loop_count = (height >> 2); loop_count--;) {
508  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
509  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
510  src += (4 * src_stride);
511  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
512  const20, const6, const3);
513  res = __msa_aver_u_b(res, inp1);
514  ST_UB(res, dst);
515  dst += dst_stride;
516 
517  res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
518  const20, const6, const3);
519  res = __msa_aver_u_b(res, inp3);
520  ST_UB(res, dst);
521  dst += dst_stride;
522 
523  res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
524  const20, const6, const3);
525  res = __msa_aver_u_b(res, inp5);
526  ST_UB(res, dst);
527  dst += dst_stride;
528 
529  res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
530  const20, const6, const3);
531  res = __msa_aver_u_b(res, inp7);
532  ST_UB(res, dst);
533  dst += dst_stride;
534  }
535 }
536 
538  int32_t src_stride,
539  uint8_t *dst,
540  int32_t dst_stride,
541  int32_t height)
542 {
543  uint8_t loop_count;
544  v16u8 inp0, inp1, inp2, inp3;
545  v16u8 res0, res1;
546  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
547  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
548  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
549  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
550  v16u8 const20 = (v16u8) __msa_ldi_b(20);
551  v16u8 const6 = (v16u8) __msa_ldi_b(6);
552  v16u8 const3 = (v16u8) __msa_ldi_b(3);
553 
554  for (loop_count = (height >> 2); loop_count--;) {
555  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
556  src += (4 * src_stride);
557  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
558  mask2, mask3, const20,
559  const6, const3);
560  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
561  mask2, mask3, const20,
562  const6, const3);
563  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
564  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
565  res0 = __msa_ave_u_b(inp0, res0);
566  res1 = __msa_ave_u_b(inp2, res1);
567  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
568  dst += (4 * dst_stride);
569  }
570 }
571 
573  int32_t src_stride,
574  uint8_t *dst,
575  int32_t dst_stride,
576  int32_t height)
577 {
578  uint8_t loop_count;
579  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
580  v16u8 res;
581  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
582  v8u16 const20 = (v8u16) __msa_ldi_h(20);
583  v16u8 const6 = (v16u8) __msa_ldi_b(6);
584  v16u8 const3 = (v16u8) __msa_ldi_b(3);
585 
586  for (loop_count = (height >> 2); loop_count--;) {
587  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
588  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
589  src += (4 * src_stride);
590  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
591  const20, const6, const3);
592  res = __msa_ave_u_b(inp0, res);
593  ST_UB(res, dst);
594  dst += dst_stride;
595 
596  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
597  const20, const6, const3);
598  res = __msa_ave_u_b(inp2, res);
599  ST_UB(res, dst);
600  dst += dst_stride;
601 
602  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
603  const20, const6, const3);
604  res = __msa_ave_u_b(inp4, res);
605  ST_UB(res, dst);
606  dst += dst_stride;
607 
608  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
609  const20, const6, const3);
610  res = __msa_ave_u_b(inp6, res);
611  ST_UB(res, dst);
612  dst += dst_stride;
613  }
614 }
615 
617  int32_t src_stride,
618  uint8_t *dst,
619  int32_t dst_stride,
620  int32_t height)
621 {
622  uint8_t loop_count;
623  v16u8 inp0, inp1, inp2, inp3;
624  v16u8 res0, res1;
625  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
626  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
627  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
628  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
629  v16u8 const20 = (v16u8) __msa_ldi_b(20);
630  v16u8 const6 = (v16u8) __msa_ldi_b(6);
631  v16u8 const3 = (v16u8) __msa_ldi_b(3);
632 
633  for (loop_count = (height >> 2); loop_count--;) {
634  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
635  src += (4 * src_stride);
636  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
637  mask2, mask3, const20,
638  const6, const3);
639  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
640  mask2, mask3, const20,
641  const6, const3);
642  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
643  dst += (4 * dst_stride);
644  }
645 }
646 
648  int32_t src_stride,
649  uint8_t *dst,
650  int32_t dst_stride,
651  int32_t height)
652 {
653  uint8_t loop_count;
654  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
655  v16u8 res;
656  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
657  v16u8 const6 = (v16u8) __msa_ldi_b(6);
658  v16u8 const3 = (v16u8) __msa_ldi_b(3);
659  v8u16 const20 = (v8u16) __msa_ldi_h(20);
660 
661  for (loop_count = (height >> 2); loop_count--;) {
662  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
663  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
664  src += (4 * src_stride);
665  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
666  const20, const6, const3);
667  ST_UB(res, dst);
668  dst += dst_stride;
669 
670  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
671  const20, const6, const3);
672  ST_UB(res, dst);
673  dst += dst_stride;
674 
675  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
676  const20, const6, const3);
677  ST_UB(res, dst);
678  dst += dst_stride;
679 
680  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
681  const20, const6, const3);
682  ST_UB(res, dst);
683  dst += dst_stride;
684  }
685 }
686 
688  int32_t src_stride,
689  uint8_t *dst,
690  int32_t dst_stride,
691  int32_t height)
692 {
693  uint8_t loop_count;
694  v16u8 inp0, inp1, inp2, inp3;
695  v16u8 res0, res1;
696  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
697  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
698  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
699  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
700  v16u8 const20 = (v16u8) __msa_ldi_b(20);
701  v16u8 const6 = (v16u8) __msa_ldi_b(6);
702  v16u8 const3 = (v16u8) __msa_ldi_b(3);
703 
704  for (loop_count = (height >> 2); loop_count--;) {
705  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
706  src += (4 * src_stride);
707  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
708  mask2, mask3, const20,
709  const6, const3);
710  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
711  mask2, mask3, const20,
712  const6, const3);
713  SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1,
714  inp0, inp1, inp2, inp3);
715  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
716  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
717  res0 = __msa_ave_u_b(inp0, res0);
718  res1 = __msa_ave_u_b(inp2, res1);
719  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
720  dst += (4 * dst_stride);
721  }
722 }
723 
725  int32_t src_stride,
726  uint8_t *dst,
727  int32_t dst_stride,
728  int32_t height)
729 {
730  uint8_t loop_count;
731  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
732  v16u8 res;
733  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
734  v16u8 const6 = (v16u8) __msa_ldi_b(6);
735  v16u8 const3 = (v16u8) __msa_ldi_b(3);
736  v8u16 const20 = (v8u16) __msa_ldi_h(20);
737 
738  for (loop_count = (height >> 2); loop_count--;) {
739  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
740  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
741  src += (4 * src_stride);
742  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
743  const20, const6, const3);
744  res = __msa_ave_u_b(res, inp1);
745  ST_UB(res, dst);
746  dst += dst_stride;
747 
748  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
749  const20, const6, const3);
750  res = __msa_ave_u_b(res, inp3);
751  ST_UB(res, dst);
752  dst += dst_stride;
753 
754  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
755  const20, const6, const3);
756  res = __msa_ave_u_b(res, inp5);
757  ST_UB(res, dst);
758  dst += dst_stride;
759 
760  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
761  const20, const6, const3);
762  res = __msa_ave_u_b(res, inp7);
763  ST_UB(res, dst);
764  dst += dst_stride;
765  }
766 }
767 
769  int32_t src_stride,
770  uint8_t *dst,
771  int32_t dst_stride,
772  int32_t height)
773 {
774  uint8_t loop_count;
775  v16u8 inp0, inp1, inp2, inp3;
776  v16u8 dst0, dst1, dst2, dst3;
777  v16u8 res0, res1;
778  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
779  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
780  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
781  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
782  v16u8 const20 = (v16u8) __msa_ldi_b(20);
783  v16u8 const6 = (v16u8) __msa_ldi_b(6);
784  v16u8 const3 = (v16u8) __msa_ldi_b(3);
785 
786  for (loop_count = (height >> 2); loop_count--;) {
787  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
788  src += (4 * src_stride);
789  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
790  mask0, mask1, mask2, mask3,
791  const20, const6, const3);
792  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
793  mask0, mask1, mask2, mask3,
794  const20, const6, const3);
795  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
796  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
797  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
798  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
799  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
800  AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
801  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
802  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
803  dst += (4 * dst_stride);
804  }
805 }
806 
808  int32_t src_stride,
809  uint8_t *dst,
810  int32_t dst_stride,
811  int32_t height)
812 {
813  uint8_t loop_count;
814  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
815  v16u8 res0, res1;
816  v16u8 dst0, dst1;
817  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
818  v16u8 const6 = (v16u8) __msa_ldi_b(6);
819  v16u8 const3 = (v16u8) __msa_ldi_b(3);
820  v8u16 const20 = (v8u16) __msa_ldi_h(20);
821 
822  for (loop_count = (height >> 2); loop_count--;) {
823  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
824  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
825  src += (4 * src_stride);
826  res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
827  const20, const6, const3);
828  res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
829  const20, const6, const3);
830  LD_UB2(dst, dst_stride, dst0, dst1);
831  AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
832  AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
833  ST_UB2(res0, res1, dst, dst_stride);
834  dst += (2 * dst_stride);
835 
836  res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
837  const20, const6, const3);
838  res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
839  const20, const6, const3);
840  LD_UB2(dst, dst_stride, dst0, dst1);
841  AVER_UB2_UB(inp4, res0, inp6, res1, res0, res1);
842  AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
843  ST_UB2(res0, res1, dst, dst_stride);
844  dst += (2 * dst_stride);
845  }
846 }
847 
849  int32_t src_stride,
850  uint8_t *dst,
851  int32_t dst_stride,
852  int32_t height)
853 {
854  uint8_t loop_count;
855  v16u8 inp0, inp1, inp2, inp3;
856  v16u8 dst0, dst1, dst2, dst3;
857  v16u8 res0, res1;
858  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
859  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
860  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
861  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
862  v16u8 const20 = (v16u8) __msa_ldi_b(20);
863  v16u8 const6 = (v16u8) __msa_ldi_b(6);
864  v16u8 const3 = (v16u8) __msa_ldi_b(3);
865 
866  for (loop_count = (height >> 2); loop_count--;) {
867  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
868  src += (4 * src_stride);
869  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
870  mask0, mask1, mask2, mask3,
871  const20, const6, const3);
872  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
873  mask0, mask1, mask2, mask3,
874  const20, const6, const3);
875  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
876  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
877  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
878  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
879  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
880  dst += (4 * dst_stride);
881  }
882 }
883 
885  int32_t src_stride,
886  uint8_t *dst,
887  int32_t dst_stride,
888  int32_t height)
889 {
890  uint8_t loop_count;
891  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
892  v16u8 res0, res1;
893  v16u8 dst0, dst1;
894  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
895  v16u8 const6 = (v16u8) __msa_ldi_b(6);
896  v16u8 const3 = (v16u8) __msa_ldi_b(3);
897  v8u16 const20 = (v8u16) __msa_ldi_h(20);
898 
899  for (loop_count = (height >> 2); loop_count--;) {
900  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
901  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
902  src += (4 * src_stride);
903  res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
904  const20, const6, const3);
905  res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
906  const20, const6, const3);
907  LD_UB2(dst, dst_stride, dst0, dst1);
908  AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
909  ST_UB2(res0, res1, dst, dst_stride);
910  dst += (2 * dst_stride);
911 
912  res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
913  const20, const6, const3);
914  res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
915  const20, const6, const3);
916  LD_UB2(dst, dst_stride, dst0, dst1);
917  AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
918  ST_UB2(res0, res1, dst, dst_stride);
919  dst += (2 * dst_stride);
920  }
921 }
922 
924  int32_t src_stride,
925  uint8_t *dst,
926  int32_t dst_stride,
927  int32_t height)
928 {
929  uint8_t loop_count;
930  v16u8 inp0, inp1, inp2, inp3;
931  v16u8 dst0, dst1, dst2, dst3;
932  v16u8 res0, res1;
933  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
934  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
935  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
936  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
937  v16u8 const20 = (v16u8) __msa_ldi_b(20);
938  v16u8 const6 = (v16u8) __msa_ldi_b(6);
939  v16u8 const3 = (v16u8) __msa_ldi_b(3);
940 
941  for (loop_count = (height >> 2); loop_count--;) {
942  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
943  src += (4 * src_stride);
944  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
945  mask0, mask1, mask2, mask3,
946  const20, const6, const3);
947  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
948  mask0, mask1, mask2, mask3,
949  const20, const6, const3);
950  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
951  SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1,
952  inp0, inp1, inp2, inp3);
953  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
954  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
955  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
956  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
957  AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
958  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
959  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
960  dst += (4 * dst_stride);
961  }
962 }
963 
965  int32_t src_stride,
966  uint8_t *dst,
967  int32_t dst_stride,
968  int32_t height)
969 {
970  uint8_t loop_count;
971  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
972  v16u8 res0, res1, dst0, dst1;
973  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
974  v16u8 const6 = (v16u8) __msa_ldi_b(6);
975  v16u8 const3 = (v16u8) __msa_ldi_b(3);
976  v8u16 const20 = (v8u16) __msa_ldi_h(20);
977 
978  for (loop_count = (height >> 2); loop_count--;) {
979  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
980  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
981  src += (4 * src_stride);
982  res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
983  const20, const6, const3);
984  res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
985  const20, const6, const3);
986  LD_UB2(dst, dst_stride, dst0, dst1);
987  AVER_UB2_UB(res0, inp1, res1, inp3, res0, res1);
988  AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
989  ST_UB2(res0, res1, dst, dst_stride);
990  dst += (2 * dst_stride);
991  res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
992  const20, const6, const3);
993  res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
994  const20, const6, const3);
995  LD_UB2(dst, dst_stride, dst0, dst1);
996  AVER_UB2_UB(res0, inp5, res1, inp7, res0, res1);
997  AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
998  ST_UB2(res0, res1, dst, dst_stride);
999  dst += (2 * dst_stride);
1000  }
1001 }
1002 
1003 
1005  int32_t src_stride,
1006  uint8_t *dst,
1007  int32_t dst_stride)
1008 {
1009  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1010  v16u8 tmp0, tmp1, res0, res1;
1011  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1012  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1013  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1014 
1015  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1016  src += (4 * src_stride);
1017  LD_UB2(src, src_stride, inp4, inp5);
1018  src += (2 * src_stride);
1019  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1020  inp1, inp2, inp3, inp4,
1021  inp1, inp0, inp0, inp1,
1022  inp2, inp3, inp4, inp5,
1023  const20, const6, const3);
1024  LD_UB2(src, src_stride, inp6, inp7);
1025  src += (2 * src_stride);
1026  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1027  inp3, inp4, inp5, inp6,
1028  inp3, inp2, inp1, inp0,
1029  inp4, inp5, inp6, inp7,
1030  const20, const6, const3);
1031  tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
1032  tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
1033  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1034  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1035 
1036  inp8 = LD_UB(src);
1037  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1038  inp5, inp6, inp7, inp8,
1039  inp5, inp4, inp3, inp2,
1040  inp6, inp7, inp8, inp8,
1041  const20, const6, const3);
1042  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1043  inp7, inp8, inp8, inp7,
1044  inp7, inp6, inp5, inp4,
1045  inp8, inp8, inp7, inp6,
1046  const20, const6, const3);
1047  tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
1048  tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
1049  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1050  ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1051 }
1052 
1054  int32_t src_stride,
1055  uint8_t *dst,
1056  int32_t dst_stride)
1057 {
1058  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1059  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1060  v16u8 res0;
1061  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1062  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1063  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1064 
1065  LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1066  src += (5 * src_stride);
1067  res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1068  inp1, inp2, inp3, inp4,
1069  const20, const6, const3);
1070  res0 = __msa_aver_u_b(res0, inp0);
1071  ST_UB(res0, dst);
1072  dst += dst_stride;
1073 
1074  inp5 = LD_UB(src);
1075  src += src_stride;
1076  res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1077  inp2, inp3, inp4, inp5,
1078  const20, const6, const3);
1079  res0 = __msa_aver_u_b(res0, inp1);
1080  ST_UB(res0, dst);
1081  dst += dst_stride;
1082 
1083  inp6 = LD_UB(src);
1084  src += src_stride;
1085  res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1086  inp3, inp4, inp5, inp6,
1087  const20, const6, const3);
1088  res0 = __msa_aver_u_b(res0, inp2);
1089  ST_UB(res0, dst);
1090  dst += dst_stride;
1091 
1092  inp7 = LD_UB(src);
1093  src += src_stride;
1094  res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1095  inp4, inp5, inp6, inp7,
1096  const20, const6, const3);
1097  res0 = __msa_aver_u_b(res0, inp3);
1098  ST_UB(res0, dst);
1099  dst += dst_stride;
1100 
1101  LD_UB2(src, src_stride, inp8, inp9);
1102  src += (2 * src_stride);
1103  res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1104  inp5, inp6, inp7, inp8,
1105  const20, const6, const3);
1106  res0 = __msa_aver_u_b(res0, inp4);
1107  ST_UB(res0, dst);
1108  dst += dst_stride;
1109 
1110  res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1111  inp6, inp7, inp8, inp9,
1112  const20, const6, const3);
1113  res0 = __msa_aver_u_b(res0, inp5);
1114  ST_UB(res0, dst);
1115  dst += dst_stride;
1116 
1117  LD_UB2(src, src_stride, inp10, inp11);
1118  src += (2 * src_stride);
1119  res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1120  inp7, inp8, inp9, inp10,
1121  const20, const6, const3);
1122  res0 = __msa_aver_u_b(res0, inp6);
1123  ST_UB(res0, dst);
1124  dst += dst_stride;
1125 
1126  res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1127  inp8, inp9, inp10, inp11,
1128  const20, const6, const3);
1129  res0 = __msa_aver_u_b(res0, inp7);
1130  ST_UB(res0, dst);
1131  dst += dst_stride;
1132 
1133  LD_UB2(src, src_stride, inp12, inp13);
1134  src += (2 * src_stride);
1135  res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1136  inp9, inp10, inp11, inp12,
1137  const20, const6, const3);
1138  res0 = __msa_aver_u_b(res0, inp8);
1139  ST_UB(res0, dst);
1140  dst += dst_stride;
1141 
1142  res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1143  inp10, inp11, inp12, inp13,
1144  const20, const6, const3);
1145  res0 = __msa_aver_u_b(res0, inp9);
1146  ST_UB(res0, dst);
1147  dst += dst_stride;
1148 
1149  LD_UB2(src, src_stride, inp14, inp15);
1150  src += (2 * src_stride);
1151  res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1152  inp11, inp12, inp13, inp14,
1153  const20, const6, const3);
1154  res0 = __msa_aver_u_b(res0, inp10);
1155  ST_UB(res0, dst);
1156  dst += dst_stride;
1157 
1158  res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1159  inp12, inp13, inp14, inp15,
1160  const20, const6, const3);
1161  res0 = __msa_aver_u_b(res0, inp11);
1162  ST_UB(res0, dst);
1163  dst += dst_stride;
1164 
1165  inp16 = LD_UB(src);
1166  res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1167  inp13, inp14, inp15, inp16,
1168  const20, const6, const3);
1169  res0 = __msa_aver_u_b(res0, inp12);
1170  ST_UB(res0, dst);
1171  dst += dst_stride;
1172 
1173  res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1174  inp14, inp15, inp16, inp16,
1175  const20, const6, const3);
1176  res0 = __msa_aver_u_b(res0, inp13);
1177  ST_UB(res0, dst);
1178  dst += dst_stride;
1179 
1180  res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1181  inp15, inp16, inp16, inp15,
1182  const20, const6, const3);
1183  res0 = __msa_aver_u_b(res0, inp14);
1184  ST_UB(res0, dst);
1185  dst += dst_stride;
1186 
1187  res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1188  inp16, inp16, inp15, inp14,
1189  const20, const6, const3);
1190  res0 = __msa_aver_u_b(res0, inp15);
1191  ST_UB(res0, dst);
1192 }
1193 
1194 static void vert_mc_qpel_8x8_msa(const uint8_t *src,
1195  int32_t src_stride,
1196  uint8_t *dst,
1197  int32_t dst_stride)
1198 {
1199  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1200  v16u8 res0, res1;
1201  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1202  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1203  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1204 
1205  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1206  src += (4 * src_stride);
1207  LD_UB2(src, src_stride, inp4, inp5);
1208  src += (2 * src_stride);
1209  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1210  inp1, inp2, inp3, inp4,
1211  inp1, inp0, inp0, inp1,
1212  inp2, inp3, inp4, inp5,
1213  const20, const6, const3);
1214  LD_UB2(src, src_stride, inp6, inp7);
1215  src += (2 * src_stride);
1216  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1217  inp3, inp4, inp5, inp6,
1218  inp3, inp2, inp1, inp0,
1219  inp4, inp5, inp6, inp7,
1220  const20, const6, const3);
1221  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1222 
1223  inp8 = LD_UB(src);
1224  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1225  inp5, inp6, inp7, inp8,
1226  inp5, inp4, inp3, inp2,
1227  inp6, inp7, inp8, inp8,
1228  const20, const6, const3);
1229  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1230  inp7, inp8, inp8, inp7,
1231  inp7, inp6, inp5, inp4,
1232  inp8, inp8, inp7, inp6,
1233  const20, const6, const3);
1234  ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1235 }
1236 
1238  int32_t src_stride,
1239  uint8_t *dst,
1240  int32_t dst_stride)
1241 {
1242  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1243  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1244  v16u8 res0;
1245  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1246  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1247  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1248 
1249  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1250  src += (4 * src_stride);
1251  inp4 = LD_UB(src);
1252  src += src_stride;
1253  res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1254  inp1, inp2, inp3, inp4,
1255  const20, const6, const3);
1256  ST_UB(res0, dst);
1257  dst += dst_stride;
1258 
1259  inp5 = LD_UB(src);
1260  src += src_stride;
1261  res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1262  inp2, inp3, inp4, inp5,
1263  const20, const6, const3);
1264  ST_UB(res0, dst);
1265  dst += dst_stride;
1266 
1267  inp6 = LD_UB(src);
1268  src += src_stride;
1269  res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1270  inp3, inp4, inp5, inp6,
1271  const20, const6, const3);
1272  ST_UB(res0, dst);
1273  dst += dst_stride;
1274 
1275  inp7 = LD_UB(src);
1276  src += src_stride;
1277  res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1278  inp4, inp5, inp6, inp7,
1279  const20, const6, const3);
1280  ST_UB(res0, dst);
1281  dst += dst_stride;
1282 
1283  inp8 = LD_UB(src);
1284  src += src_stride;
1285  res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1286  inp5, inp6, inp7, inp8,
1287  const20, const6, const3);
1288  ST_UB(res0, dst);
1289  dst += dst_stride;
1290 
1291  inp9 = LD_UB(src);
1292  src += src_stride;
1293  res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1294  inp6, inp7, inp8, inp9,
1295  const20, const6, const3);
1296  ST_UB(res0, dst);
1297  dst += dst_stride;
1298 
1299  inp10 = LD_UB(src);
1300  src += src_stride;
1301  res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1302  inp7, inp8, inp9, inp10,
1303  const20, const6, const3);
1304  ST_UB(res0, dst);
1305  dst += dst_stride;
1306 
1307  inp11 = LD_UB(src);
1308  src += src_stride;
1309  res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1310  inp8, inp9, inp10, inp11,
1311  const20, const6, const3);
1312  ST_UB(res0, dst);
1313  dst += dst_stride;
1314 
1315  inp12 = LD_UB(src);
1316  src += src_stride;
1317  res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1318  inp9, inp10, inp11, inp12,
1319  const20, const6, const3);
1320  ST_UB(res0, dst);
1321  dst += dst_stride;
1322 
1323  inp13 = LD_UB(src);
1324  src += src_stride;
1325  res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1326  inp10, inp11, inp12, inp13,
1327  const20, const6, const3);
1328  ST_UB(res0, dst);
1329  dst += dst_stride;
1330 
1331  inp14 = LD_UB(src);
1332  src += src_stride;
1333  res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1334  inp11, inp12, inp13, inp14,
1335  const20, const6, const3);
1336  ST_UB(res0, dst);
1337  dst += dst_stride;
1338 
1339  inp15 = LD_UB(src);
1340  src += src_stride;
1341  res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1342  inp12, inp13, inp14, inp15,
1343  const20, const6, const3);
1344  ST_UB(res0, dst);
1345  dst += dst_stride;
1346 
1347  inp16 = LD_UB(src);
1348  res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1349  inp13, inp14, inp15, inp16,
1350  const20, const6, const3);
1351  ST_UB(res0, dst);
1352  dst += dst_stride;
1353 
1354  res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1355  inp14, inp15, inp16, inp16,
1356  const20, const6, const3);
1357  ST_UB(res0, dst);
1358  dst += dst_stride;
1359 
1360  res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1361  inp15, inp16, inp16, inp15,
1362  const20, const6, const3);
1363  ST_UB(res0, dst);
1364  dst += dst_stride;
1365 
1366  res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1367  inp16, inp16, inp15, inp14,
1368  const20, const6, const3);
1369  ST_UB(res0, dst);
1370  dst += dst_stride;
1371 }
1372 
1374  int32_t src_stride,
1375  uint8_t *dst,
1376  int32_t dst_stride)
1377 {
1378  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1379  v16u8 tmp0, tmp1, res0, res1;
1380  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1381  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1382  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1383 
1384  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1385  src += (4 * src_stride);
1386  LD_UB2(src, src_stride, inp4, inp5);
1387  src += (2 * src_stride);
1388  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1389  inp1, inp2, inp3, inp4,
1390  inp1, inp0, inp0, inp1,
1391  inp2, inp3, inp4, inp5,
1392  const20, const6, const3);
1393 
1394  LD_UB2(src, src_stride, inp6, inp7);
1395  src += (2 * src_stride);
1396  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1397  inp3, inp4, inp5, inp6,
1398  inp3, inp2, inp1, inp0,
1399  inp4, inp5, inp6, inp7,
1400  const20, const6, const3);
1401  tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
1402  tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
1403  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1404  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1405 
1406  inp8 = LD_UB(src);
1407  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1408  inp5, inp6, inp7, inp8,
1409  inp5, inp4, inp3, inp2,
1410  inp6, inp7, inp8, inp8,
1411  const20, const6, const3);
1412  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1413  inp7, inp8, inp8, inp7,
1414  inp7, inp6, inp5, inp4,
1415  inp8, inp8, inp7, inp6,
1416  const20, const6, const3);
1417  tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
1418  tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
1419  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1420  ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1421 }
1422 
1424  int32_t src_stride,
1425  uint8_t *dst,
1426  int32_t dst_stride)
1427 {
1428  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1429  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1430  v16u8 res0;
1431  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1432  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1433  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1434 
1435  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1436  src += (4 * src_stride);
1437  inp4 = LD_UB(src);
1438  src += src_stride;
1439  res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1440  inp1, inp2, inp3, inp4,
1441  const20, const6, const3);
1442  res0 = __msa_aver_u_b(res0, inp1);
1443  ST_UB(res0, dst);
1444  dst += dst_stride;
1445 
1446  inp5 = LD_UB(src);
1447  src += src_stride;
1448  res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1449  inp2, inp3, inp4, inp5,
1450  const20, const6, const3);
1451  res0 = __msa_aver_u_b(res0, inp2);
1452  ST_UB(res0, dst);
1453  dst += dst_stride;
1454 
1455  inp6 = LD_UB(src);
1456  src += src_stride;
1457  res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1458  inp3, inp4, inp5, inp6,
1459  const20, const6, const3);
1460  res0 = __msa_aver_u_b(res0, inp3);
1461  ST_UB(res0, dst);
1462  dst += dst_stride;
1463 
1464  inp7 = LD_UB(src);
1465  src += src_stride;
1466  res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1467  inp4, inp5, inp6, inp7,
1468  const20, const6, const3);
1469  res0 = __msa_aver_u_b(res0, inp4);
1470  ST_UB(res0, dst);
1471  dst += dst_stride;
1472 
1473  inp8 = LD_UB(src);
1474  src += src_stride;
1475  res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1476  inp5, inp6, inp7, inp8,
1477  const20, const6, const3);
1478  res0 = __msa_aver_u_b(res0, inp5);
1479  ST_UB(res0, dst);
1480  dst += dst_stride;
1481 
1482  inp9 = LD_UB(src);
1483  src += src_stride;
1484  res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1485  inp6, inp7, inp8, inp9,
1486  const20, const6, const3);
1487  res0 = __msa_aver_u_b(res0, inp6);
1488  ST_UB(res0, dst);
1489  dst += dst_stride;
1490 
1491  inp10 = LD_UB(src);
1492  src += src_stride;
1493  res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1494  inp7, inp8, inp9, inp10,
1495  const20, const6, const3);
1496  res0 = __msa_aver_u_b(res0, inp7);
1497  ST_UB(res0, dst);
1498  dst += dst_stride;
1499 
1500  inp11 = LD_UB(src);
1501  src += src_stride;
1502  res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1503  inp8, inp9, inp10, inp11,
1504  const20, const6, const3);
1505  res0 = __msa_aver_u_b(res0, inp8);
1506  ST_UB(res0, dst);
1507  dst += dst_stride;
1508 
1509  inp12 = LD_UB(src);
1510  src += src_stride;
1511  res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1512  inp9, inp10, inp11, inp12,
1513  const20, const6, const3);
1514  res0 = __msa_aver_u_b(res0, inp9);
1515  ST_UB(res0, dst);
1516  dst += dst_stride;
1517 
1518  inp13 = LD_UB(src);
1519  src += src_stride;
1520  res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1521  inp10, inp11, inp12, inp13,
1522  const20, const6, const3);
1523  res0 = __msa_aver_u_b(res0, inp10);
1524  ST_UB(res0, dst);
1525  dst += dst_stride;
1526 
1527  inp14 = LD_UB(src);
1528  src += src_stride;
1529  res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1530  inp11, inp12, inp13, inp14,
1531  const20, const6, const3);
1532  res0 = __msa_aver_u_b(res0, inp11);
1533  ST_UB(res0, dst);
1534  dst += dst_stride;
1535 
1536  inp15 = LD_UB(src);
1537  src += src_stride;
1538  res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1539  inp12, inp13, inp14, inp15,
1540  const20, const6, const3);
1541  res0 = __msa_aver_u_b(res0, inp12);
1542  ST_UB(res0, dst);
1543  dst += dst_stride;
1544 
1545  inp16 = LD_UB(src);
1546  res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1547  inp13, inp14, inp15, inp16,
1548  const20, const6, const3);
1549  res0 = __msa_aver_u_b(res0, inp13);
1550  ST_UB(res0, dst);
1551  dst += dst_stride;
1552 
1553  res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1554  inp14, inp15, inp16, inp16,
1555  const20, const6, const3);
1556  res0 = __msa_aver_u_b(res0, inp14);
1557  ST_UB(res0, dst);
1558  dst += dst_stride;
1559 
1560  res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1561  inp15, inp16, inp16, inp15,
1562  const20, const6, const3);
1563  res0 = __msa_aver_u_b(res0, inp15);
1564  ST_UB(res0, dst);
1565  dst += dst_stride;
1566 
1567  res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1568  inp16, inp16, inp15, inp14,
1569  const20, const6, const3);
1570  res0 = __msa_aver_u_b(res0, inp16);
1571  ST_UB(res0, dst);
1572 }
1573 
1575  int32_t src_stride,
1576  uint8_t *dst,
1577  int32_t dst_stride)
1578 {
1579  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1580  v16u8 tmp0, tmp1, res0, res1;
1581  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1582  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1583  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1584 
1585  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1586  src += (4 * src_stride);
1587  LD_UB2(src, src_stride, inp4, inp5);
1588  src += (2 * src_stride);
1589  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1590  inp1, inp2, inp3, inp4,
1591  inp1, inp0, inp0, inp1,
1592  inp2, inp3, inp4, inp5,
1593  const20, const6, const3);
1594  LD_UB2(src, src_stride, inp6, inp7);
1595  src += (2 * src_stride);
1596  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1597  inp3, inp4, inp5, inp6,
1598  inp3, inp2, inp1, inp0,
1599  inp4, inp5, inp6, inp7,
1600  const20, const6, const3);
1601  tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
1602  tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
1603  res0 = __msa_ave_u_b(res0, tmp0);
1604  res1 = __msa_ave_u_b(res1, tmp1);
1605  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1606 
1607  inp8 = LD_UB(src);
1608  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1609  inp5, inp6, inp7, inp8,
1610  inp5, inp4, inp3, inp2,
1611  inp6, inp7, inp8, inp8,
1612  const20, const6, const3);
1613  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1614  inp7, inp8, inp8, inp7,
1615  inp7, inp6, inp5, inp4,
1616  inp8, inp8, inp7, inp6,
1617  const20, const6, const3);
1618  tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
1619  tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
1620  res0 = __msa_ave_u_b(res0, tmp0);
1621  res1 = __msa_ave_u_b(res1, tmp1);
1622  ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1623 }
1624 
1626  int32_t src_stride,
1627  uint8_t *dst,
1628  int32_t dst_stride)
1629 {
1630  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1631  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1632  v16u8 res0;
1633  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1634  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1635  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1636 
1637  LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1638  src += (5 * src_stride);
1639  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
1640  inp1, inp2, inp3, inp4,
1641  const20, const6, const3);
1642  res0 = __msa_ave_u_b(res0, inp0);
1643  ST_UB(res0, dst);
1644  dst += dst_stride;
1645 
1646  inp5 = LD_UB(src);
1647  src += src_stride;
1648  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
1649  inp2, inp3, inp4, inp5,
1650  const20, const6, const3);
1651  res0 = __msa_ave_u_b(res0, inp1);
1652  ST_UB(res0, dst);
1653  dst += dst_stride;
1654 
1655  inp6 = LD_UB(src);
1656  src += src_stride;
1657  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
1658  inp3, inp4, inp5, inp6,
1659  const20, const6, const3);
1660  res0 = __msa_ave_u_b(res0, inp2);
1661  ST_UB(res0, dst);
1662  dst += dst_stride;
1663 
1664  inp7 = LD_UB(src);
1665  src += src_stride;
1666  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
1667  inp4, inp5, inp6, inp7,
1668  const20, const6, const3);
1669  res0 = __msa_ave_u_b(res0, inp3);
1670  ST_UB(res0, dst);
1671  dst += dst_stride;
1672 
1673  inp8 = LD_UB(src);
1674  src += src_stride;
1675  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
1676  inp5, inp6, inp7, inp8,
1677  const20, const6, const3);
1678  res0 = __msa_ave_u_b(res0, inp4);
1679  ST_UB(res0, dst);
1680  dst += dst_stride;
1681 
1682  inp9 = LD_UB(src);
1683  src += src_stride;
1684  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
1685  inp6, inp7, inp8, inp9,
1686  const20, const6, const3);
1687  res0 = __msa_ave_u_b(res0, inp5);
1688  ST_UB(res0, dst);
1689  dst += dst_stride;
1690 
1691  inp10 = LD_UB(src);
1692  src += src_stride;
1693  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
1694  inp7, inp8, inp9, inp10,
1695  const20, const6, const3);
1696  res0 = __msa_ave_u_b(res0, inp6);
1697  ST_UB(res0, dst);
1698  dst += dst_stride;
1699 
1700  inp11 = LD_UB(src);
1701  src += src_stride;
1702  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
1703  inp8, inp9, inp10, inp11,
1704  const20, const6, const3);
1705  res0 = __msa_ave_u_b(res0, inp7);
1706  ST_UB(res0, dst);
1707  dst += dst_stride;
1708 
1709  inp12 = LD_UB(src);
1710  src += src_stride;
1711  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
1712  inp9, inp10, inp11, inp12,
1713  const20, const6, const3);
1714  res0 = __msa_ave_u_b(res0, inp8);
1715  ST_UB(res0, dst);
1716  dst += dst_stride;
1717 
1718  inp13 = LD_UB(src);
1719  src += src_stride;
1720  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
1721  inp10, inp11, inp12, inp13,
1722  const20, const6, const3);
1723  res0 = __msa_ave_u_b(res0, inp9);
1724  ST_UB(res0, dst);
1725  dst += dst_stride;
1726 
1727  inp14 = LD_UB(src);
1728  src += src_stride;
1729  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
1730  inp11, inp12, inp13, inp14,
1731  const20, const6, const3);
1732  res0 = __msa_ave_u_b(res0, inp10);
1733  ST_UB(res0, dst);
1734  dst += dst_stride;
1735 
1736  inp15 = LD_UB(src);
1737  src += src_stride;
1738  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
1739  inp12, inp13, inp14, inp15,
1740  const20, const6, const3);
1741  res0 = __msa_ave_u_b(res0, inp11);
1742  ST_UB(res0, dst);
1743  dst += dst_stride;
1744 
1745  inp16 = LD_UB(src);
1746  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
1747  inp13, inp14, inp15, inp16,
1748  const20, const6, const3);
1749  res0 = __msa_ave_u_b(res0, inp12);
1750  ST_UB(res0, dst);
1751  dst += dst_stride;
1752 
1753  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
1754  inp14, inp15, inp16, inp16,
1755  const20, const6, const3);
1756  res0 = __msa_ave_u_b(res0, inp13);
1757  ST_UB(res0, dst);
1758  dst += dst_stride;
1759 
1760  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
1761  inp15, inp16, inp16, inp15,
1762  const20, const6, const3);
1763  res0 = __msa_ave_u_b(res0, inp14);
1764  ST_UB(res0, dst);
1765  dst += dst_stride;
1766 
1767  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
1768  inp16, inp16, inp15, inp14,
1769  const20, const6, const3);
1770  res0 = __msa_ave_u_b(res0, inp15);
1771  ST_UB(res0, dst);
1772  dst += dst_stride;
1773 }
1774 
1776  int32_t src_stride,
1777  uint8_t *dst,
1778  int32_t dst_stride)
1779 {
1780  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1781  v16u8 res0, res1;
1782  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1783  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1784  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1785 
1786  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1787  src += (4 * src_stride);
1788  LD_UB2(src, src_stride, inp4, inp5);
1789  src += (2 * src_stride);
1790  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1791  inp1, inp2, inp3, inp4,
1792  inp1, inp0, inp0, inp1,
1793  inp2, inp3, inp4, inp5,
1794  const20, const6, const3);
1795  LD_UB2(src, src_stride, inp6, inp7);
1796  src += (2 * src_stride);
1797  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1798  inp3, inp4, inp5, inp6,
1799  inp3, inp2, inp1, inp0,
1800  inp4, inp5, inp6, inp7,
1801  const20, const6, const3);
1802  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1803 
1804  inp8 = LD_UB(src);
1805  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1806  inp5, inp6, inp7, inp8,
1807  inp5, inp4, inp3, inp2,
1808  inp6, inp7, inp8, inp8,
1809  const20, const6, const3);
1810  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1811  inp7, inp8, inp8, inp7,
1812  inp7, inp6, inp5, inp4,
1813  inp8, inp8, inp7, inp6,
1814  const20, const6, const3);
1815  ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1816 }
1817 
1819  int32_t src_stride,
1820  uint8_t *dst,
1821  int32_t dst_stride)
1822 {
1823  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1824  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1825  v16u8 res0;
1826  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1827  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1828  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1829 
1830  LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1831  src += (5 * src_stride);
1832  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
1833  inp1, inp2, inp3, inp4,
1834  const20, const6, const3);
1835  ST_UB(res0, dst);
1836  dst += dst_stride;
1837 
1838  inp5 = LD_UB(src);
1839  src += src_stride;
1840  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
1841  inp2, inp3, inp4, inp5,
1842  const20, const6, const3);
1843  ST_UB(res0, dst);
1844  dst += dst_stride;
1845 
1846  inp6 = LD_UB(src);
1847  src += src_stride;
1848  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
1849  inp3, inp4, inp5, inp6,
1850  const20, const6, const3);
1851  ST_UB(res0, dst);
1852  dst += dst_stride;
1853 
1854  inp7 = LD_UB(src);
1855  src += src_stride;
1856  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
1857  inp4, inp5, inp6, inp7,
1858  const20, const6, const3);
1859  ST_UB(res0, dst);
1860  dst += dst_stride;
1861 
1862  inp8 = LD_UB(src);
1863  src += src_stride;
1864  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
1865  inp5, inp6, inp7, inp8,
1866  const20, const6, const3);
1867  ST_UB(res0, dst);
1868  dst += dst_stride;
1869 
1870  inp9 = LD_UB(src);
1871  src += src_stride;
1872  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
1873  inp6, inp7, inp8, inp9,
1874  const20, const6, const3);
1875  ST_UB(res0, dst);
1876  dst += dst_stride;
1877 
1878  inp10 = LD_UB(src);
1879  src += src_stride;
1880  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
1881  inp7, inp8, inp9, inp10,
1882  const20, const6, const3);
1883  ST_UB(res0, dst);
1884  dst += dst_stride;
1885 
1886  inp11 = LD_UB(src);
1887  src += src_stride;
1888  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
1889  inp8, inp9, inp10, inp11,
1890  const20, const6, const3);
1891  ST_UB(res0, dst);
1892  dst += dst_stride;
1893 
1894  inp12 = LD_UB(src);
1895  src += src_stride;
1896  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
1897  inp9, inp10, inp11, inp12,
1898  const20, const6, const3);
1899  ST_UB(res0, dst);
1900  dst += dst_stride;
1901 
1902  inp13 = LD_UB(src);
1903  src += src_stride;
1904  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
1905  inp10, inp11, inp12, inp13,
1906  const20, const6, const3);
1907  ST_UB(res0, dst);
1908  dst += dst_stride;
1909 
1910  inp14 = LD_UB(src);
1911  src += src_stride;
1912  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
1913  inp11, inp12, inp13, inp14,
1914  const20, const6, const3);
1915  ST_UB(res0, dst);
1916  dst += dst_stride;
1917 
1918  inp15 = LD_UB(src);
1919  src += src_stride;
1920  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
1921  inp12, inp13, inp14, inp15,
1922  const20, const6, const3);
1923  ST_UB(res0, dst);
1924  dst += dst_stride;
1925 
1926  inp16 = LD_UB(src);
1927  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
1928  inp13, inp14, inp15, inp16,
1929  const20, const6, const3);
1930  ST_UB(res0, dst);
1931  dst += dst_stride;
1932 
1933  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
1934  inp14, inp15, inp16, inp16,
1935  const20, const6, const3);
1936  ST_UB(res0, dst);
1937  dst += dst_stride;
1938 
1939  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
1940  inp15, inp16, inp16, inp15,
1941  const20, const6, const3);
1942  ST_UB(res0, dst);
1943  dst += dst_stride;
1944 
1945  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
1946  inp16, inp16, inp15, inp14,
1947  const20, const6, const3);
1948  ST_UB(res0, dst);
1949 }
1950 
1952  int32_t src_stride,
1953  uint8_t *dst,
1954  int32_t dst_stride)
1955 {
1956  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1957  v16u8 tmp0, tmp1, res0, res1;
1958  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1959  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1960  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1961 
1962  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1963  src += (4 * src_stride);
1964  LD_UB2(src, src_stride, inp4, inp5);
1965  src += (2 * src_stride);
1966  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1967  inp1, inp2, inp3, inp4,
1968  inp1, inp0, inp0, inp1,
1969  inp2, inp3, inp4, inp5,
1970  const20, const6, const3);
1971  LD_UB2(src, src_stride, inp6, inp7);
1972  src += (2 * src_stride);
1973  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1974  inp3, inp4, inp5, inp6,
1975  inp3, inp2, inp1, inp0,
1976  inp4, inp5, inp6, inp7,
1977  const20, const6, const3);
1978  tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
1979  tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
1980  res0 = __msa_ave_u_b(res0, tmp0);
1981  res1 = __msa_ave_u_b(res1, tmp1);
1982  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1983 
1984  inp8 = LD_UB(src);
1985  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1986  inp5, inp6, inp7, inp8,
1987  inp5, inp4, inp3, inp2,
1988  inp6, inp7, inp8, inp8,
1989  const20, const6, const3);
1990  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1991  inp7, inp8, inp8, inp7,
1992  inp7, inp6, inp5, inp4,
1993  inp8, inp8, inp7, inp6,
1994  const20, const6, const3);
1995  tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
1996  tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
1997  res0 = __msa_ave_u_b(res0, tmp0);
1998  res1 = __msa_ave_u_b(res1, tmp1);
1999  ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2000 }
2001 
2003  int32_t src_stride,
2004  uint8_t *dst,
2005  int32_t dst_stride)
2006 {
2007  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2008  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2009  v16u8 res0;
2010  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2011  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2012  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2013 
2014  LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2015  src += (5 * src_stride);
2016  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
2017  inp1, inp2, inp3, inp4,
2018  const20, const6, const3);
2019  res0 = __msa_ave_u_b(res0, inp1);
2020  ST_UB(res0, dst);
2021  dst += dst_stride;
2022 
2023  inp5 = LD_UB(src);
2024  src += src_stride;
2025  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
2026  inp2, inp3, inp4, inp5,
2027  const20, const6, const3);
2028  res0 = __msa_ave_u_b(res0, inp2);
2029  ST_UB(res0, dst);
2030  dst += dst_stride;
2031 
2032  inp6 = LD_UB(src);
2033  src += src_stride;
2034  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
2035  inp3, inp4, inp5, inp6,
2036  const20, const6, const3);
2037  res0 = __msa_ave_u_b(res0, inp3);
2038  ST_UB(res0, dst);
2039  dst += dst_stride;
2040 
2041  inp7 = LD_UB(src);
2042  src += src_stride;
2043  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
2044  inp4, inp5, inp6, inp7,
2045  const20, const6, const3);
2046  res0 = __msa_ave_u_b(res0, inp4);
2047  ST_UB(res0, dst);
2048  dst += dst_stride;
2049 
2050  inp8 = LD_UB(src);
2051  src += src_stride;
2052  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
2053  inp5, inp6, inp7, inp8,
2054  const20, const6, const3);
2055  res0 = __msa_ave_u_b(res0, inp5);
2056  ST_UB(res0, dst);
2057  dst += dst_stride;
2058 
2059  inp9 = LD_UB(src);
2060  src += src_stride;
2061  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
2062  inp6, inp7, inp8, inp9,
2063  const20, const6, const3);
2064  res0 = __msa_ave_u_b(res0, inp6);
2065  ST_UB(res0, dst);
2066  dst += dst_stride;
2067 
2068  inp10 = LD_UB(src);
2069  src += src_stride;
2070  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
2071  inp7, inp8, inp9, inp10,
2072  const20, const6, const3);
2073  res0 = __msa_ave_u_b(res0, inp7);
2074  ST_UB(res0, dst);
2075  dst += dst_stride;
2076 
2077  inp11 = LD_UB(src);
2078  src += src_stride;
2079  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
2080  inp8, inp9, inp10, inp11,
2081  const20, const6, const3);
2082  res0 = __msa_ave_u_b(res0, inp8);
2083  ST_UB(res0, dst);
2084  dst += dst_stride;
2085 
2086  inp12 = LD_UB(src);
2087  src += src_stride;
2088  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
2089  inp9, inp10, inp11, inp12,
2090  const20, const6, const3);
2091  res0 = __msa_ave_u_b(res0, inp9);
2092  ST_UB(res0, dst);
2093  dst += dst_stride;
2094 
2095  inp13 = LD_UB(src);
2096  src += src_stride;
2097  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
2098  inp10, inp11, inp12, inp13,
2099  const20, const6, const3);
2100  res0 = __msa_ave_u_b(res0, inp10);
2101  ST_UB(res0, dst);
2102  dst += dst_stride;
2103 
2104  inp14 = LD_UB(src);
2105  src += src_stride;
2106  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
2107  inp11, inp12, inp13, inp14,
2108  const20, const6, const3);
2109  res0 = __msa_ave_u_b(res0, inp11);
2110  ST_UB(res0, dst);
2111  dst += dst_stride;
2112 
2113  inp15 = LD_UB(src);
2114  src += src_stride;
2115  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
2116  inp12, inp13, inp14, inp15,
2117  const20, const6, const3);
2118  res0 = __msa_ave_u_b(res0, inp12);
2119  ST_UB(res0, dst);
2120  dst += dst_stride;
2121 
2122  inp16 = LD_UB(src);
2123  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
2124  inp13, inp14, inp15, inp16,
2125  const20, const6, const3);
2126  res0 = __msa_ave_u_b(res0, inp13);
2127  ST_UB(res0, dst);
2128  dst += dst_stride;
2129 
2130  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
2131  inp14, inp15, inp16, inp16,
2132  const20, const6, const3);
2133  res0 = __msa_ave_u_b(res0, inp14);
2134  ST_UB(res0, dst);
2135  dst += dst_stride;
2136 
2137  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
2138  inp15, inp16, inp16, inp15,
2139  const20, const6, const3);
2140  res0 = __msa_ave_u_b(res0, inp15);
2141  ST_UB(res0, dst);
2142  dst += dst_stride;
2143 
2144  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
2145  inp16, inp16, inp15, inp14,
2146  const20, const6, const3);
2147  res0 = __msa_ave_u_b(res0, inp16);
2148  ST_UB(res0, dst);
2149 }
2150 
2152  int32_t src_stride,
2153  uint8_t *dst,
2154  int32_t dst_stride)
2155 {
2156  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2157  v16u8 dst0, dst1, dst2, dst3;
2158  v16u8 tmp0, tmp1, res0, res1;
2159  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2160  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2161  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2162 
2163  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2164  src += (4 * src_stride);
2165  LD_UB2(src, src_stride, inp4, inp5);
2166  src += (2 * src_stride);
2167  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2168  inp1, inp2, inp3, inp4,
2169  inp1, inp0, inp0, inp1,
2170  inp2, inp3, inp4, inp5,
2171  const20, const6, const3);
2172 
2173  LD_UB2(src, src_stride, inp6, inp7);
2174  src += (2 * src_stride);
2175  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2176  inp3, inp4, inp5, inp6,
2177  inp3, inp2, inp1, inp0,
2178  inp4, inp5, inp6, inp7,
2179  const20, const6, const3);
2180 
2181  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2182  tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
2183  tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
2184  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2185  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2186  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2187  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2188  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2189  dst += (4 * dst_stride);
2190 
2191  inp8 = LD_UB(src);
2192  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2193  inp5, inp6, inp7, inp8,
2194  inp5, inp4, inp3, inp2,
2195  inp6, inp7, inp8, inp8,
2196  const20, const6, const3);
2197  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2198  inp7, inp8, inp8, inp7,
2199  inp7, inp6, inp5, inp4,
2200  inp8, inp8, inp7, inp6,
2201  const20, const6, const3);
2202 
2203  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2204  tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
2205  tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
2206  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2207  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2208  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2209  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2210  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2211 }
2212 
2214  int32_t src_stride,
2215  uint8_t *dst,
2216  int32_t dst_stride)
2217 {
2218  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2219  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2220  v16u8 res0, res1, dst0, dst1;
2221  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2222  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2223  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2224 
2225  LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2226  src += (5 * src_stride);
2227  res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2228  inp1, inp2, inp3, inp4,
2229  const20, const6, const3);
2230 
2231  inp5 = LD_UB(src);
2232  src += src_stride;
2233  res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2234  inp2, inp3, inp4, inp5,
2235  const20, const6, const3);
2236 
2237  LD_UB2(dst, dst_stride, dst0, dst1);
2238  AVER_UB2_UB(res0, inp0, res1, inp1, res0, res1);
2239  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2240  ST_UB2(res0, res1, dst, dst_stride);
2241  dst += (2 * dst_stride);
2242 
2243  inp6 = LD_UB(src);
2244  src += src_stride;
2245  res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2246  inp3, inp4, inp5, inp6,
2247  const20, const6, const3);
2248 
2249  inp7 = LD_UB(src);
2250  src += src_stride;
2251  res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2252  inp4, inp5, inp6, inp7,
2253  const20, const6, const3);
2254 
2255  LD_UB2(dst, dst_stride, dst0, dst1);
2256  AVER_UB2_UB(res0, inp2, res1, inp3, res0, res1);
2257  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2258  ST_UB2(res0, res1, dst, dst_stride);
2259  dst += (2 * dst_stride);
2260 
2261  LD_UB2(src, src_stride, inp8, inp9);
2262  src += (2 * src_stride);
2263  res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2264  inp5, inp6, inp7, inp8,
2265  const20, const6, const3);
2266  res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2267  inp6, inp7, inp8, inp9,
2268  const20, const6, const3);
2269 
2270  LD_UB2(dst, dst_stride, dst0, dst1);
2271  AVER_UB2_UB(res0, inp4, res1, inp5, res0, res1);
2272  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2273  ST_UB2(res0, res1, dst, dst_stride);
2274  dst += (2 * dst_stride);
2275 
2276  LD_UB2(src, src_stride, inp10, inp11);
2277  src += (2 * src_stride);
2278  res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2279  inp7, inp8, inp9, inp10,
2280  const20, const6, const3);
2281  res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2282  inp8, inp9, inp10, inp11,
2283  const20, const6, const3);
2284 
2285  LD_UB2(dst, dst_stride, dst0, dst1);
2286  AVER_UB2_UB(res0, inp6, res1, inp7, res0, res1);
2287  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2288  ST_UB2(res0, res1, dst, dst_stride);
2289  dst += (2 * dst_stride);
2290 
2291  LD_UB2(src, src_stride, inp12, inp13);
2292  src += (2 * src_stride);
2293  res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2294  inp9, inp10, inp11, inp12,
2295  const20, const6, const3);
2296  res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2297  inp10, inp11, inp12, inp13,
2298  const20, const6, const3);
2299  LD_UB2(dst, dst_stride, dst0, dst1);
2300  AVER_UB2_UB(res0, inp8, res1, inp9, res0, res1);
2301  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2302  ST_UB2(res0, res1, dst, dst_stride);
2303  dst += (2 * dst_stride);
2304 
2305  LD_UB2(src, src_stride, inp14, inp15);
2306  src += (2 * src_stride);
2307  res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2308  inp11, inp12, inp13, inp14,
2309  const20, const6, const3);
2310  res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2311  inp12, inp13, inp14, inp15,
2312  const20, const6, const3);
2313 
2314  LD_UB2(dst, dst_stride, dst0, dst1);
2315  AVER_UB2_UB(res0, inp10, res1, inp11, res0, res1);
2316  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2317  ST_UB2(res0, res1, dst, dst_stride);
2318  dst += (2 * dst_stride);
2319 
2320  inp16 = LD_UB(src);
2321  res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2322  inp13, inp14, inp15, inp16,
2323  const20, const6, const3);
2324  res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2325  inp14, inp15, inp16, inp16,
2326  const20, const6, const3);
2327  LD_UB2(dst, dst_stride, dst0, dst1);
2328  AVER_UB2_UB(res0, inp12, res1, inp13, res0, res1);
2329  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2330  ST_UB2(res0, res1, dst, dst_stride);
2331  dst += (2 * dst_stride);
2332 
2333  res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2334  inp15, inp16, inp16, inp15,
2335  const20, const6, const3);
2336  res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2337  inp16, inp16, inp15, inp14,
2338  const20, const6, const3);
2339  LD_UB2(dst, dst_stride, dst0, dst1);
2340  AVER_UB2_UB(res0, inp14, res1, inp15, res0, res1);
2341  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2342  ST_UB2(res0, res1, dst, dst_stride);
2343 }
2344 
2346  int32_t src_stride,
2347  uint8_t *dst,
2348  int32_t dst_stride)
2349 {
2350  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2351  v16u8 dst0, dst1, dst2, dst3;
2352  v16u8 res0, res1;
2353  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2354  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2355  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2356 
2357  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2358  src += (4 * src_stride);
2359  LD_UB2(src, src_stride, inp4, inp5);
2360  src += (2 * src_stride);
2361  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2362  inp1, inp2, inp3, inp4,
2363  inp1, inp0, inp0, inp1,
2364  inp2, inp3, inp4, inp5,
2365  const20, const6, const3);
2366  LD_UB2(src, src_stride, inp6, inp7);
2367  src += (2 * src_stride);
2368  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2369  inp3, inp4, inp5, inp6,
2370  inp3, inp2, inp1, inp0,
2371  inp4, inp5, inp6, inp7,
2372  const20, const6, const3);
2373  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2374  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2375  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2376  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2377  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2378  dst += (4 * dst_stride);
2379 
2380  inp8 = LD_UB(src);
2381  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2382  inp5, inp6, inp7, inp8,
2383  inp5, inp4, inp3, inp2,
2384  inp6, inp7, inp8, inp8,
2385  const20, const6, const3);
2386  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2387  inp7, inp8, inp8, inp7,
2388  inp7, inp6, inp5, inp4,
2389  inp8, inp8, inp7, inp6,
2390  const20, const6, const3);
2391  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2392  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2393  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2394  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2395  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2396 }
2397 
2399  int32_t src_stride,
2400  uint8_t *dst,
2401  int32_t dst_stride)
2402 {
2403  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2404  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2405  v16u8 res0, res1, dst0, dst1;
2406  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2407  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2408  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2409 
2410  LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2411  src += (5 * src_stride);
2412  res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2413  inp1, inp2, inp3, inp4,
2414  const20, const6, const3);
2415  inp5 = LD_UB(src);
2416  src += src_stride;
2417  res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2418  inp2, inp3, inp4, inp5,
2419  const20, const6, const3);
2420  LD_UB2(dst, dst_stride, dst0, dst1);
2421  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2422  ST_UB2(res0, res1, dst, dst_stride);
2423  dst += (2 * dst_stride);
2424 
2425  inp6 = LD_UB(src);
2426  src += src_stride;
2427  res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2428  inp3, inp4, inp5, inp6,
2429  const20, const6, const3);
2430  inp7 = LD_UB(src);
2431  src += src_stride;
2432  res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2433  inp4, inp5, inp6, inp7,
2434  const20, const6, const3);
2435  LD_UB2(dst, dst_stride, dst0, dst1);
2436  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2437  ST_UB2(res0, res1, dst, dst_stride);
2438  dst += (2 * dst_stride);
2439 
2440  inp8 = LD_UB(src);
2441  src += src_stride;
2442  res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2443  inp5, inp6, inp7, inp8,
2444  const20, const6, const3);
2445  inp9 = LD_UB(src);
2446  src += src_stride;
2447  res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2448  inp6, inp7, inp8, inp9,
2449  const20, const6, const3);
2450  LD_UB2(dst, dst_stride, dst0, dst1);
2451  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2452  ST_UB2(res0, res1, dst, dst_stride);
2453  dst += (2 * dst_stride);
2454 
2455  inp10 = LD_UB(src);
2456  src += src_stride;
2457  res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2458  inp7, inp8, inp9, inp10,
2459  const20, const6, const3);
2460  inp11 = LD_UB(src);
2461  src += src_stride;
2462  res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2463  inp8, inp9, inp10, inp11,
2464  const20, const6, const3);
2465  LD_UB2(dst, dst_stride, dst0, dst1);
2466  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2467  ST_UB2(res0, res1, dst, dst_stride);
2468  dst += (2 * dst_stride);
2469 
2470  inp12 = LD_UB(src);
2471  src += src_stride;
2472  res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2473  inp9, inp10, inp11, inp12,
2474  const20, const6, const3);
2475  inp13 = LD_UB(src);
2476  src += src_stride;
2477  res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2478  inp10, inp11, inp12, inp13,
2479  const20, const6, const3);
2480  LD_UB2(dst, dst_stride, dst0, dst1);
2481  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2482  ST_UB2(res0, res1, dst, dst_stride);
2483  dst += (2 * dst_stride);
2484 
2485  inp14 = LD_UB(src);
2486  src += src_stride;
2487  res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2488  inp11, inp12, inp13, inp14,
2489  const20, const6, const3);
2490  inp15 = LD_UB(src);
2491  src += src_stride;
2492  res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2493  inp12, inp13, inp14, inp15,
2494  const20, const6, const3);
2495  LD_UB2(dst, dst_stride, dst0, dst1);
2496  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2497  ST_UB2(res0, res1, dst, dst_stride);
2498  dst += (2 * dst_stride);
2499 
2500  inp16 = LD_UB(src);
2501  res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2502  inp13, inp14, inp15, inp16,
2503  const20, const6, const3);
2504  res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2505  inp14, inp15, inp16, inp16,
2506  const20, const6, const3);
2507  LD_UB2(dst, dst_stride, dst0, dst1);
2508  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2509  ST_UB2(res0, res1, dst, dst_stride);
2510  dst += (2 * dst_stride);
2511 
2512  res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2513  inp15, inp16, inp16, inp15,
2514  const20, const6, const3);
2515  res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2516  inp16, inp16, inp15, inp14,
2517  const20, const6, const3);
2518  LD_UB2(dst, dst_stride, dst0, dst1);
2519  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2520  ST_UB2(res0, res1, dst, dst_stride);
2521 }
2522 
2524  int32_t src_stride,
2525  uint8_t *dst,
2526  int32_t dst_stride)
2527 {
2528  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2529  v16u8 dst0, dst1, dst2, dst3;
2530  v16u8 tmp0, tmp1, res0, res1;
2531  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2532  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2533  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2534 
2535  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2536  src += (4 * src_stride);
2537  LD_UB2(src, src_stride, inp4, inp5);
2538  src += (2 * src_stride);
2539  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2540  inp1, inp2, inp3, inp4,
2541  inp1, inp0, inp0, inp1,
2542  inp2, inp3, inp4, inp5,
2543  const20, const6, const3);
2544  LD_UB2(src, src_stride, inp6, inp7);
2545  src += (2 * src_stride);
2546  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2547  inp3, inp4, inp5, inp6,
2548  inp3, inp2, inp1, inp0,
2549  inp4, inp5, inp6, inp7,
2550  const20, const6, const3);
2551  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2552  tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
2553  tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
2554  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2555  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2556  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2557  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2558  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2559  dst += (4 * dst_stride);
2560 
2561  inp8 = LD_UB(src);
2562  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2563  inp5, inp6, inp7, inp8,
2564  inp5, inp4, inp3, inp2,
2565  inp6, inp7, inp8, inp8,
2566  const20, const6, const3);
2567  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2568  inp7, inp8, inp8, inp7,
2569  inp7, inp6, inp5, inp4,
2570  inp8, inp8, inp7, inp6,
2571  const20, const6, const3);
2572  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2573  tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
2574  tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
2575  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2576  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2577  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2578  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2579  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2580 }
2581 
2583  int32_t src_stride,
2584  uint8_t *dst,
2585  int32_t dst_stride)
2586 {
2587  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2588  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2589  v16u8 res0, res1, dst0, dst1;
2590  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2591  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2592  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2593 
2594  LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2595  src += (5 * src_stride);
2596  res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2597  inp1, inp2, inp3, inp4,
2598  const20, const6, const3);
2599  inp5 = LD_UB(src);
2600  src += src_stride;
2601  res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2602  inp2, inp3, inp4, inp5,
2603  const20, const6, const3);
2604  LD_UB2(dst, dst_stride, dst0, dst1);
2605  AVER_UB2_UB(res0, inp1, res1, inp2, res0, res1);
2606  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2607  ST_UB2(res0, res1, dst, dst_stride);
2608  dst += (2 * dst_stride);
2609 
2610  inp6 = LD_UB(src);
2611  src += src_stride;
2612  res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2613  inp3, inp4, inp5, inp6,
2614  const20, const6, const3);
2615  inp7 = LD_UB(src);
2616  src += src_stride;
2617  res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2618  inp4, inp5, inp6, inp7,
2619  const20, const6, const3);
2620  LD_UB2(dst, dst_stride, dst0, dst1);
2621  AVER_UB2_UB(res0, inp3, res1, inp4, res0, res1);
2622  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2623  ST_UB2(res0, res1, dst, dst_stride);
2624  dst += (2 * dst_stride);
2625 
2626  inp8 = LD_UB(src);
2627  src += src_stride;
2628  res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2629  inp5, inp6, inp7, inp8,
2630  const20, const6, const3);
2631  inp9 = LD_UB(src);
2632  src += src_stride;
2633  res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2634  inp6, inp7, inp8, inp9,
2635  const20, const6, const3);
2636  LD_UB2(dst, dst_stride, dst0, dst1);
2637  AVER_UB2_UB(res0, inp5, res1, inp6, res0, res1);
2638  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2639  ST_UB2(res0, res1, dst, dst_stride);
2640  dst += (2 * dst_stride);
2641 
2642  inp10 = LD_UB(src);
2643  src += src_stride;
2644  res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2645  inp7, inp8, inp9, inp10,
2646  const20, const6, const3);
2647  inp11 = LD_UB(src);
2648  src += src_stride;
2649  res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2650  inp8, inp9, inp10, inp11,
2651  const20, const6, const3);
2652  LD_UB2(dst, dst_stride, dst0, dst1);
2653  AVER_UB2_UB(res0, inp7, res1, inp8, res0, res1);
2654  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2655  ST_UB2(res0, res1, dst, dst_stride);
2656  dst += (2 * dst_stride);
2657 
2658  inp12 = LD_UB(src);
2659  src += src_stride;
2660  res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2661  inp9, inp10, inp11, inp12,
2662  const20, const6, const3);
2663  inp13 = LD_UB(src);
2664  src += src_stride;
2665  res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2666  inp10, inp11, inp12, inp13,
2667  const20, const6, const3);
2668  LD_UB2(dst, dst_stride, dst0, dst1);
2669  AVER_UB2_UB(res0, inp9, res1, inp10, res0, res1);
2670  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2671  ST_UB2(res0, res1, dst, dst_stride);
2672  dst += (2 * dst_stride);
2673 
2674  inp14 = LD_UB(src);
2675  src += src_stride;
2676  res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2677  inp11, inp12, inp13, inp14,
2678  const20, const6, const3);
2679  inp15 = LD_UB(src);
2680  src += src_stride;
2681  res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2682  inp12, inp13, inp14, inp15,
2683  const20, const6, const3);
2684  LD_UB2(dst, dst_stride, dst0, dst1);
2685  AVER_UB2_UB(res0, inp11, res1, inp12, res0, res1);
2686  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2687  ST_UB2(res0, res1, dst, dst_stride);
2688  dst += (2 * dst_stride);
2689 
2690  inp16 = LD_UB(src);
2691  res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2692  inp13, inp14, inp15, inp16,
2693  const20, const6, const3);
2694  res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2695  inp14, inp15, inp16, inp16,
2696  const20, const6, const3);
2697  LD_UB2(dst, dst_stride, dst0, dst1);
2698  AVER_UB2_UB(res0, inp13, res1, inp14, res0, res1);
2699  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2700  ST_UB2(res0, res1, dst, dst_stride);
2701  dst += (2 * dst_stride);
2702 
2703  res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2704  inp15, inp16, inp16, inp15,
2705  const20, const6, const3);
2706  res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2707  inp16, inp16, inp15, inp14,
2708  const20, const6, const3);
2709  LD_UB2(dst, dst_stride, dst0, dst1);
2710  AVER_UB2_UB(res0, inp15, res1, inp16, res0, res1);
2711  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2712  ST_UB2(res0, res1, dst, dst_stride);
2713 }
2714 
2716  int32_t src_stride,
2717  uint8_t *dst,
2718  int32_t dst_stride,
2719  int32_t height)
2720 {
2721  uint8_t loop_count;
2722  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
2723  v16u8 res;
2724  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
2725  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2726  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2727  v8u16 const20 = (v8u16) __msa_ldi_h(20);
2728 
2729  for (loop_count = (height >> 2); loop_count--;) {
2730  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
2731  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
2732  src += (4 * src_stride);
2733  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2734  const20, const6, const3);
2735  res = __msa_ave_u_b(inp0, res);
2736  ST_UB(res, dst);
2737  dst += dst_stride;
2738 
2739  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
2740  const20, const6, const3);
2741  res = __msa_ave_u_b(inp2, res);
2742  ST_UB(res, dst);
2743  dst += dst_stride;
2744 
2745  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
2746  const20, const6, const3);
2747  res = __msa_ave_u_b(inp4, res);
2748  ST_UB(res, dst);
2749  dst += dst_stride;
2750 
2751  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
2752  const20, const6, const3);
2753  res = __msa_ave_u_b(inp6, res);
2754  ST_UB(res, dst);
2755  dst += dst_stride;
2756  }
2757 
2758  LD_UB2(src, 1, inp0, inp1);
2759  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2760  const20, const6, const3);
2761  res = __msa_ave_u_b(inp0, res);
2762  ST_UB(res, dst);
2763 }
2764 
2766  int32_t src_stride,
2767  uint8_t *dst,
2768  int32_t dst_stride)
2769 {
2770  uint8_t buff[272];
2771 
2772  hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
2773  vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
2774 }
2775 
2777  int32_t src_stride,
2778  uint8_t *dst,
2779  int32_t dst_stride)
2780 {
2781  v16u8 inp0, inp1, inp2, inp3;
2782  v16u8 res0, res1, avg0, avg1;
2783  v16u8 horiz0, horiz1, horiz2, horiz3;
2784  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
2785  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2786  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
2787  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
2788  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
2789  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2790  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2791  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2792 
2793  LD_UB2(src, src_stride, inp0, inp1);
2794  src += (2 * src_stride);
2795  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2796  mask2, mask3, const20,
2797  const6, const3);
2798  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
2799  horiz0 = __msa_ave_u_b(inp0, res0);
2800  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
2801  LD_UB2(src, src_stride, inp2, inp3);
2802  src += (2 * src_stride);
2803  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2804  mask2, mask3, const20,
2805  const6, const3);
2806  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
2807  horiz2 = __msa_ave_u_b(inp2, res1);
2808  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
2809  LD_UB2(src, src_stride, inp0, inp1);
2810  src += (2 * src_stride);
2811  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2812  mask2, mask3, const20,
2813  const6, const3);
2814  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
2815  horiz4 = __msa_ave_u_b(inp0, res0);
2816  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
2817  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
2818  horiz1, horiz2, horiz3, horiz4,
2819  horiz1, horiz0, horiz0, horiz1,
2820  horiz2, horiz3, horiz4, horiz5,
2821  const20, const6, const3);
2822  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2823  res0 = __msa_ave_u_b(avg0, res0);
2824  ST_D2(res0, 0, 1, dst, dst_stride);
2825  dst += (2 * dst_stride);
2826 
2827  LD_UB2(src, src_stride, inp2, inp3);
2828  src += (2 * src_stride);
2829  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2830  mask2, mask3, const20,
2831  const6, const3);
2832  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
2833  horiz6 = __msa_ave_u_b(inp2, res1);
2834  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
2835  inp0 = LD_UB(src);
2836  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
2837  mask2, mask3, const20,
2838  const6, const3);
2839  horiz8 = __msa_ave_u_b(inp0, res0);
2840  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
2841  horiz3, horiz4, horiz5, horiz6,
2842  horiz3, horiz2, horiz1, horiz0,
2843  horiz4, horiz5, horiz6, horiz7,
2844  const20, const6, const3);
2845  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
2846  res1 = __msa_ave_u_b(avg1, res1);
2847  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
2848  horiz5, horiz6, horiz7, horiz8,
2849  horiz5, horiz4, horiz3, horiz2,
2850  horiz6, horiz7, horiz8, horiz8,
2851  const20, const6, const3);
2852  ST_D2(res1, 0, 1, dst, dst_stride);
2853  dst += 2 * dst_stride;
2854 
2855  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
2856  res0 = __msa_ave_u_b(avg0, res0);
2857  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
2858  horiz7, horiz8, horiz8, horiz7,
2859  horiz7, horiz6, horiz5, horiz4,
2860  horiz8, horiz8, horiz7, horiz6,
2861  const20, const6, const3);
2862  ST_D2(res0, 0, 1, dst, dst_stride);
2863  dst += 2 * dst_stride;
2864 
2865  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
2866  res1 = __msa_ave_u_b(avg1, res1);
2867  ST_D2(res1, 0, 1, dst, dst_stride);
2868 }
2869 
2871  int32_t src_stride,
2872  uint8_t *dst,
2873  int32_t dst_stride,
2874  int32_t height)
2875 {
2876  uint8_t loop_count;
2877  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
2878  v16u8 res;
2879  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
2880  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2881  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2882  v8u16 const20 = (v8u16) __msa_ldi_h(20);
2883 
2884  for (loop_count = (height >> 2); loop_count--;) {
2885  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
2886  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
2887  src += (4 * src_stride);
2888  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2889  const20, const6, const3);
2890  ST_UB(res, dst);
2891  dst += dst_stride;
2892 
2893  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
2894  const20, const6, const3);
2895  ST_UB(res, dst);
2896  dst += dst_stride;
2897 
2898  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
2899  const20, const6, const3);
2900  ST_UB(res, dst);
2901  dst += dst_stride;
2902 
2903  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
2904  const20, const6, const3);
2905  ST_UB(res, dst);
2906  dst += dst_stride;
2907  }
2908 
2909  LD_UB2(src, 1, inp0, inp1);
2910  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2911  const20, const6, const3);
2912  ST_UB(res, dst);
2913 }
2914 
2916  int32_t src_stride,
2917  uint8_t *dst,
2918  int32_t dst_stride)
2919 {
2920  uint8_t buff[272];
2921 
2922  hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
2923  vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
2924 }
2925 
2927  int32_t src_stride,
2928  uint8_t *dst,
2929  int32_t dst_stride)
2930 {
2931  v16u8 inp0, inp1, inp2, inp3;
2932  v16u8 res0, res1, avg0, avg1;
2933  v16u8 horiz0, horiz1, horiz2, horiz3;
2934  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
2935  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2936  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
2937  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
2938  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
2939  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2940  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2941  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2942 
2943  LD_UB2(src, src_stride, inp0, inp1);
2944  src += (2 * src_stride);
2945  horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2946  mask2, mask3, const20,
2947  const6, const3);
2948  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
2949 
2950  LD_UB2(src, src_stride, inp2, inp3);
2951  src += (2 * src_stride);
2952  horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2953  mask2, mask3, const20,
2954  const6, const3);
2955  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
2956  LD_UB2(src, src_stride, inp0, inp1);
2957  src += (2 * src_stride);
2958  horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2959  mask2, mask3, const20,
2960  const6, const3);
2961  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
2962  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
2963  horiz1, horiz2, horiz3, horiz4,
2964  horiz1, horiz0, horiz0, horiz1,
2965  horiz2, horiz3, horiz4, horiz5,
2966  const20, const6, const3);
2967  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2968  res0 = __msa_ave_u_b(avg0, res0);
2969  ST_D2(res0, 0, 1, dst, dst_stride);
2970  dst += (2 * dst_stride);
2971 
2972  LD_UB2(src, src_stride, inp2, inp3);
2973  src += (2 * src_stride);
2974  horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2975  mask2, mask3, const20,
2976  const6, const3);
2977  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
2978  inp0 = LD_UB(src);
2979  horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
2980  mask2, mask3, const20,
2981  const6, const3);
2982  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
2983  horiz3, horiz4, horiz5, horiz6,
2984  horiz3, horiz2, horiz1, horiz0,
2985  horiz4, horiz5, horiz6, horiz7,
2986  const20, const6, const3);
2987  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
2988  res1 = __msa_ave_u_b(avg1, res1);
2989  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2990  res0 = __msa_ave_u_b(avg0, res0);
2991  ST_D2(res1, 0, 1, dst, dst_stride);
2992  dst += (2 * dst_stride);
2993 
2994  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
2995  horiz5, horiz6, horiz7, horiz8,
2996  horiz5, horiz4, horiz3, horiz2,
2997  horiz6, horiz7, horiz8, horiz8,
2998  const20, const6, const3);
2999  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3000  res0 = __msa_ave_u_b(avg0, res0);
3001  ST_D2(res0, 0, 1, dst, dst_stride);
3002  dst += (2 * dst_stride);
3003 
3004  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3005  horiz7, horiz8, horiz8, horiz7,
3006  horiz7, horiz6, horiz5, horiz4,
3007  horiz8, horiz8, horiz7, horiz6,
3008  const20, const6, const3);
3009  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3010  res1 = __msa_ave_u_b(avg1, res1);
3011  ST_D2(res1, 0, 1, dst, dst_stride);
3012 }
3013 
3015  int32_t src_stride,
3016  uint8_t *dst,
3017  int32_t dst_stride,
3018  int32_t height)
3019 {
3020  uint8_t loop_count;
3021  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3022  v16u8 res;
3023  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3024  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3025  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3026  v8u16 const20 = (v8u16) __msa_ldi_h(20);
3027 
3028  for (loop_count = (height >> 2); loop_count--;) {
3029  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3030  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3031  src += (4 * src_stride);
3032  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
3033  const20, const6, const3);
3034  res = __msa_ave_u_b(res, inp1);
3035  ST_UB(res, dst);
3036  dst += dst_stride;
3037 
3038  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
3039  const20, const6, const3);
3040  res = __msa_ave_u_b(res, inp3);
3041  ST_UB(res, dst);
3042  dst += dst_stride;
3043 
3044  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
3045  const20, const6, const3);
3046  res = __msa_ave_u_b(res, inp5);
3047  ST_UB(res, dst);
3048  dst += dst_stride;
3049 
3050  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
3051  const20, const6, const3);
3052  res = __msa_ave_u_b(res, inp7);
3053  ST_UB(res, dst);
3054  dst += dst_stride;
3055  }
3056 
3057  LD_UB2(src, 1, inp0, inp1);
3058  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
3059  const20, const6, const3);
3060  res = __msa_ave_u_b(inp1, res);
3061  ST_UB(res, dst);
3062 }
3063 
3065  int32_t src_stride,
3066  uint8_t *dst,
3067  int32_t dst_stride)
3068 {
3069  uint8_t buff[272];
3070 
3071  hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3072  vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3073 }
3074 
3076  int32_t src_stride,
3077  uint8_t *dst,
3078  int32_t dst_stride)
3079 {
3080  v16u8 inp0, inp1, inp2, inp3;
3081  v16u8 res0, res1, avg0, avg1;
3082  v16u8 horiz0, horiz1, horiz2, horiz3;
3083  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3084  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3085  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3086  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3087  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3088  v16u8 const20 = (v16u8) __msa_ldi_b(20);
3089  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3090  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3091 
3092  LD_UB2(src, src_stride, inp0, inp1);
3093  src += (2 * src_stride);
3094  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3095  mask2, mask3, const20,
3096  const6, const3);
3097  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3098 
3099  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3100  horiz0 = __msa_ave_u_b(inp0, res0);
3101  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3102  LD_UB2(src, src_stride, inp2, inp3);
3103  src += (2 * src_stride);
3104  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3105  mask2, mask3, const20,
3106  const6, const3);
3107  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3108 
3109  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3110  horiz2 = __msa_ave_u_b(inp2, res1);
3111  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3112  LD_UB2(src, src_stride, inp0, inp1);
3113  src += (2 * src_stride);
3114  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3115  mask2, mask3, const20,
3116  const6, const3);
3117  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3118 
3119  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3120  horiz4 = __msa_ave_u_b(inp0, res0);
3121  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3122  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3123  horiz1, horiz2, horiz3, horiz4,
3124  horiz1, horiz0, horiz0, horiz1,
3125  horiz2, horiz3, horiz4, horiz5,
3126  const20, const6, const3);
3127  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3128  res0 = __msa_ave_u_b(avg0, res0);
3129  ST_D2(res0, 0, 1, dst, dst_stride);
3130  dst += (2 * dst_stride);
3131 
3132  LD_UB2(src, src_stride, inp2, inp3);
3133  src += (2 * src_stride);
3134  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3135  mask2, mask3, const20,
3136  const6, const3);
3137  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3138 
3139  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3140  horiz6 = __msa_ave_u_b(inp2, res1);
3141  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3142  inp0 = LD_UB(src);
3143  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3144  mask2, mask3, const20,
3145  const6, const3);
3146  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3147  horiz8 = __msa_ave_u_b(inp0, res0);
3148  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3149  horiz3, horiz4, horiz5, horiz6,
3150  horiz3, horiz2, horiz1, horiz0,
3151  horiz4, horiz5, horiz6, horiz7,
3152  const20, const6, const3);
3153  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
3154  res1 = __msa_ave_u_b(avg1, res1);
3155  ST_D2(res1, 0, 1, dst, dst_stride);
3156  dst += (2 * dst_stride);
3157 
3158  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3159  horiz5, horiz6, horiz7, horiz8,
3160  horiz5, horiz4, horiz3, horiz2,
3161  horiz6, horiz7, horiz8, horiz8,
3162  const20, const6, const3);
3163  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3164  res0 = __msa_ave_u_b(avg0, res0);
3165  ST_D2(res0, 0, 1, dst, dst_stride);
3166  dst += (2 * dst_stride);
3167 
3168  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3169  horiz7, horiz8, horiz8, horiz7,
3170  horiz7, horiz6, horiz5, horiz4,
3171  horiz8, horiz8, horiz7, horiz6,
3172  const20, const6, const3);
3173  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3174  res1 = __msa_ave_u_b(avg1, res1);
3175  ST_D2(res1, 0, 1, dst, dst_stride);
3176 }
3177 
3179  int32_t src_stride,
3180  uint8_t *dst,
3181  int32_t dst_stride)
3182 {
3183  uint8_t buff[272];
3184 
3185  hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3186  vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3187 }
3188 
3190  int32_t src_stride,
3191  uint8_t *dst,
3192  int32_t dst_stride)
3193 {
3194  v16u8 inp0, inp1, inp2, inp3;
3195  v16u8 res0, res1;
3196  v16u8 horiz0, horiz1, horiz2, horiz3;
3197  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3198  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3199  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3200  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3201  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3202  v16u8 const20 = (v16u8) __msa_ldi_b(20);
3203  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3204  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3205 
3206  LD_UB2(src, src_stride, inp0, inp1);
3207  src += (2 * src_stride);
3208  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3209  mask2, mask3, const20,
3210  const6, const3);
3211  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3212  horiz0 = __msa_ave_u_b(inp0, res0);
3213  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3214  LD_UB2(src, src_stride, inp2, inp3);
3215  src += (2 * src_stride);
3216  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3217  mask2, mask3, const20,
3218  const6, const3);
3219  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3220  horiz2 = __msa_ave_u_b(inp2, res1);
3221  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3222  LD_UB2(src, src_stride, inp0, inp1);
3223  src += (2 * src_stride);
3224  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3225  mask2, mask3, const20,
3226  const6, const3);
3227  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3228  horiz4 = __msa_ave_u_b(inp0, res0);
3229  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3230  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3231  horiz1, horiz2, horiz3, horiz4,
3232  horiz1, horiz0, horiz0, horiz1,
3233  horiz2, horiz3, horiz4, horiz5,
3234  const20, const6, const3);
3235 
3236  LD_UB2(src, src_stride, inp2, inp3);
3237  src += (2 * src_stride);
3238  ST_D2(res0, 0, 1, dst, dst_stride);
3239  dst += 2 * dst_stride;
3240 
3241  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3242  mask2, mask3, const20,
3243  const6, const3);
3244  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3245  horiz6 = __msa_ave_u_b(inp2, res1);
3246  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3247  inp0 = LD_UB(src);
3248  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3249  mask2, mask3, const20,
3250  const6, const3);
3251  horiz8 = __msa_ave_u_b(inp0, res0);
3252  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3253  horiz3, horiz4, horiz5, horiz6,
3254  horiz3, horiz2, horiz1, horiz0,
3255  horiz4, horiz5, horiz6, horiz7,
3256  const20, const6, const3);
3257  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3258  horiz5, horiz6, horiz7, horiz8,
3259  horiz5, horiz4, horiz3, horiz2,
3260  horiz6, horiz7, horiz8, horiz8,
3261  const20, const6, const3);
3262  ST_D4(res1, res0, 0, 1, 0, 1, dst, dst_stride);
3263  dst += (4 * dst_stride);
3264 
3265  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3266  horiz7, horiz8, horiz8, horiz7,
3267  horiz7, horiz6, horiz5, horiz4,
3268  horiz8, horiz8, horiz7, horiz6,
3269  const20, const6, const3);
3270  ST_D2(res1, 0, 1, dst, dst_stride);
3271 }
3272 
3274  int32_t src_stride,
3275  uint8_t *dst,
3276  int32_t dst_stride)
3277 {
3278  uint8_t buff[272];
3279 
3280  hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3281  vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3282 }
3283 
3285  int32_t src_stride,
3286  uint8_t *dst,
3287  int32_t dst_stride)
3288 {
3289  v16u8 inp0, inp1, inp2, inp3;
3290  v16u8 res0, res1;
3291  v16u8 horiz0, horiz1, horiz2, horiz3;
3292  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3293  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3294  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3295  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3296  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3297  v16u8 const20 = (v16u8) __msa_ldi_b(20);
3298  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3299  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3300 
3301  LD_UB2(src, src_stride, inp0, inp1);
3302  src += (2 * src_stride);
3303  horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3304  mask2, mask3, const20,
3305  const6, const3);
3306  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3307  LD_UB2(src, src_stride, inp2, inp3);
3308  src += (2 * src_stride);
3309  horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3310  mask2, mask3, const20,
3311  const6, const3);
3312  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3313  LD_UB2(src, src_stride, inp0, inp1);
3314  src += (2 * src_stride);
3315  horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3316  mask2, mask3, const20,
3317  const6, const3);
3318  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3319  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3320  horiz1, horiz2, horiz3, horiz4,
3321  horiz1, horiz0, horiz0, horiz1,
3322  horiz2, horiz3, horiz4, horiz5,
3323  const20, const6, const3);
3324  LD_UB2(src, src_stride, inp2, inp3);
3325  src += (2 * src_stride);
3326  ST_D2(res0, 0, 1, dst, dst_stride);
3327  dst += 2 * dst_stride;
3328 
3329  horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3330  mask2, mask3, const20,
3331  const6, const3);
3332  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3333  inp0 = LD_UB(src);
3334  horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3335  mask2, mask3, const20,
3336  const6, const3);
3337  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3338  horiz3, horiz4, horiz5, horiz6,
3339  horiz3, horiz2, horiz1, horiz0,
3340  horiz4, horiz5, horiz6, horiz7,
3341  const20, const6, const3);
3342  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3343  horiz5, horiz6, horiz7, horiz8,
3344  horiz5, horiz4, horiz3, horiz2,
3345  horiz6, horiz7, horiz8, horiz8,
3346  const20, const6, const3);
3347  ST_D2(res1, 0, 1, dst, dst_stride);
3348  dst += 2 * dst_stride;
3349 
3350 
3351  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3352  horiz7, horiz8, horiz8, horiz7,
3353  horiz7, horiz6, horiz5, horiz4,
3354  horiz8, horiz8, horiz7, horiz6,
3355  const20, const6, const3);
3356  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3357 }
3358 
3360  int32_t src_stride,
3361  uint8_t *dst,
3362  int32_t dst_stride)
3363 {
3364  uint8_t buff[272];
3365 
3366  hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3367  vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3368 }
3369 
3371  int32_t src_stride,
3372  uint8_t *dst,
3373  int32_t dst_stride)
3374 {
3375  v16u8 inp0, inp1, inp2, inp3;
3376  v16u8 res0, res1;
3377  v16u8 horiz0, horiz1, horiz2, horiz3;
3378  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3379  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3380  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3381  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3382  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3383  v16u8 const20 = (v16u8) __msa_ldi_b(20);
3384  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3385  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3386 
3387  LD_UB2(src, src_stride, inp0, inp1);
3388  src += (2 * src_stride);
3389  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3390  mask2, mask3, const20,
3391  const6, const3);
3392  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3393 
3394  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3395  horiz0 = __msa_ave_u_b(inp0, res0);
3396  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3397  LD_UB2(src, src_stride, inp2, inp3);
3398  src += (2 * src_stride);
3399  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3400  mask2, mask3, const20,
3401  const6, const3);
3402  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3403 
3404  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3405  horiz2 = __msa_ave_u_b(inp2, res1);
3406  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3407  LD_UB2(src, src_stride, inp0, inp1);
3408  src += (2 * src_stride);
3409  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3410  mask2, mask3, const20,
3411  const6, const3);
3412  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3413 
3414  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3415  horiz4 = __msa_ave_u_b(inp0, res0);
3416  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3417  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3418  horiz1, horiz2, horiz3, horiz4,
3419  horiz1, horiz0, horiz0, horiz1,
3420  horiz2, horiz3, horiz4, horiz5,
3421  const20, const6, const3);
3422  LD_UB2(src, src_stride, inp2, inp3);
3423  src += (2 * src_stride);
3424  ST_D2(res0, 0, 1, dst, dst_stride);
3425  dst += 2 * dst_stride;
3426 
3427  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3428  mask2, mask3, const20,
3429  const6, const3);
3430  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3431 
3432  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3433  horiz6 = __msa_ave_u_b(inp2, res1);
3434  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3435  inp0 = LD_UB(src);
3436  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3437  mask2, mask3, const20,
3438  const6, const3);
3439  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3440  horiz8 = __msa_ave_u_b(inp0, res0);
3441  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3442  horiz3, horiz4, horiz5, horiz6,
3443  horiz3, horiz2, horiz1, horiz0,
3444  horiz4, horiz5, horiz6, horiz7,
3445  const20, const6, const3);
3446  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3447  horiz5, horiz6, horiz7, horiz8,
3448  horiz5, horiz4, horiz3, horiz2,
3449  horiz6, horiz7, horiz8, horiz8,
3450  const20, const6, const3);
3451  ST_D2(res1, 0, 1, dst, dst_stride);
3452  dst += 2 * dst_stride;
3453 
3454  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3455  horiz7, horiz8, horiz8, horiz7,
3456  horiz7, horiz6, horiz5, horiz4,
3457  horiz8, horiz8, horiz7, horiz6,
3458  const20, const6, const3);
3459  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3460 }
3461 
3463  int32_t src_stride,
3464  uint8_t *dst,
3465  int32_t dst_stride)
3466 {
3467  uint8_t buff[272];
3468 
3469  hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3470  vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3471 }
3472 
3474  int32_t src_stride,
3475  uint8_t *dst,
3476  int32_t dst_stride)
3477 {
3478  v16u8 inp0, inp1, inp2, inp3;
3479  v16u8 res0, res1, avg0, avg1;
3480  v16u8 horiz0, horiz1, horiz2, horiz3;
3481  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3482  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3483  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3484  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3485  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3486  v16u8 const20 = (v16u8) __msa_ldi_b(20);
3487  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3488  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3489 
3490  LD_UB2(src, src_stride, inp0, inp1);
3491  src += (2 * src_stride);
3492  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3493  mask2, mask3, const20,
3494  const6, const3);
3495  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3496  horiz0 = __msa_ave_u_b(inp0, res0);
3497  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3498  LD_UB2(src, src_stride, inp2, inp3);
3499  src += (2 * src_stride);
3500  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3501  mask2, mask3, const20,
3502  const6, const3);
3503  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3504  horiz2 = __msa_ave_u_b(inp2, res1);
3505  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3506  LD_UB2(src, src_stride, inp0, inp1);
3507  src += (2 * src_stride);
3508  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3509  mask2, mask3, const20,
3510  const6, const3);
3511  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3512  horiz4 = __msa_ave_u_b(inp0, res0);
3513  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3514  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3515  horiz1, horiz2, horiz3, horiz4,
3516  horiz1, horiz0, horiz0, horiz1,
3517  horiz2, horiz3, horiz4, horiz5,
3518  const20, const6, const3);
3519  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3520  res0 = __msa_ave_u_b(avg0, res0);
3521  ST_D2(res0, 0, 1, dst, dst_stride);
3522  dst += (2 * dst_stride);
3523 
3524  LD_UB2(src, src_stride, inp2, inp3);
3525  src += (2 * src_stride);
3526  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3527  mask2, mask3, const20,
3528  const6, const3);
3529  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3530  horiz6 = __msa_ave_u_b(inp2, res1);
3531  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3532  inp0 = LD_UB(src);
3533  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3534  mask2, mask3, const20,
3535  const6, const3);
3536  horiz8 = __msa_ave_u_b(inp0, res0);
3537  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3538  horiz3, horiz4, horiz5, horiz6,
3539  horiz3, horiz2, horiz1, horiz0,
3540  horiz4, horiz5, horiz6, horiz7,
3541  const20, const6, const3);
3542  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3543  res1 = __msa_ave_u_b(avg1, res1);
3544  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3545  horiz5, horiz6, horiz7, horiz8,
3546  horiz5, horiz4, horiz3, horiz2,
3547  horiz6, horiz7, horiz8, horiz8,
3548  const20, const6, const3);
3549  ST_D2(res1, 0, 1, dst, dst_stride);
3550  dst += 2 * dst_stride;
3551 
3552  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3553  res0 = __msa_ave_u_b(avg0, res0);
3554 
3555  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3556  horiz7, horiz8, horiz8, horiz7,
3557  horiz7, horiz6, horiz5, horiz4,
3558  horiz8, horiz8, horiz7, horiz6,
3559  const20, const6, const3);
3560  ST_D2(res0, 0, 1, dst, dst_stride);
3561  dst += 2 * dst_stride;
3562 
3563  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3564  res1 = __msa_ave_u_b(avg1, res1);
3565  ST_D2(res1, 0, 1, dst, dst_stride);
3566 }
3567 
3569  int32_t src_stride,
3570  uint8_t *dst,
3571  int32_t dst_stride)
3572 {
3573  uint8_t buff[272];
3574 
3575  hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3576  vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3577 }
3578 
3580  int32_t src_stride,
3581  uint8_t *dst,
3582  int32_t dst_stride)
3583 {
3584  v16u8 inp0, inp1, inp2, inp3;
3585  v16u8 res0, res1, avg0, avg1;
3586  v16u8 horiz0, horiz1, horiz2, horiz3;
3587  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3588  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3589  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3590  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3591  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3592  v16u8 const20 = (v16u8) __msa_ldi_b(20);
3593  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3594  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3595 
3596  LD_UB2(src, src_stride, inp0, inp1);
3597  src += (2 * src_stride);
3598  horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3599  mask2, mask3, const20,
3600  const6, const3);
3601  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3602  LD_UB2(src, src_stride, inp2, inp3);
3603  src += (2 * src_stride);
3604  horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3605  mask2, mask3, const20,
3606  const6, const3);
3607  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3608  LD_UB2(src, src_stride, inp0, inp1);
3609  src += (2 * src_stride);
3610  horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3611  mask2, mask3, const20,
3612  const6, const3);
3613  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3614  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3615  horiz1, horiz2, horiz3, horiz4,
3616  horiz1, horiz0, horiz0, horiz1,
3617  horiz2, horiz3, horiz4, horiz5,
3618  const20, const6, const3);
3619  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3620  res0 = __msa_ave_u_b(avg0, res0);
3621  LD_UB2(src, src_stride, inp2, inp3);
3622  src += (2 * src_stride);
3623  ST_D2(res0, 0, 1, dst, dst_stride);
3624  dst += 2 * dst_stride;
3625 
3626  horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3627  mask2, mask3, const20,
3628  const6, const3);
3629  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3630  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3631  horiz3, horiz4, horiz5, horiz6,
3632  horiz3, horiz2, horiz1, horiz0,
3633  horiz4, horiz5, horiz6, horiz7,
3634  const20, const6, const3);
3635  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3636  res1 = __msa_ave_u_b(avg1, res1);
3637  inp0 = LD_UB(src);
3638  horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3639  mask2, mask3, const20,
3640  const6, const3);
3641  ST_D2(res1, 0, 1, dst, dst_stride);
3642  dst += 2 * dst_stride;
3643 
3644  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3645  horiz5, horiz6, horiz7, horiz8,
3646  horiz5, horiz4, horiz3, horiz2,
3647  horiz6, horiz7, horiz8, horiz8,
3648  const20, const6, const3);
3649  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3650  res0 = __msa_ave_u_b(avg0, res0);
3651  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3652  horiz7, horiz8, horiz8, horiz7,
3653  horiz7, horiz6, horiz5, horiz4,
3654  horiz8, horiz8, horiz7, horiz6,
3655  const20, const6, const3);
3656  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3657  res1 = __msa_ave_u_b(avg1, res1);
3658  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3659 }
3660 
3662  int32_t src_stride,
3663  uint8_t *dst,
3664  int32_t dst_stride)
3665 {
3666  uint8_t buff[272];
3667 
3668  hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3669  vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3670 }
3671 
3673  int32_t src_stride,
3674  uint8_t *dst,
3675  int32_t dst_stride)
3676 {
3677  v16u8 inp0, inp1, inp2, inp3;
3678  v16u8 res0, res1, avg0, avg1;
3679  v16u8 horiz0, horiz1, horiz2, horiz3;
3680  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3681  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3682  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3683  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3684  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3685  v16u8 const20 = (v16u8) __msa_ldi_b(20);
3686  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3687  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3688 
3689  LD_UB2(src, src_stride, inp0, inp1);
3690  src += (2 * src_stride);
3691  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3692  mask2, mask3, const20,
3693  const6, const3);
3694  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3695 
3696  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3697  horiz0 = __msa_ave_u_b(inp0, res0);
3698  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3699  LD_UB2(src, src_stride, inp2, inp3);
3700  src += (2 * src_stride);
3701  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3702  mask2, mask3, const20,
3703  const6, const3);
3704  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3705 
3706  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3707  horiz2 = __msa_ave_u_b(inp2, res1);
3708  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3709  LD_UB2(src, src_stride, inp0, inp1);
3710  src += (2 * src_stride);
3711  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3712  mask2, mask3, const20,
3713  const6, const3);
3714 
3715  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3716  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3717  horiz4 = __msa_ave_u_b(inp0, res0);
3718  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3719  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3720  horiz1, horiz2, horiz3, horiz4,
3721  horiz1, horiz0, horiz0, horiz1,
3722  horiz2, horiz3, horiz4, horiz5,
3723  const20, const6, const3);
3724  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3725  res0 = __msa_ave_u_b(avg0, res0);
3726  ST_D2(res0, 0, 1, dst, dst_stride);
3727  dst += (2 * dst_stride);
3728 
3729  LD_UB2(src, src_stride, inp2, inp3);
3730  src += (2 * src_stride);
3731  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3732  mask2, mask3, const20,
3733  const6, const3);
3734  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3735 
3736  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3737  horiz6 = __msa_ave_u_b(inp2, res1);
3738  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3739  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3740  horiz3, horiz4, horiz5, horiz6,
3741  horiz3, horiz2, horiz1, horiz0,
3742  horiz4, horiz5, horiz6, horiz7,
3743  const20, const6, const3);
3744  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3745  res1 = __msa_ave_u_b(avg1, res1);
3746  ST_D2(res1, 0, 1, dst, dst_stride);
3747  dst += (2 * dst_stride);
3748 
3749  inp0 = LD_UB(src);
3750  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3751  mask2, mask3, const20,
3752  const6, const3);
3753  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3754  horiz8 = __msa_ave_u_b(inp0, res0);
3755  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3756  horiz5, horiz6, horiz7, horiz8,
3757  horiz5, horiz4, horiz3, horiz2,
3758  horiz6, horiz7, horiz8, horiz8,
3759  const20, const6, const3);
3760  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3761  horiz7, horiz8, horiz8, horiz7,
3762  horiz7, horiz6, horiz5, horiz4,
3763  horiz8, horiz8, horiz7, horiz6,
3764  const20, const6, const3);
3765  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3766  res0 = __msa_ave_u_b(avg0, res0);
3767  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3768  res1 = __msa_ave_u_b(avg1, res1);
3769  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3770 }
3771 
3773  int32_t src_stride,
3774  uint8_t *dst,
3775  int32_t dst_stride,
3776  int32_t height)
3777 {
3778  uint8_t loop_count;
3779  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3780  v16u8 res;
3781  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3782  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3783  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3784  v8u16 const20 = (v8u16) __msa_ldi_h(20);
3785 
3786  for (loop_count = (height >> 2); loop_count--;) {
3787  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3788  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3789  src += (4 * src_stride);
3790  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
3791  const20, const6, const3);
3792  res = __msa_aver_u_b(inp0, res);
3793  ST_UB(res, dst);
3794  dst += dst_stride;
3795 
3796  res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
3797  const20, const6, const3);
3798  res = __msa_aver_u_b(inp2, res);
3799  ST_UB(res, dst);
3800  dst += dst_stride;
3801 
3802  res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
3803  const20, const6, const3);
3804  res = __msa_aver_u_b(inp4, res);
3805  ST_UB(res, dst);
3806  dst += dst_stride;
3807 
3808  res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
3809  const20, const6, const3);
3810  res = __msa_aver_u_b(inp6, res);
3811  ST_UB(res, dst);
3812  dst += dst_stride;
3813  }
3814 
3815  LD_UB2(src, 1, inp0, inp1);
3816  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
3817  res = __msa_aver_u_b(inp0, res);
3818  ST_UB(res, dst);
3819 }
3820 
3822  int32_t src_stride,
3823  uint8_t *dst,
3824  int32_t dst_stride)
3825 {
3826  uint8_t buff[272];
3827 
3828  hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3829  vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3830 }
3831 
3833  int32_t src_stride,
3834  uint8_t *dst,
3835  int32_t dst_stride)
3836 {
3837  v16u8 inp0, inp1, inp2, inp3;
3838  v16u8 res0, res1, avg0, avg1;
3839  v16u8 horiz0, horiz1, horiz2, horiz3;
3840  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3841  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3842  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3843  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3844  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3845  v16u8 const20 = (v16u8) __msa_ldi_b(20);
3846  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3847  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3848 
3849  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
3850  src += (4 * src_stride);
3851  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
3852  const20, const6, const3);
3853  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
3854  const20, const6, const3);
3855  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3856  horiz0 = __msa_aver_u_b(inp0, res0);
3857  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3858  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3859  horiz2 = __msa_aver_u_b(inp2, res1);
3860  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3861  LD_UB2(src, src_stride, inp0, inp1);
3862  src += (2 * src_stride);
3863  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
3864  const20, const6, const3);
3865  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3866  horiz4 = __msa_aver_u_b(inp0, res0);
3867  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3868  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3869  horiz1, horiz2, horiz3, horiz4,
3870  horiz1, horiz0, horiz0, horiz1,
3871  horiz2, horiz3, horiz4, horiz5,
3872  const20, const6, const3);
3873  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3874  res0 = __msa_aver_u_b(avg0, res0);
3875  ST_D2(res0, 0, 1, dst, dst_stride);
3876  dst += (2 * dst_stride);
3877 
3878  LD_UB2(src, src_stride, inp2, inp3);
3879  src += (2 * src_stride);
3880  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
3881  const20, const6, const3);
3882  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3883  horiz6 = __msa_aver_u_b(inp2, res1);
3884  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3885  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3886  horiz3, horiz4, horiz5, horiz6,
3887  horiz3, horiz2, horiz1, horiz0,
3888  horiz4, horiz5, horiz6, horiz7,
3889  const20, const6, const3);
3890  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
3891  res1 = __msa_aver_u_b(avg1, res1);
3892 
3893  inp0 = LD_UB(src);
3894  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
3895  const20, const6, const3);
3896  horiz8 = __msa_aver_u_b(inp0, res0);
3897  ST_D2(res1, 0, 1, dst, dst_stride);
3898  dst += 2 * dst_stride;
3899 
3900  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3901  horiz5, horiz6, horiz7, horiz8,
3902  horiz5, horiz4, horiz3, horiz2,
3903  horiz6, horiz7, horiz8, horiz8,
3904  const20, const6, const3);
3905  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3906  res0 = __msa_aver_u_b(avg0, res0);
3907  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3908  horiz7, horiz8, horiz8, horiz7,
3909  horiz7, horiz6, horiz5, horiz4,
3910  horiz8, horiz8, horiz7, horiz6,
3911  const20, const6, const3);
3912  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3913  res1 = __msa_aver_u_b(avg1, res1);
3914  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3915 }
3916 
3918  int32_t src_stride,
3919  uint8_t *dst,
3920  int32_t dst_stride,
3921  int32_t height)
3922 {
3923  uint8_t loop_count;
3924  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3925  v16u8 res;
3926  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3927  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3928  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3929  v8u16 const20 = (v8u16) __msa_ldi_h(20);
3930 
3931  for (loop_count = (height >> 2); loop_count--;) {
3932  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3933  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3934  src += (4 * src_stride);
3935  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
3936  const20, const6, const3);
3937  ST_UB(res, dst);
3938  dst += dst_stride;
3939 
3940  res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
3941  const20, const6, const3);
3942  ST_UB(res, dst);
3943  dst += dst_stride;
3944 
3945  res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
3946  const20, const6, const3);
3947  ST_UB(res, dst);
3948  dst += dst_stride;
3949 
3950  res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
3951  const20, const6, const3);
3952  ST_UB(res, dst);
3953  dst += dst_stride;
3954  }
3955 
3956  LD_UB2(src, 1, inp0, inp1);
3957  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
3958  ST_UB(res, dst);
3959 }
3960 
3962  int32_t src_stride,
3963  uint8_t *dst,
3964  int32_t dst_stride)
3965 {
3966  uint8_t buff[272];
3967 
3968  hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3969  vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3970 }
3971 
3973  int32_t src_stride,
3974  uint8_t *dst,
3975  int32_t dst_stride)
3976 {
3977  v16u8 inp0, inp1, inp2, inp3;
3978  v16u8 res0, res1, avg0, avg1;
3979  v16u8 horiz0, horiz1, horiz2, horiz3;
3980  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3981  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3982  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3983  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3984  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3985  v16u8 const20 = (v16u8) __msa_ldi_b(20);
3986  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3987  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3988 
3989  LD_UB2(src, src_stride, inp0, inp1);
3990  src += (2 * src_stride);
3991  horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
3992  mask0, mask1, mask2, mask3,
3993  const20, const6, const3);
3994  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3995  LD_UB2(src, src_stride, inp2, inp3);
3996  src += (2 * src_stride);
3997  horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
3998  mask0, mask1, mask2, mask3,
3999  const20, const6, const3);
4000  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4001  LD_UB2(src, src_stride, inp0, inp1);
4002  src += (2 * src_stride);
4003  horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4004  mask0, mask1, mask2, mask3,
4005  const20, const6, const3);
4006  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4007  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4008  horiz1, horiz2, horiz3, horiz4,
4009  horiz1, horiz0, horiz0, horiz1,
4010  horiz2, horiz3, horiz4, horiz5,
4011  const20, const6, const3);
4012  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4013  res0 = __msa_aver_u_b(avg0, res0);
4014  ST_D2(res0, 0, 1, dst, dst_stride);
4015  dst += (2 * dst_stride);
4016 
4017  LD_UB2(src, src_stride, inp2, inp3);
4018  src += (2 * src_stride);
4019  horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4020  mask0, mask1, mask2, mask3,
4021  const20, const6, const3);
4022  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4023  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4024  horiz3, horiz4, horiz5, horiz6,
4025  horiz3, horiz2, horiz1, horiz0,
4026  horiz4, horiz5, horiz6, horiz7,
4027  const20, const6, const3);
4028  inp0 = LD_UB(src);
4029  horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4030  mask0, mask1, mask2, mask3,
4031  const20, const6, const3);
4032  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4033  res1 = __msa_aver_u_b(avg1, res1);
4034  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4035  horiz5, horiz6, horiz7, horiz8,
4036  horiz5, horiz4, horiz3, horiz2,
4037  horiz6, horiz7, horiz8, horiz8,
4038  const20, const6, const3);
4039  ST_D2(res1, 0, 1, dst, dst_stride);
4040  dst += 2 * dst_stride;
4041 
4042  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4043  res0 = __msa_aver_u_b(avg0, res0);
4044  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4045  horiz7, horiz8, horiz8, horiz7,
4046  horiz7, horiz6, horiz5, horiz4,
4047  horiz8, horiz8, horiz7, horiz6,
4048  const20, const6, const3);
4049  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4050  res1 = __msa_aver_u_b(avg1, res1);
4051  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4052 }
4053 
4055  int32_t src_stride,
4056  uint8_t *dst,
4057  int32_t dst_stride,
4058  int32_t height)
4059 {
4060  uint8_t loop_count;
4061  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
4062  v16u8 res;
4063  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
4064  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4065  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4066  v8u16 const20 = (v8u16) __msa_ldi_h(20);
4067 
4068  for (loop_count = (height >> 2); loop_count--;) {
4069  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
4070  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
4071  src += (4 * src_stride);
4072  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
4073  const20, const6, const3);
4074  res = __msa_aver_u_b(res, inp1);
4075  ST_UB(res, dst);
4076  dst += dst_stride;
4077 
4078  res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
4079  const20, const6, const3);
4080  res = __msa_aver_u_b(res, inp3);
4081  ST_UB(res, dst);
4082  dst += dst_stride;
4083 
4084  res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
4085  const20, const6, const3);
4086  res = __msa_aver_u_b(res, inp5);
4087  ST_UB(res, dst);
4088  dst += dst_stride;
4089 
4090  res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
4091  const20, const6, const3);
4092  res = __msa_aver_u_b(res, inp7);
4093  ST_UB(res, dst);
4094  dst += dst_stride;
4095  }
4096 
4097  LD_UB2(src, 1, inp0, inp1);
4098  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
4099  res = __msa_aver_u_b(inp1, res);
4100  ST_UB(res, dst);
4101 }
4102 
4104  int32_t src_stride,
4105  uint8_t *dst,
4106  int32_t dst_stride)
4107 {
4108  uint8_t buff[272];
4109 
4110  hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4111  vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4112 }
4113 
4115  int32_t src_stride,
4116  uint8_t *dst,
4117  int32_t dst_stride)
4118 {
4119  v16u8 inp0, inp1, inp2, inp3;
4120  v16u8 res0, res1, avg0, avg1;
4121  v16u8 horiz0, horiz1, horiz2, horiz3;
4122  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4123  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4124  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4125  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4126  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4127  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4128  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4129  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4130 
4131  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4132  src += (4 * src_stride);
4133  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4134  const20, const6, const3);
4135  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4136  const20, const6, const3);
4137  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4138 
4139  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4140  horiz0 = __msa_aver_u_b(inp0, res0);
4141  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4142  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4143 
4144  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4145  horiz2 = __msa_aver_u_b(inp2, res1);
4146  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4147  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4148  src += (4 * src_stride);
4149  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4150  const20, const6, const3);
4151  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4152  const20, const6, const3);
4153  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4154 
4155  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4156  horiz4 = __msa_aver_u_b(inp0, res0);
4157  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4158  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4159 
4160  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4161  horiz6 = __msa_aver_u_b(inp2, res1);
4162  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4163  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4164  horiz1, horiz2, horiz3, horiz4,
4165  horiz1, horiz0, horiz0, horiz1,
4166  horiz2, horiz3, horiz4, horiz5,
4167  const20, const6, const3);
4168  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4169  res0 = __msa_aver_u_b(avg0, res0);
4170  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4171  horiz3, horiz4, horiz5, horiz6,
4172  horiz3, horiz2, horiz1, horiz0,
4173  horiz4, horiz5, horiz6, horiz7,
4174  const20, const6, const3);
4175  ST_D2(res0, 0, 1, dst, dst_stride);
4176  dst += 2 * dst_stride;
4177 
4178  inp0 = LD_UB(src);
4179  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4180  const20, const6, const3);
4181  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4182  res1 = __msa_aver_u_b(avg1, res1);
4183  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4184  horiz8 = __msa_aver_u_b(inp0, res0);
4185  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4186  horiz5, horiz6, horiz7, horiz8,
4187  horiz5, horiz4, horiz3, horiz2,
4188  horiz6, horiz7, horiz8, horiz8,
4189  const20, const6, const3);
4190  ST_D2(res1, 0, 1, dst, dst_stride);
4191  dst += 2 * dst_stride;
4192 
4193  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4194  res0 = __msa_aver_u_b(avg0, res0);
4195  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4196  horiz7, horiz8, horiz8, horiz7,
4197  horiz7, horiz6, horiz5, horiz4,
4198  horiz8, horiz8, horiz7, horiz6,
4199  const20, const6, const3);
4200  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4201  res1 = __msa_aver_u_b(avg1, res1);
4202  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4203 }
4204 
4206  int32_t src_stride,
4207  uint8_t *dst,
4208  int32_t dst_stride)
4209 {
4210  uint8_t buff[272];
4211 
4212  hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4213  vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4214 }
4215 
4217  int32_t src_stride,
4218  uint8_t *dst,
4219  int32_t dst_stride)
4220 {
4221  v16u8 inp0, inp1, inp2, inp3;
4222  v16u8 res0, res1;
4223  v16u8 horiz0, horiz1, horiz2, horiz3;
4224  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4225  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4226  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4227  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4228  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4229  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4230  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4231  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4232 
4233  LD_UB2(src, src_stride, inp0, inp1);
4234  src += (2 * src_stride);
4235  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4236  const20, const6, const3);
4237  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4238  horiz0 = __msa_aver_u_b(inp0, res0);
4239  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4240 
4241  LD_UB2(src, src_stride, inp2, inp3);
4242  src += (2 * src_stride);
4243  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4244  const20, const6, const3);
4245  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4246  horiz2 = __msa_aver_u_b(inp2, res1);
4247  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4248  LD_UB2(src, src_stride, inp0, inp1);
4249  src += (2 * src_stride);
4250  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4251  const20, const6, const3);
4252  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4253  horiz4 = __msa_aver_u_b(inp0, res0);
4254  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4255  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4256  horiz1, horiz2, horiz3, horiz4,
4257  horiz1, horiz0, horiz0, horiz1,
4258  horiz2, horiz3, horiz4, horiz5,
4259  const20, const6, const3);
4260  ST_D2(res0, 0, 1, dst, dst_stride);
4261  dst += (2 * dst_stride);
4262 
4263  LD_UB2(src, src_stride, inp2, inp3);
4264  src += (2 * src_stride);
4265  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4266  const20, const6, const3);
4267  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4268  horiz6 = __msa_aver_u_b(inp2, res1);
4269  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4270  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4271  horiz3, horiz4, horiz5, horiz6,
4272  horiz3, horiz2, horiz1, horiz0,
4273  horiz4, horiz5, horiz6, horiz7,
4274  const20, const6, const3);
4275  inp0 = LD_UB(src);
4276  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4277  const20, const6, const3);
4278  horiz8 = __msa_aver_u_b(inp0, res0);
4279  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4280  horiz5, horiz6, horiz7, horiz8,
4281  horiz5, horiz4, horiz3, horiz2,
4282  horiz6, horiz7, horiz8, horiz8,
4283  const20, const6, const3);
4284  ST_D2(res1, 0, 1, dst, dst_stride);
4285  dst += 2 * dst_stride;
4286 
4287  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4288  horiz7, horiz8, horiz8, horiz7,
4289  horiz7, horiz6, horiz5, horiz4,
4290  horiz8, horiz8, horiz7, horiz6,
4291  const20, const6, const3);
4292  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4293 }
4294 
4295 static void hv_mc_qpel_16x16_msa(const uint8_t *src,
4296  int32_t src_stride,
4297  uint8_t *dst,
4298  int32_t dst_stride)
4299 {
4300  uint8_t buff[272];
4301 
4302  hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4303  vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4304 }
4305 
4306 static void hv_mc_qpel_8x8_msa(const uint8_t *src, int32_t src_stride,
4307  uint8_t *dst, int32_t dst_stride)
4308 {
4309  v16u8 inp0, inp1, inp2, inp3;
4310  v16u8 res0, res1;
4311  v16u8 horiz0, horiz1, horiz2, horiz3;
4312  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4313  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4314  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4315  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4316  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4317  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4318  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4319  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4320 
4321  LD_UB2(src, src_stride, inp0, inp1);
4322  src += (2 * src_stride);
4323  horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4324  mask0, mask1, mask2, mask3,
4325  const20, const6, const3);
4326  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4327  LD_UB2(src, src_stride, inp2, inp3);
4328  src += (2 * src_stride);
4329  horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4330  mask0, mask1, mask2, mask3,
4331  const20, const6, const3);
4332  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4333  LD_UB2(src, src_stride, inp0, inp1);
4334  src += (2 * src_stride);
4335  horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4336  mask0, mask1, mask2, mask3,
4337  const20, const6, const3);
4338  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4339  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4340  horiz1, horiz2, horiz3, horiz4,
4341  horiz1, horiz0, horiz0, horiz1,
4342  horiz2, horiz3, horiz4, horiz5,
4343  const20, const6, const3);
4344  ST_D2(res0, 0, 1, dst, dst_stride);
4345  dst += (2 * dst_stride);
4346 
4347  LD_UB2(src, src_stride, inp2, inp3);
4348  src += (2 * src_stride);
4349  horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4350  mask0, mask1, mask2, mask3,
4351  const20, const6, const3);
4352  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4353  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4354  horiz3, horiz4, horiz5, horiz6,
4355  horiz3, horiz2, horiz1, horiz0,
4356  horiz4, horiz5, horiz6, horiz7,
4357  const20, const6, const3);
4358  inp0 = LD_UB(src);
4359  horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4360  mask0, mask1, mask2, mask3,
4361  const20, const6, const3);
4362  ST_D2(res1, 0, 1, dst, dst_stride);
4363  dst += 2 * dst_stride;
4364 
4365  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4366  horiz5, horiz6, horiz7, horiz8,
4367  horiz5, horiz4, horiz3, horiz2,
4368  horiz6, horiz7, horiz8, horiz8,
4369  const20, const6, const3);
4370  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4371  horiz7, horiz8, horiz8, horiz7,
4372  horiz7, horiz6, horiz5, horiz4,
4373  horiz8, horiz8, horiz7, horiz6,
4374  const20, const6, const3);
4375  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4376 }
4377 
4379  int32_t src_stride,
4380  uint8_t *dst,
4381  int32_t dst_stride)
4382 {
4383  uint8_t buff[272];
4384 
4385  hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4386  vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4387 }
4388 
4390  int32_t src_stride,
4391  uint8_t *dst,
4392  int32_t dst_stride)
4393 {
4394  v16u8 inp0, inp1, inp2, inp3;
4395  v16u8 res0, res1;
4396  v16u8 horiz0, horiz1, horiz2, horiz3;
4397  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4398  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4399  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4400  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4401  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4402  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4403  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4404  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4405 
4406  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4407  src += (4 * src_stride);
4408 
4409  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4410  const20, const6, const3);
4411  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4412  const20, const6, const3);
4413  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4414 
4415  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4416  horiz0 = __msa_aver_u_b(inp0, res0);
4417  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4418  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4419 
4420  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4421  horiz2 = __msa_aver_u_b(inp2, res1);
4422  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4423  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4424  src += (4 * src_stride);
4425  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4426  const20, const6, const3);
4427  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4428  const20, const6, const3);
4429  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4430 
4431  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4432  horiz4 = __msa_aver_u_b(inp0, res0);
4433  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4434  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4435 
4436  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4437  horiz6 = __msa_aver_u_b(inp2, res1);
4438  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4439  inp0 = LD_UB(src);
4440  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4441  const20, const6, const3);
4442  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4443  horiz8 = __msa_aver_u_b(inp0, res0);
4444  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4445  horiz1, horiz2, horiz3, horiz4,
4446  horiz1, horiz0, horiz0, horiz1,
4447  horiz2, horiz3, horiz4, horiz5,
4448  const20, const6, const3);
4449  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4450  horiz3, horiz4, horiz5, horiz6,
4451  horiz3, horiz2, horiz1, horiz0,
4452  horiz4, horiz5, horiz6, horiz7,
4453  const20, const6, const3);
4454  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4455  dst += (4 * dst_stride);
4456 
4457  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4458  horiz5, horiz6, horiz7, horiz8,
4459  horiz5, horiz4, horiz3, horiz2,
4460  horiz6, horiz7, horiz8, horiz8,
4461  const20, const6, const3);
4462  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4463  horiz7, horiz8, horiz8, horiz7,
4464  horiz7, horiz6, horiz5, horiz4,
4465  horiz8, horiz8, horiz7, horiz6,
4466  const20, const6, const3);
4467  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4468 }
4469 
4471  int32_t src_stride,
4472  uint8_t *dst,
4473  int32_t dst_stride)
4474 {
4475  uint8_t buff[272];
4476 
4477  hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4478  vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4479 }
4480 
4482  int32_t src_stride,
4483  uint8_t *dst,
4484  int32_t dst_stride)
4485 {
4486  v16u8 inp0, inp1, inp2, inp3;
4487  v16u8 res0, res1, avg0, avg1;
4488  v16u8 horiz0, horiz1, horiz2, horiz3;
4489  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4490  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4491  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4492  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4493  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4494  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4495  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4496  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4497 
4498  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4499  src += (4 * src_stride);
4500 
4501  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4502  const20, const6, const3);
4503  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4504  const20, const6, const3);
4505  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4506  horiz0 = __msa_aver_u_b(inp0, res0);
4507  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4508  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4509  horiz2 = __msa_aver_u_b(inp2, res1);
4510  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4511  LD_UB2(src, src_stride, inp0, inp1);
4512  src += (2 * src_stride);
4513 
4514  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4515  const20, const6, const3);
4516  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4517  horiz4 = __msa_aver_u_b(inp0, res0);
4518  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4519  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4520  horiz1, horiz2, horiz3, horiz4,
4521  horiz1, horiz0, horiz0, horiz1,
4522  horiz2, horiz3, horiz4, horiz5,
4523  const20, const6, const3);
4524  avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
4525  res0 = __msa_aver_u_b(avg0, res0);
4526  ST_D2(res0, 0, 1, dst, dst_stride);
4527  dst += (2 * dst_stride);
4528 
4529  LD_UB2(src, src_stride, inp2, inp3);
4530  src += (2 * src_stride);
4531  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4532  const20, const6, const3);
4533  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4534  horiz6 = __msa_aver_u_b(inp2, res1);
4535  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4536  inp0 = LD_UB(src);
4537  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4538  const20, const6, const3);
4539  horiz8 = __msa_aver_u_b(inp0, res0);
4540  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4541  horiz3, horiz4, horiz5, horiz6,
4542  horiz3, horiz2, horiz1, horiz0,
4543  horiz4, horiz5, horiz6, horiz7,
4544  const20, const6, const3);
4545  avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
4546  res1 = __msa_aver_u_b(avg1, res1);
4547  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4548  horiz5, horiz6, horiz7, horiz8,
4549  horiz5, horiz4, horiz3, horiz2,
4550  horiz6, horiz7, horiz8, horiz8,
4551  const20, const6, const3);
4552  ST_D2(res1, 0, 1, dst, dst_stride);
4553  dst += 2 * dst_stride;
4554 
4555  avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
4556  res0 = __msa_aver_u_b(avg0, res0);
4557  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4558  horiz7, horiz8, horiz8, horiz7,
4559  horiz7, horiz6, horiz5, horiz4,
4560  horiz8, horiz8, horiz7, horiz6,
4561  const20, const6, const3);
4562  avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
4563  res1 = __msa_aver_u_b(avg1, res1);
4564  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4565 }
4566 
4568  int32_t src_stride,
4569  uint8_t *dst,
4570  int32_t dst_stride)
4571 {
4572  uint8_t buff[272];
4573 
4574  hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4575  vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4576 }
4577 
4579  int32_t src_stride,
4580  uint8_t *dst,
4581  int32_t dst_stride)
4582 {
4583  v16u8 inp0, inp1, inp2, inp3;
4584  v16u8 res0, res1, avg0, avg1;
4585  v16u8 horiz0, horiz1, horiz2, horiz3;
4586  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4587  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4588  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4589  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4590  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4591  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4592  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4593  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4594 
4595  LD_UB2(src, src_stride, inp0, inp1);
4596  src += (2 * src_stride);
4597  horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4598  mask0, mask1, mask2, mask3,
4599  const20, const6, const3);
4600  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4601  LD_UB2(src, src_stride, inp2, inp3);
4602  src += (2 * src_stride);
4603  horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4604  mask0, mask1, mask2, mask3,
4605  const20, const6, const3);
4606  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4607  LD_UB2(src, src_stride, inp0, inp1);
4608  src += (2 * src_stride);
4609  horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4610  mask0, mask1, mask2, mask3,
4611  const20, const6, const3);
4612  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4613  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4614  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4615  horiz1, horiz2, horiz3, horiz4,
4616  horiz1, horiz0, horiz0, horiz1,
4617  horiz2, horiz3, horiz4, horiz5,
4618  const20, const6, const3);
4619  avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
4620  res0 = __msa_aver_u_b(avg0, res0);
4621  ST_D2(res0, 0, 1, dst, dst_stride);
4622  dst += (2 * dst_stride);
4623 
4624  LD_UB2(src, src_stride, inp2, inp3);
4625  src += (2 * src_stride);
4626  horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4627  mask0, mask1, mask2, mask3,
4628  const20, const6, const3);
4629  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4630  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4631  horiz3, horiz4, horiz5, horiz6,
4632  horiz3, horiz2, horiz1, horiz0,
4633  horiz4, horiz5, horiz6, horiz7,
4634  const20, const6, const3);
4635  inp0 = LD_UB(src);
4636  horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4637  mask0, mask1, mask2, mask3,
4638  const20, const6, const3);
4639  avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
4640  res1 = __msa_aver_u_b(avg1, res1);
4641  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4642  horiz5, horiz6, horiz7, horiz8,
4643  horiz5, horiz4, horiz3, horiz2,
4644  horiz6, horiz7, horiz8, horiz8,
4645  const20, const6, const3);
4646  ST_D2(res1, 0, 1, dst, dst_stride);
4647  dst += 2 * dst_stride;
4648  avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
4649  res0 = __msa_aver_u_b(avg0, res0);
4650 
4651  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4652  horiz7, horiz8, horiz8, horiz7,
4653  horiz7, horiz6, horiz5, horiz4,
4654  horiz8, horiz8, horiz7, horiz6,
4655  const20, const6, const3);
4656  avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
4657  res1 = __msa_aver_u_b(avg1, res1);
4658  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4659 }
4660 
4662  int32_t src_stride,
4663  uint8_t *dst,
4664  int32_t dst_stride)
4665 {
4666  uint8_t buff[272];
4667 
4668  hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4669  vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4670 }
4671 
4673  int32_t src_stride,
4674  uint8_t *dst, int32_t dst_stride)
4675 {
4676  v16u8 inp0, inp1, inp2, inp3;
4677  v16u8 res0, res1, avg0, avg1;
4678  v16u8 horiz0, horiz1, horiz2, horiz3;
4679  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4680  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4681  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4682  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4683  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4684  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4685  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4686  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4687 
4688  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4689  src += (4 * src_stride);
4690  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4691  mask0, mask1, mask2, mask3,
4692  const20, const6, const3);
4693  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4694 
4695  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4696  horiz0 = __msa_aver_u_b(inp0, res0);
4697  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4698  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4699  const20, const6, const3);
4700  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4701 
4702  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4703  horiz2 = __msa_aver_u_b(inp2, res1);
4704  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4705  LD_UB2(src, src_stride, inp0, inp1);
4706  src += (2 * src_stride);
4707  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4708  const20, const6, const3);
4709  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4710 
4711  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4712  horiz4 = __msa_aver_u_b(inp0, res0);
4713  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4714  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4715  horiz1, horiz2, horiz3, horiz4,
4716  horiz1, horiz0, horiz0, horiz1,
4717  horiz2, horiz3, horiz4, horiz5,
4718  const20, const6, const3);
4719  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
4720  res0 = __msa_aver_u_b(avg0, res0);
4721  LD_UB2(src, src_stride, inp2, inp3);
4722  src += (2 * src_stride);
4723  ST_D2(res0, 0, 1, dst, dst_stride);
4724  dst += 2 * dst_stride;
4725 
4726  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4727  const20, const6, const3);
4728  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4729 
4730  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4731  horiz6 = __msa_aver_u_b(inp2, res1);
4732  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4733  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4734  horiz3, horiz4, horiz5, horiz6,
4735  horiz3, horiz2, horiz1, horiz0,
4736  horiz4, horiz5, horiz6, horiz7,
4737  const20, const6, const3);
4738  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
4739  res1 = __msa_aver_u_b(avg1, res1);
4740  inp0 = LD_UB(src);
4741  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4742  const20, const6, const3);
4743  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4744  horiz8 = __msa_aver_u_b(inp0, res0);
4745  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4746  horiz5, horiz6, horiz7, horiz8,
4747  horiz5, horiz4, horiz3, horiz2,
4748  horiz6, horiz7, horiz8, horiz8,
4749  const20, const6, const3);
4750  ST_D2(res1, 0, 1, dst, dst_stride);
4751  dst += 2 * dst_stride;
4752 
4753  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
4754  res0 = __msa_aver_u_b(avg0, res0);
4755  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4756  horiz7, horiz8, horiz8, horiz7,
4757  horiz7, horiz6, horiz5, horiz4,
4758  horiz8, horiz8, horiz7, horiz6,
4759  const20, const6, const3);
4760  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
4761  res1 = __msa_aver_u_b(avg1, res1);
4762  ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4763 }
4764 
4766  int32_t src_stride,
4767  uint8_t *dst,
4768  int32_t dst_stride)
4769 {
4770  uint8_t buff[272];
4771 
4772  hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4773  vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4774 }
4775 
4777  int32_t src_stride,
4778  uint8_t *dst,
4779  int32_t dst_stride)
4780 {
4781  v16u8 inp0, inp1, inp2, inp3;
4782  v16u8 res0, res1, avg0, avg1;
4783  v16u8 horiz0, horiz1, horiz2, horiz3;
4784  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4785  v16u8 dst0, dst1;
4786  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4787  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4788  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4789  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4790  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4791  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4792  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4793 
4794  LD_UB2(src, src_stride, inp0, inp1);
4795  src += (2 * src_stride);
4796  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4797  const20, const6, const3);
4798  LD_UB2(src, src_stride, inp2, inp3);
4799  src += (2 * src_stride);
4800  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4801  horiz0 = __msa_aver_u_b(inp0, res0);
4802  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4803  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4804  const20, const6, const3);
4805  LD_UB2(src, src_stride, inp0, inp1);
4806  src += (2 * src_stride);
4807  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4808  horiz2 = __msa_aver_u_b(inp2, res1);
4809  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4810  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4811  const20, const6, const3);
4812  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4813  horiz4 = __msa_aver_u_b(inp0, res0);
4814  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4815  LD_UB2(dst, dst_stride, dst0, dst1);
4816  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4817  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4818  horiz1, horiz2, horiz3, horiz4,
4819  horiz1, horiz0, horiz0, horiz1,
4820  horiz2, horiz3, horiz4, horiz5,
4821  const20, const6, const3);
4822  res0 = __msa_aver_u_b(avg0, res0);
4823  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4824  res0 = __msa_aver_u_b(avg0, res0);
4825  ST_D2(res0, 0, 1, dst, dst_stride);
4826  dst += (2 * dst_stride);
4827 
4828  LD_UB2(src, src_stride, inp2, inp3);
4829  src += (2 * src_stride);
4830  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4831  const20, const6, const3);
4832  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4833  horiz6 = __msa_aver_u_b(inp2, res1);
4834  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4835  LD_UB2(dst, dst_stride, dst0, dst1);
4836  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4837  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4838  horiz3, horiz4, horiz5, horiz6,
4839  horiz3, horiz2, horiz1, horiz0,
4840  horiz4, horiz5, horiz6, horiz7,
4841  const20, const6, const3);
4842  res1 = __msa_aver_u_b(avg1, res1);
4843  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4844  res1 = __msa_aver_u_b(avg1, res1);
4845  ST_D2(res1, 0, 1, dst, dst_stride);
4846  dst += (2 * dst_stride);
4847 
4848  inp0 = LD_UB(src);
4849  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4850  const20, const6, const3);
4851  horiz8 = __msa_aver_u_b(inp0, res0);
4852  LD_UB2(dst, dst_stride, dst0, dst1);
4853  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4854  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4855  horiz5, horiz6, horiz7, horiz8,
4856  horiz5, horiz4, horiz3, horiz2,
4857  horiz6, horiz7, horiz8, horiz8,
4858  const20, const6, const3);
4859  res0 = __msa_aver_u_b(avg0, res0);
4860  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4861  res0 = __msa_aver_u_b(avg0, res0);
4862  ST_D2(res0, 0, 1, dst, dst_stride);
4863  dst += (2 * dst_stride);
4864 
4865  LD_UB2(dst, dst_stride, dst0, dst1);
4866  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4867  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4868  horiz7, horiz8, horiz8, horiz7,
4869  horiz7, horiz6, horiz5, horiz4,
4870  horiz8, horiz8, horiz7, horiz6,
4871  const20, const6, const3);
4872  res1 = __msa_aver_u_b(avg1, res1);
4873  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4874  res1 = __msa_aver_u_b(avg1, res1);
4875  ST_D2(res1, 0, 1, dst, dst_stride);
4876 }
4877 
4879  int32_t src_stride,
4880  uint8_t *dst,
4881  int32_t dst_stride)
4882 {
4883  uint8_t buff[272];
4884 
4885  hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4886  vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4887 }
4888 
4890  int32_t src_stride,
4891  uint8_t *dst,
4892  int32_t dst_stride)
4893 {
4894  v16u8 inp0, inp1, inp2, inp3;
4895  v16u8 res0, res1, avg0, avg1;
4896  v16u8 horiz0, horiz1, horiz2, horiz3;
4897  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4898  v16u8 dst0, dst1;
4899  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4900  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4901  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4902  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4903  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4904  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4905  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4906 
4907  LD_UB2(src, src_stride, inp0, inp1);
4908  src += (2 * src_stride);
4909  horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4910  mask0, mask1, mask2, mask3,
4911  const20, const6, const3);
4912  LD_UB2(src, src_stride, inp2, inp3);
4913  src += (2 * src_stride);
4914  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4915  horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4916  mask0, mask1, mask2, mask3,
4917  const20, const6, const3);
4918  LD_UB2(src, src_stride, inp0, inp1);
4919  src += (2 * src_stride);
4920  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4921  horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4922  mask0, mask1, mask2, mask3,
4923  const20, const6, const3);
4924  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4925  LD_UB2(dst, dst_stride, dst0, dst1);
4926  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4927  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4928  horiz1, horiz2, horiz3, horiz4,
4929  horiz1, horiz0, horiz0, horiz1,
4930  horiz2, horiz3, horiz4, horiz5,
4931  const20, const6, const3);
4932  res0 = __msa_aver_u_b(avg0, res0);
4933  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4934  res0 = __msa_aver_u_b(avg0, res0);
4935  ST_D2(res0, 0, 1, dst, dst_stride);
4936  dst += (2 * dst_stride);
4937 
4938  LD_UB2(src, src_stride, inp2, inp3);
4939  src += (2 * src_stride);
4940  horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4941  mask0, mask1, mask2, mask3,
4942  const20, const6, const3);
4943  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4944  LD_UB2(dst, dst_stride, dst0, dst1);
4945  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4946  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4947  horiz3, horiz4, horiz5, horiz6,
4948  horiz3, horiz2, horiz1, horiz0,
4949  horiz4, horiz5, horiz6, horiz7,
4950  const20, const6, const3);
4951  res1 = __msa_aver_u_b(avg1, res1);
4952  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4953  res1 = __msa_aver_u_b(avg1, res1);
4954  ST_D2(res1, 0, 1, dst, dst_stride);
4955  dst += (2 * dst_stride);
4956 
4957  inp0 = LD_UB(src);
4958  horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4959  mask0, mask1, mask2, mask3,
4960  const20, const6, const3);
4961  LD_UB2(dst, dst_stride, dst0, dst1);
4962  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4963  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4964  horiz5, horiz6, horiz7, horiz8,
4965  horiz5, horiz4, horiz3, horiz2,
4966  horiz6, horiz7, horiz8, horiz8,
4967  const20, const6, const3);
4968  res0 = __msa_aver_u_b(avg0, res0);
4969  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4970  res0 = __msa_aver_u_b(avg0, res0);
4971  ST_D2(res0, 0, 1, dst, dst_stride);
4972  dst += (2 * dst_stride);
4973 
4974  LD_UB2(dst, dst_stride, dst0, dst1);
4975  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4976  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4977  horiz7, horiz8, horiz8, horiz7,
4978  horiz7, horiz6, horiz5, horiz4,
4979  horiz8, horiz8, horiz7, horiz6,
4980  const20, const6, const3);
4981  res1 = __msa_aver_u_b(avg1, res1);
4982  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4983  res1 = __msa_aver_u_b(avg1, res1);
4984  ST_D2(res1, 0, 1, dst, dst_stride);
4985 }
4986 
4988  int32_t src_stride,
4989  uint8_t *dst,
4990  int32_t dst_stride)
4991 {
4992  uint8_t buff[272];
4993 
4994  hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4995  vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4996 }
4997 
4999  int32_t src_stride,
5000  uint8_t *dst,
5001  int32_t dst_stride)
5002 {
5003  v16u8 inp0, inp1, inp2, inp3;
5004  v16u8 res0, res1, avg0, avg1;
5005  v16u8 horiz0, horiz1, horiz2, horiz3;
5006  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5007  v16u8 dst0, dst1;
5008  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5009  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5010  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5011  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5012  v16u8 const20 = (v16u8) __msa_ldi_b(20);
5013  v16u8 const6 = (v16u8) __msa_ldi_b(6);
5014  v16u8 const3 = (v16u8) __msa_ldi_b(3);
5015 
5016  LD_UB2(src, src_stride, inp0, inp1);
5017  src += (2 * src_stride);
5018  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5019  const20, const6, const3);
5020 
5021  LD_UB2(src, src_stride, inp2, inp3);
5022  src += (2 * src_stride);
5023  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5024 
5025  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5026  horiz0 = __msa_aver_u_b(inp0, res0);
5027  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5028  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5029  const20, const6, const3);
5030  LD_UB2(src, src_stride, inp0, inp1);
5031  src += (2 * src_stride);
5032  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5033 
5034  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5035  horiz2 = __msa_aver_u_b(inp2, res1);
5036  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5037  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5038  const20, const6, const3);
5039 
5040  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5041 
5042  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5043  horiz4 = __msa_aver_u_b(inp0, res0);
5044  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5045  LD_UB2(dst, dst_stride, dst0, dst1);
5046  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
5047  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5048  horiz1, horiz2, horiz3, horiz4,
5049  horiz1, horiz0, horiz0, horiz1,
5050  horiz2, horiz3, horiz4, horiz5,
5051  const20, const6, const3);
5052  res0 = __msa_aver_u_b(avg0, res0);
5053  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5054  res0 = __msa_aver_u_b(avg0, res0);
5055  ST_D2(res0, 0, 1, dst, dst_stride);
5056  dst += (2 * dst_stride);
5057 
5058  LD_UB2(src, src_stride, inp2, inp3);
5059  src += (2 * src_stride);
5060  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5061  const20, const6, const3);
5062 
5063  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5064 
5065  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5066  horiz6 = __msa_aver_u_b(inp2, res1);
5067  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5068  LD_UB2(dst, dst_stride, dst0, dst1);
5069  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
5070  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5071  horiz3, horiz4, horiz5, horiz6,
5072  horiz3, horiz2, horiz1, horiz0,
5073  horiz4, horiz5, horiz6, horiz7,
5074  const20, const6, const3);
5075  res1 = __msa_aver_u_b(avg1, res1);
5076  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5077  res1 = __msa_aver_u_b(avg1, res1);
5078  ST_D2(res1, 0, 1, dst, dst_stride);
5079  dst += (2 * dst_stride);
5080 
5081  inp0 = LD_UB(src);
5082  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5083  const20, const6, const3);
5084  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5085  horiz8 = __msa_aver_u_b(inp0, res0);
5086  LD_UB2(dst, dst_stride, dst0, dst1);
5087  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
5088  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5089  horiz5, horiz6, horiz7, horiz8,
5090  horiz5, horiz4, horiz3, horiz2,
5091  horiz6, horiz7, horiz8, horiz8,
5092  const20, const6, const3);
5093  res0 = __msa_aver_u_b(avg0, res0);
5094  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5095  res0 = __msa_aver_u_b(avg0, res0);
5096  ST_D2(res0, 0, 1, dst, dst_stride);
5097  dst += (2 * dst_stride);
5098 
5099  LD_UB2(dst, dst_stride, dst0, dst1);
5100  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
5101  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5102  horiz7, horiz8, horiz8, horiz7,
5103  horiz7, horiz6, horiz5, horiz4,
5104  horiz8, horiz8, horiz7, horiz6,
5105  const20, const6, const3);
5106  res1 = __msa_aver_u_b(avg1, res1);
5107  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5108  res1 = __msa_aver_u_b(avg1, res1);
5109  ST_D2(res1, 0, 1, dst, dst_stride);
5110 }
5111 
5113  int32_t src_stride,
5114  uint8_t *dst,
5115  int32_t dst_stride)
5116 {
5117  uint8_t buff[272];
5118 
5119  hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
5120  vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5121 }
5122 
5124  int32_t src_stride,
5125  uint8_t *dst,
5126  int32_t dst_stride)
5127 {
5128  v16u8 inp0, inp1, inp2, inp3;
5129  v16u8 res0, res1, avg0, avg1;
5130  v16u8 horiz0, horiz1, horiz2, horiz3;
5131  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5132  v16u8 dst0, dst1;
5133  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5134  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5135  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5136  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5137  v16u8 const20 = (v16u8) __msa_ldi_b(20);
5138  v16u8 const6 = (v16u8) __msa_ldi_b(6);
5139  v16u8 const3 = (v16u8) __msa_ldi_b(3);
5140 
5141  LD_UB2(src, src_stride, inp0, inp1);
5142  src += (2 * src_stride);
5143  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5144  const20, const6, const3);
5145  LD_UB2(src, src_stride, inp2, inp3);
5146  src += (2 * src_stride);
5147  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5148  horiz0 = __msa_aver_u_b(inp0, res0);
5149  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5150  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5151  const20, const6, const3);
5152  LD_UB2(src, src_stride, inp0, inp1);
5153  src += (2 * src_stride);
5154  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5155  horiz2 = __msa_aver_u_b(inp2, res1);
5156  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5157  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5158  const20, const6, const3);
5159  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5160  horiz4 = __msa_aver_u_b(inp0, res0);
5161  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5162  LD_UB2(dst, dst_stride, dst0, dst1);
5163  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5164  horiz1, horiz2, horiz3, horiz4,
5165  horiz1, horiz0, horiz0, horiz1,
5166  horiz2, horiz3, horiz4, horiz5,
5167  const20, const6, const3);
5168  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5169  res0 = __msa_aver_u_b(avg0, res0);
5170  ST_D2(res0, 0, 1, dst, dst_stride);
5171  dst += (2 * dst_stride);
5172 
5173  LD_UB2(src, src_stride, inp2, inp3);
5174  src += (2 * src_stride);
5175  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5176  const20, const6, const3);
5177  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5178  horiz6 = __msa_aver_u_b(inp2, res1);
5179  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5180  LD_UB2(dst, dst_stride, dst0, dst1);
5181  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5182  horiz3, horiz4, horiz5, horiz6,
5183  horiz3, horiz2, horiz1, horiz0,
5184  horiz4, horiz5, horiz6, horiz7,
5185  const20, const6, const3);
5186  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5187  res1 = __msa_aver_u_b(avg1, res1);
5188  ST_D2(res1, 0, 1, dst, dst_stride);
5189  dst += (2 * dst_stride);
5190 
5191  inp0 = LD_UB(src);
5192  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5193  const20, const6, const3);
5194  horiz8 = __msa_aver_u_b(inp0, res0);
5195  LD_UB2(dst, dst_stride, dst0, dst1);
5196  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5197  horiz5, horiz6, horiz7, horiz8,
5198  horiz5, horiz4, horiz3, horiz2,
5199  horiz6, horiz7, horiz8, horiz8,
5200  const20, const6, const3);
5201  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5202  res0 = __msa_aver_u_b(avg0, res0);
5203  ST_D2(res0, 0, 1, dst, dst_stride);
5204  dst += (2 * dst_stride);
5205 
5206  LD_UB2(dst, dst_stride, dst0, dst1);
5207  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5208  horiz7, horiz8, horiz8, horiz7,
5209  horiz7, horiz6, horiz5, horiz4,
5210  horiz8, horiz8, horiz7, horiz6,
5211  const20, const6, const3);
5212  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5213  res1 = __msa_aver_u_b(avg1, res1);
5214  ST_D2(res1, 0, 1, dst, dst_stride);
5215 }
5216 
5217 static void hv_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, int32_t src_stride,
5218  uint8_t *dst, int32_t dst_stride)
5219 {
5220  uint8_t buff[272];
5221 
5222  hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
5223  vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5224 
5225 }
5226 
5227 static void hv_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, int32_t src_stride,
5228  uint8_t *dst, int32_t dst_stride)
5229 {
5230  v16u8 inp0, inp1, inp2, inp3;
5231  v16u8 res0, res1, avg0, avg1;
5232  v16u8 horiz0, horiz1, horiz2, horiz3;
5233  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5234  v16u8 dst0, dst1;
5235  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5236  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5237  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5238  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5239  v16u8 const20 = (v16u8) __msa_ldi_b(20);
5240  v16u8 const6 = (v16u8) __msa_ldi_b(6);
5241  v16u8 const3 = (v16u8) __msa_ldi_b(3);
5242 
5243  LD_UB2(src, src_stride, inp0, inp1);
5244  src += (2 * src_stride);
5245  horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5246  mask0, mask1, mask2, mask3,
5247  const20, const6, const3);
5248  LD_UB2(src, src_stride, inp2, inp3);
5249  src += (2 * src_stride);
5250  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5251  horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5252  mask0, mask1, mask2, mask3,
5253  const20, const6, const3);
5254  LD_UB2(src, src_stride, inp0, inp1);
5255  src += (2 * src_stride);
5256  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5257  horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5258  mask0, mask1, mask2, mask3,
5259  const20, const6, const3);
5260  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5261  LD_UB2(src, src_stride, inp2, inp3);
5262  src += (2 * src_stride);
5263  horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5264  mask0, mask1, mask2, mask3,
5265  const20, const6, const3);
5266  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5267  inp0 = LD_UB(src);
5268  horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
5269  mask0, mask1, mask2, mask3,
5270  const20, const6, const3);
5271  LD_UB2(dst, dst_stride, dst0, dst1);
5272  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5273  horiz1, horiz2, horiz3, horiz4,
5274  horiz1, horiz0, horiz0, horiz1,
5275  horiz2, horiz3, horiz4, horiz5,
5276  const20, const6, const3);
5277  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5278  res0 = __msa_aver_u_b(avg0, res0);
5279  ST_D2(res0, 0, 1, dst, dst_stride);
5280  dst += (2 * dst_stride);
5281 
5282  LD_UB2(dst, dst_stride, dst0, dst1);
5283  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5284  horiz3, horiz4, horiz5, horiz6,
5285  horiz3, horiz2, horiz1, horiz0,
5286  horiz4, horiz5, horiz6, horiz7,
5287  const20, const6, const3);
5288  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5289  res1 = __msa_aver_u_b(avg1, res1);
5290  ST_D2(res1, 0, 1, dst, dst_stride);
5291  dst += (2 * dst_stride);
5292 
5293  LD_UB2(dst, dst_stride, dst0, dst1);
5294  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5295  horiz5, horiz6, horiz7, horiz8,
5296  horiz5, horiz4, horiz3, horiz2,
5297  horiz6, horiz7, horiz8, horiz8,
5298  const20, const6, const3);
5299  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5300  res0 = __msa_aver_u_b(avg0, res0);
5301  ST_D2(res0, 0, 1, dst, dst_stride);
5302  dst += (2 * dst_stride);
5303 
5304  LD_UB2(dst, dst_stride, dst0, dst1);
5305  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5306  horiz7, horiz8, horiz8, horiz7,
5307  horiz7, horiz6, horiz5, horiz4,
5308  horiz8, horiz8, horiz7, horiz6,
5309  const20, const6, const3);
5310  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5311  res1 = __msa_aver_u_b(avg1, res1);
5312  ST_D2(res1, 0, 1, dst, dst_stride);
5313 }
5314 
5316  int32_t src_stride,
5317  uint8_t *dst,
5318  int32_t dst_stride)
5319 {
5320  uint8_t buff[272];
5321 
5322  hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
5323  vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5324 }
5325 
5327  int32_t src_stride,
5328  uint8_t *dst,
5329  int32_t dst_stride)
5330 {
5331  v16u8 inp0, inp1, inp2, inp3;
5332  v16u8 res0, res1, avg0, avg1;
5333  v16u8 horiz0, horiz1, horiz2, horiz3;
5334  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5335  v16u8 dst0, dst1;
5336  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5337  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5338  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5339  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5340  v16u8 const20 = (v16u8) __msa_ldi_b(20);
5341  v16u8 const6 = (v16u8) __msa_ldi_b(6);
5342  v16u8 const3 = (v16u8) __msa_ldi_b(3);
5343 
5344  LD_UB2(src, src_stride, inp0, inp1);
5345  src += (2 * src_stride);
5346  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5347  const20, const6, const3);
5348  LD_UB2(src, src_stride, inp2, inp3);
5349  src += (2 * src_stride);
5350  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5351 
5352  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5353  horiz0 = __msa_aver_u_b(inp0, res0);
5354  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5355  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5356  const20, const6, const3);
5357  LD_UB2(src, src_stride, inp0, inp1);
5358  src += (2 * src_stride);
5359  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5360 
5361  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5362  horiz2 = __msa_aver_u_b(inp2, res1);
5363  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5364  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5365  const20, const6, const3);
5366 
5367  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5368 
5369  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5370  horiz4 = __msa_aver_u_b(inp0, res0);
5371  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5372  LD_UB2(dst, dst_stride, dst0, dst1);
5373  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5374  horiz1, horiz2, horiz3, horiz4,
5375  horiz1, horiz0, horiz0, horiz1,
5376  horiz2, horiz3, horiz4, horiz5,
5377  const20, const6, const3);
5378  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5379  res0 = __msa_aver_u_b(avg0, res0);
5380  ST_D2(res0, 0, 1, dst, dst_stride);
5381  dst += (2 * dst_stride);
5382 
5383  LD_UB2(src, src_stride, inp2, inp3);
5384  src += (2 * src_stride);
5385  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5386  const20, const6, const3);
5387 
5388  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5389 
5390  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5391  horiz6 = __msa_aver_u_b(inp2, res1);
5392  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5393  LD_UB2(dst, dst_stride, dst0, dst1);
5394  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5395  horiz3, horiz4, horiz5, horiz6,
5396  horiz3, horiz2, horiz1, horiz0,
5397  horiz4, horiz5, horiz6, horiz7,
5398  const20, const6, const3);
5399  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5400  res1 = __msa_aver_u_b(avg1, res1);
5401  ST_D2(res1, 0, 1, dst, dst_stride);
5402  dst += (2 * dst_stride);
5403 
5404  inp0 = LD_UB(src);
5405  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5406  const20, const6, const3);
5407  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5408  horiz8 = __msa_aver_u_b(inp0, res0);
5409  LD_UB2(dst, dst_stride, dst0, dst1);
5410  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5411  horiz5, horiz6, horiz7, horiz8,
5412  horiz5, horiz4, horiz3, horiz2,
5413  horiz6, horiz7, horiz8, horiz8,
5414  const20, const6, const3);
5415  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5416  res0 = __msa_aver_u_b(avg0, res0);
5417  ST_D2(res0, 0, 1, dst, dst_stride);
5418  dst += (2 * dst_stride);
5419 
5420  LD_UB2(dst, dst_stride, dst0, dst1);
5421  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5422  horiz7, horiz8, horiz8, horiz7,
5423  horiz7, horiz6, horiz5, horiz4,
5424  horiz8, horiz8, horiz7, horiz6,
5425  const20, const6, const3);
5426  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5427  res1 = __msa_aver_u_b(avg1, res1);
5428  ST_D2(res1, 0, 1, dst, dst_stride);
5429 }
5430 
5432  int32_t src_stride,
5433  uint8_t *dst,
5434  int32_t dst_stride)
5435 {
5436  uint8_t buff[272];
5437 
5438  hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
5439  vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5440 }
5441 
5443  int32_t src_stride,
5444  uint8_t *dst,
5445  int32_t dst_stride)
5446 {
5447  v16u8 inp0, inp1, inp2, inp3;
5448  v16u8 res0, res1, avg0, avg1;
5449  v16u8 horiz0, horiz1, horiz2, horiz3;
5450  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5451  v16u8 dst0, dst1;
5452  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5453  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5454  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5455  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5456  v16u8 const20 = (v16u8) __msa_ldi_b(20);
5457  v16u8 const6 = (v16u8) __msa_ldi_b(6);
5458  v16u8 const3 = (v16u8) __msa_ldi_b(3);
5459 
5460  LD_UB2(src, src_stride, inp0, inp1);
5461  src += (2 * src_stride);
5462 
5463  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5464  const20, const6, const3);
5465  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5466  horiz0 = __msa_aver_u_b(inp0, res0);
5467  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5468  LD_UB2(src, src_stride, inp2, inp3);
5469  src += (2 * src_stride);
5470  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5471  const20, const6, const3);
5472  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5473  horiz2 = __msa_aver_u_b(inp2, res1);
5474  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5475  LD_UB2(dst, dst_stride, dst0, dst1);
5476  LD_UB2(src, src_stride, inp0, inp1);
5477  src += (2 * src_stride);
5478  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5479  const20, const6, const3);
5480  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5481  horiz4 = __msa_aver_u_b(inp0, res0);
5482  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5483  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5484  horiz1, horiz2, horiz3, horiz4,
5485  horiz1, horiz0, horiz0, horiz1,
5486  horiz2, horiz3, horiz4, horiz5,
5487  const20, const6, const3);
5488  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5489  res0 = __msa_aver_u_b(avg0, res0);
5490  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5491  res0 = __msa_aver_u_b(avg0, res0);
5492  ST_D2(res0, 0, 1, dst, dst_stride);
5493  dst += (2 * dst_stride);
5494 
5495  LD_UB2(dst, dst_stride, dst0, dst1);
5496  LD_UB2(src, src_stride, inp2, inp3);
5497  src += (2 * src_stride);
5498  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5499  const20, const6, const3);
5500  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5501  horiz6 = __msa_aver_u_b(inp2, res1);
5502  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5503  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5504  horiz3, horiz4, horiz5, horiz6,
5505  horiz3, horiz2, horiz1, horiz0,
5506  horiz4, horiz5, horiz6, horiz7,
5507  const20, const6, const3);
5508  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5509  res1 = __msa_aver_u_b(avg1, res1);
5510  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5511  res1 = __msa_aver_u_b(avg1, res1);
5512  ST_D2(res1, 0, 1, dst, dst_stride);
5513  dst += (2 * dst_stride);
5514 
5515  inp0 = LD_UB(src);
5516  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5517  const20, const6, const3);
5518  horiz8 = __msa_aver_u_b(inp0, res0);
5519  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5520  horiz5, horiz6, horiz7, horiz8,
5521  horiz5, horiz4, horiz3, horiz2,
5522  horiz6, horiz7, horiz8, horiz8,
5523  const20, const6, const3);
5524  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5525  horiz7, horiz8, horiz8, horiz7,
5526  horiz7, horiz6, horiz5, horiz4,
5527  horiz8, horiz8, horiz7, horiz6,
5528  const20, const6, const3);
5529  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5530  res0 = __msa_aver_u_b(avg0, res0);
5531  LD_UB2(dst, dst_stride, dst0, dst1);
5532  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5533  res0 = __msa_aver_u_b(avg0, res0);
5534  ST_D2(res0, 0, 1, dst, dst_stride);
5535  dst += (2 * dst_stride);
5536 
5537  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5538  res1 = __msa_aver_u_b(avg1, res1);
5539  LD_UB2(dst, dst_stride, dst0, dst1);
5540  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5541  res1 = __msa_aver_u_b(avg1, res1);
5542  ST_D2(res1, 0, 1, dst, dst_stride);
5543 }
5544 
5546  int32_t src_stride,
5547  uint8_t *dst,
5548  int32_t dst_stride)
5549 {
5550  uint8_t buff[272];
5551 
5552  hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
5553  vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5554 }
5555 
5557  int32_t src_stride,
5558  uint8_t *dst,
5559  int32_t dst_stride)
5560 {
5561  v16u8 inp0, inp1, inp2, inp3;
5562  v16u8 res0, res1, avg0, avg1;
5563  v16u8 horiz0, horiz1, horiz2, horiz3;
5564  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5565  v16u8 dst0, dst1;
5566  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5567  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5568  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5569  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5570  v16u8 const20 = (v16u8) __msa_ldi_b(20);
5571  v16u8 const6 = (v16u8) __msa_ldi_b(6);
5572  v16u8 const3 = (v16u8) __msa_ldi_b(3);
5573 
5574  LD_UB2(src, src_stride, inp0, inp1);
5575  src += (2 * src_stride);
5576  horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5577  mask0, mask1, mask2, mask3,
5578  const20, const6, const3);
5579  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5580  LD_UB2(src, src_stride, inp2, inp3);
5581  src += (2 * src_stride);
5582  horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5583  mask0, mask1, mask2, mask3,
5584  const20, const6, const3);
5585  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5586  LD_UB2(dst, dst_stride, dst0, dst1);
5587  LD_UB2(src, src_stride, inp0, inp1);
5588  src += (2 * src_stride);
5589  horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5590  mask0, mask1, mask2, mask3,
5591  const20, const6, const3);
5592  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5593  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5594  horiz1, horiz2, horiz3, horiz4,
5595  horiz1, horiz0, horiz0, horiz1,
5596  horiz2, horiz3, horiz4, horiz5,
5597  const20, const6, const3);
5598  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5599  res0 = __msa_aver_u_b(avg0, res0);
5600  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5601  res0 = __msa_aver_u_b(avg0, res0);
5602  ST_D2(res0, 0, 1, dst, dst_stride);
5603  dst += (2 * dst_stride);
5604 
5605  LD_UB2(dst, dst_stride, dst0, dst1);
5606  LD_UB2(src, src_stride, inp2, inp3);
5607  src += (2 * src_stride);
5608  horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5609  mask0, mask1, mask2, mask3,
5610  const20, const6, const3);
5611  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5612  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5613  horiz3, horiz4, horiz5, horiz6,
5614  horiz3, horiz2, horiz1, horiz0,
5615  horiz4, horiz5, horiz6, horiz7,
5616  const20, const6, const3);
5617  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5618  res1 = __msa_aver_u_b(avg1, res1);
5619  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5620  res1 = __msa_aver_u_b(avg1, res1);
5621  ST_D2(res1, 0, 1, dst, dst_stride);
5622  dst += (2 * dst_stride);
5623 
5624  inp0 = LD_UB(src);
5625  horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
5626  mask0, mask1, mask2, mask3,
5627  const20, const6, const3);
5628  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
5629  horiz6, horiz7, horiz8, horiz5, horiz4,
5630  horiz3, horiz2, horiz6, horiz7, horiz8,
5631  horiz8, const20, const6, const3);
5632  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
5633  horiz8, horiz8, horiz7, horiz7, horiz6,
5634  horiz5, horiz4, horiz8, horiz8, horiz7,
5635  horiz6, const20, const6, const3);
5636  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5637  res0 = __msa_aver_u_b(avg0, res0);
5638  LD_UB2(dst, dst_stride, dst0, dst1);
5639  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5640  res0 = __msa_aver_u_b(avg0, res0);
5641  ST_D2(res0, 0, 1, dst, dst_stride);
5642  dst += (2 * dst_stride);
5643 
5644  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5645  res1 = __msa_aver_u_b(avg1, res1);
5646  LD_UB2(dst, dst_stride, dst0, dst1);
5647  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5648  res1 = __msa_aver_u_b(avg1, res1);
5649  ST_D2(res1, 0, 1, dst, dst_stride);
5650 }
5651 
5653  int32_t src_stride,
5654  uint8_t *dst,
5655  int32_t dst_stride)
5656 {
5657  uint8_t buff[272];
5658 
5659  hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
5660  vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5661 }
5662 
5664  int32_t src_stride,
5665  uint8_t *dst,
5666  int32_t dst_stride)
5667 {
5668  v16u8 inp0, inp1, inp2, inp3;
5669  v16u8 res0, res1, avg0, avg1;
5670  v16u8 horiz0, horiz1, horiz2, horiz3;
5671  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5672  v16u8 dst0, dst1;
5673  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5674  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5675  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5676  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5677  v16u8 const20 = (v16u8) __msa_ldi_b(20);
5678  v16u8 const6 = (v16u8) __msa_ldi_b(6);
5679  v16u8 const3 = (v16u8) __msa_ldi_b(3);
5680 
5681  LD_UB2(src, src_stride, inp0, inp1);
5682  src += (2 * src_stride);
5683  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5684  const20, const6, const3);
5685  LD_UB2(src, src_stride, inp2, inp3);
5686  src += (2 * src_stride);
5687  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5688 
5689  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5690  horiz0 = __msa_aver_u_b(inp0, res0);
5691  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5692  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5693  const20, const6, const3);
5694  LD_UB2(src, src_stride, inp0, inp1);
5695  src += (2 * src_stride);
5696  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5697 
5698  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5699  horiz2 = __msa_aver_u_b(inp2, res1);
5700  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5701  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5702  const20, const6, const3);
5703  SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5704 
5705  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5706  horiz4 = __msa_aver_u_b(inp0, res0);
5707  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5708  LD_UB2(dst, dst_stride, dst0, dst1);
5709  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5710  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, horiz1,
5711  horiz2, horiz3, horiz4, horiz1, horiz0,
5712  horiz0, horiz1, horiz2, horiz3, horiz4,
5713  horiz5, const20, const6, const3);
5714  res0 = __msa_aver_u_b(avg0, res0);
5715  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5716  res0 = __msa_aver_u_b(avg0, res0);
5717  ST_D2(res0, 0, 1, dst, dst_stride);
5718  dst += (2 * dst_stride);
5719 
5720  LD_UB2(src, src_stride, inp2, inp3);
5721  src += (2 * src_stride);
5722  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5723  const20, const6, const3);
5724  SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5725 
5726  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5727  horiz6 = __msa_aver_u_b(inp2, res1);
5728  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5729  LD_UB2(dst, dst_stride, dst0, dst1);
5730  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5731  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, horiz3,
5732  horiz4, horiz5, horiz6, horiz3, horiz2,
5733  horiz1, horiz0, horiz4, horiz5, horiz6,
5734  horiz7, const20, const6, const3);
5735  res1 = __msa_aver_u_b(avg1, res1);
5736  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5737  res1 = __msa_aver_u_b(avg1, res1);
5738  ST_D2(res1, 0, 1, dst, dst_stride);
5739  dst += (2 * dst_stride);
5740 
5741  inp0 = LD_UB(src);
5742  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5743  const20, const6, const3);
5744  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5745  horiz8 = __msa_aver_u_b(inp0, res0);
5746  LD_UB2(dst, dst_stride, dst0, dst1);
5747  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5748  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
5749  horiz6, horiz7, horiz8, horiz5, horiz4,
5750  horiz3, horiz2, horiz6, horiz7, horiz8,
5751  horiz8, const20, const6, const3);
5752  res0 = __msa_aver_u_b(avg0, res0);
5753  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5754  res0 = __msa_aver_u_b(avg0, res0);
5755  ST_D2(res0, 0, 1, dst, dst_stride);
5756  dst += (2 * dst_stride);
5757 
5758  LD_UB2(dst, dst_stride, dst0, dst1);
5759  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5760  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
5761  horiz8, horiz8, horiz7, horiz7, horiz6,
5762  horiz5, horiz4, horiz8, horiz8, horiz7,
5763  horiz6, const20, const6, const3);
5764  res1 = __msa_aver_u_b(avg1, res1);
5765  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5766  res1 = __msa_aver_u_b(avg1, res1);
5767  ST_D2(res1, 0, 1, dst, dst_stride);
5768 }
5769 
5770 static void copy_8x8_msa(const uint8_t *src, int32_t src_stride,
5771  uint8_t *dst, int32_t dst_stride)
5772 {
5773  uint64_t src0, src1;
5774  int32_t loop_cnt;
5775 
5776  for (loop_cnt = 4; loop_cnt--;) {
5777  src0 = LD(src);
5778  src += src_stride;
5779  src1 = LD(src);
5780  src += src_stride;
5781 
5782  SD(src0, dst);
5783  dst += dst_stride;
5784  SD(src1, dst);
5785  dst += dst_stride;
5786  }
5787 }
5788 
5789 static void copy_16x16_msa(const uint8_t *src, int32_t src_stride,
5790  uint8_t *dst, int32_t dst_stride)
5791 {
5792  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
5793  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
5794 
5795  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
5796  src += (8 * src_stride);
5797  LD_UB8(src, src_stride,
5798  src8, src9, src10, src11, src12, src13, src14, src15);
5799 
5800  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
5801  dst += (8 * dst_stride);
5802  ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
5803  dst, dst_stride);
5804 }
5805 
5806 static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
5807  uint8_t *dst, int32_t dst_stride,
5808  int32_t height)
5809 {
5810  int32_t cnt;
5811  uint64_t out0, out1, out2, out3;
5812  v16u8 src0, src1, src2, src3;
5813  v16u8 dst0, dst1, dst2, dst3;
5814 
5815  for (cnt = (height / 4); cnt--;) {
5816  LD_UB4(src, src_stride, src0, src1, src2, src3);
5817  src += (4 * src_stride);
5818  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
5819 
5820  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
5821  dst0, dst1, dst2, dst3);
5822 
5823  out0 = __msa_copy_u_d((v2i64) dst0, 0);
5824  out1 = __msa_copy_u_d((v2i64) dst1, 0);
5825  out2 = __msa_copy_u_d((v2i64) dst2, 0);
5826  out3 = __msa_copy_u_d((v2i64) dst3, 0);
5827  SD4(out0, out1, out2, out3, dst, dst_stride);
5828  dst += (4 * dst_stride);
5829  }
5830 }
5831 
5832 static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
5833  uint8_t *dst, int32_t dst_stride,
5834  int32_t height)
5835 {
5836  int32_t cnt;
5837  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
5838  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5839 
5840  for (cnt = (height / 8); cnt--;) {
5841  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
5842  src += (8 * src_stride);
5843  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
5844 
5845  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
5846  dst0, dst1, dst2, dst3);
5847  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
5848  dst4, dst5, dst6, dst7);
5849  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
5850  dst += (8 * dst_stride);
5851  }
5852 }
5853 
5854 void ff_copy_16x16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5855 {
5856  copy_16x16_msa(src, stride, dest, stride);
5857 }
5858 
5859 void ff_copy_8x8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5860 {
5861  copy_8x8_msa(src, stride, dest, stride);
5862 }
5863 
5865  const uint8_t *src,
5866  ptrdiff_t stride)
5867 {
5868  horiz_mc_qpel_aver_src0_8width_msa(src, stride, dest, stride, 8);
5869 }
5870 
5872  const uint8_t *src,
5873  ptrdiff_t stride)
5874 {
5875  horiz_mc_qpel_aver_src0_16width_msa(src, stride, dest, stride, 16);
5876 }
5877 
5879  ptrdiff_t stride)
5880 {
5881  horiz_mc_qpel_8width_msa(src, stride, dest, stride, 8);
5882 }
5883 
5885  const uint8_t *src, ptrdiff_t stride)
5886 {
5887  horiz_mc_qpel_16width_msa(src, stride, dest, stride, 16);
5888 }
5889 
5891  const uint8_t *src,
5892  ptrdiff_t stride)
5893 {
5894  horiz_mc_qpel_aver_src1_8width_msa(src, stride, dest, stride, 8);
5895 }
5896 
5898  const uint8_t *src,
5899  ptrdiff_t stride)
5900 {
5901  horiz_mc_qpel_aver_src1_16width_msa(src, stride, dest, stride, 16);
5902 }
5903 
5905  const uint8_t *src,
5906  ptrdiff_t stride)
5907 {
5908  horiz_mc_qpel_no_rnd_aver_src0_8width_msa(src, stride, dest, stride, 8);
5909 }
5910 
5912  const uint8_t *src,
5913  ptrdiff_t stride)
5914 {
5915  horiz_mc_qpel_no_rnd_aver_src0_16width_msa(src, stride, dest, stride, 16);
5916 }
5917 
5919  const uint8_t *src, ptrdiff_t stride)
5920 {
5921  horiz_mc_qpel_no_rnd_8width_msa(src, stride, dest, stride, 8);
5922 }
5923 
5925  const uint8_t *src, ptrdiff_t stride)
5926 {
5927  horiz_mc_qpel_no_rnd_16width_msa(src, stride, dest, stride, 16);
5928 }
5929 
5931  const uint8_t *src,
5932  ptrdiff_t stride)
5933 {
5934  horiz_mc_qpel_no_rnd_aver_src1_8width_msa(src, stride, dest, stride, 8);
5935 }
5936 
5938  const uint8_t *src,
5939  ptrdiff_t stride)
5940 {
5941  horiz_mc_qpel_no_rnd_aver_src1_16width_msa(src, stride, dest, stride, 16);
5942 }
5943 
5944 void ff_avg_width8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5945 {
5946  avg_width8_msa(src, stride, dest, stride, 8);
5947 }
5948 
5949 void ff_avg_width16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5950 {
5951  avg_width16_msa(src, stride, dest, stride, 16);
5952 }
5953 
5955  const uint8_t *src,
5956  ptrdiff_t stride)
5957 {
5958  horiz_mc_qpel_avg_dst_aver_src0_8width_msa(src, stride, dest, stride, 8);
5959 }
5960 
5962  const uint8_t *src,
5963  ptrdiff_t stride)
5964 {
5965  horiz_mc_qpel_avg_dst_aver_src0_16width_msa(src, stride, dest, stride, 16);
5966 }
5967 
5969  const uint8_t *src, ptrdiff_t stride)
5970 {
5971  horiz_mc_qpel_avg_dst_8width_msa(src, stride, dest, stride, 8);
5972 }
5973 
5975  const uint8_t *src, ptrdiff_t stride)
5976 {
5977  horiz_mc_qpel_avg_dst_16width_msa(src, stride, dest, stride, 16);
5978 }
5979 
5981  const uint8_t *src,
5982  ptrdiff_t stride)
5983 {
5984  horiz_mc_qpel_avg_dst_aver_src1_8width_msa(src, stride, dest, stride, 8);
5985 }
5986 
5988  const uint8_t *src,
5989  ptrdiff_t stride)
5990 {
5991  horiz_mc_qpel_avg_dst_aver_src1_16width_msa(src, stride, dest, stride, 16);
5992 }
5993 
5994 
5996  const uint8_t *src, ptrdiff_t stride)
5997 {
5998  vert_mc_qpel_aver_src0_8x8_msa(src, stride, dest, stride);
5999 }
6000 
6002  const uint8_t *src, ptrdiff_t stride)
6003 {
6004  vert_mc_qpel_aver_src0_16x16_msa(src, stride, dest, stride);
6005 }
6006 
6008  ptrdiff_t stride)
6009 {
6010  vert_mc_qpel_8x8_msa(src, stride, dest, stride);
6011 }
6012 
6014  ptrdiff_t stride)
6015 {
6016  vert_mc_qpel_16x16_msa(src, stride, dest, stride);
6017 }
6018 
6020  const uint8_t *src, ptrdiff_t stride)
6021 {
6022  vert_mc_qpel_aver_src1_8x8_msa(src, stride, dest, stride);
6023 }
6024 
6026  const uint8_t *src, ptrdiff_t stride)
6027 {
6028  vert_mc_qpel_aver_src1_16x16_msa(src, stride, dest, stride);
6029 }
6030 
6032  const uint8_t *src,
6033  ptrdiff_t stride)
6034 {
6035  vert_mc_qpel_no_rnd_aver_src0_8x8_msa(src, stride, dest, stride);
6036 }
6037 
6039  const uint8_t *src,
6040  ptrdiff_t stride)
6041 {
6042  vert_mc_qpel_no_rnd_aver_src0_16x16_msa(src, stride, dest, stride);
6043 }
6044 
6046  const uint8_t *src, ptrdiff_t stride)
6047 {
6048  vert_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride);
6049 }
6050 
6052  const uint8_t *src, ptrdiff_t stride)
6053 {
6054  vert_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride);
6055 }
6056 
6058  const uint8_t *src,
6059  ptrdiff_t stride)
6060 {
6061  vert_mc_qpel_no_rnd_aver_src1_8x8_msa(src, stride, dest, stride);
6062 }
6063 
6065  const uint8_t *src,
6066  ptrdiff_t stride)
6067 {
6068  vert_mc_qpel_no_rnd_aver_src1_16x16_msa(src, stride, dest, stride);
6069 }
6070 
6072  const uint8_t *src,
6073  ptrdiff_t stride)
6074 {
6075  vert_mc_qpel_avg_dst_aver_src0_8x8_msa(src, stride, dest, stride);
6076 }
6077 
6079  const uint8_t *src,
6080  ptrdiff_t stride)
6081 {
6082  vert_mc_qpel_avg_dst_aver_src0_16x16_msa(src, stride, dest, stride);
6083 }
6084 
6086  const uint8_t *src, ptrdiff_t stride)
6087 {
6088  vert_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride);
6089 }
6090 
6092  const uint8_t *src, ptrdiff_t stride)
6093 {
6094  vert_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride);
6095 }
6096 
6098  const uint8_t *src,
6099  ptrdiff_t stride)
6100 {
6101  vert_mc_qpel_avg_dst_aver_src1_8x8_msa(src, stride, dest, stride);
6102 }
6103 
6105  const uint8_t *src,
6106  ptrdiff_t stride)
6107 {
6108  vert_mc_qpel_avg_dst_aver_src1_16x16_msa(src, stride, dest, stride);
6109 }
6110 
6111 /* HV cases */
6113  const uint8_t *src,
6114  ptrdiff_t stride)
6115 {
6116  hv_mc_qpel_aver_hv_src00_16x16_msa(src, stride, dest, stride);
6117 }
6118 
6120  const uint8_t *src, ptrdiff_t stride)
6121 {
6122  hv_mc_qpel_aver_hv_src00_8x8_msa(src, stride, dest, stride);
6123 }
6124 
6126  const uint8_t *src, ptrdiff_t stride)
6127 {
6128  hv_mc_qpel_aver_v_src0_16x16_msa(src, stride, dest, stride);
6129 }
6130 
6132  const uint8_t *src, ptrdiff_t stride)
6133 {
6134  hv_mc_qpel_aver_v_src0_8x8_msa(src, stride, dest, stride);
6135 }
6136 
6138  const uint8_t *src,
6139  ptrdiff_t stride)
6140 {
6141  hv_mc_qpel_aver_hv_src10_16x16_msa(src, stride, dest, stride);
6142 }
6143 
6145  const uint8_t *src, ptrdiff_t stride)
6146 {
6147  hv_mc_qpel_aver_hv_src10_8x8_msa(src, stride, dest, stride);
6148 }
6149 
6151  const uint8_t *src, ptrdiff_t stride)
6152 {
6153  hv_mc_qpel_aver_h_src0_16x16_msa(src, stride, dest, stride);
6154 }
6155