FFmpeg
hevc_mc_uni_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
24 
25 static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  /* 4 width cases */
29  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30  /* 4 width cases */
31  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32 };
33 
34 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
35  mask0, mask1, mask2, mask3, \
36  filt0, filt1, filt2, filt3, \
37  out0, out1) \
38 { \
39  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
40  \
41  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
42  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
43  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
44  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
45  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
46  DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
47  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
48  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \
49 }
50 
51 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
52  mask0, mask1, mask2, mask3, \
53  filt0, filt1, filt2, filt3, \
54  out0, out1, out2, out3) \
55 { \
56  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
57  \
58  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
59  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
60  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
61  out0, out1, out2, out3); \
62  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
63  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
64  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
65  out0, out1, out2, out3); \
66  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
67  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
68  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
69  out0, out1, out2, out3); \
70  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
71  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
72  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
73  out0, out1, out2, out3); \
74 }
75 
76 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
77  mask0, mask1, filt0, filt1, \
78  out0, out1) \
79 { \
80  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
81  \
82  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
83  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
84  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
85  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
86 }
87 
88 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
89  mask0, mask1, filt0, filt1, \
90  out0, out1, out2, out3) \
91 { \
92  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
93  \
94  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
95  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
96  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
97  out0, out1, out2, out3); \
98  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
99  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
100  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
101  out0, out1, out2, out3); \
102 }
103 
104 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
105  uint8_t *dst, int32_t dst_stride,
106  int32_t height)
107 {
108  int32_t cnt;
109  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
110 
111  if (2 == height) {
112  LD2(src, src_stride, out0, out1);
113  SD(out0, dst);
114  dst += dst_stride;
115  SD(out1, dst);
116  } else if (6 == height) {
117  LD4(src, src_stride, out0, out1, out2, out3);
118  src += (4 * src_stride);
119  SD4(out0, out1, out2, out3, dst, dst_stride);
120  dst += (4 * dst_stride);
121  LD2(src, src_stride, out0, out1);
122  SD(out0, dst);
123  dst += dst_stride;
124  SD(out1, dst);
125  } else if (0 == (height % 8)) {
126  for (cnt = (height >> 3); cnt--;) {
127  LD4(src, src_stride, out0, out1, out2, out3);
128  src += (4 * src_stride);
129  LD4(src, src_stride, out4, out5, out6, out7);
130  src += (4 * src_stride);
131  SD4(out0, out1, out2, out3, dst, dst_stride);
132  dst += (4 * dst_stride);
133  SD4(out4, out5, out6, out7, dst, dst_stride);
134  dst += (4 * dst_stride);
135  }
136  } else if (0 == (height % 4)) {
137  for (cnt = (height >> 2); cnt--;) {
138  LD4(src, src_stride, out0, out1, out2, out3);
139  src += (4 * src_stride);
140  SD4(out0, out1, out2, out3, dst, dst_stride);
141  dst += (4 * dst_stride);
142  }
143  }
144 }
145 
146 static void copy_width12_msa(const uint8_t *src, int32_t src_stride,
147  uint8_t *dst, int32_t dst_stride,
148  int32_t height)
149 {
150  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
151 
152  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
153  src += (8 * src_stride);
154  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
155  dst += (8 * dst_stride);
156  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
157  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
158 }
159 
160 static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
161  uint8_t *dst, int32_t dst_stride,
162  int32_t height)
163 {
164  int32_t cnt;
165  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
166 
167  if (12 == height) {
168  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169  src += (8 * src_stride);
170  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171  dst += (8 * dst_stride);
172  LD_UB4(src, src_stride, src0, src1, src2, src3);
173  src += (4 * src_stride);
174  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
175  dst += (4 * dst_stride);
176  } else if (0 == (height % 8)) {
177  for (cnt = (height >> 3); cnt--;) {
178  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
179  src7);
180  src += (8 * src_stride);
181  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
182  dst_stride);
183  dst += (8 * dst_stride);
184  }
185  } else if (0 == (height % 4)) {
186  for (cnt = (height >> 2); cnt--;) {
187  LD_UB4(src, src_stride, src0, src1, src2, src3);
188  src += (4 * src_stride);
189 
190  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
191  dst += (4 * dst_stride);
192  }
193  }
194 }
195 
196 static void copy_width24_msa(const uint8_t *src, int32_t src_stride,
197  uint8_t *dst, int32_t dst_stride,
198  int32_t height)
199 {
200  int32_t cnt;
201  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
202  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
203 
204  for (cnt = 4; cnt--;) {
205  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
206  LD4(src + 16, src_stride, out0, out1, out2, out3);
207  src += (4 * src_stride);
208  LD4(src + 16, src_stride, out4, out5, out6, out7);
209  src += (4 * src_stride);
210 
211  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212  SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213  dst += (4 * dst_stride);
214  SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215  dst += (4 * dst_stride);
216  }
217 }
218 
219 static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
220  uint8_t *dst, int32_t dst_stride,
221  int32_t height)
222 {
223  int32_t cnt;
224  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
225 
226  for (cnt = (height >> 2); cnt--;) {
227  LD_UB4(src, src_stride, src0, src1, src2, src3);
228  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
229  src += (4 * src_stride);
230  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
231  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232  dst += (4 * dst_stride);
233  }
234 }
235 
236 static void copy_width48_msa(const uint8_t *src, int32_t src_stride,
237  uint8_t *dst, int32_t dst_stride,
238  int32_t height)
239 {
240  int32_t cnt;
241  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
242  v16u8 src11;
243 
244  for (cnt = (height >> 2); cnt--;) {
245  LD_UB4(src, src_stride, src0, src1, src2, src3);
246  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
247  LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
248  src += (4 * src_stride);
249 
250  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
251  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252  ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253  dst += (4 * dst_stride);
254  }
255 }
256 
257 static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
258  uint8_t *dst, int32_t dst_stride,
259  int32_t height)
260 {
261  int32_t cnt;
262  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
263  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
264 
265  for (cnt = (height >> 2); cnt--;) {
266  LD_UB4(src, 16, src0, src1, src2, src3);
267  src += src_stride;
268  LD_UB4(src, 16, src4, src5, src6, src7);
269  src += src_stride;
270  LD_UB4(src, 16, src8, src9, src10, src11);
271  src += src_stride;
272  LD_UB4(src, 16, src12, src13, src14, src15);
273  src += src_stride;
274 
275  ST_UB4(src0, src1, src2, src3, dst, 16);
276  dst += dst_stride;
277  ST_UB4(src4, src5, src6, src7, dst, 16);
278  dst += dst_stride;
279  ST_UB4(src8, src9, src10, src11, dst, 16);
280  dst += dst_stride;
281  ST_UB4(src12, src13, src14, src15, dst, 16);
282  dst += dst_stride;
283  }
284 }
285 
286 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
287  uint8_t *dst, int32_t dst_stride,
288  const int8_t *filter)
289 {
290  v16u8 mask0, mask1, mask2, mask3, out;
291  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
292  v8i16 filt, out0, out1;
293 
294  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
295  src -= 3;
296 
297  /* rearranging filter */
298  filt = LD_SH(filter);
299  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
300 
301  mask1 = mask0 + 2;
302  mask2 = mask0 + 4;
303  mask3 = mask0 + 6;
304 
305  LD_SB4(src, src_stride, src0, src1, src2, src3);
306  XORI_B4_128_SB(src0, src1, src2, src3);
307  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
308  mask3, filt0, filt1, filt2, filt3, out0, out1);
309  SRARI_H2_SH(out0, out1, 6);
310  SAT_SH2_SH(out0, out1, 7);
311  out = PCKEV_XORI128_UB(out0, out1);
312  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
313 }
314 
315 static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
316  uint8_t *dst, int32_t dst_stride,
317  const int8_t *filter)
318 {
319  v16i8 filt0, filt1, filt2, filt3;
320  v16i8 src0, src1, src2, src3;
321  v16u8 mask0, mask1, mask2, mask3, out;
322  v8i16 filt, out0, out1, out2, out3;
323 
324  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
325  src -= 3;
326 
327  /* rearranging filter */
328  filt = LD_SH(filter);
329  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
330 
331  mask1 = mask0 + 2;
332  mask2 = mask0 + 4;
333  mask3 = mask0 + 6;
334 
335  LD_SB4(src, src_stride, src0, src1, src2, src3);
336  XORI_B4_128_SB(src0, src1, src2, src3);
337  src += (4 * src_stride);
338  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
339  mask3, filt0, filt1, filt2, filt3, out0, out1);
340  LD_SB4(src, src_stride, src0, src1, src2, src3);
341  XORI_B4_128_SB(src0, src1, src2, src3);
342  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
343  mask3, filt0, filt1, filt2, filt3, out2, out3);
344  SRARI_H4_SH(out0, out1, out2, out3, 6);
345  SAT_SH4_SH(out0, out1, out2, out3, 7);
346  out = PCKEV_XORI128_UB(out0, out1);
347  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
348  out = PCKEV_XORI128_UB(out2, out3);
349  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
350 }
351 
352 static void common_hz_8t_4x16_msa(const uint8_t *src, int32_t src_stride,
353  uint8_t *dst, int32_t dst_stride,
354  const int8_t *filter)
355 {
356  v16u8 mask0, mask1, mask2, mask3, out;
357  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
358  v8i16 filt, out0, out1, out2, out3;
359 
360  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
361  src -= 3;
362 
363  /* rearranging filter */
364  filt = LD_SH(filter);
365  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
366 
367  mask1 = mask0 + 2;
368  mask2 = mask0 + 4;
369  mask3 = mask0 + 6;
370 
371  LD_SB4(src, src_stride, src0, src1, src2, src3);
372  XORI_B4_128_SB(src0, src1, src2, src3);
373  src += (4 * src_stride);
374  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
375  mask3, filt0, filt1, filt2, filt3, out0, out1);
376  LD_SB4(src, src_stride, src0, src1, src2, src3);
377  XORI_B4_128_SB(src0, src1, src2, src3);
378  src += (4 * src_stride);
379  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
380  mask3, filt0, filt1, filt2, filt3, out2, out3);
381  SRARI_H4_SH(out0, out1, out2, out3, 6);
382  SAT_SH4_SH(out0, out1, out2, out3, 7);
383  out = PCKEV_XORI128_UB(out0, out1);
384  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
385  out = PCKEV_XORI128_UB(out2, out3);
386  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
387  dst += (8 * dst_stride);
388 
389  LD_SB4(src, src_stride, src0, src1, src2, src3);
390  XORI_B4_128_SB(src0, src1, src2, src3);
391  src += (4 * src_stride);
392  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
393  mask3, filt0, filt1, filt2, filt3, out0, out1);
394  LD_SB4(src, src_stride, src0, src1, src2, src3);
395  XORI_B4_128_SB(src0, src1, src2, src3);
396  src += (4 * src_stride);
397  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
398  mask3, filt0, filt1, filt2, filt3, out2, out3);
399 
400  SRARI_H4_SH(out0, out1, out2, out3, 6);
401  SAT_SH4_SH(out0, out1, out2, out3, 7);
402  out = PCKEV_XORI128_UB(out0, out1);
403  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
404  out = PCKEV_XORI128_UB(out2, out3);
405  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
406 }
407 
408 static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
409  uint8_t *dst, int32_t dst_stride,
410  const int8_t *filter, int32_t height)
411 {
412  if (4 == height) {
413  common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
414  } else if (8 == height) {
415  common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
416  } else if (16 == height) {
417  common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
418  }
419 }
420 
421 static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
422  uint8_t *dst, int32_t dst_stride,
423  const int8_t *filter, int32_t height)
424 {
425  uint32_t loop_cnt;
426  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
427  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
428  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
429  v8i16 filt, out0, out1, out2, out3;
430 
431  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
432  src -= 3;
433 
434  /* rearranging filter */
435  filt = LD_SH(filter);
436  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
437 
438  mask1 = mask0 + 2;
439  mask2 = mask0 + 4;
440  mask3 = mask0 + 6;
441 
442  for (loop_cnt = (height >> 2); loop_cnt--;) {
443  LD_SB4(src, src_stride, src0, src1, src2, src3);
444  XORI_B4_128_SB(src0, src1, src2, src3);
445  src += (4 * src_stride);
446 
447  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
448  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
449  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
450  out0, out1, out2, out3);
451  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
452  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
453  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
454  out0, out1, out2, out3);
455  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
456  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
457  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
458  out0, out1, out2, out3);
459  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
460  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
461  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
462  out0, out1, out2, out3);
463 
464  SRARI_H4_SH(out0, out1, out2, out3, 6);
465  SAT_SH4_SH(out0, out1, out2, out3, 7);
466  tmp0 = PCKEV_XORI128_UB(out0, out1);
467  tmp1 = PCKEV_XORI128_UB(out2, out3);
468  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
469  dst += (4 * dst_stride);
470  }
471 }
472 
473 static void common_hz_8t_12w_msa(const uint8_t *src, int32_t src_stride,
474  uint8_t *dst, int32_t dst_stride,
475  const int8_t *filter, int32_t height)
476 {
477  uint32_t loop_cnt;
478  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
479  v16u8 tmp0, tmp1, tmp2;
480  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
481  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
482  v16i8 filt0, filt1, filt2, filt3;
483  v8i16 filt, out0, out1, out2, out3, out4, out5;
484 
485  mask00 = LD_UB(&ff_hevc_mask_arr[0]);
486  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
487 
488  src = src - 3;
489 
490  /* rearranging filter */
491  filt = LD_SH(filter);
492  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
493 
494  mask1 = mask00 + 2;
495  mask2 = mask00 + 4;
496  mask3 = mask00 + 6;
497  mask4 = mask0 + 2;
498  mask5 = mask0 + 4;
499  mask6 = mask0 + 6;
500 
501  for (loop_cnt = 4; loop_cnt--;) {
502  /* 8 width */
503  LD_SB4(src, src_stride, src0, src1, src2, src3);
504  /* 4 width */
505  LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
506 
507  XORI_B4_128_SB(src0, src1, src2, src3);
508  XORI_B4_128_SB(src4, src5, src6, src7);
509  src += (4 * src_stride);
510 
511  VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
512  VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
513  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
514  out1, out2, out3);
515  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
516  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
517  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
518  out1, out2, out3);
519  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
520  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
521  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
522  out1, out2, out3);
523  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
524  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
525  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
526  out1, out2, out3);
527 
528  /* 4 width */
529  VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
530  DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5);
531  VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
532  DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5);
533  VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
534  DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5);
535  VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
536  DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5);
537 
538  SRARI_H4_SH(out0, out1, out2, out3, 6);
539  SRARI_H2_SH(out4, out5, 6);
540  SAT_SH4_SH(out0, out1, out2, out3, 7);
541  SAT_SH2_SH(out4, out5, 7);
542  tmp0 = PCKEV_XORI128_UB(out0, out1);
543  tmp1 = PCKEV_XORI128_UB(out2, out3);
544  tmp2 = PCKEV_XORI128_UB(out4, out5);
545 
546  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
547  ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
548  dst += (4 * dst_stride);
549  }
550 }
551 
552 static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
553  uint8_t *dst, int32_t dst_stride,
554  const int8_t *filter, int32_t height)
555 {
556  uint32_t loop_cnt;
557  v16u8 mask0, mask1, mask2, mask3, out;
558  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
559  v16i8 filt0, filt1, filt2, filt3;
560  v8i16 filt, out0, out1, out2, out3;
561 
562  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
563  src -= 3;
564 
565  /* rearranging filter */
566  filt = LD_SH(filter);
567  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
568 
569  mask1 = mask0 + 2;
570  mask2 = mask0 + 4;
571  mask3 = mask0 + 6;
572 
573  for (loop_cnt = (height >> 2); loop_cnt--;) {
574  LD_SB2(src, src_stride, src0, src2);
575  LD_SB2(src + 8, src_stride, src1, src3);
576  src += (2 * src_stride);
577 
578  LD_SB2(src, src_stride, src4, src6);
579  LD_SB2(src + 8, src_stride, src5, src7);
580  src += (2 * src_stride);
581 
582  XORI_B4_128_SB(src0, src1, src2, src3);
583  XORI_B4_128_SB(src4, src5, src6, src7);
584  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
585  mask3, filt0, filt1, filt2, filt3, out0,
586  out1, out2, out3);
587  SRARI_H4_SH(out0, out1, out2, out3, 6);
588  SAT_SH4_SH(out0, out1, out2, out3, 7);
589  out = PCKEV_XORI128_UB(out0, out1);
590  ST_UB(out, dst);
591  dst += dst_stride;
592  out = PCKEV_XORI128_UB(out2, out3);
593  ST_UB(out, dst);
594  dst += dst_stride;
595 
596  HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
597  mask3, filt0, filt1, filt2, filt3, out0,
598  out1, out2, out3);
599  SRARI_H4_SH(out0, out1, out2, out3, 6);
600  SAT_SH4_SH(out0, out1, out2, out3, 7);
601  out = PCKEV_XORI128_UB(out0, out1);
602  ST_UB(out, dst);
603  dst += dst_stride;
604  out = PCKEV_XORI128_UB(out2, out3);
605  ST_UB(out, dst);
606  dst += dst_stride;
607  }
608 }
609 
610 static void common_hz_8t_24w_msa(const uint8_t *src, int32_t src_stride,
611  uint8_t *dst, int32_t dst_stride,
612  const int8_t *filter, int32_t height)
613 {
614  uint32_t loop_cnt;
615  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
616  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
617  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
618  v16i8 vec11;
619  v8i16 out0, out1, out2, out3, out8, out9, filt;
620 
621  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
622  src -= 3;
623 
624  /* rearranging filter */
625  filt = LD_SH(filter);
626  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
627 
628  mask1 = mask0 + 2;
629  mask2 = mask0 + 4;
630  mask3 = mask0 + 6;
631  mask4 = mask0 + 8;
632  mask5 = mask0 + 10;
633  mask6 = mask0 + 12;
634  mask7 = mask0 + 14;
635 
636  for (loop_cnt = 16; loop_cnt--;) {
637  LD_SB2(src, src_stride, src0, src2);
638  LD_SB2(src + 16, src_stride, src1, src3);
639  XORI_B4_128_SB(src0, src1, src2, src3);
640  src += (2 * src_stride);
641  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
642  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
643  VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
644  DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
645  out8, out2, out9);
646  DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
647  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
648  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
649  VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
650  DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
651  out0, out8, out2, out9);
652  DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3);
653  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
654  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
655  VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
656  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
657  out0, out8, out2, out9);
658  DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
659  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
660  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
661  VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
662  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
663  out0, out8, out2, out9);
664  DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3);
665  SRARI_H4_SH(out0, out8, out2, out9, 6);
666  SRARI_H2_SH(out1, out3, 6);
667  SAT_SH4_SH(out0, out8, out2, out9, 7);
668  SAT_SH2_SH(out1, out3, 7);
669  out = PCKEV_XORI128_UB(out8, out9);
670  ST_D2(out, 0, 1, dst + 16, dst_stride);
671  out = PCKEV_XORI128_UB(out0, out1);
672  ST_UB(out, dst);
673  dst += dst_stride;
674  out = PCKEV_XORI128_UB(out2, out3);
675  ST_UB(out, dst);
676  dst += dst_stride;
677  }
678 }
679 
680 static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
681  uint8_t *dst, int32_t dst_stride,
682  const int8_t *filter, int32_t height)
683 {
684  uint32_t loop_cnt;
685  v16u8 mask0, mask1, mask2, mask3, out;
686  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
687  v16i8 filt0, filt1, filt2, filt3;
688  v8i16 filt, out0, out1, out2, out3;
689 
690  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
691  src -= 3;
692 
693  /* rearranging filter */
694  filt = LD_SH(filter);
695  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
696 
697  mask1 = mask0 + 2;
698  mask2 = mask0 + 4;
699  mask3 = mask0 + 6;
700 
701  for (loop_cnt = (height >> 1); loop_cnt--;) {
702  src0 = LD_SB(src);
703  src1 = LD_SB(src + 8);
704  src2 = LD_SB(src + 16);
705  src3 = LD_SB(src + 24);
706  src += src_stride;
707  XORI_B4_128_SB(src0, src1, src2, src3);
708 
709  src4 = LD_SB(src);
710  src5 = LD_SB(src + 8);
711  src6 = LD_SB(src + 16);
712  src7 = LD_SB(src + 24);
713  src += src_stride;
714  XORI_B4_128_SB(src4, src5, src6, src7);
715 
716  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
717  mask3, filt0, filt1, filt2, filt3, out0,
718  out1, out2, out3);
719  SRARI_H4_SH(out0, out1, out2, out3, 6);
720  SAT_SH4_SH(out0, out1, out2, out3, 7);
721 
722  out = PCKEV_XORI128_UB(out0, out1);
723  ST_UB(out, dst);
724  out = PCKEV_XORI128_UB(out2, out3);
725  ST_UB(out, dst + 16);
726  dst += dst_stride;
727 
728  HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
729  mask3, filt0, filt1, filt2, filt3, out0,
730  out1, out2, out3);
731  SRARI_H4_SH(out0, out1, out2, out3, 6);
732  SAT_SH4_SH(out0, out1, out2, out3, 7);
733  out = PCKEV_XORI128_UB(out0, out1);
734  ST_UB(out, dst);
735  out = PCKEV_XORI128_UB(out2, out3);
736  ST_UB(out, dst + 16);
737  dst += dst_stride;
738  }
739 }
740 
741 static void common_hz_8t_48w_msa(const uint8_t *src, int32_t src_stride,
742  uint8_t *dst, int32_t dst_stride,
743  const int8_t *filter, int32_t height)
744 {
745  uint32_t loop_cnt;
746  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
747  v16i8 src4;
748  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
749  v8i16 filt, out0, out1, out2, out3;
750 
751  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
752  src -= 3;
753 
754  /* rearranging filter */
755  filt = LD_SH(filter);
756  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
757 
758  mask1 = mask0 + 2;
759  mask2 = mask0 + 4;
760  mask3 = mask0 + 6;
761  mask4 = mask0 + 8;
762  mask5 = mask0 + 10;
763  mask6 = mask0 + 12;
764  mask7 = mask0 + 14;
765 
766  for (loop_cnt = 64; loop_cnt--;) {
767  src0 = LD_SB(src);
768  src1 = LD_SB(src + 8);
769  src2 = LD_SB(src + 16);
770  src3 = LD_SB(src + 32);
771  src4 = LD_SB(src + 40);
772  src += src_stride;
773 
774  XORI_B4_128_SB(src0, src1, src2, src3);
775  src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
776 
777  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
778  vec0, vec1, vec2);
779  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
780  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
781  vec0, vec1, vec2);
782  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
783  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
784  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
785  vec0, vec1, vec2);
786  DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
787  out2 = __msa_dpadd_s_h(out2, vec2, filt2);
788 
789  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
790  vec0, vec1, vec2);
791  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
792  out2 = __msa_dpadd_s_h(out2, vec2, filt3);
793 
794  SRARI_H2_SH(out0, out1, 6);
795  out3 = __msa_srari_h(out2, 6);
796  SAT_SH3_SH(out0, out1, out3, 7);
797  out = PCKEV_XORI128_UB(out0, out1);
798  ST_UB(out, dst);
799 
800  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
801  vec0, vec1, vec2);
802  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
803  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
804  vec0, vec1, vec2);
805  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
806  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
807  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
808  vec0, vec1, vec2);
809  DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
810  out2 = __msa_dpadd_s_h(out2, vec2, filt2);
811  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
812  vec0, vec1, vec2);
813  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
814  out2 = __msa_dpadd_s_h(out2, vec2, filt3);
815 
816  SRARI_H2_SH(out0, out1, 6);
817  out2 = __msa_srari_h(out2, 6);
818  SAT_SH3_SH(out0, out1, out2, 7);
819  out = PCKEV_XORI128_UB(out3, out0);
820  ST_UB(out, dst + 16);
821  out = PCKEV_XORI128_UB(out1, out2);
822  ST_UB(out, dst + 32);
823  dst += dst_stride;
824  }
825 }
826 
827 static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
828  uint8_t *dst, int32_t dst_stride,
829  const int8_t *filter, int32_t height)
830 {
831  int32_t loop_cnt;
832  v16u8 mask0, mask1, mask2, mask3, out;
833  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
834  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
835  v16i8 filt0, filt1, filt2, filt3;
836  v8i16 res0, res1, res2, res3, filt;
837 
838  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
839  src -= 3;
840 
841  /* rearranging filter */
842  filt = LD_SH(filter);
843  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
844 
845  mask1 = mask0 + 2;
846  mask2 = mask0 + 4;
847  mask3 = mask0 + 6;
848 
849  for (loop_cnt = height; loop_cnt--;) {
850  LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
851  src += src_stride;
852 
853  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
854 
855  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
856  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
857  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
858  res1, res2, res3);
859  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
860  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
861  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
862  res1, res2, res3);
863  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
864  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
865  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
866  res1, res2, res3);
867  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
868  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
869  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
870  res1, res2, res3);
871 
872  SRARI_H4_SH(res0, res1, res2, res3, 6);
873  SAT_SH4_SH(res0, res1, res2, res3, 7);
874  out = PCKEV_XORI128_UB(res0, res1);
875  ST_UB(out, dst);
876  out = PCKEV_XORI128_UB(res2, res3);
877  ST_UB(out, dst + 16);
878 
879  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
880  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
881  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
882  res1, res2, res3);
883  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
884  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
885  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
886  res1, res2, res3);
887  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
888  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
889  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
890  res1, res2, res3);
891  VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
892  VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
893  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
894  res1, res2, res3);
895 
896  SRARI_H4_SH(res0, res1, res2, res3, 6);
897  SAT_SH4_SH(res0, res1, res2, res3, 7);
898  out = PCKEV_XORI128_UB(res0, res1);
899  ST_UB(out, dst + 32);
900  out = PCKEV_XORI128_UB(res2, res3);
901  ST_UB(out, dst + 48);
902  dst += dst_stride;
903  }
904 }
905 
906 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
907  uint8_t *dst, int32_t dst_stride,
908  const int8_t *filter, int32_t height)
909 {
910  uint32_t loop_cnt;
911  v16u8 out0, out1;
912  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
913  v16i8 src11, src12, src13, src14;
914  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
915  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
916  v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
917  v16i8 src10998, filt0, filt1, filt2, filt3;
918  v8i16 filt, out10, out32, out54, out76;
919 
920  src -= (3 * src_stride);
921 
922  filt = LD_SH(filter);
923  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
924 
925  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
926  src += (7 * src_stride);
927 
928  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
929  src54_r, src21_r);
930  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
931  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
932  src4332, src6554);
933  XORI_B3_128_SB(src2110, src4332, src6554);
934 
935  for (loop_cnt = (height >> 3); loop_cnt--;) {
936  LD_SB4(src, src_stride, src7, src8, src9, src10);
937  src += (4 * src_stride);
938  LD_SB4(src, src_stride, src11, src12, src13, src14);
939  src += (4 * src_stride);
940 
941  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
942  src87_r, src98_r, src109_r);
943  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
944  src1110_r, src1211_r, src1312_r, src1413_r);
945  ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
946  ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
947  src12111110, src14131312);
948  XORI_B2_128_SB(src8776, src10998);
949  XORI_B2_128_SB(src12111110, src14131312);
950 
951  DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
952  DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
953  DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
954  DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
955  DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
956  DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
957  DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
958  DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
959  SRARI_H2_SH(out10, out32, 6);
960  SRARI_H2_SH(out54, out76, 6);
961  SAT_SH2_SH(out10, out32, 7);
962  SAT_SH2_SH(out54, out76, 7);
963  out0 = PCKEV_XORI128_UB(out10, out32);
964  out1 = PCKEV_XORI128_UB(out54, out76);
965  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
966  dst += (8 * dst_stride);
967 
968  src2110 = src10998;
969  src4332 = src12111110;
970  src6554 = src14131312;
971  src6 = src14;
972  }
973 }
974 
975 static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
976  uint8_t *dst, int32_t dst_stride,
977  const int8_t *filter, int32_t height)
978 {
979  uint32_t loop_cnt;
980  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
981  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
982  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
983  v16u8 tmp0, tmp1;
984  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
985 
986  src -= (3 * src_stride);
987 
988  filt = LD_SH(filter);
989  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
990 
991  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
992  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
993  src += (7 * src_stride);
994  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
995  src54_r, src21_r);
996  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
997 
998  for (loop_cnt = (height >> 2); loop_cnt--;) {
999  LD_SB4(src, src_stride, src7, src8, src9, src10);
1000  XORI_B4_128_SB(src7, src8, src9, src10);
1001  src += (4 * src_stride);
1002 
1003  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1004  src87_r, src98_r, src109_r);
1005  DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1006  filt0, out0_r, out1_r, out2_r, out3_r);
1007  DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1008  filt1, out0_r, out1_r, out2_r, out3_r);
1009  DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1010  filt2, out0_r, out1_r, out2_r, out3_r);
1011  DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1012  filt3, out0_r, out1_r, out2_r, out3_r);
1013  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1014  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1015  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1016  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1017  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
1018  dst += (4 * dst_stride);
1019 
1020  src10_r = src54_r;
1021  src32_r = src76_r;
1022  src54_r = src98_r;
1023  src21_r = src65_r;
1024  src43_r = src87_r;
1025  src65_r = src109_r;
1026  src6 = src10;
1027  }
1028 }
1029 
1030 static void common_vt_8t_12w_msa(const uint8_t *src, int32_t src_stride,
1031  uint8_t *dst, int32_t dst_stride,
1032  const int8_t *filter, int32_t height)
1033 {
1034  uint32_t loop_cnt;
1035  uint32_t out2, out3;
1036  uint64_t out0, out1;
1037  v16u8 tmp0, tmp1, tmp2, tmp3;
1038  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1039  v16i8 filt0, filt1, filt2, filt3;
1040  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1041  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1042  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1043  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1044 
1045  src -= (3 * src_stride);
1046 
1047  filt = LD_SH(filter);
1048  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1049 
1050  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1051  src += (7 * src_stride);
1052 
1053  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1054 
1055  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1056  src54_r, src21_r);
1057  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1058  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1059  src54_l, src21_l);
1060  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1061 
1062  for (loop_cnt = 4; loop_cnt--;) {
1063  LD_SB4(src, src_stride, src7, src8, src9, src10);
1064  XORI_B4_128_SB(src7, src8, src9, src10);
1065  src += (4 * src_stride);
1066 
1067  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1068  src87_r, src98_r, src109_r);
1069  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1070  src87_l, src98_l, src109_l);
1071  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1072  filt1, filt2, filt3);
1073  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1074  filt1, filt2, filt3);
1075  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1076  filt1, filt2, filt3);
1077  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1078  filt1, filt2, filt3);
1079  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1080  filt1, filt2, filt3);
1081  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1082  filt1, filt2, filt3);
1083  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1084  filt1, filt2, filt3);
1085  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1086  filt1, filt2, filt3);
1087  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1088  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1089  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1090  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1091  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1092  out3_r, tmp0, tmp1, tmp2, tmp3);
1093  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1094 
1095  out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1096  out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1097  out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1098  out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1099  SD(out0, dst);
1100  SW(out2, (dst + 8));
1101  dst += dst_stride;
1102  SD(out1, dst);
1103  SW(out3, (dst + 8));
1104  dst += dst_stride;
1105  out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1106  out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1107  out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1108  out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1109  SD(out0, dst);
1110  SW(out2, (dst + 8));
1111  dst += dst_stride;
1112  SD(out1, dst);
1113  SW(out3, (dst + 8));
1114  dst += dst_stride;
1115 
1116  src10_r = src54_r;
1117  src32_r = src76_r;
1118  src54_r = src98_r;
1119  src21_r = src65_r;
1120  src43_r = src87_r;
1121  src65_r = src109_r;
1122  src10_l = src54_l;
1123  src32_l = src76_l;
1124  src54_l = src98_l;
1125  src21_l = src65_l;
1126  src43_l = src87_l;
1127  src65_l = src109_l;
1128  src6 = src10;
1129  }
1130 }
1131 
1132 static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
1133  uint8_t *dst, int32_t dst_stride,
1134  const int8_t *filter, int32_t height)
1135 {
1136  uint32_t loop_cnt;
1137  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1138  v16i8 filt0, filt1, filt2, filt3;
1139  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1140  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1141  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1142  v16u8 tmp0, tmp1, tmp2, tmp3;
1143  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1144 
1145  src -= (3 * src_stride);
1146 
1147  filt = LD_SH(filter);
1148  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1149 
1150  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1151  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1152  src += (7 * src_stride);
1153  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1154  src54_r, src21_r);
1155  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1156  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1157  src54_l, src21_l);
1158  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1159 
1160  for (loop_cnt = (height >> 2); loop_cnt--;) {
1161  LD_SB4(src, src_stride, src7, src8, src9, src10);
1162  XORI_B4_128_SB(src7, src8, src9, src10);
1163  src += (4 * src_stride);
1164 
1165  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1166  src87_r, src98_r, src109_r);
1167  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1168  src87_l, src98_l, src109_l);
1169  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1170  filt1, filt2, filt3);
1171  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1172  filt1, filt2, filt3);
1173  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1174  filt1, filt2, filt3);
1175  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1176  filt1, filt2, filt3);
1177  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1178  filt1, filt2, filt3);
1179  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1180  filt1, filt2, filt3);
1181  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1182  filt1, filt2, filt3);
1183  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1184  filt1, filt2, filt3);
1185  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1186  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1187  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1188  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1189  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1190  out3_r, tmp0, tmp1, tmp2, tmp3);
1191  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1192  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1193  dst += (4 * dst_stride);
1194 
1195  src10_r = src54_r;
1196  src32_r = src76_r;
1197  src54_r = src98_r;
1198  src21_r = src65_r;
1199  src43_r = src87_r;
1200  src65_r = src109_r;
1201  src10_l = src54_l;
1202  src32_l = src76_l;
1203  src54_l = src98_l;
1204  src21_l = src65_l;
1205  src43_l = src87_l;
1206  src65_l = src109_l;
1207  src6 = src10;
1208  }
1209 }
1210 
1211 static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
1212  uint8_t *dst, int32_t dst_stride,
1213  const int8_t *filter, int32_t height,
1214  int32_t width)
1215 {
1216  const uint8_t *src_tmp;
1217  uint8_t *dst_tmp;
1218  uint32_t loop_cnt, cnt;
1219  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1220  v16i8 filt0, filt1, filt2, filt3;
1221  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1222  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1223  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1224  v16u8 tmp0, tmp1, tmp2, tmp3;
1225  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1226 
1227  src -= (3 * src_stride);
1228 
1229  filt = LD_SH(filter);
1230  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1231 
1232  for (cnt = (width >> 4); cnt--;) {
1233  src_tmp = src;
1234  dst_tmp = dst;
1235 
1236  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1237  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1238  src_tmp += (7 * src_stride);
1239  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1240  src32_r, src54_r, src21_r);
1241  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1242  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1243  src32_l, src54_l, src21_l);
1244  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1245 
1246  for (loop_cnt = (height >> 2); loop_cnt--;) {
1247  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1248  XORI_B4_128_SB(src7, src8, src9, src10);
1249  src_tmp += (4 * src_stride);
1250  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1251  src87_r, src98_r, src109_r);
1252  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1253  src87_l, src98_l, src109_l);
1254  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r,
1255  filt0, filt1, filt2, filt3);
1256  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r,
1257  filt0, filt1, filt2, filt3);
1258  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r,
1259  filt0, filt1, filt2, filt3);
1260  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r,
1261  filt0, filt1, filt2, filt3);
1262  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l,
1263  filt0, filt1, filt2, filt3);
1264  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l,
1265  filt0, filt1, filt2, filt3);
1266  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l,
1267  filt0, filt1, filt2, filt3);
1268  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l,
1269  filt0, filt1, filt2, filt3);
1270  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1271  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1272  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1273  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1274  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1275  out3_r, tmp0, tmp1, tmp2, tmp3);
1276  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1277  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1278  dst_tmp += (4 * dst_stride);
1279 
1280  src10_r = src54_r;
1281  src32_r = src76_r;
1282  src54_r = src98_r;
1283  src21_r = src65_r;
1284  src43_r = src87_r;
1285  src65_r = src109_r;
1286  src10_l = src54_l;
1287  src32_l = src76_l;
1288  src54_l = src98_l;
1289  src21_l = src65_l;
1290  src43_l = src87_l;
1291  src65_l = src109_l;
1292  src6 = src10;
1293  }
1294 
1295  src += 16;
1296  dst += 16;
1297  }
1298 }
1299 
1300 static void common_vt_8t_24w_msa(const uint8_t *src, int32_t src_stride,
1301  uint8_t *dst, int32_t dst_stride,
1302  const int8_t *filter, int32_t height)
1303 {
1304  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1305  16);
1306 
1307  common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1308  height);
1309 }
1310 
1311 static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
1312  uint8_t *dst, int32_t dst_stride,
1313  const int8_t *filter, int32_t height)
1314 {
1315  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1316  32);
1317 }
1318 
1319 static void common_vt_8t_48w_msa(const uint8_t *src, int32_t src_stride,
1320  uint8_t *dst, int32_t dst_stride,
1321  const int8_t *filter, int32_t height)
1322 {
1323  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1324  48);
1325 }
1326 
1327 static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
1328  uint8_t *dst, int32_t dst_stride,
1329  const int8_t *filter, int32_t height)
1330 {
1331  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1332  64);
1333 }
1334 
1335 static void hevc_hv_uni_8t_4w_msa(const uint8_t *src,
1336  int32_t src_stride,
1337  uint8_t *dst,
1338  int32_t dst_stride,
1339  const int8_t *filter_x,
1340  const int8_t *filter_y,
1341  int32_t height)
1342 {
1343  uint32_t loop_cnt;
1344  v16u8 out0, out1;
1345  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1346  v16i8 src9, src10, src11, src12, src13, src14;
1347  v8i16 filt0, filt1, filt2, filt3;
1348  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1349  v16i8 mask1, mask2, mask3;
1350  v8i16 filter_vec;
1351  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1352  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1353  v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1354  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1355  v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1356  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1357  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1358 
1359  src -= ((3 * src_stride) + 3);
1360  filter_vec = LD_SH(filter_x);
1361  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1362 
1363  filter_vec = LD_SH(filter_y);
1364  UNPCK_R_SB_SH(filter_vec, filter_vec);
1365 
1366  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1367 
1368  mask1 = mask0 + 2;
1369  mask2 = mask0 + 4;
1370  mask3 = mask0 + 6;
1371 
1372  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1373  src += (7 * src_stride);
1374  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1375 
1376  VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1377  VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1378  VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1379  vec8, vec9, vec10, vec11);
1380  VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1381  vec12, vec13, vec14, vec15);
1382 
1383  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1384  filt3);
1385  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1386  filt3);
1387  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1388  filt3);
1389  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1390  filt3);
1391 
1392  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1393  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1394  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1395 
1396  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1397 
1398  for (loop_cnt = height >> 3; loop_cnt--;) {
1399  LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1400  src14);
1401  src += (8 * src_stride);
1402  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1403 
1404  VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1405  vec0, vec1, vec2, vec3);
1406  VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1407  vec4, vec5, vec6, vec7);
1408  VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1409  vec8, vec9, vec10, vec11);
1410  VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1411  vec12, vec13, vec14, vec15);
1412 
1413  dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1414  filt3);
1415  dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1416  filt3);
1417  dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1418  filt2, filt3);
1419  dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1420  filt2, filt3);
1421 
1422  dst76_r = __msa_ilvr_h(dst117, dst66);
1423  ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1424  ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1425  ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1426  dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1427  dst1110_r = __msa_ilvr_h(dst117, dst1410);
1428 
1429  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1430  filt_h1, filt_h2, filt_h3);
1431  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1432  filt_h1, filt_h2, filt_h3);
1433  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1434  filt_h1, filt_h2, filt_h3);
1435  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1436  filt_h1, filt_h2, filt_h3);
1437  dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1438  filt_h1, filt_h2, filt_h3);
1439  dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1440  filt_h1, filt_h2, filt_h3);
1441  dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1442  filt_h1, filt_h2, filt_h3);
1443  dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1444  filt_h0, filt_h1, filt_h2, filt_h3);
1445 
1446  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1447  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1448  SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1449  SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1450  SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1451  SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1452  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1453  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1454  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1455  out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1456  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1457  dst += (8 * dst_stride);
1458 
1459  dst10_r = dst98_r;
1460  dst32_r = dst1110_r;
1461  dst54_r = dst1312_r;
1462  dst21_r = dst109_r;
1463  dst43_r = dst1211_r;
1464  dst65_r = dst1413_r;
1465  dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1466  }
1467 }
1468 
1469 static void hevc_hv_uni_8t_8multx2mult_msa(const uint8_t *src,
1470  int32_t src_stride,
1471  uint8_t *dst,
1472  int32_t dst_stride,
1473  const int8_t *filter_x,
1474  const int8_t *filter_y,
1476 {
1477  uint32_t loop_cnt, cnt;
1478  const uint8_t *src_tmp;
1479  uint8_t *dst_tmp;
1480  v16u8 out;
1481  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1482  v8i16 filt0, filt1, filt2, filt3;
1483  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1484  v16i8 mask1, mask2, mask3;
1485  v8i16 filter_vec;
1486  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1487  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1488  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1489  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1490  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1491  v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1492  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1493  v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1494  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1495 
1496  src -= ((3 * src_stride) + 3);
1497 
1498  filter_vec = LD_SH(filter_x);
1499  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1500 
1501  filter_vec = LD_SH(filter_y);
1502  UNPCK_R_SB_SH(filter_vec, filter_vec);
1503 
1504  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1505 
1506  mask1 = mask0 + 2;
1507  mask2 = mask0 + 4;
1508  mask3 = mask0 + 6;
1509 
1510  for (cnt = width >> 3; cnt--;) {
1511  src_tmp = src;
1512  dst_tmp = dst;
1513 
1514  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1515  src_tmp += (7 * src_stride);
1516  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1517 
1518  /* row 0 row 1 row 2 row 3 */
1519  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1520  vec0, vec1, vec2, vec3);
1521  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1522  vec4, vec5, vec6, vec7);
1523  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1524  vec8, vec9, vec10, vec11);
1525  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1526  vec12, vec13, vec14, vec15);
1527  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1528  filt3);
1529  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1530  filt3);
1531  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1532  filt3);
1533  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1534  filt2, filt3);
1535 
1536  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1537  vec0, vec1, vec2, vec3);
1538  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1539  vec4, vec5, vec6, vec7);
1540  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1541  vec8, vec9, vec10, vec11);
1542  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1543  filt3);
1544  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1545  filt3);
1546  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1547  filt3);
1548 
1549  for (loop_cnt = height >> 1; loop_cnt--;) {
1550  LD_SB2(src_tmp, src_stride, src7, src8);
1551  XORI_B2_128_SB(src7, src8);
1552  src_tmp += 2 * src_stride;
1553 
1554  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1555  dst10_r, dst32_r, dst54_r, dst21_r);
1556  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1557  dst10_l, dst32_l, dst54_l, dst21_l);
1558  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1559  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1560 
1561  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1562  vec0, vec1, vec2, vec3);
1563  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1564  filt2, filt3);
1565 
1566  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1567  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1568  filt_h0, filt_h1, filt_h2, filt_h3);
1569  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1570  filt_h0, filt_h1, filt_h2, filt_h3);
1571  dst0_r >>= 6;
1572  dst0_l >>= 6;
1573 
1574  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1575  vec0, vec1, vec2, vec3);
1576  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1577  filt2, filt3);
1578 
1579  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1580  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1581  filt_h0, filt_h1, filt_h2, filt_h3);
1582  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1583  filt_h0, filt_h1, filt_h2, filt_h3);
1584  dst1_r >>= 6;
1585  dst1_l >>= 6;
1586  SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1587  SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1588 
1589  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1590  out = PCKEV_XORI128_UB(dst0, dst1);
1591  ST_D2(out, 0, 1, dst_tmp, dst_stride);
1592  dst_tmp += (2 * dst_stride);
1593 
1594  dst0 = dst2;
1595  dst1 = dst3;
1596  dst2 = dst4;
1597  dst3 = dst5;
1598  dst4 = dst6;
1599  dst5 = dst7;
1600  dst6 = dst8;
1601  }
1602 
1603  src += 8;
1604  dst += 8;
1605  }
1606 }
1607 
1608 static void hevc_hv_uni_8t_8w_msa(const uint8_t *src,
1609  int32_t src_stride,
1610  uint8_t *dst,
1611  int32_t dst_stride,
1612  const int8_t *filter_x,
1613  const int8_t *filter_y,
1614  int32_t height)
1615 {
1616  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1617  filter_x, filter_y, height, 8);
1618 }
1619 
1620 static void hevc_hv_uni_8t_12w_msa(const uint8_t *src,
1621  int32_t src_stride,
1622  uint8_t *dst,
1623  int32_t dst_stride,
1624  const int8_t *filter_x,
1625  const int8_t *filter_y,
1626  int32_t height)
1627 {
1628  uint32_t loop_cnt;
1629  const uint8_t *src_tmp;
1630  uint8_t *dst_tmp;
1631  v16u8 out0, out1;
1632  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1633  v16i8 src11, src12, src13, src14;
1634  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1635  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1636  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1637  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1638  v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1639  v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1640  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1641  v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1642  v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1643  v8i16 dst1413_r, dst87_l, filter_vec;
1644  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1645  v4i32 dst0_l, dst1_l;
1646 
1647  src -= ((3 * src_stride) + 3);
1648 
1649  filter_vec = LD_SH(filter_x);
1650  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1651 
1652  filter_vec = LD_SH(filter_y);
1653  UNPCK_R_SB_SH(filter_vec, filter_vec);
1654 
1655  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1656 
1657  mask0 = LD_SB(ff_hevc_mask_arr);
1658  mask1 = mask0 + 2;
1659  mask2 = mask0 + 4;
1660  mask3 = mask0 + 6;
1661 
1662  src_tmp = src;
1663  dst_tmp = dst;
1664 
1665  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1666  src_tmp += (7 * src_stride);
1667  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1668 
1669  /* row 0 row 1 row 2 row 3 */
1670  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1671  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1672  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1673  vec11);
1674  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1675  vec15);
1676  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1677  filt3);
1678  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1679  filt3);
1680  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1681  filt3);
1682  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1683  filt2, filt3);
1684 
1685  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1686  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1687  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1688  vec11);
1689  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1690  filt3);
1691  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1692  filt3);
1693  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1694  filt3);
1695 
1696  for (loop_cnt = 8; loop_cnt--;) {
1697  LD_SB2(src_tmp, src_stride, src7, src8);
1698  XORI_B2_128_SB(src7, src8);
1699  src_tmp += 2 * src_stride;
1700 
1701  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1702  dst32_r, dst54_r, dst21_r);
1703  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1704  dst32_l, dst54_l, dst21_l);
1705  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1706  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1707 
1708  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1709  vec3);
1710  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1711  filt3);
1712 
1713  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1714  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1715  filt_h0, filt_h1, filt_h2, filt_h3);
1716  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1717  filt_h0, filt_h1, filt_h2, filt_h3);
1718  dst0_r >>= 6;
1719  dst0_l >>= 6;
1720 
1721  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1722  vec3);
1723  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1724  filt3);
1725 
1726  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1727  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1728  filt_h0, filt_h1, filt_h2, filt_h3);
1729  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1730  filt_h0, filt_h1, filt_h2, filt_h3);
1731  dst1_r >>= 6;
1732  dst1_l >>= 6;
1733  SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1734  SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1735 
1736  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1737  out0 = PCKEV_XORI128_UB(dst0, dst1);
1738  ST_D2(out0, 0, 1, dst_tmp, dst_stride);
1739  dst_tmp += (2 * dst_stride);
1740 
1741  dst0 = dst2;
1742  dst1 = dst3;
1743  dst2 = dst4;
1744  dst3 = dst5;
1745  dst4 = dst6;
1746  dst5 = dst7;
1747  dst6 = dst8;
1748  }
1749 
1750  src += 8;
1751  dst += 8;
1752 
1753  mask4 = LD_SB(ff_hevc_mask_arr + 16);
1754  mask5 = mask4 + 2;
1755  mask6 = mask4 + 4;
1756  mask7 = mask4 + 6;
1757 
1758  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1759  src += (7 * src_stride);
1760  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1761 
1762  VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1763  VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1764  VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1765  vec11);
1766  VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1767  vec15);
1768 
1769  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1770  filt3);
1771  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1772  filt3);
1773  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1774  filt3);
1775  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1776  filt3);
1777 
1778  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1779  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1780  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1781 
1782  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1783 
1784  for (loop_cnt = 2; loop_cnt--;) {
1785  LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1786  src14);
1787  src += (8 * src_stride);
1788  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1789 
1790  VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1791  vec3);
1792  VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1793  vec7);
1794  VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1795  vec11);
1796  VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1797  vec14, vec15);
1798 
1799  dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1800  filt3);
1801  dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1802  filt3);
1803  dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1804  filt2, filt3);
1805  dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1806  filt2, filt3);
1807 
1808  dst76_r = __msa_ilvr_h(dst117, dst66);
1809  ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1810  ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1811  ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1812  dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1813  dst1110_r = __msa_ilvr_h(dst117, dst1410);
1814 
1815  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1816  filt_h1, filt_h2, filt_h3);
1817  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1818  filt_h1, filt_h2, filt_h3);
1819  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1820  filt_h1, filt_h2, filt_h3);
1821  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1822  filt_h1, filt_h2, filt_h3);
1823  dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1824  filt_h1, filt_h2, filt_h3);
1825  dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1826  filt_h1, filt_h2, filt_h3);
1827  dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1828  filt_h1, filt_h2, filt_h3);
1829  dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1830  filt_h0, filt_h1, filt_h2, filt_h3);
1831 
1832  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1833  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1834  SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1835  SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1836  SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1837  SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1838  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1839  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1840  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1841  out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1842  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1843  dst += (8 * dst_stride);
1844 
1845  dst10_r = dst98_r;
1846  dst32_r = dst1110_r;
1847  dst54_r = dst1312_r;
1848  dst21_r = dst109_r;
1849  dst43_r = dst1211_r;
1850  dst65_r = dst1413_r;
1851  dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1852  }
1853 }
1854 
1855 static void hevc_hv_uni_8t_16w_msa(const uint8_t *src,
1856  int32_t src_stride,
1857  uint8_t *dst,
1858  int32_t dst_stride,
1859  const int8_t *filter_x,
1860  const int8_t *filter_y,
1861  int32_t height)
1862 {
1863  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1864  filter_x, filter_y, height, 16);
1865 }
1866 
1867 static void hevc_hv_uni_8t_24w_msa(const uint8_t *src,
1868  int32_t src_stride,
1869  uint8_t *dst,
1870  int32_t dst_stride,
1871  const int8_t *filter_x,
1872  const int8_t *filter_y,
1873  int32_t height)
1874 {
1875  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1876  filter_x, filter_y, height, 24);
1877 }
1878 
1879 static void hevc_hv_uni_8t_32w_msa(const uint8_t *src,
1880  int32_t src_stride,
1881  uint8_t *dst,
1882  int32_t dst_stride,
1883  const int8_t *filter_x,
1884  const int8_t *filter_y,
1885  int32_t height)
1886 {
1887  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1888  filter_x, filter_y, height, 32);
1889 }
1890 
1891 static void hevc_hv_uni_8t_48w_msa(const uint8_t *src,
1892  int32_t src_stride,
1893  uint8_t *dst,
1894  int32_t dst_stride,
1895  const int8_t *filter_x,
1896  const int8_t *filter_y,
1897  int32_t height)
1898 {
1899  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1900  filter_x, filter_y, height, 48);
1901 }
1902 
1903 static void hevc_hv_uni_8t_64w_msa(const uint8_t *src,
1904  int32_t src_stride,
1905  uint8_t *dst,
1906  int32_t dst_stride,
1907  const int8_t *filter_x,
1908  const int8_t *filter_y,
1909  int32_t height)
1910 {
1911  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1912  filter_x, filter_y, height, 64);
1913 }
1914 
1915 static void common_hz_4t_4x2_msa(const uint8_t *src, int32_t src_stride,
1916  uint8_t *dst, int32_t dst_stride,
1917  const int8_t *filter)
1918 {
1919  v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
1920  v16u8 out;
1921  v8i16 filt, res0;
1922 
1923  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1924  src -= 1;
1925 
1926  /* rearranging filter */
1927  filt = LD_SH(filter);
1928  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1929 
1930  mask1 = mask0 + 2;
1931 
1932  LD_SB2(src, src_stride, src0, src1);
1934  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1935  res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
1936  res0 = __msa_srari_h(res0, 6);
1937  res0 = __msa_sat_s_h(res0, 7);
1938  out = PCKEV_XORI128_UB(res0, res0);
1939  ST_W2(out, 0, 1, dst, dst_stride);
1940 }
1941 
1942 static void common_hz_4t_4x4_msa(const uint8_t *src, int32_t src_stride,
1943  uint8_t *dst, int32_t dst_stride,
1944  const int8_t *filter)
1945 {
1946  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1947  v8i16 filt, out0, out1;
1948  v16u8 out;
1949 
1950  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1951  src -= 1;
1952 
1953  /* rearranging filter */
1954  filt = LD_SH(filter);
1955  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1956 
1957  mask1 = mask0 + 2;
1958 
1959  LD_SB4(src, src_stride, src0, src1, src2, src3);
1960  XORI_B4_128_SB(src0, src1, src2, src3);
1961  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1962  filt0, filt1, out0, out1);
1963  SRARI_H2_SH(out0, out1, 6);
1964  SAT_SH2_SH(out0, out1, 7);
1965  out = PCKEV_XORI128_UB(out0, out1);
1966  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1967 }
1968 
1969 static void common_hz_4t_4x8_msa(const uint8_t *src, int32_t src_stride,
1970  uint8_t *dst, int32_t dst_stride,
1971  const int8_t *filter)
1972 {
1973  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1974  v16u8 out;
1975  v8i16 filt, out0, out1, out2, out3;
1976 
1977  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1978  src -= 1;
1979 
1980  /* rearranging filter */
1981  filt = LD_SH(filter);
1982  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1983 
1984  mask1 = mask0 + 2;
1985 
1986  LD_SB4(src, src_stride, src0, src1, src2, src3);
1987  src += (4 * src_stride);
1988 
1989  XORI_B4_128_SB(src0, src1, src2, src3);
1990  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1991  filt0, filt1, out0, out1);
1992  LD_SB4(src, src_stride, src0, src1, src2, src3);
1993  XORI_B4_128_SB(src0, src1, src2, src3);
1994  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1995  filt0, filt1, out2, out3);
1996  SRARI_H4_SH(out0, out1, out2, out3, 6);
1997  SAT_SH4_SH(out0, out1, out2, out3, 7);
1998  out = PCKEV_XORI128_UB(out0, out1);
1999  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2000  out = PCKEV_XORI128_UB(out2, out3);
2001  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2002 }
2003 
2004 static void common_hz_4t_4x16_msa(const uint8_t *src, int32_t src_stride,
2005  uint8_t *dst, int32_t dst_stride,
2006  const int8_t *filter)
2007 {
2008  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2009  v16i8 filt0, filt1, mask0, mask1;
2010  v16u8 out;
2011  v8i16 filt, out0, out1, out2, out3;
2012 
2013  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2014  src -= 1;
2015 
2016  /* rearranging filter */
2017  filt = LD_SH(filter);
2018  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2019 
2020  mask1 = mask0 + 2;
2021 
2022  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2023  src += (8 * src_stride);
2024  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2025  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2026  filt0, filt1, out0, out1);
2027  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2028  filt0, filt1, out2, out3);
2029  SRARI_H4_SH(out0, out1, out2, out3, 6);
2030  SAT_SH4_SH(out0, out1, out2, out3, 7);
2031  out = PCKEV_XORI128_UB(out0, out1);
2032  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2033  out = PCKEV_XORI128_UB(out2, out3);
2034  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2035  dst += (8 * dst_stride);
2036 
2037  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2038  src += (8 * src_stride);
2039  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2040  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2041  filt0, filt1, out0, out1);
2042  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2043  filt0, filt1, out2, out3);
2044  SRARI_H4_SH(out0, out1, out2, out3, 6);
2045  SAT_SH4_SH(out0, out1, out2, out3, 7);
2046  out = PCKEV_XORI128_UB(out0, out1);
2047  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2048  out = PCKEV_XORI128_UB(out2, out3);
2049  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2050 }
2051 
2052 static void common_hz_4t_4w_msa(const uint8_t *src, int32_t src_stride,
2053  uint8_t *dst, int32_t dst_stride,
2054  const int8_t *filter, int32_t height)
2055 {
2056  if (2 == height) {
2057  common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2058  } else if (4 == height) {
2059  common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2060  } else if (8 == height) {
2061  common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2062  } else if (16 == height) {
2063  common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
2064  }
2065 }
2066 
2067 static void common_hz_4t_6w_msa(const uint8_t *src, int32_t src_stride,
2068  uint8_t *dst, int32_t dst_stride,
2069  const int8_t *filter, int32_t height)
2070 {
2071  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2072  v16u8 out4, out5;
2073  v8i16 filt, out0, out1, out2, out3;
2074 
2075  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2076  src -= 1;
2077 
2078  /* rearranging filter */
2079  filt = LD_SH(filter);
2080  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2081 
2082  mask1 = mask0 + 2;
2083 
2084  LD_SB4(src, src_stride, src0, src1, src2, src3);
2085  src += (4 * src_stride);
2086 
2087  XORI_B4_128_SB(src0, src1, src2, src3);
2088  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2089  filt1, out0, out1, out2, out3);
2090  SRARI_H4_SH(out0, out1, out2, out3, 6);
2091  SAT_SH4_SH(out0, out1, out2, out3, 7);
2092  out4 = PCKEV_XORI128_UB(out0, out1);
2093  out5 = PCKEV_XORI128_UB(out2, out3);
2094  ST_W2(out4, 0, 2, dst, dst_stride);
2095  ST_H2(out4, 2, 6, dst + 4, dst_stride);
2096  ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2097  ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2098  dst += (4 * dst_stride);
2099 
2100  LD_SB4(src, src_stride, src0, src1, src2, src3);
2101  src += (4 * src_stride);
2102 
2103  XORI_B4_128_SB(src0, src1, src2, src3);
2104  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2105  filt1, out0, out1, out2, out3);
2106  SRARI_H4_SH(out0, out1, out2, out3, 6);
2107  SAT_SH4_SH(out0, out1, out2, out3, 7);
2108  out4 = PCKEV_XORI128_UB(out0, out1);
2109  out5 = PCKEV_XORI128_UB(out2, out3);
2110  ST_W2(out4, 0, 2, dst, dst_stride);
2111  ST_H2(out4, 2, 6, dst + 4, dst_stride);
2112  ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2113  ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2114 }
2115 
2116 static void common_hz_4t_8x2mult_msa(const uint8_t *src, int32_t src_stride,
2117  uint8_t *dst, int32_t dst_stride,
2118  const int8_t *filter, int32_t height)
2119 {
2120  uint32_t loop_cnt;
2121  v16i8 src0, src1, filt0, filt1, mask0, mask1;
2122  v16u8 out;
2123  v8i16 filt, vec0, vec1, vec2, vec3;
2124 
2125  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2126  src -= 1;
2127 
2128  filt = LD_SH(filter);
2129  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2130 
2131  mask1 = mask0 + 2;
2132 
2133  for (loop_cnt = (height >> 1); loop_cnt--;) {
2134  LD_SB2(src, src_stride, src0, src1);
2135  src += (2 * src_stride);
2136 
2138  VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2139  DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2140  VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2141  DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
2142  SRARI_H2_SH(vec0, vec1, 6);
2143  SAT_SH2_SH(vec0, vec1, 7);
2144  out = PCKEV_XORI128_UB(vec0, vec1);
2145  ST_D2(out, 0, 1, dst, dst_stride);
2146  dst += (2 * dst_stride);
2147  }
2148 }
2149 
2150 static void common_hz_4t_8x4mult_msa(const uint8_t *src, int32_t src_stride,
2151  uint8_t *dst, int32_t dst_stride,
2152  const int8_t *filter, int32_t height)
2153 {
2154  uint32_t loop_cnt;
2155  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2156  v16u8 tmp0, tmp1;
2157  v8i16 filt, out0, out1, out2, out3;
2158 
2159  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2160  src -= 1;
2161 
2162  /* rearranging filter */
2163  filt = LD_SH(filter);
2164  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2165 
2166  mask1 = mask0 + 2;
2167 
2168  for (loop_cnt = (height >> 2); loop_cnt--;) {
2169  LD_SB4(src, src_stride, src0, src1, src2, src3);
2170  src += (4 * src_stride);
2171 
2172  XORI_B4_128_SB(src0, src1, src2, src3);
2173  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2174  filt1, out0, out1, out2, out3);
2175  SRARI_H4_SH(out0, out1, out2, out3, 6);
2176  SAT_SH4_SH(out0, out1, out2, out3, 7);
2177  tmp0 = PCKEV_XORI128_UB(out0, out1);
2178  tmp1 = PCKEV_XORI128_UB(out2, out3);
2179  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2180  dst += (4 * dst_stride);
2181  }
2182 }
2183 
2184 static void common_hz_4t_8w_msa(const uint8_t *src, int32_t src_stride,
2185  uint8_t *dst, int32_t dst_stride,
2186  const int8_t *filter, int32_t height)
2187 {
2188  if ((2 == height) || (6 == height)) {
2189  common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
2190  height);
2191  } else {
2192  common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
2193  height);
2194  }
2195 }
2196 
2197 static void common_hz_4t_12w_msa(const uint8_t *src, int32_t src_stride,
2198  uint8_t *dst, int32_t dst_stride,
2199  const int8_t *filter, int32_t height)
2200 {
2201  uint32_t loop_cnt;
2202  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2203  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2204  v16i8 vec10, vec11;
2205  v16u8 tmp0, tmp1;
2206  v8i16 filt, out0, out1, out2, out3, out4, out5;
2207 
2208  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2209  mask2 = LD_SB(&ff_hevc_mask_arr[32]);
2210 
2211  src -= 1;
2212 
2213  /* rearranging filter */
2214  filt = LD_SH(filter);
2215  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2216 
2217  mask1 = mask0 + 2;
2218  mask3 = mask2 + 2;
2219 
2220  for (loop_cnt = 4; loop_cnt--;) {
2221  LD_SB4(src, src_stride, src0, src1, src2, src3);
2222  src += (4 * src_stride);
2223 
2224  XORI_B4_128_SB(src0, src1, src2, src3);
2225  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2226  DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2227  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2228  DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
2229  SRARI_H2_SH(out0, out1, 6);
2230  SAT_SH2_SH(out0, out1, 7);
2231  tmp0 = PCKEV_XORI128_UB(out0, out1);
2232  ST_W4(tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2233 
2234  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2235  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2236  DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2237  out2, out3, out4, out5);
2238  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2239  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2240  DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2241  out2, out3, out4, out5);
2242  SRARI_H4_SH(out2, out3, out4, out5, 6);
2243  SAT_SH4_SH(out2, out3, out4, out5, 7);
2244  tmp0 = PCKEV_XORI128_UB(out2, out3);
2245  tmp1 = PCKEV_XORI128_UB(out4, out5);
2246  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2247  dst += (4 * dst_stride);
2248  }
2249 }
2250 
2251 static void common_hz_4t_16w_msa(const uint8_t *src, int32_t src_stride,
2252  uint8_t *dst, int32_t dst_stride,
2253  const int8_t *filter, int32_t height)
2254 {
2255  uint32_t loop_cnt;
2256  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2257  v16i8 filt0, filt1, mask0, mask1;
2258  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2259  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2260  v16u8 out;
2261 
2262  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2263  src -= 1;
2264 
2265  /* rearranging filter */
2266  filt = LD_SH(filter);
2267  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2268 
2269  mask1 = mask0 + 2;
2270 
2271  for (loop_cnt = (height >> 2); loop_cnt--;) {
2272  LD_SB4(src, src_stride, src0, src2, src4, src6);
2273  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2274  src += (4 * src_stride);
2275 
2276  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2277 
2278  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2279  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2280  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2281  out0, out1, out2, out3);
2282  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2283  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2284  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2285  out0, out1, out2, out3);
2286  SRARI_H4_SH(out0, out1, out2, out3, 6);
2287  SAT_SH4_SH(out0, out1, out2, out3, 7);
2288  out = PCKEV_XORI128_UB(out0, out1);
2289  ST_UB(out, dst);
2290  dst += dst_stride;
2291  out = PCKEV_XORI128_UB(out2, out3);
2292  ST_UB(out, dst);
2293  dst += dst_stride;
2294 
2295  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2296  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2297  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2298  out4, out5, out6, out7);
2299  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2300  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2301  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2302  out4, out5, out6, out7);
2303  SRARI_H4_SH(out4, out5, out6, out7, 6);
2304  SAT_SH4_SH(out4, out5, out6, out7, 7);
2305  out = PCKEV_XORI128_UB(out4, out5);
2306  ST_UB(out, dst);
2307  dst += dst_stride;
2308  out = PCKEV_XORI128_UB(out6, out7);
2309  ST_UB(out, dst);
2310  dst += dst_stride;
2311  }
2312 }
2313 
2314 static void common_hz_4t_24w_msa(const uint8_t *src, int32_t src_stride,
2315  uint8_t *dst, int32_t dst_stride,
2316  const int8_t *filter, int32_t height)
2317 {
2318  uint8_t *dst1 = dst + 16;
2319  uint32_t loop_cnt;
2320  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2321  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2322  v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2323  v8i16 filt, out0, out1, out2, out3;
2324  v16u8 tmp0, tmp1;
2325 
2326  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2327  src -= 1;
2328 
2329  /* rearranging filter */
2330  filt = LD_SH(filter);
2331  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2332 
2333  mask1 = mask0 + 2;
2334  mask00 = mask0 + 8;
2335  mask11 = mask0 + 10;
2336 
2337  for (loop_cnt = 8; loop_cnt--;) {
2338  LD_SB4(src, src_stride, src0, src2, src4, src6);
2339  LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2340  src += (4 * src_stride);
2341 
2342  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2343  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2344  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2345  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2346  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2347  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2348  out0, out1, out2, out3);
2349  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2350  out0, out1, out2, out3);
2351  SRARI_H4_SH(out0, out1, out2, out3, 6);
2352  SAT_SH4_SH(out0, out1, out2, out3, 7);
2353  tmp0 = PCKEV_XORI128_UB(out0, out1);
2354  ST_UB(tmp0, dst);
2355  dst += dst_stride;
2356  tmp0 = PCKEV_XORI128_UB(out2, out3);
2357  ST_UB(tmp0, dst);
2358  dst += dst_stride;
2359 
2360  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2361  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2362  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2363  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2364  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2365  out0, out1, out2, out3);
2366  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2367  out0, out1, out2, out3);
2368  SRARI_H4_SH(out0, out1, out2, out3, 6);
2369  SAT_SH4_SH(out0, out1, out2, out3, 7);
2370  tmp0 = PCKEV_XORI128_UB(out0, out1);
2371  ST_UB(tmp0, dst);
2372  dst += dst_stride;
2373  tmp0 = PCKEV_XORI128_UB(out2, out3);
2374  ST_UB(tmp0, dst);
2375  dst += dst_stride;
2376 
2377  /* 8 width */
2378  VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2379  VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2380  VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2381  VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2382 
2383  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2384  out0, out1, out2, out3);
2385  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2386  out0, out1, out2, out3);
2387 
2388  SRARI_H4_SH(out0, out1, out2, out3, 6);
2389  SAT_SH4_SH(out0, out1, out2, out3, 7);
2390  tmp0 = PCKEV_XORI128_UB(out0, out1);
2391  tmp1 = PCKEV_XORI128_UB(out2, out3);
2392  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst1, dst_stride);
2393  dst1 += (4 * dst_stride);
2394  }
2395 }
2396 
2397 static void common_hz_4t_32w_msa(const uint8_t *src, int32_t src_stride,
2398  uint8_t *dst, int32_t dst_stride,
2399  const int8_t *filter, int32_t height)
2400 {
2401  uint32_t loop_cnt;
2402  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2403  v16i8 filt0, filt1, mask0, mask1;
2404  v16u8 out;
2405  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2406  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2407 
2408  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2409  src -= 1;
2410 
2411  /* rearranging filter */
2412  filt = LD_SH(filter);
2413  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2414 
2415  mask1 = mask0 + 2;
2416 
2417  for (loop_cnt = (height >> 1); loop_cnt--;) {
2418  src0 = LD_SB(src);
2419  src1 = LD_SB(src + 8);
2420  src2 = LD_SB(src + 16);
2421  src3 = LD_SB(src + 24);
2422  src += src_stride;
2423  src4 = LD_SB(src);
2424  src5 = LD_SB(src + 8);
2425  src6 = LD_SB(src + 16);
2426  src7 = LD_SB(src + 24);
2427  src += src_stride;
2428 
2429  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2430 
2431  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2432  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2433  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2434  out0, out1, out2, out3);
2435  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2436  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2437  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2438  out0, out1, out2, out3);
2439 
2440  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2441  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2442  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2443  out4, out5, out6, out7);
2444  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2445  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2446  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2447  out4, out5, out6, out7);
2448  SRARI_H4_SH(out0, out1, out2, out3, 6);
2449  SRARI_H4_SH(out4, out5, out6, out7, 6);
2450  SAT_SH4_SH(out0, out1, out2, out3, 7);
2451  SAT_SH4_SH(out4, out5, out6, out7, 7);
2452  out = PCKEV_XORI128_UB(out0, out1);
2453  ST_UB(out, dst);
2454  out = PCKEV_XORI128_UB(out2, out3);
2455  ST_UB(out, dst + 16);
2456  dst += dst_stride;
2457  out = PCKEV_XORI128_UB(out4, out5);
2458  ST_UB(out, dst);
2459  out = PCKEV_XORI128_UB(out6, out7);
2460  ST_UB(out, dst + 16);
2461  dst += dst_stride;
2462  }
2463 }
2464 
2465 static void common_vt_4t_4x2_msa(const uint8_t *src, int32_t src_stride,
2466  uint8_t *dst, int32_t dst_stride,
2467  const int8_t *filter)
2468 {
2469  v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2470  v16i8 src2110, src4332, filt0, filt1;
2471  v16u8 out;
2472  v8i16 filt, out10;
2473 
2474  src -= src_stride;
2475 
2476  filt = LD_SH(filter);
2477  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2478 
2479  LD_SB3(src, src_stride, src0, src1, src2);
2480  src += (3 * src_stride);
2481 
2482  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2483  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2484  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2485  LD_SB2(src, src_stride, src3, src4);
2486  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2487  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2488  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2489  out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2490  out10 = __msa_srari_h(out10, 6);
2491  out10 = __msa_sat_s_h(out10, 7);
2492  out = PCKEV_XORI128_UB(out10, out10);
2493  ST_W2(out, 0, 1, dst, dst_stride);
2494 }
2495 
2496 static void common_vt_4t_4x4multiple_msa(const uint8_t *src, int32_t src_stride,
2497  uint8_t *dst, int32_t dst_stride,
2498  const int8_t *filter, int32_t height)
2499 {
2500  uint32_t loop_cnt;
2501  v16i8 src0, src1, src2, src3, src4, src5;
2502  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2503  v16i8 src2110, src4332, filt0, filt1;
2504  v8i16 filt, out10, out32;
2505  v16u8 out;
2506 
2507  src -= src_stride;
2508 
2509  filt = LD_SH(filter);
2510  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2511 
2512  LD_SB3(src, src_stride, src0, src1, src2);
2513  src += (3 * src_stride);
2514 
2515  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2516 
2517  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2518  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2519 
2520  for (loop_cnt = (height >> 2); loop_cnt--;) {
2521  LD_SB3(src, src_stride, src3, src4, src5);
2522  src += (3 * src_stride);
2523  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2524  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2525  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2526  out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2527 
2528  src2 = LD_SB(src);
2529  src += (src_stride);
2530  ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2531  src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2532  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2533  out32 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
2534  SRARI_H2_SH(out10, out32, 6);
2535  SAT_SH2_SH(out10, out32, 7);
2536  out = PCKEV_XORI128_UB(out10, out32);
2537  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2538  dst += (4 * dst_stride);
2539  }
2540 }
2541 
2542 static void common_vt_4t_4w_msa(const uint8_t *src, int32_t src_stride,
2543  uint8_t *dst, int32_t dst_stride,
2544  const int8_t *filter, int32_t height)
2545 {
2546  if (2 == height) {
2547  common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2548  } else {
2549  common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2550  height);
2551  }
2552 }
2553 
2554 static void common_vt_4t_6w_msa(const uint8_t *src, int32_t src_stride,
2555  uint8_t *dst, int32_t dst_stride,
2556  const int8_t *filter, int32_t height)
2557 {
2558  v16u8 out0, out1;
2559  v16i8 src0, src1, src2, src3, src4, src5, src6;
2560  v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2561  v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
2562 
2563  src -= src_stride;
2564 
2565  filter_vec = LD_SH(filter);
2566  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2567 
2568  LD_SB3(src, src_stride, src0, src1, src2);
2569  src += (3 * src_stride);
2571  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2572 
2573  LD_SB2(src, src_stride, src3, src4);
2574  src += (2 * src_stride);
2575  XORI_B2_128_SB(src3, src4);
2576  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2577 
2578  dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2579  dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2580 
2581  LD_SB2(src, src_stride, src5, src6);
2582  src += (2 * src_stride);
2583  XORI_B2_128_SB(src5, src6);
2584  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2585 
2586  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2587  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2588 
2589  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2590  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2591  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2592  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2593  ST_W2(out0, 0, 2, dst, dst_stride);
2594  ST_H2(out0, 2, 6, dst + 4, dst_stride);
2595  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2596  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2597  dst += (4 * dst_stride);
2598 
2599  LD_SB2(src, src_stride, src3, src4);
2600  src += (2 * src_stride);
2601  XORI_B2_128_SB(src3, src4);
2602  ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
2603 
2604  dst0_r = HEVC_FILT_4TAP_SH(src54_r, src32_r, filt0, filt1);
2605  dst1_r = HEVC_FILT_4TAP_SH(src65_r, src43_r, filt0, filt1);
2606 
2607  LD_SB2(src, src_stride, src5, src6);
2608  src += (2 * src_stride);
2609  XORI_B2_128_SB(src5, src6);
2610  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2611 
2612  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2613  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2614 
2615  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2616  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2617  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2618  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2619  ST_W2(out0, 0, 2, dst, dst_stride);
2620  ST_H2(out0, 2, 6, dst + 4, dst_stride);
2621  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2622  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2623 }
2624 
2625 static void common_vt_4t_8x2_msa(const uint8_t *src, int32_t src_stride,
2626  uint8_t *dst, int32_t dst_stride,
2627  const int8_t *filter)
2628 {
2629  v16i8 src0, src1, src2, src3, src4;
2630  v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2631  v16u8 out;
2632 
2633  src -= src_stride;
2634 
2635  /* rearranging filter_y */
2636  filt = LD_SH(filter);
2637  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2638 
2639  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2640  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2641  ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2642  tmp0 = HEVC_FILT_4TAP_SH(src01, src23, filt0, filt1);
2643  ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2644  tmp1 = HEVC_FILT_4TAP_SH(src12, src34, filt0, filt1);
2645  SRARI_H2_SH(tmp0, tmp1, 6);
2646  SAT_SH2_SH(tmp0, tmp1, 7);
2647  out = PCKEV_XORI128_UB(tmp0, tmp1);
2648  ST_D2(out, 0, 1, dst, dst_stride);
2649 }
2650 
2651 static void common_vt_4t_8x6_msa(const uint8_t *src, int32_t src_stride,
2652  uint8_t *dst, int32_t dst_stride,
2653  const int8_t *filter)
2654 {
2655  uint32_t loop_cnt;
2656  uint64_t out0, out1, out2;
2657  v16i8 src0, src1, src2, src3, src4, src5;
2658  v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2659  v8i16 filt, filt0, filt1;
2660 
2661  src -= src_stride;
2662 
2663  /* rearranging filter_y */
2664  filt = LD_SH(filter);
2665  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2666 
2667  LD_SB3(src, src_stride, src0, src1, src2);
2668  src += (3 * src_stride);
2669 
2671  ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2672 
2673  for (loop_cnt = 2; loop_cnt--;) {
2674  LD_SB3(src, src_stride, src3, src4, src5);
2675  src += (3 * src_stride);
2676 
2677  XORI_B3_128_SB(src3, src4, src5);
2678  ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2679  tmp0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2680  tmp1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2681  tmp2 = HEVC_FILT_4TAP_SH(vec1, vec4, filt0, filt1);
2682  SRARI_H2_SH(tmp0, tmp1, 6);
2683  tmp2 = __msa_srari_h(tmp2, 6);
2684  SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2685  PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2686  XORI_B2_128_SH(tmp0, tmp2);
2687 
2688  out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2689  out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2690  out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2691  SD(out0, dst);
2692  dst += dst_stride;
2693  SD(out1, dst);
2694  dst += dst_stride;
2695  SD(out2, dst);
2696  dst += dst_stride;
2697 
2698  src2 = src5;
2699  vec0 = vec3;
2700  vec2 = vec4;
2701  }
2702 }
2703 
2704 static void common_vt_4t_8x4mult_msa(const uint8_t *src, int32_t src_stride,
2705  uint8_t *dst, int32_t dst_stride,
2706  const int8_t *filter, int32_t height)
2707 {
2708  uint32_t loop_cnt;
2709  v16i8 src0, src1, src2, src7, src8, src9, src10;
2710  v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2711  v16u8 tmp0, tmp1;
2712  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2713 
2714  src -= src_stride;
2715 
2716  filt = LD_SH(filter);
2717  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2718 
2719  LD_SB3(src, src_stride, src0, src1, src2);
2720  src += (3 * src_stride);
2721 
2723  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2724 
2725  for (loop_cnt = (height >> 2); loop_cnt--;) {
2726  LD_SB4(src, src_stride, src7, src8, src9, src10);
2727  src += (4 * src_stride);
2728 
2729  XORI_B4_128_SB(src7, src8, src9, src10);
2730  ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2731  src72_r, src87_r, src98_r, src109_r);
2732  out0_r = HEVC_FILT_4TAP_SH(src10_r, src72_r, filt0, filt1);
2733  out1_r = HEVC_FILT_4TAP_SH(src21_r, src87_r, filt0, filt1);
2734  out2_r = HEVC_FILT_4TAP_SH(src72_r, src98_r, filt0, filt1);
2735  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2736  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2737  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2738  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2739  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2740  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2741  dst += (4 * dst_stride);
2742 
2743  src10_r = src98_r;
2744  src21_r = src109_r;
2745  src2 = src10;
2746  }
2747 }
2748 
2749 static void common_vt_4t_8w_msa(const uint8_t *src, int32_t src_stride,
2750  uint8_t *dst, int32_t dst_stride,
2751  const int8_t *filter, int32_t height)
2752 {
2753  if (2 == height) {
2754  common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2755  } else if (6 == height) {
2756  common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2757  } else {
2758  common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2759  filter, height);
2760  }
2761 }
2762 
2763 static void common_vt_4t_12w_msa(const uint8_t *src, int32_t src_stride,
2764  uint8_t *dst, int32_t dst_stride,
2765  const int8_t *filter, int32_t height)
2766 {
2767  uint32_t loop_cnt;
2768  v16i8 src0, src1, src2, src3, src4, src5, src6;
2769  v16u8 out0, out1;
2770  v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2771  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2772  v16i8 src2110, src4332, src6554;
2773  v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
2774  v8i16 filter_vec;
2775 
2776  src -= (1 * src_stride);
2777 
2778  filter_vec = LD_SH(filter);
2779  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2780 
2781  LD_SB3(src, src_stride, src0, src1, src2);
2782  src += (3 * src_stride);
2783 
2785  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2786  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2787  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2788 
2789  for (loop_cnt = 4; loop_cnt--;) {
2790  LD_SB4(src, src_stride, src3, src4, src5, src6);
2791  src += (4 * src_stride);
2792 
2793  XORI_B4_128_SB(src3, src4, src5, src6);
2794  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2795  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2796  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2797  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2798  ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2799  src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2800 
2801  dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2802  dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2803  dst0_l = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2804  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2805  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2806  dst1_l = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
2807 
2808  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2809  SRARI_H2_SH(dst0_l, dst1_l, 6);
2810  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2811  SAT_SH2_SH(dst0_l, dst1_l, 7);
2812  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2813  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2814  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2815  out0 = PCKEV_XORI128_UB(dst0_l, dst1_l);
2816  ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
2817  dst += (4 * dst_stride);
2818 
2819  src2 = src6;
2820  src10_r = src54_r;
2821  src21_r = src65_r;
2822  src2110 = src6554;
2823  }
2824 }
2825 
2826 static void common_vt_4t_16w_msa(const uint8_t *src, int32_t src_stride,
2827  uint8_t *dst, int32_t dst_stride,
2828  const int8_t *filter, int32_t height)
2829 {
2830  uint32_t loop_cnt;
2831  v16i8 src0, src1, src2, src3, src4, src5, src6;
2832  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2833  v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2834  v16u8 tmp0, tmp1, tmp2, tmp3;
2835  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2836 
2837  src -= src_stride;
2838 
2839  filt = LD_SH(filter);
2840  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2841 
2842  LD_SB3(src, src_stride, src0, src1, src2);
2843  src += (3 * src_stride);
2844 
2846  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2847  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2848 
2849  for (loop_cnt = (height >> 2); loop_cnt--;) {
2850  LD_SB4(src, src_stride, src3, src4, src5, src6);
2851  src += (4 * src_stride);
2852 
2853  XORI_B4_128_SB(src3, src4, src5, src6);
2854  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2855  src32_r, src43_r, src54_r, src65_r);
2856  ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2857  src32_l, src43_l, src54_l, src65_l);
2858  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2859  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2860  out2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2861  out3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2862  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2863  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2864  out2_l = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
2865  out3_l = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
2866  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2867  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
2868  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2869  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2870  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2871  out3_r, tmp0, tmp1, tmp2, tmp3);
2872  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2873  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2874  dst += (4 * dst_stride);
2875 
2876  src10_r = src54_r;
2877  src21_r = src65_r;
2878  src10_l = src54_l;
2879  src21_l = src65_l;
2880  src2 = src6;
2881  }
2882 }
2883 
2884 static void common_vt_4t_24w_msa(const uint8_t *src, int32_t src_stride,
2885  uint8_t *dst, int32_t dst_stride,
2886  const int8_t *filter, int32_t height)
2887 {
2888  uint32_t loop_cnt;
2889  uint64_t out0, out1;
2890  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2891  v16i8 src11, filt0, filt1;
2892  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2893  v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2894  v16u8 out;
2895  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2896 
2897  src -= src_stride;
2898 
2899  filt = LD_SH(filter);
2900  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2901 
2902  /* 16 width */
2903  LD_SB3(src, src_stride, src0, src1, src2);
2905  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2906  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2907 
2908  /* 8 width */
2909  LD_SB3(src + 16, src_stride, src6, src7, src8);
2910  src += (3 * src_stride);
2911  XORI_B3_128_SB(src6, src7, src8);
2912  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2913 
2914  for (loop_cnt = 8; loop_cnt--;) {
2915  /* 16 width */
2916  LD_SB2(src, src_stride, src3, src4);
2917  XORI_B2_128_SB(src3, src4);
2918  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2919  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2920 
2921  /* 8 width */
2922  LD_SB2(src + 16, src_stride, src9, src10);
2923  src += (2 * src_stride);
2924  XORI_B2_128_SB(src9, src10);
2925  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2926 
2927  /* 16 width */
2928  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2929  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2930  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2931  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2932 
2933  /* 8 width */
2934  out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
2935  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2936 
2937  /* 16 + 8 width */
2938  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2939  SRARI_H2_SH(out0_l, out1_l, 6);
2940  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2941  SAT_SH2_SH(out0_l, out1_l, 7);
2942  out = PCKEV_XORI128_UB(out0_r, out0_l);
2943  ST_UB(out, dst);
2944  PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2945  XORI_B2_128_SH(out2_r, out3_r);
2946  out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2947  out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2948  SD(out0, dst + 16);
2949  dst += dst_stride;
2950  out = PCKEV_XORI128_UB(out1_r, out1_l);
2951  ST_UB(out, dst);
2952  SD(out1, dst + 16);
2953  dst += dst_stride;
2954 
2955  /* 16 width */
2956  LD_SB2(src, src_stride, src5, src2);
2957  XORI_B2_128_SB(src5, src2);
2958  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2959  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2960 
2961  /* 8 width */
2962  LD_SB2(src + 16, src_stride, src11, src8);
2963  src += (2 * src_stride);
2964  XORI_B2_128_SB(src11, src8);
2965  ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2966 
2967  /* 16 width */
2968  out0_r = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
2969  out0_l = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
2970  out1_r = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
2971  out1_l = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
2972 
2973  /* 8 width */
2974  out2_r = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
2975  out3_r = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
2976 
2977  /* 16 + 8 width */
2978  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2979  SRARI_H2_SH(out0_l, out1_l, 6);
2980  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2981  SAT_SH2_SH(out0_l, out1_l, 7);
2982  out = PCKEV_XORI128_UB(out0_r, out0_l);
2983  ST_UB(out, dst);
2984  out = PCKEV_XORI128_UB(out2_r, out2_r);
2985  ST_D1(out, 0, dst + 16);
2986  dst += dst_stride;
2987  out = PCKEV_XORI128_UB(out1_r, out1_l);
2988  ST_UB(out, dst);
2989  out = PCKEV_XORI128_UB(out3_r, out3_r);
2990  ST_D1(out, 0, dst + 16);
2991  dst += dst_stride;
2992  }
2993 }
2994 
2995 static void common_vt_4t_32w_msa(const uint8_t *src, int32_t src_stride,
2996  uint8_t *dst, int32_t dst_stride,
2997  const int8_t *filter, int32_t height)
2998 {
2999  uint32_t loop_cnt;
3000  v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3001  v16i8 src10_r, src32_r, src76_r, src98_r;
3002  v16i8 src21_r, src43_r, src87_r, src109_r;
3003  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3004  v16i8 src10_l, src32_l, src76_l, src98_l;
3005  v16i8 src21_l, src43_l, src87_l, src109_l;
3006  v8i16 filt;
3007  v16i8 filt0, filt1;
3008  v16u8 out;
3009 
3010  src -= src_stride;
3011 
3012  filt = LD_SH(filter);
3013  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
3014 
3015  /* 16 width */
3016  LD_SB3(src, src_stride, src0, src1, src2);
3018 
3019  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3020  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3021 
3022  /* next 16 width */
3023  LD_SB3(src + 16, src_stride, src6, src7, src8);
3024  src += (3 * src_stride);
3025 
3026  XORI_B3_128_SB(src6, src7, src8);
3027  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3028  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3029 
3030  for (loop_cnt = (height >> 1); loop_cnt--;) {
3031  /* 16 width */
3032  LD_SB2(src, src_stride, src3, src4);
3033  XORI_B2_128_SB(src3, src4);
3034  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3035  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3036 
3037  /* 16 width */
3038  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3039  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3040  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3041  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3042 
3043  /* 16 width */
3044  SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
3045  SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3046  out = PCKEV_XORI128_UB(out0_r, out0_l);
3047  ST_UB(out, dst);
3048  out = PCKEV_XORI128_UB(out1_r, out1_l);
3049  ST_UB(out, dst + dst_stride);
3050 
3051  src10_r = src32_r;
3052  src21_r = src43_r;
3053  src10_l = src32_l;
3054  src21_l = src43_l;
3055  src2 = src4;
3056 
3057  /* next 16 width */
3058  LD_SB2(src + 16, src_stride, src9, src10);
3059  src += (2 * src_stride);
3060  XORI_B2_128_SB(src9, src10);
3061  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3062  ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3063 
3064  /* next 16 width */
3065  out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3066  out2_l = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
3067  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3068  out3_l = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
3069 
3070  /* next 16 width */
3071  SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
3072  SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3073  out = PCKEV_XORI128_UB(out2_r, out2_l);
3074  ST_UB(out, dst + 16);
3075  out = PCKEV_XORI128_UB(out3_r, out3_l);
3076  ST_UB(out, dst + 16 + dst_stride);
3077 
3078  dst += 2 * dst_stride;
3079 
3080  src76_r = src98_r;
3081  src87_r = src109_r;
3082  src76_l = src98_l;
3083  src87_l = src109_l;
3084  src8 = src10;
3085  }
3086 }
3087 
3088 static void hevc_hv_uni_4t_4x2_msa(const uint8_t *src,
3089  int32_t src_stride,
3090  uint8_t *dst,
3091  int32_t dst_stride,
3092  const int8_t *filter_x,
3093  const int8_t *filter_y)
3094 {
3095  v16u8 out;
3096  v16i8 src0, src1, src2, src3, src4;
3097  v8i16 filt0, filt1;
3098  v8i16 filt_h0, filt_h1;
3099  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3100  v16i8 mask1;
3101  v8i16 filter_vec, tmp;
3102  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3103  v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3104  v4i32 dst0, dst1;
3105 
3106  src -= (src_stride + 1);
3107 
3108  filter_vec = LD_SH(filter_x);
3109  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3110 
3111  filter_vec = LD_SH(filter_y);
3112  UNPCK_R_SB_SH(filter_vec, filter_vec);
3113 
3114  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3115 
3116  mask1 = mask0 + 2;
3117 
3118  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3119  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3120 
3121  VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3122  VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3123  VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3124 
3125  dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3126  dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3127  dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3128 
3129  ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3130  ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3131 
3132  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3133  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3134  dst0 >>= 6;
3135  dst1 >>= 6;
3136  tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3137  tmp = __msa_srari_h(tmp, 6);
3138  tmp = __msa_sat_s_h(tmp, 7);
3140  ST_W2(out, 0, 1, dst, dst_stride);
3141 }
3142 
3143 static void hevc_hv_uni_4t_4x4_msa(const uint8_t *src,
3144  int32_t src_stride,
3145  uint8_t *dst,
3146  int32_t dst_stride,
3147  const int8_t *filter_x,
3148  const int8_t *filter_y)
3149 {
3150  v16u8 out;
3151  v16i8 src0, src1, src2, src3, src4, src5, src6;
3152  v8i16 filt0, filt1;
3153  v8i16 filt_h0, filt_h1;
3154  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3155  v16i8 mask1;
3156  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3157  v8i16 filter_vec, tmp0, tmp1;
3158  v8i16 dst30, dst41, dst52, dst63;
3159  v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3160  v4i32 dst0, dst1, dst2, dst3;
3161 
3162  src -= (src_stride + 1);
3163 
3164  filter_vec = LD_SH(filter_x);
3165  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3166 
3167  filter_vec = LD_SH(filter_y);
3168  UNPCK_R_SB_SH(filter_vec, filter_vec);
3169 
3170  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3171 
3172  mask1 = mask0 + 2;
3173 
3174  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3175  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3176 
3177  VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3178  VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3179  VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3180  VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3181 
3182  dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3183  dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3184  dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3185  dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3186 
3187  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3188  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3189  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3190  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3191  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3192  dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3193  dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3194  SRA_4V(dst0, dst1, dst2, dst3, 6);
3195  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3196  SRARI_H2_SH(tmp0, tmp1, 6);
3197  SAT_SH2_SH(tmp0, tmp1, 7);
3198  out = PCKEV_XORI128_UB(tmp0, tmp1);
3199  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3200 }
3201 
3202 static void hevc_hv_uni_4t_4multx8mult_msa(const uint8_t *src,
3203  int32_t src_stride,
3204  uint8_t *dst,
3205  int32_t dst_stride,
3206  const int8_t *filter_x,
3207  const int8_t *filter_y,
3208  int32_t height)
3209 {
3210  uint32_t loop_cnt;
3211  v16u8 out0, out1;
3212  v16i8 src0, src1, src2, src3, src4, src5;
3213  v16i8 src6, src7, src8, src9, src10;
3214  v8i16 filt0, filt1;
3215  v8i16 filt_h0, filt_h1;
3216  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3217  v16i8 mask1;
3218  v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3;
3219  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3220  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3221  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3222  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3223  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3224  v8i16 dst98_r, dst109_r;
3225 
3226  src -= (src_stride + 1);
3227 
3228  filter_vec = LD_SH(filter_x);
3229  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3230 
3231  filter_vec = LD_SH(filter_y);
3232  UNPCK_R_SB_SH(filter_vec, filter_vec);
3233 
3234  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3235 
3236  mask1 = mask0 + 2;
3237 
3238  LD_SB3(src, src_stride, src0, src1, src2);
3239  src += (3 * src_stride);
3240 
3242 
3243  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3244  VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3245  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3246  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3247  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3248  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3249 
3250  for (loop_cnt = height >> 3; loop_cnt--;) {
3251  LD_SB8(src, src_stride,
3252  src3, src4, src5, src6, src7, src8, src9, src10);
3253  src += (8 * src_stride);
3254 
3255  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3256 
3257  VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3258  VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3259  VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3260  VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3261 
3262  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3263  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3264  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3265  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3266 
3267  dst32_r = __msa_ilvr_h(dst73, dst22);
3268  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3269  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3270  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3271  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3272  dst76_r = __msa_ilvr_h(dst22, dst106);
3273 
3274  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3275  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3276  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3277  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3278  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3279  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3280  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3281  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3282  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3283  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3284  PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3285  dst5_r, dst4_r, dst7_r, dst6_r,
3286  tmp0, tmp1, tmp2, tmp3);
3287  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3288  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3289  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3290  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3291  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3292  dst += (8 * dst_stride);
3293 
3294  dst10_r = dst98_r;
3295  dst21_r = dst109_r;
3296  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3297  }
3298 }
3299 
3300 static void hevc_hv_uni_4t_4w_msa(const uint8_t *src,
3301  int32_t src_stride,
3302  uint8_t *dst,
3303  int32_t dst_stride,
3304  const int8_t *filter_x,
3305  const int8_t *filter_y,
3306  int32_t height)
3307 {
3308  if (2 == height) {
3309  hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3310  filter_x, filter_y);
3311  } else if (4 == height) {
3312  hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3313  filter_x, filter_y);
3314  } else if (0 == (height % 8)) {
3315  hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3316  filter_x, filter_y, height);
3317  }
3318 }
3319 
3320 static void hevc_hv_uni_4t_6w_msa(const uint8_t *src,
3321  int32_t src_stride,
3322  uint8_t *dst,
3323  int32_t dst_stride,
3324  const int8_t *filter_x,
3325  const int8_t *filter_y,
3326  int32_t height)
3327 {
3328  v16u8 out0, out1, out2;
3329  v16i8 src0, src1, src2, src3, src4, src5, src6;
3330  v16i8 src7, src8, src9, src10;
3331  v8i16 filt0, filt1;
3332  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3333  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3334  v16i8 mask1;
3335  v8i16 filt_h0, filt_h1, filter_vec;
3336  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3337  v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3338  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3339  v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
3340  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3341  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3342  v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
3343  v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l;
3344  v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3345 
3346  src -= (src_stride + 1);
3347 
3348  filter_vec = LD_SH(filter_x);
3349  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3350 
3351  filter_vec = LD_SH(filter_y);
3352  UNPCK_R_SB_SH(filter_vec, filter_vec);
3353 
3354  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3355 
3356  mask1 = mask0 + 2;
3357 
3358  LD_SB3(src, src_stride, src0, src1, src2);
3359  src += (3 * src_stride);
3360 
3362 
3363  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3364  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3365  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3366 
3367  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3368  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3369  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3370 
3371  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3372  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3373 
3374  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3375  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3376 
3377  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3378  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3379  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3380  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3381 
3382  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3383  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3384  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3385  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3386 
3387  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3388  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3389  VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3390  VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3391 
3392  dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3393  dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3394  dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3395  dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3396 
3397  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3398  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3399  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3400  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3401  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
3402  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
3403  ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
3404  ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
3405 
3406  PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3407  PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3408  dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3409 
3410  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3411  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3412  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3413  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3414  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3415  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3416  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3417  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3418  dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
3419  dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
3420  dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
3421  dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3422  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3423  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3424  SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3425  PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3426  PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3427  PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3428  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3429  SRARI_H2_SH(tmp4, tmp5, 6);
3430  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3,7);
3431  SAT_SH2_SH(tmp4, tmp5,7);
3432  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3433  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3434  out2 = PCKEV_XORI128_UB(tmp4, tmp5);
3435  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3436  ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
3437 }
3438 
3439 static void hevc_hv_uni_4t_8x2_msa(const uint8_t *src,
3440  int32_t src_stride,
3441  uint8_t *dst,
3442  int32_t dst_stride,
3443  const int8_t *filter_x,
3444  const int8_t *filter_y)
3445 {
3446  v16u8 out;
3447  v16i8 src0, src1, src2, src3, src4;
3448  v8i16 filt0, filt1;
3449  v8i16 filt_h0, filt_h1, filter_vec;
3450  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3451  v16i8 mask1;
3452  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3453  v8i16 dst0, dst1, dst2, dst3, dst4;
3454  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3455  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3456  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3457  v8i16 out0_r, out1_r;
3458 
3459  src -= (src_stride + 1);
3460 
3461  filter_vec = LD_SH(filter_x);
3462  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3463 
3464  filter_vec = LD_SH(filter_y);
3465  UNPCK_R_SB_SH(filter_vec, filter_vec);
3466 
3467  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3468 
3469  mask1 = mask0 + 2;
3470 
3471  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3472  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3473 
3474  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3475  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3476  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3477  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3478  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3479 
3480  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3481  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3482  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3483  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3484  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3485  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3486  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3487  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3488  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3489  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3490  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3491  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3492  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3493  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3494  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3495  SRARI_H2_SH(out0_r, out1_r, 6);
3496  SAT_SH2_SH(out0_r, out1_r, 7);
3497  out = PCKEV_XORI128_UB(out0_r, out1_r);
3498  ST_D2(out, 0, 1, dst, dst_stride);
3499 }
3500 
3501 static void hevc_hv_uni_4t_8multx4_msa(const uint8_t *src,
3502  int32_t src_stride,
3503  uint8_t *dst,
3504  int32_t dst_stride,
3505  const int8_t *filter_x,
3506  const int8_t *filter_y,
3507  int32_t width8mult)
3508 {
3509  uint32_t cnt;
3510  v16u8 out0, out1;
3511  v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
3512  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3513  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
3514  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
3515  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3516  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3517  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3518 
3519  src -= (src_stride + 1);
3520 
3521  filter_vec = LD_SH(filter_x);
3522  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3523 
3524  filter_vec = LD_SH(filter_y);
3525  UNPCK_R_SB_SH(filter_vec, filter_vec);
3526 
3527  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3528 
3529  mask0 = LD_SB(ff_hevc_mask_arr);
3530  mask1 = mask0 + 2;
3531 
3532  for (cnt = width8mult; cnt--;) {
3533  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3534  src += 8;
3535  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3536 
3537  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3538  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3539  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3540 
3541  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3542  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3543  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3544 
3545  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3546  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3547 
3548  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3549  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3550  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3551  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3552 
3553  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3554  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3555  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3556  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3557 
3558  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3559  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3560  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3561  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3562 
3563  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3564  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3565  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3566  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3567  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3568  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3569  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3570  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3571 
3572  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3573  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3574 
3575  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3576  dst3_r, tmp0, tmp1, tmp2, tmp3);
3577  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3578  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3579  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3580  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3581  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3582  dst += 8;
3583  }
3584 }
3585 
3586 static void hevc_hv_uni_4t_8x6_msa(const uint8_t *src,
3587  int32_t src_stride,
3588  uint8_t *dst,
3589  int32_t dst_stride,
3590  const int8_t *filter_x,
3591  const int8_t *filter_y)
3592 {
3593  v16u8 out0, out1, out2;
3594  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3595  v8i16 filt0, filt1;
3596  v8i16 filt_h0, filt_h1, filter_vec;
3597  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3598  v16i8 mask1;
3599  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3600  v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3601  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3602  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3603  v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3604  v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3605  v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3606  v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3607  v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3608  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3609 
3610  src -= (src_stride + 1);
3611 
3612  filter_vec = LD_SH(filter_x);
3613  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3614 
3615  filter_vec = LD_SH(filter_y);
3616  UNPCK_R_SB_SH(filter_vec, filter_vec);
3617 
3618  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3619 
3620  mask1 = mask0 + 2;
3621 
3622  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3623  src += (5 * src_stride);
3624  LD_SB4(src, src_stride, src5, src6, src7, src8);
3625 
3626  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3627  XORI_B4_128_SB(src5, src6, src7, src8);
3628 
3629  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3630  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3631  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3632  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3633  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3634  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3635  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3636  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3637  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3638 
3639  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3640  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3641  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3642  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3643  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3644  dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
3645  dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
3646  dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
3647  dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
3648 
3649  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3650  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3651  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3652  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3653  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3654  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3655  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3656  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3657 
3658  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3659  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3660  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3661  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3662  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3663  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3664  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3665  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3666  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3667  dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3668  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3669  dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3670 
3671  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3672  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3673  SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3674  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3675  dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3676  PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3677  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3678  SRARI_H2_SH(out4_r, out5_r, 6);
3679  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3680  SAT_SH2_SH(out4_r, out5_r, 7);
3681  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3682  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3683  out2 = PCKEV_XORI128_UB(out4_r, out5_r);
3684 
3685  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3686  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
3687 }
3688 
3689 static void hevc_hv_uni_4t_8multx4mult_msa(const uint8_t *src,
3690  int32_t src_stride,
3691  uint8_t *dst,
3692  int32_t dst_stride,
3693  const int8_t *filter_x,
3694  const int8_t *filter_y,
3695  int32_t height,
3696  int32_t width8mult)
3697 {
3698  uint32_t loop_cnt, cnt;
3699  const uint8_t *src_tmp;
3700  uint8_t *dst_tmp;
3701  v16u8 out0, out1;
3702  v16i8 src0, src1, src2, src3, src4, src5, src6;
3703  v8i16 filt0, filt1;
3704  v8i16 filt_h0, filt_h1, filter_vec;
3705  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3706  v16i8 mask1;
3707  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3708  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3709  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3710  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3711  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3712  v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
3713  v8i16 out0_r, out1_r, out2_r, out3_r;
3714 
3715  src -= (src_stride + 1);
3716 
3717  filter_vec = LD_SH(filter_x);
3718  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3719 
3720  filter_vec = LD_SH(filter_y);
3721  UNPCK_R_SB_SH(filter_vec, filter_vec);
3722 
3723  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3724 
3725  mask1 = mask0 + 2;
3726 
3727  for (cnt = width8mult; cnt--;) {
3728  src_tmp = src;
3729  dst_tmp = dst;
3730 
3731  LD_SB3(src_tmp, src_stride, src0, src1, src2);
3732  src_tmp += (3 * src_stride);
3733 
3735 
3736  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3737  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3738  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3739 
3740  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3741  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3742  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3743 
3744  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3745  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3746 
3747  for (loop_cnt = (height >> 2); loop_cnt--;) {
3748  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3749  src_tmp += (4 * src_stride);
3750 
3751  XORI_B4_128_SB(src3, src4, src5, src6);
3752 
3753  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3754  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3755  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3756  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3757 
3758  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3759  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3760  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3761  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3762 
3763  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3764  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3765  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3766  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3767 
3768  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3769  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3770  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3771  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3772  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3773  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3774  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3775  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3776 
3777  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3778  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3779 
3780  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3781  dst2_l, dst2_r, dst3_l, dst3_r,
3782  out0_r, out1_r, out2_r, out3_r);
3783 
3784  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3785  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3786  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3787  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3788  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3789  dst_tmp += (4 * dst_stride);
3790 
3791  dst10_r = dst54_r;
3792  dst10_l = dst54_l;
3793  dst21_r = dst65_r;
3794  dst21_l = dst65_l;
3795  dst2 = dst6;
3796  }
3797 
3798  src += 8;
3799  dst += 8;
3800  }
3801 }
3802 
3803 static void hevc_hv_uni_4t_8w_msa(const uint8_t *src,
3804  int32_t src_stride,
3805  uint8_t *dst,
3806  int32_t dst_stride,
3807  const int8_t *filter_x,
3808  const int8_t *filter_y,
3809  int32_t height)
3810 {
3811  if (2 == height) {
3812  hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3813  filter_x, filter_y);
3814  } else if (4 == height) {
3815  hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride,
3816  filter_x, filter_y, 1);
3817  } else if (6 == height) {
3818  hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3819  filter_x, filter_y);
3820  } else if (0 == (height % 4)) {
3821  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3822  filter_x, filter_y, height, 1);
3823  }
3824 }
3825 
3826 static void hevc_hv_uni_4t_12w_msa(const uint8_t *src,
3827  int32_t src_stride,
3828  uint8_t *dst,
3829  int32_t dst_stride,
3830  const int8_t *filter_x,
3831  const int8_t *filter_y,
3832  int32_t height)
3833 {
3834  uint32_t loop_cnt;
3835  const uint8_t *src_tmp;
3836  uint8_t *dst_tmp;
3837  v16u8 out0, out1;
3838  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3839  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3840  v16i8 mask0, mask1, mask2, mask3;
3841  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
3842  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
3843  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3844  v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
3845  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3846  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3847  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3848  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3849 
3850  src -= (src_stride + 1);
3851 
3852  filter_vec = LD_SH(filter_x);
3853  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3854 
3855  filter_vec = LD_SH(filter_y);
3856  UNPCK_R_SB_SH(filter_vec, filter_vec);
3857 
3858  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3859 
3860  mask0 = LD_SB(ff_hevc_mask_arr);
3861  mask1 = mask0 + 2;
3862 
3863  src_tmp = src;
3864  dst_tmp = dst;
3865 
3866  LD_SB3(src_tmp, src_stride, src0, src1, src2);
3867  src_tmp += (3 * src_stride);
3868 
3870 
3871  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3872  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3873  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3874 
3875  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3876  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3877  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3878 
3879  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3880  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3881 
3882  for (loop_cnt = 4; loop_cnt--;) {
3883  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3884  src_tmp += (4 * src_stride);
3885  XORI_B4_128_SB(src3, src4, src5, src6);
3886 
3887  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3888  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3889  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3890  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3891 
3892  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3893  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3894  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3895  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3896 
3897  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3898  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3899  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3900  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3901 
3902  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3903  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3904  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3905  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3906  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3907  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3908  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3909  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3910 
3911  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3912  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3913 
3914  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3915  dst3_r, tmp0, tmp1, tmp2, tmp3);
3916  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3917  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3918  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3919  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3920  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3921  dst_tmp += (4 * dst_stride);
3922 
3923  dst10_r = dst54_r;
3924  dst10_l = dst54_l;
3925  dst21_r = dst65_r;
3926  dst21_l = dst65_l;
3927  dsth2 = dsth6;
3928  }
3929 
3930  src += 8;
3931  dst += 8;
3932 
3933  mask2 = LD_SB(ff_hevc_mask_arr + 16);
3934  mask3 = mask2 + 2;
3935 
3936  LD_SB3(src, src_stride, src0, src1, src2);
3937  src += (3 * src_stride);
3939  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3940  VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
3941 
3942  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3943  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3944 
3945  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3946  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3947 
3948  for (loop_cnt = 2; loop_cnt--;) {
3949  LD_SB8(src, src_stride,
3950  src3, src4, src5, src6, src7, src8, src9, src10);
3951  src += (8 * src_stride);
3952  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3953  VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
3954  VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
3955  VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
3956  VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
3957 
3958  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3959  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3960  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3961  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3962 
3963  dst32_r = __msa_ilvr_h(dst73, dst22);
3964  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3965  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3966  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3967  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3968  dst76_r = __msa_ilvr_h(dst22, dst106);
3969 
3970  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3971  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3972  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3973  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3974  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3975  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3976  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3977  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3978  SRA_4V(dst0, dst1, dst2, dst3, 6);
3979  SRA_4V(dst4, dst5, dst6, dst7, 6);
3980  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3981  tmp0, tmp1, tmp2, tmp3);
3982  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3983  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3984  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3985  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3986  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3987  dst += (8 * dst_stride);
3988 
3989  dst10_r = dst98_r;
3990  dst21_r = dst109_r;
3991  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3992  }
3993 }
3994 
3995 static void hevc_hv_uni_4t_16w_msa(const uint8_t *src,
3996  int32_t src_stride,
3997  uint8_t *dst,
3998  int32_t dst_stride,
3999  const int8_t *filter_x,
4000  const int8_t *filter_y,
4001  int32_t height)
4002 {
4003  if (4 == height) {
4004  hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride, filter_x,
4005  filter_y, 2);
4006  } else {
4007  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4008  filter_x, filter_y, height, 2);
4009  }
4010 }
4011 
4012 static void hevc_hv_uni_4t_24w_msa(const uint8_t *src,
4013  int32_t src_stride,
4014  uint8_t *dst,
4015  int32_t dst_stride,
4016  const int8_t *filter_x,
4017  const int8_t *filter_y,
4018  int32_t height)
4019 {
4020  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4021  filter_x, filter_y, height, 3);
4022 }
4023 
4024 static void hevc_hv_uni_4t_32w_msa(const uint8_t *src,
4025  int32_t src_stride,
4026  uint8_t *dst,
4027  int32_t dst_stride,
4028  const int8_t *filter_x,
4029  const int8_t *filter_y,
4030  int32_t height)
4031 {
4032  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4033  filter_x, filter_y, height, 4);
4034 }
4035 
4036 #define UNI_MC_COPY(WIDTH) \
4037 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4038  ptrdiff_t dst_stride, \
4039  const uint8_t *src, \
4040  ptrdiff_t src_stride, \
4041  int height, \
4042  intptr_t mx, \
4043  intptr_t my, \
4044  int width) \
4045 { \
4046  copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
4047 }
4048 
4049 UNI_MC_COPY(8);
4050 UNI_MC_COPY(12);
4051 UNI_MC_COPY(16);
4052 UNI_MC_COPY(24);
4053 UNI_MC_COPY(32);
4054 UNI_MC_COPY(48);
4055 UNI_MC_COPY(64);
4056 
4057 #undef UNI_MC_COPY
4058 
4059 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4060 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4061  ptrdiff_t dst_stride, \
4062  const uint8_t *src, \
4063  ptrdiff_t src_stride, \
4064  int height, \
4065  intptr_t mx, \
4066  intptr_t my, \
4067  int width) \
4068 { \
4069  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4070  \
4071  common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4072  filter, height); \
4073 }
4074 
4075 UNI_MC(qpel, h, 4, 8, hz, mx);
4076 UNI_MC(qpel, h, 8, 8, hz, mx);
4077 UNI_MC(qpel, h, 12, 8, hz, mx);
4078 UNI_MC(qpel, h, 16, 8, hz, mx);
4079 UNI_MC(qpel, h, 24, 8, hz, mx);
4080 UNI_MC(qpel, h, 32, 8, hz, mx);
4081 UNI_MC(qpel, h, 48, 8, hz, mx);
4082 UNI_MC(qpel, h, 64, 8, hz, mx);
4083 
4084 UNI_MC(qpel, v, 4, 8, vt, my);
4085 UNI_MC(qpel, v, 8, 8, vt, my);
4086 UNI_MC(qpel, v, 12, 8, vt, my);
4087 UNI_MC(qpel, v, 16, 8, vt, my);
4088 UNI_MC(qpel, v, 24, 8, vt, my);
4089 UNI_MC(qpel, v, 32, 8, vt, my);
4090 UNI_MC(qpel, v, 48, 8, vt, my);
4091 UNI_MC(qpel, v, 64, 8, vt, my);
4092 
4093 UNI_MC(epel, h, 4, 4, hz, mx);
4094 UNI_MC(epel, h, 6, 4, hz, mx);
4095 UNI_MC(epel, h, 8, 4, hz, mx);
4096 UNI_MC(epel, h, 12, 4, hz, mx);
4097 UNI_MC(epel, h, 16, 4, hz, mx);
4098 UNI_MC(epel, h, 24, 4, hz, mx);
4099 UNI_MC(epel, h, 32, 4, hz, mx);
4100 
4101 UNI_MC(epel, v, 4, 4, vt, my);
4102 UNI_MC(epel, v, 6, 4, vt, my);
4103 UNI_MC(epel, v, 8, 4, vt, my);
4104 UNI_MC(epel, v, 12, 4, vt, my);
4105 UNI_MC(epel, v, 16, 4, vt, my);
4106 UNI_MC(epel, v, 24, 4, vt, my);
4107 UNI_MC(epel, v, 32, 4, vt, my);
4108 
4109 #undef UNI_MC
4110 
4111 #define UNI_MC_HV(PEL, WIDTH, TAP) \
4112 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4113  ptrdiff_t dst_stride, \
4114  const uint8_t *src, \
4115  ptrdiff_t src_stride, \
4116  int height, \
4117  intptr_t mx, \
4118  intptr_t my, \
4119  int width) \
4120 { \
4121  const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4122  const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4123  \
4124  hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4125  filter_x, filter_y, height); \
4126 }
4127 
4128 UNI_MC_HV(qpel, 4, 8);
4129 UNI_MC_HV(qpel, 8, 8);
4130 UNI_MC_HV(qpel, 12, 8);
4131 UNI_MC_HV(qpel, 16, 8);
4132 UNI_MC_HV(qpel, 24, 8);
4133 UNI_MC_HV(qpel, 32, 8);
4134 UNI_MC_HV(qpel, 48, 8);
4135 UNI_MC_HV(qpel, 64, 8);
4136 
4137 UNI_MC_HV(epel, 4, 4);
4138 UNI_MC_HV(epel, 6, 4);
4139 UNI_MC_HV(epel, 8, 4);
4140 UNI_MC_HV(epel, 12, 4);
4141 UNI_MC_HV(epel, 16, 4);
4142 UNI_MC_HV(epel, 24, 4);
4143 UNI_MC_HV(epel, 32, 4);
4144 
4145 #undef UNI_MC_HV
common_vt_8t_16w_msa
static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1132
XORI_B2_128_SH
#define XORI_B2_128_SH(...)
Definition: generic_macros_msa.h:1836
VSHF_B2_SB
#define VSHF_B2_SB(...)
Definition: generic_macros_msa.h:662
UNI_MC
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
Definition: hevc_mc_uni_msa.c:4059
LD_SB4
#define LD_SB4(...)
Definition: generic_macros_msa.h:297
common_vt_8t_24w_msa
static void common_vt_8t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1300
LD_UB8
#define LD_UB8(...)
Definition: generic_macros_msa.h:335
ILVR_H2_SH
#define ILVR_H2_SH(...)
Definition: generic_macros_msa.h:1392
hevc_hv_uni_4t_8x2_msa
static void hevc_hv_uni_4t_8x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
Definition: hevc_mc_uni_msa.c:3439
HORIZ_8TAP_4WID_4VECS_FILT
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1)
Definition: hevc_mc_uni_msa.c:34
common_hz_4t_8x4mult_msa
static void common_hz_4t_8x4mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2150
DPADD_SB2_SH
#define DPADD_SB2_SH(...)
Definition: generic_macros_msa.h:833
hevc_hv_uni_4t_8multx4_msa
static void hevc_hv_uni_4t_8multx4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult)
Definition: hevc_mc_uni_msa.c:3501
SRARI_H2_SH
#define SRARI_H2_SH(...)
Definition: generic_macros_msa.h:2059
out
FILE * out
Definition: movenc.c:54
common_hz_4t_4x4_msa
static void common_hz_4t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:1942
SPLATI_H4_SH
#define SPLATI_H4_SH(...)
Definition: generic_macros_msa.h:1674
common_vt_4t_32w_msa
static void common_vt_4t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2995
src1
const pixel * src1
Definition: h264pred_template.c:421
SAT_SH4_SH
#define SAT_SH4_SH(...)
Definition: generic_macros_msa.h:1615
common_hz_8t_4x16_msa
static void common_hz_8t_4x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:352
VSHF_B3_SB
#define VSHF_B3_SB(in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, out0, out1, out2)
Definition: vp8_mc_lsx.c:54
PCKEV_H2_SW
#define PCKEV_H2_SW(...)
Definition: generic_macros_msa.h:1760
hevc_hv_uni_4t_4multx8mult_msa
static void hevc_hv_uni_4t_4multx8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:3202
common_vt_4t_8x6_msa
static void common_vt_4t_8x6_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2651
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
SRARI_H4_SH
#define SRARI_H4_SH(...)
Definition: generic_macros_msa.h:2067
common_hz_4t_16w_msa
static void common_hz_4t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2251
SAT_SH2_SH
#define SAT_SH2_SH(...)
Definition: generic_macros_msa.h:1601
copy_width32_msa
static void copy_width32_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:219
ST_D1
#define ST_D1(in, idx, pdst)
Definition: generic_macros_msa.h:485
hevc_hv_uni_8t_12w_msa
static void hevc_hv_uni_8t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1620
LD_SH
#define LD_SH(...)
Definition: generic_macros_msa.h:35
common_hz_8t_24w_msa
static void common_hz_8t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:610
hevc_hv_uni_8t_8multx2mult_msa
static void hevc_hv_uni_8t_8multx2mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
Definition: hevc_mc_uni_msa.c:1469
common_vt_4t_4x2_msa
static void common_vt_4t_4x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2465
VSHF_B4_SB
#define VSHF_B4_SB(...)
Definition: generic_macros_msa.h:680
PCKEV_B4_UB
#define PCKEV_B4_UB(...)
Definition: generic_macros_msa.h:1739
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
ST_UB8
#define ST_UB8(...)
Definition: generic_macros_msa.h:391
hevc_hv_uni_8t_64w_msa
static void hevc_hv_uni_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1903
ST_UB4
#define ST_UB4(...)
Definition: generic_macros_msa.h:374
SAT_SH3_SH
#define SAT_SH3_SH(...)
Definition: generic_macros_msa.h:1608
DOTP_SB2_SH
#define DOTP_SB2_SH(...)
Definition: generic_macros_msa.h:768
common_hz_8t_4w_msa
static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:408
hevc_hv_uni_8t_4w_msa
static void hevc_hv_uni_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1335
XORI_B4_128_SB
#define XORI_B4_128_SB(...)
Definition: generic_macros_msa.h:1851
copy_width16_msa
static void copy_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:160
hevc_hv_uni_8t_48w_msa
static void hevc_hv_uni_8t_48w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1891
generic_macros_msa.h
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: vp8_lpf_lsx.c:234
hevc_hv_uni_8t_32w_msa
static void hevc_hv_uni_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1879
ST12x8_UB
#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
Definition: generic_macros_msa.h:527
common_hz_8t_4x8_msa
static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:315
LD_SB
#define LD_SB(...)
Definition: generic_macros_msa.h:33
LD_UB
#define LD_UB(...)
Definition: generic_macros_msa.h:32
LD_SB5
#define LD_SB5(...)
Definition: generic_macros_msa.h:308
aligned
static int aligned(int val)
Definition: dashdec.c:168
copy_width24_msa
static void copy_width24_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:196
common_hz_4t_4w_msa
static void common_hz_4t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2052
SW
#define SW(val, pdst)
Definition: generic_macros_msa.h:167
copy_width64_msa
static void copy_width64_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:257
common_vt_4t_8x2_msa
static void common_vt_4t_8x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2625
ILVL_H2_SH
#define ILVL_H2_SH(...)
Definition: generic_macros_msa.h:1292
common_hz_8t_64w_msa
static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:827
width
#define width
HEVC_FILT_8TAP_SH
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
Definition: hevc_macros_msa.h:24
ST_H8
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:429
hevc_hv_uni_4t_24w_msa
static void hevc_hv_uni_4t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:4012
common_hz_8t_8w_msa
static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:421
UNPCK_R_SB_SH
#define UNPCK_R_SB_SH(in, out)
Definition: generic_macros_msa.h:2156
SRA_4V
#define SRA_4V(in0, in1, in2, in3, shift)
Definition: generic_macros_msa.h:1939
SAT_SW4_SW
#define SAT_SW4_SW(...)
Definition: generic_macros_msa.h:1639
common_vt_4t_24w_msa
static void common_vt_4t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2884
PCKEV_H4_SH
#define PCKEV_H4_SH(...)
Definition: generic_macros_msa.h:1768
HEVC_FILT_8TAP
#define HEVC_FILT_8TAP(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
Definition: hevc_macros_msa.h:35
common_hz_4t_6w_msa
static void common_hz_4t_6w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2067
ILVR_B3_SH
#define ILVR_B3_SH(...)
Definition: generic_macros_msa.h:1351
common_vt_8t_8w_msa
static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:975
common_hz_4t_4x16_msa
static void common_hz_4t_4x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2004
hevc_macros_msa.h
common_hz_4t_8x2mult_msa
static void common_hz_4t_8x2mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2116
ILVR_B4_SB
#define ILVR_B4_SB(...)
Definition: generic_macros_msa.h:1360
LD2
#define LD2(psrc, stride, out0, out1)
Definition: generic_macros_msa.h:223
ILVR_D2_SB
#define ILVR_D2_SB(...)
Definition: generic_macros_msa.h:1444
HORIZ_4TAP_4WID_4VECS_FILT
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1)
Definition: hevc_mc_uni_msa.c:76
XORI_B5_128_SB
#define XORI_B5_128_SB(...)
Definition: generic_macros_msa.h:1859
ILVRL_H2_SH
#define ILVRL_H2_SH(...)
Definition: generic_macros_msa.h:1508
common_vt_8t_12w_msa
static void common_vt_8t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1030
ff_hevc_mask_arr
static const uint8_t ff_hevc_mask_arr[16 *3]
Definition: hevc_mc_uni_msa.c:25
hevc_hv_uni_4t_12w_msa
static void hevc_hv_uni_4t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:3826
DOTP_SB3_SH
#define DOTP_SB3_SH(...)
Definition: generic_macros_msa.h:776
hevc_hv_uni_8t_16w_msa
static void hevc_hv_uni_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1855
common_hz_8t_32w_msa
static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:680
common_hz_4t_8w_msa
static void common_hz_4t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2184
common_hz_4t_4x2_msa
static void common_hz_4t_4x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:1915
common_hz_8t_16w_msa
static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:552
common_vt_8t_4w_msa
static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:906
ST_W2
#define ST_W2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:450
HORIZ_4TAP_8WID_4VECS_FILT
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1, out2, out3)
Definition: hevc_mc_uni_msa.c:88
hevc_hv_uni_8t_24w_msa
static void hevc_hv_uni_8t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1867
common_vt_4t_4w_msa
static void common_vt_4t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2542
ILVR_D3_SB
#define ILVR_D3_SB(...)
Definition: generic_macros_msa.h:1452
hevcdsp_mips.h
VSHF_B2_SH
#define VSHF_B2_SH(...)
Definition: generic_macros_msa.h:664
PCKEV_XORI128_UB
#define PCKEV_XORI128_UB(in0, in1)
Definition: generic_macros_msa.h:2751
common_hz_4t_24w_msa
static void common_hz_4t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2314
LD_SB7
#define LD_SB7(...)
Definition: generic_macros_msa.h:327
SD4
#define SD4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:256
LD_UB4
#define LD_UB4(...)
Definition: generic_macros_msa.h:296
XORI_B8_128_SB
#define XORI_B8_128_SB(...)
Definition: generic_macros_msa.h:1880
ILVR_B2_SB
#define ILVR_B2_SB(...)
Definition: generic_macros_msa.h:1338
XORI_B2_128_SB
#define XORI_B2_128_SB(...)
Definition: generic_macros_msa.h:1835
hevc_hv_uni_8t_8w_msa
static void hevc_hv_uni_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1608
hevc_hv_uni_4t_16w_msa
static void hevc_hv_uni_4t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:3995
height
#define height
common_vt_8t_16w_mult_msa
static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
Definition: hevc_mc_uni_msa.c:1211
hevc_hv_uni_4t_8x6_msa
static void hevc_hv_uni_4t_8x6_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
Definition: hevc_mc_uni_msa.c:3586
common_vt_4t_8x4mult_msa
static void common_vt_4t_8x4mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2704
hevc_hv_uni_4t_8w_msa
static void hevc_hv_uni_4t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:3803
common_vt_4t_8w_msa
static void common_vt_4t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2749
ST_D2
#define ST_D2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:491
SPLATI_H2_SH
#define SPLATI_H2_SH(...)
Definition: generic_macros_msa.h:1656
HORIZ_8TAP_8WID_4VECS_FILT
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3)
Definition: hevc_mc_uni_msa.c:51
copy_width48_msa
static void copy_width48_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:236
SPLATI_W4_SH
#define SPLATI_W4_SH(...)
Definition: generic_macros_msa.h:1700
HEVC_FILT_4TAP_SH
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
Definition: hevc_macros_msa.h:46
UNI_MC_COPY
#define UNI_MC_COPY(WIDTH)
Definition: hevc_mc_uni_msa.c:4036
PCKEV_B2_SH
#define PCKEV_B2_SH(...)
Definition: generic_macros_msa.h:1721
hevc_hv_uni_4t_6w_msa
static void hevc_hv_uni_4t_6w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:3320
UNI_MC_HV
#define UNI_MC_HV(PEL, WIDTH, TAP)
Definition: hevc_mc_uni_msa.c:4111
common_vt_4t_6w_msa
static void common_vt_4t_6w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2554
XORI_B4_128_UB
#define XORI_B4_128_UB(...)
Definition: generic_macros_msa.h:1850
src2
const pixel * src2
Definition: h264pred_template.c:422
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:470
LD4
#define LD4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:228
ILVL_B2_SB
#define ILVL_B2_SB(...)
Definition: generic_macros_msa.h:1263
filt
static const int8_t filt[NUMTAPS *2]
Definition: af_earwax.c:39
SPLATI_H4_SB
#define SPLATI_H4_SB(...)
Definition: generic_macros_msa.h:1673
DPADD_SB4_SH
#define DPADD_SB4_SH(...)
Definition: generic_macros_msa.h:841
hevc_hv_uni_4t_4w_msa
static void hevc_hv_uni_4t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:3300
ST_H2
#define ST_H2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:409
SPLATI_W2_SH
#define SPLATI_W2_SH(...)
Definition: generic_macros_msa.h:1692
common_vt_4t_4x4multiple_msa
static void common_vt_4t_4x4multiple_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2496
LD_SB3
#define LD_SB3(...)
Definition: generic_macros_msa.h:289
ILVL_H4_SH
#define ILVL_H4_SH(...)
Definition: generic_macros_msa.h:1301
copy_width8_msa
static void copy_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:104
ST_UB
#define ST_UB(...)
Definition: generic_macros_msa.h:40
hevc_hv_uni_4t_4x4_msa
static void hevc_hv_uni_4t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
Definition: hevc_mc_uni_msa.c:3143
common_hz_4t_12w_msa
static void common_hz_4t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2197
common_vt_8t_64w_msa
static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1327
copy_width12_msa
static void copy_width12_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:146
common_hz_8t_4x4_msa
static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:286
common_hz_4t_32w_msa
static void common_hz_4t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2397
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:499
DOTP_SB4_SH
#define DOTP_SB4_SH(...)
Definition: generic_macros_msa.h:784
ILVL_B4_SB
#define ILVL_B4_SB(...)
Definition: generic_macros_msa.h:1274
LD_SB8
#define LD_SB8(...)
Definition: generic_macros_msa.h:336
SRARI_W4_SW
#define SRARI_W4_SW(...)
Definition: generic_macros_msa.h:2092
src0
const pixel *const src0
Definition: h264pred_template.c:420
common_vt_8t_48w_msa
static void common_vt_8t_48w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1319
common_hz_8t_12w_msa
static void common_hz_8t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:473
XORI_B7_128_SB
#define XORI_B7_128_SB(...)
Definition: generic_macros_msa.h:1873
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
HEVC_FILT_4TAP
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
Definition: hevc_macros_msa.h:55
int32_t
int32_t
Definition: audioconvert.c:56
common_vt_4t_12w_msa
static void common_vt_4t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2763
common_hz_8t_48w_msa
static void common_hz_8t_48w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:741
h
h
Definition: vp9dsp_template.c:2038
ILVR_H4_SH
#define ILVR_H4_SH(...)
Definition: generic_macros_msa.h:1408
common_hz_4t_4x8_msa
static void common_hz_4t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:1969
hevc_hv_uni_4t_32w_msa
static void hevc_hv_uni_4t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:4024
common_vt_8t_32w_msa
static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1311
ILVR_B2_SH
#define ILVR_B2_SH(...)
Definition: generic_macros_msa.h:1340
PCKEV_D2_SH
#define PCKEV_D2_SH(...)
Definition: generic_macros_msa.h:1789
hevc_hv_uni_4t_4x2_msa
static void hevc_hv_uni_4t_4x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
Definition: hevc_mc_uni_msa.c:3088
hevc_hv_uni_4t_8multx4mult_msa
static void hevc_hv_uni_4t_8multx4mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width8mult)
Definition: hevc_mc_uni_msa.c:3689
SD
#define SD
Definition: ccaption_dec.c:929
SPLATI_H2_SB
#define SPLATI_H2_SB(...)
Definition: generic_macros_msa.h:1655
PCKEV_H2_SH
#define PCKEV_H2_SH(...)
Definition: generic_macros_msa.h:1759
XORI_B3_128_SB
#define XORI_B3_128_SB(...)
Definition: generic_macros_msa.h:1843
common_vt_4t_16w_msa
static void common_vt_4t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2826
LD_SB2
#define LD_SB2(...)
Definition: generic_macros_msa.h:278