FFmpeg
hevc_mc_uni_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
24 
25 static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  /* 4 width cases */
29  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30  /* 4 width cases */
31  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32 };
33 
34 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
35  mask0, mask1, mask2, mask3, \
36  filt0, filt1, filt2, filt3, \
37  out0, out1) \
38 { \
39  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
40  \
41  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
42  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
43  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
44  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
45  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
46  DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
47  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
48  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \
49 }
50 
51 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
52  mask0, mask1, mask2, mask3, \
53  filt0, filt1, filt2, filt3, \
54  out0, out1, out2, out3) \
55 { \
56  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
57  \
58  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
59  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
60  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
61  out0, out1, out2, out3); \
62  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
63  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
64  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
65  out0, out1, out2, out3); \
66  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
67  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
68  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
69  out0, out1, out2, out3); \
70  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
71  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
72  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
73  out0, out1, out2, out3); \
74 }
75 
76 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
77  mask0, mask1, filt0, filt1, \
78  out0, out1) \
79 { \
80  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
81  \
82  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
83  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
84  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
85  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
86 }
87 
88 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
89  mask0, mask1, filt0, filt1, \
90  out0, out1, out2, out3) \
91 { \
92  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
93  \
94  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
95  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
96  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
97  out0, out1, out2, out3); \
98  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
99  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
100  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
101  out0, out1, out2, out3); \
102 }
103 
104 static void copy_width8_msa(uint8_t *src, int32_t src_stride,
105  uint8_t *dst, int32_t dst_stride,
106  int32_t height)
107 {
108  int32_t cnt;
109  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
110 
111  if (2 == height) {
112  LD2(src, src_stride, out0, out1);
113  SD(out0, dst);
114  dst += dst_stride;
115  SD(out1, dst);
116  } else if (6 == height) {
117  LD4(src, src_stride, out0, out1, out2, out3);
118  src += (4 * src_stride);
119  SD4(out0, out1, out2, out3, dst, dst_stride);
120  dst += (4 * dst_stride);
121  LD2(src, src_stride, out0, out1);
122  SD(out0, dst);
123  dst += dst_stride;
124  SD(out1, dst);
125  } else if (0 == (height % 8)) {
126  for (cnt = (height >> 3); cnt--;) {
127  LD4(src, src_stride, out0, out1, out2, out3);
128  src += (4 * src_stride);
129  LD4(src, src_stride, out4, out5, out6, out7);
130  src += (4 * src_stride);
131  SD4(out0, out1, out2, out3, dst, dst_stride);
132  dst += (4 * dst_stride);
133  SD4(out4, out5, out6, out7, dst, dst_stride);
134  dst += (4 * dst_stride);
135  }
136  } else if (0 == (height % 4)) {
137  for (cnt = (height >> 2); cnt--;) {
138  LD4(src, src_stride, out0, out1, out2, out3);
139  src += (4 * src_stride);
140  SD4(out0, out1, out2, out3, dst, dst_stride);
141  dst += (4 * dst_stride);
142  }
143  }
144 }
145 
146 static void copy_width12_msa(uint8_t *src, int32_t src_stride,
147  uint8_t *dst, int32_t dst_stride,
148  int32_t height)
149 {
150  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
151 
152  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
153  src += (8 * src_stride);
154  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
155  dst += (8 * dst_stride);
156  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
157  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
158 }
159 
160 static void copy_width16_msa(uint8_t *src, int32_t src_stride,
161  uint8_t *dst, int32_t dst_stride,
162  int32_t height)
163 {
164  int32_t cnt;
165  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
166 
167  if (12 == height) {
168  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169  src += (8 * src_stride);
170  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171  dst += (8 * dst_stride);
172  LD_UB4(src, src_stride, src0, src1, src2, src3);
173  src += (4 * src_stride);
174  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
175  dst += (4 * dst_stride);
176  } else if (0 == (height % 8)) {
177  for (cnt = (height >> 3); cnt--;) {
178  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
179  src7);
180  src += (8 * src_stride);
181  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
182  dst_stride);
183  dst += (8 * dst_stride);
184  }
185  } else if (0 == (height % 4)) {
186  for (cnt = (height >> 2); cnt--;) {
187  LD_UB4(src, src_stride, src0, src1, src2, src3);
188  src += (4 * src_stride);
189 
190  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
191  dst += (4 * dst_stride);
192  }
193  }
194 }
195 
196 static void copy_width24_msa(uint8_t *src, int32_t src_stride,
197  uint8_t *dst, int32_t dst_stride,
198  int32_t height)
199 {
200  int32_t cnt;
201  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
202  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
203 
204  for (cnt = 4; cnt--;) {
205  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
206  LD4(src + 16, src_stride, out0, out1, out2, out3);
207  src += (4 * src_stride);
208  LD4(src + 16, src_stride, out4, out5, out6, out7);
209  src += (4 * src_stride);
210 
211  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212  SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213  dst += (4 * dst_stride);
214  SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215  dst += (4 * dst_stride);
216  }
217 }
218 
219 static void copy_width32_msa(uint8_t *src, int32_t src_stride,
220  uint8_t *dst, int32_t dst_stride,
221  int32_t height)
222 {
223  int32_t cnt;
224  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
225 
226  for (cnt = (height >> 2); cnt--;) {
227  LD_UB4(src, src_stride, src0, src1, src2, src3);
228  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
229  src += (4 * src_stride);
230  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
231  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232  dst += (4 * dst_stride);
233  }
234 }
235 
236 static void copy_width48_msa(uint8_t *src, int32_t src_stride,
237  uint8_t *dst, int32_t dst_stride,
238  int32_t height)
239 {
240  int32_t cnt;
241  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
242  v16u8 src11;
243 
244  for (cnt = (height >> 2); cnt--;) {
245  LD_UB4(src, src_stride, src0, src1, src2, src3);
246  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
247  LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
248  src += (4 * src_stride);
249 
250  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
251  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252  ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253  dst += (4 * dst_stride);
254  }
255 }
256 
257 static void copy_width64_msa(uint8_t *src, int32_t src_stride,
258  uint8_t *dst, int32_t dst_stride,
259  int32_t height)
260 {
261  int32_t cnt;
262  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
263  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
264 
265  for (cnt = (height >> 2); cnt--;) {
266  LD_UB4(src, 16, src0, src1, src2, src3);
267  src += src_stride;
268  LD_UB4(src, 16, src4, src5, src6, src7);
269  src += src_stride;
270  LD_UB4(src, 16, src8, src9, src10, src11);
271  src += src_stride;
272  LD_UB4(src, 16, src12, src13, src14, src15);
273  src += src_stride;
274 
275  ST_UB4(src0, src1, src2, src3, dst, 16);
276  dst += dst_stride;
277  ST_UB4(src4, src5, src6, src7, dst, 16);
278  dst += dst_stride;
279  ST_UB4(src8, src9, src10, src11, dst, 16);
280  dst += dst_stride;
281  ST_UB4(src12, src13, src14, src15, dst, 16);
282  dst += dst_stride;
283  }
284 }
285 
286 static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
287  uint8_t *dst, int32_t dst_stride,
288  const int8_t *filter)
289 {
290  v16u8 mask0, mask1, mask2, mask3, out;
291  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
292  v8i16 filt, out0, out1;
293 
294  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
295  src -= 3;
296 
297  /* rearranging filter */
298  filt = LD_SH(filter);
299  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
300 
301  mask1 = mask0 + 2;
302  mask2 = mask0 + 4;
303  mask3 = mask0 + 6;
304 
305  LD_SB4(src, src_stride, src0, src1, src2, src3);
306  XORI_B4_128_SB(src0, src1, src2, src3);
307  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
308  mask3, filt0, filt1, filt2, filt3, out0, out1);
309  SRARI_H2_SH(out0, out1, 6);
310  SAT_SH2_SH(out0, out1, 7);
311  out = PCKEV_XORI128_UB(out0, out1);
312  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
313 }
314 
315 static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
316  uint8_t *dst, int32_t dst_stride,
317  const int8_t *filter)
318 {
319  v16i8 filt0, filt1, filt2, filt3;
320  v16i8 src0, src1, src2, src3;
321  v16u8 mask0, mask1, mask2, mask3, out;
322  v8i16 filt, out0, out1, out2, out3;
323 
324  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
325  src -= 3;
326 
327  /* rearranging filter */
328  filt = LD_SH(filter);
329  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
330 
331  mask1 = mask0 + 2;
332  mask2 = mask0 + 4;
333  mask3 = mask0 + 6;
334 
335  LD_SB4(src, src_stride, src0, src1, src2, src3);
336  XORI_B4_128_SB(src0, src1, src2, src3);
337  src += (4 * src_stride);
338  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
339  mask3, filt0, filt1, filt2, filt3, out0, out1);
340  LD_SB4(src, src_stride, src0, src1, src2, src3);
341  XORI_B4_128_SB(src0, src1, src2, src3);
342  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
343  mask3, filt0, filt1, filt2, filt3, out2, out3);
344  SRARI_H4_SH(out0, out1, out2, out3, 6);
345  SAT_SH4_SH(out0, out1, out2, out3, 7);
346  out = PCKEV_XORI128_UB(out0, out1);
347  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
348  dst += (4 * dst_stride);
349  out = PCKEV_XORI128_UB(out2, out3);
350  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
351 }
352 
353 static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
354  uint8_t *dst, int32_t dst_stride,
355  const int8_t *filter)
356 {
357  v16u8 mask0, mask1, mask2, mask3, out;
358  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
359  v8i16 filt, out0, out1, out2, out3;
360 
361  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
362  src -= 3;
363 
364  /* rearranging filter */
365  filt = LD_SH(filter);
366  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
367 
368  mask1 = mask0 + 2;
369  mask2 = mask0 + 4;
370  mask3 = mask0 + 6;
371 
372  LD_SB4(src, src_stride, src0, src1, src2, src3);
373  XORI_B4_128_SB(src0, src1, src2, src3);
374  src += (4 * src_stride);
375  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
376  mask3, filt0, filt1, filt2, filt3, out0, out1);
377  LD_SB4(src, src_stride, src0, src1, src2, src3);
378  XORI_B4_128_SB(src0, src1, src2, src3);
379  src += (4 * src_stride);
380  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
381  mask3, filt0, filt1, filt2, filt3, out2, out3);
382  SRARI_H4_SH(out0, out1, out2, out3, 6);
383  SAT_SH4_SH(out0, out1, out2, out3, 7);
384  out = PCKEV_XORI128_UB(out0, out1);
385  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
386  dst += (4 * dst_stride);
387  out = PCKEV_XORI128_UB(out2, out3);
388  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
389  dst += (4 * dst_stride);
390 
391  LD_SB4(src, src_stride, src0, src1, src2, src3);
392  XORI_B4_128_SB(src0, src1, src2, src3);
393  src += (4 * src_stride);
394  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
395  mask3, filt0, filt1, filt2, filt3, out0, out1);
396  LD_SB4(src, src_stride, src0, src1, src2, src3);
397  XORI_B4_128_SB(src0, src1, src2, src3);
398  src += (4 * src_stride);
399  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
400  mask3, filt0, filt1, filt2, filt3, out2, out3);
401 
402  SRARI_H4_SH(out0, out1, out2, out3, 6);
403  SAT_SH4_SH(out0, out1, out2, out3, 7);
404  out = PCKEV_XORI128_UB(out0, out1);
405  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
406  dst += (4 * dst_stride);
407  out = PCKEV_XORI128_UB(out2, out3);
408  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
409 }
410 
411 static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
412  uint8_t *dst, int32_t dst_stride,
413  const int8_t *filter, int32_t height)
414 {
415  if (4 == height) {
416  common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
417  } else if (8 == height) {
418  common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
419  } else if (16 == height) {
420  common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
421  }
422 }
423 
424 static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
425  uint8_t *dst, int32_t dst_stride,
426  const int8_t *filter, int32_t height)
427 {
428  uint32_t loop_cnt;
429  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
430  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
431  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
432  v8i16 filt, out0, out1, out2, out3;
433 
434  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
435  src -= 3;
436 
437  /* rearranging filter */
438  filt = LD_SH(filter);
439  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
440 
441  mask1 = mask0 + 2;
442  mask2 = mask0 + 4;
443  mask3 = mask0 + 6;
444 
445  for (loop_cnt = (height >> 2); loop_cnt--;) {
446  LD_SB4(src, src_stride, src0, src1, src2, src3);
447  XORI_B4_128_SB(src0, src1, src2, src3);
448  src += (4 * src_stride);
449 
450  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
451  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
452  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
453  out0, out1, out2, out3);
454  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
455  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
456  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
457  out0, out1, out2, out3);
458  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
459  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
460  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
461  out0, out1, out2, out3);
462  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
463  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
464  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
465  out0, out1, out2, out3);
466 
467  SRARI_H4_SH(out0, out1, out2, out3, 6);
468  SAT_SH4_SH(out0, out1, out2, out3, 7);
469  tmp0 = PCKEV_XORI128_UB(out0, out1);
470  tmp1 = PCKEV_XORI128_UB(out2, out3);
471  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
472  dst += (4 * dst_stride);
473  }
474 }
475 
476 static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
477  uint8_t *dst, int32_t dst_stride,
478  const int8_t *filter, int32_t height)
479 {
480  uint32_t loop_cnt;
481  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
482  v16u8 tmp0, tmp1, tmp2;
483  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
484  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
485  v16i8 filt0, filt1, filt2, filt3;
486  v8i16 filt, out0, out1, out2, out3, out4, out5;
487 
488  mask00 = LD_UB(&ff_hevc_mask_arr[0]);
489  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
490 
491  src = src - 3;
492 
493  /* rearranging filter */
494  filt = LD_SH(filter);
495  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
496 
497  mask1 = mask00 + 2;
498  mask2 = mask00 + 4;
499  mask3 = mask00 + 6;
500  mask4 = mask0 + 2;
501  mask5 = mask0 + 4;
502  mask6 = mask0 + 6;
503 
504  for (loop_cnt = 4; loop_cnt--;) {
505  /* 8 width */
506  LD_SB4(src, src_stride, src0, src1, src2, src3);
507  /* 4 width */
508  LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
509 
510  XORI_B4_128_SB(src0, src1, src2, src3);
511  XORI_B4_128_SB(src4, src5, src6, src7);
512  src += (4 * src_stride);
513 
514  VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
515  VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
516  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
517  out1, out2, out3);
518  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
519  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
520  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
521  out1, out2, out3);
522  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
523  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
524  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
525  out1, out2, out3);
526  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
527  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
528  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
529  out1, out2, out3);
530 
531  /* 4 width */
532  VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
533  DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5);
534  VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
535  DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5);
536  VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
537  DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5);
538  VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
539  DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5);
540 
541  SRARI_H4_SH(out0, out1, out2, out3, 6);
542  SRARI_H2_SH(out4, out5, 6);
543  SAT_SH4_SH(out0, out1, out2, out3, 7);
544  SAT_SH2_SH(out4, out5, 7);
545  tmp0 = PCKEV_XORI128_UB(out0, out1);
546  tmp1 = PCKEV_XORI128_UB(out2, out3);
547  tmp2 = PCKEV_XORI128_UB(out4, out5);
548 
549  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
550  ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
551  dst += (4 * dst_stride);
552  }
553 }
554 
555 static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
556  uint8_t *dst, int32_t dst_stride,
557  const int8_t *filter, int32_t height)
558 {
559  uint32_t loop_cnt;
560  v16u8 mask0, mask1, mask2, mask3, out;
561  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
562  v16i8 filt0, filt1, filt2, filt3;
563  v8i16 filt, out0, out1, out2, out3;
564 
565  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
566  src -= 3;
567 
568  /* rearranging filter */
569  filt = LD_SH(filter);
570  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
571 
572  mask1 = mask0 + 2;
573  mask2 = mask0 + 4;
574  mask3 = mask0 + 6;
575 
576  for (loop_cnt = (height >> 2); loop_cnt--;) {
577  LD_SB2(src, src_stride, src0, src2);
578  LD_SB2(src + 8, src_stride, src1, src3);
579  src += (2 * src_stride);
580 
581  LD_SB2(src, src_stride, src4, src6);
582  LD_SB2(src + 8, src_stride, src5, src7);
583  src += (2 * src_stride);
584 
585  XORI_B4_128_SB(src0, src1, src2, src3);
586  XORI_B4_128_SB(src4, src5, src6, src7);
587  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
588  mask3, filt0, filt1, filt2, filt3, out0,
589  out1, out2, out3);
590  SRARI_H4_SH(out0, out1, out2, out3, 6);
591  SAT_SH4_SH(out0, out1, out2, out3, 7);
592  out = PCKEV_XORI128_UB(out0, out1);
593  ST_UB(out, dst);
594  dst += dst_stride;
595  out = PCKEV_XORI128_UB(out2, out3);
596  ST_UB(out, dst);
597  dst += dst_stride;
598 
599  HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
600  mask3, filt0, filt1, filt2, filt3, out0,
601  out1, out2, out3);
602  SRARI_H4_SH(out0, out1, out2, out3, 6);
603  SAT_SH4_SH(out0, out1, out2, out3, 7);
604  out = PCKEV_XORI128_UB(out0, out1);
605  ST_UB(out, dst);
606  dst += dst_stride;
607  out = PCKEV_XORI128_UB(out2, out3);
608  ST_UB(out, dst);
609  dst += dst_stride;
610  }
611 }
612 
613 static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
614  uint8_t *dst, int32_t dst_stride,
615  const int8_t *filter, int32_t height)
616 {
617  uint32_t loop_cnt;
618  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
619  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
620  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
621  v16i8 vec11;
622  v8i16 out0, out1, out2, out3, out8, out9, filt;
623 
624  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
625  src -= 3;
626 
627  /* rearranging filter */
628  filt = LD_SH(filter);
629  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
630 
631  mask1 = mask0 + 2;
632  mask2 = mask0 + 4;
633  mask3 = mask0 + 6;
634  mask4 = mask0 + 8;
635  mask5 = mask0 + 10;
636  mask6 = mask0 + 12;
637  mask7 = mask0 + 14;
638 
639  for (loop_cnt = 16; loop_cnt--;) {
640  LD_SB2(src, src_stride, src0, src2);
641  LD_SB2(src + 16, src_stride, src1, src3);
642  XORI_B4_128_SB(src0, src1, src2, src3);
643  src += (2 * src_stride);
644  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
645  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
646  VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
647  DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
648  out8, out2, out9);
649  DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
650  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
651  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
652  VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
653  DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
654  out0, out8, out2, out9);
655  DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3);
656  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
657  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
658  VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
659  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
660  out0, out8, out2, out9);
661  DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
662  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
663  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
664  VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
665  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
666  out0, out8, out2, out9);
667  DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3);
668  SRARI_H4_SH(out0, out8, out2, out9, 6);
669  SRARI_H2_SH(out1, out3, 6);
670  SAT_SH4_SH(out0, out8, out2, out9, 7);
671  SAT_SH2_SH(out1, out3, 7);
672  out = PCKEV_XORI128_UB(out8, out9);
673  ST8x2_UB(out, dst + 16, dst_stride);
674  out = PCKEV_XORI128_UB(out0, out1);
675  ST_UB(out, dst);
676  dst += dst_stride;
677  out = PCKEV_XORI128_UB(out2, out3);
678  ST_UB(out, dst);
679  dst += dst_stride;
680  }
681 }
682 
683 static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
684  uint8_t *dst, int32_t dst_stride,
685  const int8_t *filter, int32_t height)
686 {
687  uint32_t loop_cnt;
688  v16u8 mask0, mask1, mask2, mask3, out;
689  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
690  v16i8 filt0, filt1, filt2, filt3;
691  v8i16 filt, out0, out1, out2, out3;
692 
693  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
694  src -= 3;
695 
696  /* rearranging filter */
697  filt = LD_SH(filter);
698  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
699 
700  mask1 = mask0 + 2;
701  mask2 = mask0 + 4;
702  mask3 = mask0 + 6;
703 
704  for (loop_cnt = (height >> 1); loop_cnt--;) {
705  src0 = LD_SB(src);
706  src1 = LD_SB(src + 8);
707  src2 = LD_SB(src + 16);
708  src3 = LD_SB(src + 24);
709  src += src_stride;
710  XORI_B4_128_SB(src0, src1, src2, src3);
711 
712  src4 = LD_SB(src);
713  src5 = LD_SB(src + 8);
714  src6 = LD_SB(src + 16);
715  src7 = LD_SB(src + 24);
716  src += src_stride;
717  XORI_B4_128_SB(src4, src5, src6, src7);
718 
719  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
720  mask3, filt0, filt1, filt2, filt3, out0,
721  out1, out2, out3);
722  SRARI_H4_SH(out0, out1, out2, out3, 6);
723  SAT_SH4_SH(out0, out1, out2, out3, 7);
724 
725  out = PCKEV_XORI128_UB(out0, out1);
726  ST_UB(out, dst);
727  out = PCKEV_XORI128_UB(out2, out3);
728  ST_UB(out, dst + 16);
729  dst += dst_stride;
730 
731  HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
732  mask3, filt0, filt1, filt2, filt3, out0,
733  out1, out2, out3);
734  SRARI_H4_SH(out0, out1, out2, out3, 6);
735  SAT_SH4_SH(out0, out1, out2, out3, 7);
736  out = PCKEV_XORI128_UB(out0, out1);
737  ST_UB(out, dst);
738  out = PCKEV_XORI128_UB(out2, out3);
739  ST_UB(out, dst + 16);
740  dst += dst_stride;
741  }
742 }
743 
744 static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
745  uint8_t *dst, int32_t dst_stride,
746  const int8_t *filter, int32_t height)
747 {
748  uint32_t loop_cnt;
749  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
750  v16i8 src4;
751  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
752  v8i16 filt, out0, out1, out2, out3;
753 
754  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
755  src -= 3;
756 
757  /* rearranging filter */
758  filt = LD_SH(filter);
759  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
760 
761  mask1 = mask0 + 2;
762  mask2 = mask0 + 4;
763  mask3 = mask0 + 6;
764  mask4 = mask0 + 8;
765  mask5 = mask0 + 10;
766  mask6 = mask0 + 12;
767  mask7 = mask0 + 14;
768 
769  for (loop_cnt = 64; loop_cnt--;) {
770  src0 = LD_SB(src);
771  src1 = LD_SB(src + 8);
772  src2 = LD_SB(src + 16);
773  src3 = LD_SB(src + 32);
774  src4 = LD_SB(src + 40);
775  src += src_stride;
776 
777  XORI_B4_128_SB(src0, src1, src2, src3);
778  src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
779 
780  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
781  vec0, vec1, vec2);
782  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
783  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
784  vec0, vec1, vec2);
785  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
786  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
787  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
788  vec0, vec1, vec2);
789  DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
790  out2 = __msa_dpadd_s_h(out2, vec2, filt2);
791 
792  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
793  vec0, vec1, vec2);
794  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
795  out2 = __msa_dpadd_s_h(out2, vec2, filt3);
796 
797  SRARI_H2_SH(out0, out1, 6);
798  out3 = __msa_srari_h(out2, 6);
799  SAT_SH3_SH(out0, out1, out3, 7);
800  out = PCKEV_XORI128_UB(out0, out1);
801  ST_UB(out, dst);
802 
803  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
804  vec0, vec1, vec2);
805  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
806  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
807  vec0, vec1, vec2);
808  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
809  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
810  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
811  vec0, vec1, vec2);
812  DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
813  out2 = __msa_dpadd_s_h(out2, vec2, filt2);
814  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
815  vec0, vec1, vec2);
816  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
817  out2 = __msa_dpadd_s_h(out2, vec2, filt3);
818 
819  SRARI_H2_SH(out0, out1, 6);
820  out2 = __msa_srari_h(out2, 6);
821  SAT_SH3_SH(out0, out1, out2, 7);
822  out = PCKEV_XORI128_UB(out3, out0);
823  ST_UB(out, dst + 16);
824  out = PCKEV_XORI128_UB(out1, out2);
825  ST_UB(out, dst + 32);
826  dst += dst_stride;
827  }
828 }
829 
830 static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
831  uint8_t *dst, int32_t dst_stride,
832  const int8_t *filter, int32_t height)
833 {
834  int32_t loop_cnt;
835  v16u8 mask0, mask1, mask2, mask3, out;
836  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
837  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
838  v16i8 filt0, filt1, filt2, filt3;
839  v8i16 res0, res1, res2, res3, filt;
840 
841  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
842  src -= 3;
843 
844  /* rearranging filter */
845  filt = LD_SH(filter);
846  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
847 
848  mask1 = mask0 + 2;
849  mask2 = mask0 + 4;
850  mask3 = mask0 + 6;
851 
852  for (loop_cnt = height; loop_cnt--;) {
853  LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
854  src += src_stride;
855 
856  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
857 
858  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
859  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
860  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
861  res1, res2, res3);
862  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
863  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
864  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
865  res1, res2, res3);
866  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
867  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
868  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
869  res1, res2, res3);
870  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
871  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
872  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
873  res1, res2, res3);
874 
875  SRARI_H4_SH(res0, res1, res2, res3, 6);
876  SAT_SH4_SH(res0, res1, res2, res3, 7);
877  out = PCKEV_XORI128_UB(res0, res1);
878  ST_UB(out, dst);
879  out = PCKEV_XORI128_UB(res2, res3);
880  ST_UB(out, dst + 16);
881 
882  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
883  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
884  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
885  res1, res2, res3);
886  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
887  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
888  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
889  res1, res2, res3);
890  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
891  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
892  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
893  res1, res2, res3);
894  VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
895  VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
896  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
897  res1, res2, res3);
898 
899  SRARI_H4_SH(res0, res1, res2, res3, 6);
900  SAT_SH4_SH(res0, res1, res2, res3, 7);
901  out = PCKEV_XORI128_UB(res0, res1);
902  ST_UB(out, dst + 32);
903  out = PCKEV_XORI128_UB(res2, res3);
904  ST_UB(out, dst + 48);
905  dst += dst_stride;
906  }
907 }
908 
909 static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
910  uint8_t *dst, int32_t dst_stride,
911  const int8_t *filter, int32_t height)
912 {
913  uint32_t loop_cnt;
914  v16u8 out0, out1;
915  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
916  v16i8 src11, src12, src13, src14;
917  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
918  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
919  v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
920  v16i8 src10998, filt0, filt1, filt2, filt3;
921  v8i16 filt, out10, out32, out54, out76;
922 
923  src -= (3 * src_stride);
924 
925  filt = LD_SH(filter);
926  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
927 
928  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
929  src += (7 * src_stride);
930 
931  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
932  src54_r, src21_r);
933  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
934  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
935  src4332, src6554);
936  XORI_B3_128_SB(src2110, src4332, src6554);
937 
938  for (loop_cnt = (height >> 3); loop_cnt--;) {
939  LD_SB4(src, src_stride, src7, src8, src9, src10);
940  src += (4 * src_stride);
941  LD_SB4(src, src_stride, src11, src12, src13, src14);
942  src += (4 * src_stride);
943 
944  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
945  src87_r, src98_r, src109_r);
946  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
947  src1110_r, src1211_r, src1312_r, src1413_r);
948  ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
949  ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
950  src12111110, src14131312);
951  XORI_B2_128_SB(src8776, src10998);
952  XORI_B2_128_SB(src12111110, src14131312);
953 
954  DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
955  DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
956  DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
957  DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
958  DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
959  DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
960  DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
961  DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
962  SRARI_H2_SH(out10, out32, 6);
963  SRARI_H2_SH(out54, out76, 6);
964  SAT_SH2_SH(out10, out32, 7);
965  SAT_SH2_SH(out54, out76, 7);
966  out0 = PCKEV_XORI128_UB(out10, out32);
967  out1 = PCKEV_XORI128_UB(out54, out76);
968  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
969  dst += (4 * dst_stride);
970  ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
971  dst += (4 * dst_stride);
972 
973  src2110 = src10998;
974  src4332 = src12111110;
975  src6554 = src14131312;
976  src6 = src14;
977  }
978 }
979 
980 static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
981  uint8_t *dst, int32_t dst_stride,
982  const int8_t *filter, int32_t height)
983 {
984  uint32_t loop_cnt;
985  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
986  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
987  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
988  v16u8 tmp0, tmp1;
989  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
990 
991  src -= (3 * src_stride);
992 
993  filt = LD_SH(filter);
994  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
995 
996  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
997  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
998  src += (7 * src_stride);
999  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1000  src54_r, src21_r);
1001  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1002 
1003  for (loop_cnt = (height >> 2); loop_cnt--;) {
1004  LD_SB4(src, src_stride, src7, src8, src9, src10);
1005  XORI_B4_128_SB(src7, src8, src9, src10);
1006  src += (4 * src_stride);
1007 
1008  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1009  src87_r, src98_r, src109_r);
1010  DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1011  filt0, out0_r, out1_r, out2_r, out3_r);
1012  DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1013  filt1, out0_r, out1_r, out2_r, out3_r);
1014  DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1015  filt2, out0_r, out1_r, out2_r, out3_r);
1016  DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1017  filt3, out0_r, out1_r, out2_r, out3_r);
1018  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1019  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1020  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1021  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1022  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1023  dst += (4 * dst_stride);
1024 
1025  src10_r = src54_r;
1026  src32_r = src76_r;
1027  src54_r = src98_r;
1028  src21_r = src65_r;
1029  src43_r = src87_r;
1030  src65_r = src109_r;
1031  src6 = src10;
1032  }
1033 }
1034 
1035 static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1036  uint8_t *dst, int32_t dst_stride,
1037  const int8_t *filter, int32_t height)
1038 {
1039  uint32_t loop_cnt;
1040  uint32_t out2, out3;
1041  uint64_t out0, out1;
1042  v16u8 tmp0, tmp1, tmp2, tmp3;
1043  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1044  v16i8 filt0, filt1, filt2, filt3;
1045  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1046  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1047  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1048  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1049 
1050  src -= (3 * src_stride);
1051 
1052  filt = LD_SH(filter);
1053  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1054 
1055  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1056  src += (7 * src_stride);
1057 
1058  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1059 
1060  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1061  src54_r, src21_r);
1062  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1063  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1064  src54_l, src21_l);
1065  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1066 
1067  for (loop_cnt = 4; loop_cnt--;) {
1068  LD_SB4(src, src_stride, src7, src8, src9, src10);
1069  XORI_B4_128_SB(src7, src8, src9, src10);
1070  src += (4 * src_stride);
1071 
1072  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1073  src87_r, src98_r, src109_r);
1074  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1075  src87_l, src98_l, src109_l);
1076  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1077  filt1, filt2, filt3);
1078  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1079  filt1, filt2, filt3);
1080  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1081  filt1, filt2, filt3);
1082  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1083  filt1, filt2, filt3);
1084  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1085  filt1, filt2, filt3);
1086  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1087  filt1, filt2, filt3);
1088  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1089  filt1, filt2, filt3);
1090  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1091  filt1, filt2, filt3);
1092  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1093  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1094  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1095  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1096  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1097  out3_r, tmp0, tmp1, tmp2, tmp3);
1098  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1099 
1100  out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1101  out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1102  out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1103  out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1104  SD(out0, dst);
1105  SW(out2, (dst + 8));
1106  dst += dst_stride;
1107  SD(out1, dst);
1108  SW(out3, (dst + 8));
1109  dst += dst_stride;
1110  out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1111  out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1112  out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1113  out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1114  SD(out0, dst);
1115  SW(out2, (dst + 8));
1116  dst += dst_stride;
1117  SD(out1, dst);
1118  SW(out3, (dst + 8));
1119  dst += dst_stride;
1120 
1121  src10_r = src54_r;
1122  src32_r = src76_r;
1123  src54_r = src98_r;
1124  src21_r = src65_r;
1125  src43_r = src87_r;
1126  src65_r = src109_r;
1127  src10_l = src54_l;
1128  src32_l = src76_l;
1129  src54_l = src98_l;
1130  src21_l = src65_l;
1131  src43_l = src87_l;
1132  src65_l = src109_l;
1133  src6 = src10;
1134  }
1135 }
1136 
1137 static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1138  uint8_t *dst, int32_t dst_stride,
1139  const int8_t *filter, int32_t height)
1140 {
1141  uint32_t loop_cnt;
1142  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1143  v16i8 filt0, filt1, filt2, filt3;
1144  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1145  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1146  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1147  v16u8 tmp0, tmp1, tmp2, tmp3;
1148  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1149 
1150  src -= (3 * src_stride);
1151 
1152  filt = LD_SH(filter);
1153  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1154 
1155  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1156  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1157  src += (7 * src_stride);
1158  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1159  src54_r, src21_r);
1160  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1161  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1162  src54_l, src21_l);
1163  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1164 
1165  for (loop_cnt = (height >> 2); loop_cnt--;) {
1166  LD_SB4(src, src_stride, src7, src8, src9, src10);
1167  XORI_B4_128_SB(src7, src8, src9, src10);
1168  src += (4 * src_stride);
1169 
1170  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1171  src87_r, src98_r, src109_r);
1172  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1173  src87_l, src98_l, src109_l);
1174  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1175  filt1, filt2, filt3);
1176  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1177  filt1, filt2, filt3);
1178  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1179  filt1, filt2, filt3);
1180  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1181  filt1, filt2, filt3);
1182  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1183  filt1, filt2, filt3);
1184  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1185  filt1, filt2, filt3);
1186  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1187  filt1, filt2, filt3);
1188  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1189  filt1, filt2, filt3);
1190  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1191  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1192  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1193  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1194  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1195  out3_r, tmp0, tmp1, tmp2, tmp3);
1196  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1197  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1198  dst += (4 * dst_stride);
1199 
1200  src10_r = src54_r;
1201  src32_r = src76_r;
1202  src54_r = src98_r;
1203  src21_r = src65_r;
1204  src43_r = src87_r;
1205  src65_r = src109_r;
1206  src10_l = src54_l;
1207  src32_l = src76_l;
1208  src54_l = src98_l;
1209  src21_l = src65_l;
1210  src43_l = src87_l;
1211  src65_l = src109_l;
1212  src6 = src10;
1213  }
1214 }
1215 
1217  uint8_t *dst, int32_t dst_stride,
1218  const int8_t *filter, int32_t height,
1219  int32_t width)
1220 {
1221  uint8_t *src_tmp;
1222  uint8_t *dst_tmp;
1223  uint32_t loop_cnt, cnt;
1224  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1225  v16i8 filt0, filt1, filt2, filt3;
1226  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1227  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1228  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1229  v16u8 tmp0, tmp1, tmp2, tmp3;
1230  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1231 
1232  src -= (3 * src_stride);
1233 
1234  filt = LD_SH(filter);
1235  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1236 
1237  for (cnt = (width >> 4); cnt--;) {
1238  src_tmp = src;
1239  dst_tmp = dst;
1240 
1241  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1242  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1243  src_tmp += (7 * src_stride);
1244  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1245  src32_r, src54_r, src21_r);
1246  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1247  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1248  src32_l, src54_l, src21_l);
1249  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1250 
1251  for (loop_cnt = (height >> 2); loop_cnt--;) {
1252  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1253  XORI_B4_128_SB(src7, src8, src9, src10);
1254  src_tmp += (4 * src_stride);
1255  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1256  src87_r, src98_r, src109_r);
1257  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1258  src87_l, src98_l, src109_l);
1259  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r,
1260  filt0, filt1, filt2, filt3);
1261  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r,
1262  filt0, filt1, filt2, filt3);
1263  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r,
1264  filt0, filt1, filt2, filt3);
1265  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r,
1266  filt0, filt1, filt2, filt3);
1267  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l,
1268  filt0, filt1, filt2, filt3);
1269  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l,
1270  filt0, filt1, filt2, filt3);
1271  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l,
1272  filt0, filt1, filt2, filt3);
1273  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l,
1274  filt0, filt1, filt2, filt3);
1275  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1276  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1277  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1278  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1279  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1280  out3_r, tmp0, tmp1, tmp2, tmp3);
1281  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1282  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1283  dst_tmp += (4 * dst_stride);
1284 
1285  src10_r = src54_r;
1286  src32_r = src76_r;
1287  src54_r = src98_r;
1288  src21_r = src65_r;
1289  src43_r = src87_r;
1290  src65_r = src109_r;
1291  src10_l = src54_l;
1292  src32_l = src76_l;
1293  src54_l = src98_l;
1294  src21_l = src65_l;
1295  src43_l = src87_l;
1296  src65_l = src109_l;
1297  src6 = src10;
1298  }
1299 
1300  src += 16;
1301  dst += 16;
1302  }
1303 }
1304 
1305 static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1306  uint8_t *dst, int32_t dst_stride,
1307  const int8_t *filter, int32_t height)
1308 {
1309  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1310  16);
1311 
1312  common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1313  height);
1314 }
1315 
1316 static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1317  uint8_t *dst, int32_t dst_stride,
1318  const int8_t *filter, int32_t height)
1319 {
1320  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1321  32);
1322 }
1323 
1324 static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1325  uint8_t *dst, int32_t dst_stride,
1326  const int8_t *filter, int32_t height)
1327 {
1328  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1329  48);
1330 }
1331 
1332 static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1333  uint8_t *dst, int32_t dst_stride,
1334  const int8_t *filter, int32_t height)
1335 {
1336  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1337  64);
1338 }
1339 
1341  int32_t src_stride,
1342  uint8_t *dst,
1343  int32_t dst_stride,
1344  const int8_t *filter_x,
1345  const int8_t *filter_y,
1346  int32_t height)
1347 {
1348  uint32_t loop_cnt;
1349  v16u8 out0, out1;
1350  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1351  v16i8 src9, src10, src11, src12, src13, src14;
1352  v8i16 filt0, filt1, filt2, filt3;
1353  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1354  v16i8 mask1, mask2, mask3;
1355  v8i16 filter_vec;
1356  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1357  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1358  v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1359  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1360  v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1361  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1362  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1363 
1364  src -= ((3 * src_stride) + 3);
1365  filter_vec = LD_SH(filter_x);
1366  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1367 
1368  filter_vec = LD_SH(filter_y);
1369  UNPCK_R_SB_SH(filter_vec, filter_vec);
1370 
1371  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1372 
1373  mask1 = mask0 + 2;
1374  mask2 = mask0 + 4;
1375  mask3 = mask0 + 6;
1376 
1377  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1378  src += (7 * src_stride);
1379  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1380 
1381  VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1382  VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1383  VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1384  vec8, vec9, vec10, vec11);
1385  VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1386  vec12, vec13, vec14, vec15);
1387 
1388  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1389  filt3);
1390  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1391  filt3);
1392  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1393  filt3);
1394  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1395  filt3);
1396 
1397  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1398  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1399  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1400 
1401  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1402 
1403  for (loop_cnt = height >> 3; loop_cnt--;) {
1404  LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1405  src14);
1406  src += (8 * src_stride);
1407  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1408 
1409  VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1410  vec0, vec1, vec2, vec3);
1411  VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1412  vec4, vec5, vec6, vec7);
1413  VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1414  vec8, vec9, vec10, vec11);
1415  VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1416  vec12, vec13, vec14, vec15);
1417 
1418  dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1419  filt3);
1420  dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1421  filt3);
1422  dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1423  filt2, filt3);
1424  dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1425  filt2, filt3);
1426 
1427  dst76_r = __msa_ilvr_h(dst117, dst66);
1428  ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1429  ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1430  ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1431  dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1432  dst1110_r = __msa_ilvr_h(dst117, dst1410);
1433 
1434  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1435  filt_h1, filt_h2, filt_h3);
1436  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1437  filt_h1, filt_h2, filt_h3);
1438  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1439  filt_h1, filt_h2, filt_h3);
1440  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1441  filt_h1, filt_h2, filt_h3);
1442  dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1443  filt_h1, filt_h2, filt_h3);
1444  dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1445  filt_h1, filt_h2, filt_h3);
1446  dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1447  filt_h1, filt_h2, filt_h3);
1448  dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1449  filt_h0, filt_h1, filt_h2, filt_h3);
1450 
1451  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1452  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1453  SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1454  SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1455  SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1456  SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1457  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1458  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1459  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1460  out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1461  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
1462  dst += (4 * dst_stride);
1463  ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
1464  dst += (4 * dst_stride);
1465 
1466  dst10_r = dst98_r;
1467  dst32_r = dst1110_r;
1468  dst54_r = dst1312_r;
1469  dst21_r = dst109_r;
1470  dst43_r = dst1211_r;
1471  dst65_r = dst1413_r;
1472  dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1473  }
1474 }
1475 
1477  int32_t src_stride,
1478  uint8_t *dst,
1479  int32_t dst_stride,
1480  const int8_t *filter_x,
1481  const int8_t *filter_y,
1483 {
1484  uint32_t loop_cnt, cnt;
1485  uint8_t *src_tmp;
1486  uint8_t *dst_tmp;
1487  v16u8 out;
1488  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1489  v8i16 filt0, filt1, filt2, filt3;
1490  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1491  v16i8 mask1, mask2, mask3;
1492  v8i16 filter_vec;
1493  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1494  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1495  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1496  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1497  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1498  v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1499  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1500  v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1501  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1502 
1503  src -= ((3 * src_stride) + 3);
1504 
1505  filter_vec = LD_SH(filter_x);
1506  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1507 
1508  filter_vec = LD_SH(filter_y);
1509  UNPCK_R_SB_SH(filter_vec, filter_vec);
1510 
1511  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1512 
1513  mask1 = mask0 + 2;
1514  mask2 = mask0 + 4;
1515  mask3 = mask0 + 6;
1516 
1517  for (cnt = width >> 3; cnt--;) {
1518  src_tmp = src;
1519  dst_tmp = dst;
1520 
1521  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1522  src_tmp += (7 * src_stride);
1523  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1524 
1525  /* row 0 row 1 row 2 row 3 */
1526  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1527  vec0, vec1, vec2, vec3);
1528  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1529  vec4, vec5, vec6, vec7);
1530  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1531  vec8, vec9, vec10, vec11);
1532  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1533  vec12, vec13, vec14, vec15);
1534  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1535  filt3);
1536  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1537  filt3);
1538  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1539  filt3);
1540  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1541  filt2, filt3);
1542 
1543  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1544  vec0, vec1, vec2, vec3);
1545  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1546  vec4, vec5, vec6, vec7);
1547  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1548  vec8, vec9, vec10, vec11);
1549  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1550  filt3);
1551  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1552  filt3);
1553  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1554  filt3);
1555 
1556  for (loop_cnt = height >> 1; loop_cnt--;) {
1557  LD_SB2(src_tmp, src_stride, src7, src8);
1558  XORI_B2_128_SB(src7, src8);
1559  src_tmp += 2 * src_stride;
1560 
1561  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1562  dst10_r, dst32_r, dst54_r, dst21_r);
1563  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1564  dst10_l, dst32_l, dst54_l, dst21_l);
1565  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1566  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1567 
1568  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1569  vec0, vec1, vec2, vec3);
1570  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1571  filt2, filt3);
1572 
1573  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1574  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1575  filt_h0, filt_h1, filt_h2, filt_h3);
1576  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1577  filt_h0, filt_h1, filt_h2, filt_h3);
1578  dst0_r >>= 6;
1579  dst0_l >>= 6;
1580 
1581  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1582  vec0, vec1, vec2, vec3);
1583  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1584  filt2, filt3);
1585 
1586  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1587  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1588  filt_h0, filt_h1, filt_h2, filt_h3);
1589  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1590  filt_h0, filt_h1, filt_h2, filt_h3);
1591  dst1_r >>= 6;
1592  dst1_l >>= 6;
1593  SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1594  SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1595 
1596  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1597  out = PCKEV_XORI128_UB(dst0, dst1);
1598  ST8x2_UB(out, dst_tmp, dst_stride);
1599  dst_tmp += (2 * dst_stride);
1600 
1601  dst0 = dst2;
1602  dst1 = dst3;
1603  dst2 = dst4;
1604  dst3 = dst5;
1605  dst4 = dst6;
1606  dst5 = dst7;
1607  dst6 = dst8;
1608  }
1609 
1610  src += 8;
1611  dst += 8;
1612  }
1613 }
1614 
1616  int32_t src_stride,
1617  uint8_t *dst,
1618  int32_t dst_stride,
1619  const int8_t *filter_x,
1620  const int8_t *filter_y,
1621  int32_t height)
1622 {
1623  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1624  filter_x, filter_y, height, 8);
1625 }
1626 
1628  int32_t src_stride,
1629  uint8_t *dst,
1630  int32_t dst_stride,
1631  const int8_t *filter_x,
1632  const int8_t *filter_y,
1633  int32_t height)
1634 {
1635  uint32_t loop_cnt;
1636  uint8_t *src_tmp, *dst_tmp;
1637  v16u8 out0, out1;
1638  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1639  v16i8 src11, src12, src13, src14;
1640  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1641  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1642  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1643  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1644  v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1645  v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1646  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1647  v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1648  v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1649  v8i16 dst1413_r, dst87_l, filter_vec;
1650  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1651  v4i32 dst0_l, dst1_l;
1652 
1653  src -= ((3 * src_stride) + 3);
1654 
1655  filter_vec = LD_SH(filter_x);
1656  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1657 
1658  filter_vec = LD_SH(filter_y);
1659  UNPCK_R_SB_SH(filter_vec, filter_vec);
1660 
1661  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1662 
1663  mask0 = LD_SB(ff_hevc_mask_arr);
1664  mask1 = mask0 + 2;
1665  mask2 = mask0 + 4;
1666  mask3 = mask0 + 6;
1667 
1668  src_tmp = src;
1669  dst_tmp = dst;
1670 
1671  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1672  src_tmp += (7 * src_stride);
1673  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1674 
1675  /* row 0 row 1 row 2 row 3 */
1676  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1677  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1678  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1679  vec11);
1680  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1681  vec15);
1682  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1683  filt3);
1684  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1685  filt3);
1686  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1687  filt3);
1688  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1689  filt2, filt3);
1690 
1691  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1692  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1693  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1694  vec11);
1695  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1696  filt3);
1697  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1698  filt3);
1699  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1700  filt3);
1701 
1702  for (loop_cnt = 8; loop_cnt--;) {
1703  LD_SB2(src_tmp, src_stride, src7, src8);
1704  XORI_B2_128_SB(src7, src8);
1705  src_tmp += 2 * src_stride;
1706 
1707  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1708  dst32_r, dst54_r, dst21_r);
1709  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1710  dst32_l, dst54_l, dst21_l);
1711  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1712  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1713 
1714  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1715  vec3);
1716  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1717  filt3);
1718 
1719  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1720  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1721  filt_h0, filt_h1, filt_h2, filt_h3);
1722  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1723  filt_h0, filt_h1, filt_h2, filt_h3);
1724  dst0_r >>= 6;
1725  dst0_l >>= 6;
1726 
1727  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1728  vec3);
1729  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1730  filt3);
1731 
1732  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1733  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1734  filt_h0, filt_h1, filt_h2, filt_h3);
1735  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1736  filt_h0, filt_h1, filt_h2, filt_h3);
1737  dst1_r >>= 6;
1738  dst1_l >>= 6;
1739  SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1740  SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1741 
1742  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1743  out0 = PCKEV_XORI128_UB(dst0, dst1);
1744  ST8x2_UB(out0, dst_tmp, dst_stride);
1745  dst_tmp += (2 * dst_stride);
1746 
1747  dst0 = dst2;
1748  dst1 = dst3;
1749  dst2 = dst4;
1750  dst3 = dst5;
1751  dst4 = dst6;
1752  dst5 = dst7;
1753  dst6 = dst8;
1754  }
1755 
1756  src += 8;
1757  dst += 8;
1758 
1759  mask4 = LD_SB(ff_hevc_mask_arr + 16);
1760  mask5 = mask4 + 2;
1761  mask6 = mask4 + 4;
1762  mask7 = mask4 + 6;
1763 
1764  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1765  src += (7 * src_stride);
1766  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1767 
1768  VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1769  VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1770  VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1771  vec11);
1772  VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1773  vec15);
1774 
1775  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1776  filt3);
1777  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1778  filt3);
1779  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1780  filt3);
1781  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1782  filt3);
1783 
1784  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1785  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1786  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1787 
1788  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1789 
1790  for (loop_cnt = 2; loop_cnt--;) {
1791  LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1792  src14);
1793  src += (8 * src_stride);
1794  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1795 
1796  VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1797  vec3);
1798  VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1799  vec7);
1800  VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1801  vec11);
1802  VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1803  vec14, vec15);
1804 
1805  dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1806  filt3);
1807  dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1808  filt3);
1809  dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1810  filt2, filt3);
1811  dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1812  filt2, filt3);
1813 
1814  dst76_r = __msa_ilvr_h(dst117, dst66);
1815  ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1816  ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1817  ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1818  dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1819  dst1110_r = __msa_ilvr_h(dst117, dst1410);
1820 
1821  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1822  filt_h1, filt_h2, filt_h3);
1823  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1824  filt_h1, filt_h2, filt_h3);
1825  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1826  filt_h1, filt_h2, filt_h3);
1827  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1828  filt_h1, filt_h2, filt_h3);
1829  dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1830  filt_h1, filt_h2, filt_h3);
1831  dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1832  filt_h1, filt_h2, filt_h3);
1833  dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1834  filt_h1, filt_h2, filt_h3);
1835  dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1836  filt_h0, filt_h1, filt_h2, filt_h3);
1837 
1838  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1839  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1840  SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1841  SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1842  SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1843  SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1844  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1845  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1846  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1847  out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1848  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
1849  dst += (4 * dst_stride);
1850  ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
1851  dst += (4 * dst_stride);
1852 
1853  dst10_r = dst98_r;
1854  dst32_r = dst1110_r;
1855  dst54_r = dst1312_r;
1856  dst21_r = dst109_r;
1857  dst43_r = dst1211_r;
1858  dst65_r = dst1413_r;
1859  dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1860  }
1861 }
1862 
1864  int32_t src_stride,
1865  uint8_t *dst,
1866  int32_t dst_stride,
1867  const int8_t *filter_x,
1868  const int8_t *filter_y,
1869  int32_t height)
1870 {
1871  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1872  filter_x, filter_y, height, 16);
1873 }
1874 
1876  int32_t src_stride,
1877  uint8_t *dst,
1878  int32_t dst_stride,
1879  const int8_t *filter_x,
1880  const int8_t *filter_y,
1881  int32_t height)
1882 {
1883  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1884  filter_x, filter_y, height, 24);
1885 }
1886 
1888  int32_t src_stride,
1889  uint8_t *dst,
1890  int32_t dst_stride,
1891  const int8_t *filter_x,
1892  const int8_t *filter_y,
1893  int32_t height)
1894 {
1895  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1896  filter_x, filter_y, height, 32);
1897 }
1898 
1900  int32_t src_stride,
1901  uint8_t *dst,
1902  int32_t dst_stride,
1903  const int8_t *filter_x,
1904  const int8_t *filter_y,
1905  int32_t height)
1906 {
1907  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1908  filter_x, filter_y, height, 48);
1909 }
1910 
1912  int32_t src_stride,
1913  uint8_t *dst,
1914  int32_t dst_stride,
1915  const int8_t *filter_x,
1916  const int8_t *filter_y,
1917  int32_t height)
1918 {
1919  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1920  filter_x, filter_y, height, 64);
1921 }
1922 
1923 static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
1924  uint8_t *dst, int32_t dst_stride,
1925  const int8_t *filter)
1926 {
1927  v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
1928  v16u8 out;
1929  v8i16 filt, res0;
1930 
1931  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1932  src -= 1;
1933 
1934  /* rearranging filter */
1935  filt = LD_SH(filter);
1936  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1937 
1938  mask1 = mask0 + 2;
1939 
1940  LD_SB2(src, src_stride, src0, src1);
1941  XORI_B2_128_SB(src0, src1);
1942  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1943  res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
1944  res0 = __msa_srari_h(res0, 6);
1945  res0 = __msa_sat_s_h(res0, 7);
1946  out = PCKEV_XORI128_UB(res0, res0);
1947  ST4x2_UB(out, dst, dst_stride);
1948 }
1949 
1950 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
1951  uint8_t *dst, int32_t dst_stride,
1952  const int8_t *filter)
1953 {
1954  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1955  v8i16 filt, out0, out1;
1956  v16u8 out;
1957 
1958  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1959  src -= 1;
1960 
1961  /* rearranging filter */
1962  filt = LD_SH(filter);
1963  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1964 
1965  mask1 = mask0 + 2;
1966 
1967  LD_SB4(src, src_stride, src0, src1, src2, src3);
1968  XORI_B4_128_SB(src0, src1, src2, src3);
1969  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1970  filt0, filt1, out0, out1);
1971  SRARI_H2_SH(out0, out1, 6);
1972  SAT_SH2_SH(out0, out1, 7);
1973  out = PCKEV_XORI128_UB(out0, out1);
1974  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1975 }
1976 
1977 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
1978  uint8_t *dst, int32_t dst_stride,
1979  const int8_t *filter)
1980 {
1981  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1982  v16u8 out;
1983  v8i16 filt, out0, out1, out2, out3;
1984 
1985  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1986  src -= 1;
1987 
1988  /* rearranging filter */
1989  filt = LD_SH(filter);
1990  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1991 
1992  mask1 = mask0 + 2;
1993 
1994  LD_SB4(src, src_stride, src0, src1, src2, src3);
1995  src += (4 * src_stride);
1996 
1997  XORI_B4_128_SB(src0, src1, src2, src3);
1998  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1999  filt0, filt1, out0, out1);
2000  LD_SB4(src, src_stride, src0, src1, src2, src3);
2001  XORI_B4_128_SB(src0, src1, src2, src3);
2002  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2003  filt0, filt1, out2, out3);
2004  SRARI_H4_SH(out0, out1, out2, out3, 6);
2005  SAT_SH4_SH(out0, out1, out2, out3, 7);
2006  out = PCKEV_XORI128_UB(out0, out1);
2007  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2008  dst += (4 * dst_stride);
2009  out = PCKEV_XORI128_UB(out2, out3);
2010  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2011 }
2012 
2013 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
2014  uint8_t *dst, int32_t dst_stride,
2015  const int8_t *filter)
2016 {
2017  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2018  v16i8 filt0, filt1, mask0, mask1;
2019  v16u8 out;
2020  v8i16 filt, out0, out1, out2, out3;
2021 
2022  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2023  src -= 1;
2024 
2025  /* rearranging filter */
2026  filt = LD_SH(filter);
2027  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2028 
2029  mask1 = mask0 + 2;
2030 
2031  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2032  src += (8 * src_stride);
2033  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2034  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2035  filt0, filt1, out0, out1);
2036  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2037  filt0, filt1, out2, out3);
2038  SRARI_H4_SH(out0, out1, out2, out3, 6);
2039  SAT_SH4_SH(out0, out1, out2, out3, 7);
2040  out = PCKEV_XORI128_UB(out0, out1);
2041  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2042  dst += (4 * dst_stride);
2043  out = PCKEV_XORI128_UB(out2, out3);
2044  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2045  dst += (4 * dst_stride);
2046 
2047  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2048  src += (8 * src_stride);
2049  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2050  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2051  filt0, filt1, out0, out1);
2052  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2053  filt0, filt1, out2, out3);
2054  SRARI_H4_SH(out0, out1, out2, out3, 6);
2055  SAT_SH4_SH(out0, out1, out2, out3, 7);
2056  out = PCKEV_XORI128_UB(out0, out1);
2057  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2058  dst += (4 * dst_stride);
2059  out = PCKEV_XORI128_UB(out2, out3);
2060  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2061 }
2062 
2063 static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
2064  uint8_t *dst, int32_t dst_stride,
2065  const int8_t *filter, int32_t height)
2066 {
2067  if (2 == height) {
2068  common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2069  } else if (4 == height) {
2070  common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2071  } else if (8 == height) {
2072  common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2073  } else if (16 == height) {
2074  common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
2075  }
2076 }
2077 
2078 static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
2079  uint8_t *dst, int32_t dst_stride,
2080  const int8_t *filter, int32_t height)
2081 {
2082  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2083  v16u8 out4, out5;
2084  v8i16 filt, out0, out1, out2, out3;
2085 
2086  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2087  src -= 1;
2088 
2089  /* rearranging filter */
2090  filt = LD_SH(filter);
2091  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2092 
2093  mask1 = mask0 + 2;
2094 
2095  LD_SB4(src, src_stride, src0, src1, src2, src3);
2096  src += (4 * src_stride);
2097 
2098  XORI_B4_128_SB(src0, src1, src2, src3);
2099  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2100  filt1, out0, out1, out2, out3);
2101  SRARI_H4_SH(out0, out1, out2, out3, 6);
2102  SAT_SH4_SH(out0, out1, out2, out3, 7);
2103  out4 = PCKEV_XORI128_UB(out0, out1);
2104  out5 = PCKEV_XORI128_UB(out2, out3);
2105  ST6x4_UB(out4, out5, dst, dst_stride);
2106  dst += (4 * dst_stride);
2107 
2108  LD_SB4(src, src_stride, src0, src1, src2, src3);
2109  src += (4 * src_stride);
2110 
2111  XORI_B4_128_SB(src0, src1, src2, src3);
2112  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2113  filt1, out0, out1, out2, out3);
2114  SRARI_H4_SH(out0, out1, out2, out3, 6);
2115  SAT_SH4_SH(out0, out1, out2, out3, 7);
2116  out4 = PCKEV_XORI128_UB(out0, out1);
2117  out5 = PCKEV_XORI128_UB(out2, out3);
2118  ST6x4_UB(out4, out5, dst, dst_stride);
2119  dst += (4 * dst_stride);
2120 }
2121 
2122 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
2123  uint8_t *dst, int32_t dst_stride,
2124  const int8_t *filter, int32_t height)
2125 {
2126  uint32_t loop_cnt;
2127  v16i8 src0, src1, filt0, filt1, mask0, mask1;
2128  v16u8 out;
2129  v8i16 filt, vec0, vec1, vec2, vec3;
2130 
2131  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2132  src -= 1;
2133 
2134  filt = LD_SH(filter);
2135  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2136 
2137  mask1 = mask0 + 2;
2138 
2139  for (loop_cnt = (height >> 1); loop_cnt--;) {
2140  LD_SB2(src, src_stride, src0, src1);
2141  src += (2 * src_stride);
2142 
2143  XORI_B2_128_SB(src0, src1);
2144  VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2145  DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2146  VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2147  DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
2148  SRARI_H2_SH(vec0, vec1, 6);
2149  SAT_SH2_SH(vec0, vec1, 7);
2150  out = PCKEV_XORI128_UB(vec0, vec1);
2151  ST8x2_UB(out, dst, dst_stride);
2152  dst += (2 * dst_stride);
2153  }
2154 }
2155 
2156 static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2157  uint8_t *dst, int32_t dst_stride,
2158  const int8_t *filter, int32_t height)
2159 {
2160  uint32_t loop_cnt;
2161  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2162  v16u8 tmp0, tmp1;
2163  v8i16 filt, out0, out1, out2, out3;
2164 
2165  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2166  src -= 1;
2167 
2168  /* rearranging filter */
2169  filt = LD_SH(filter);
2170  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2171 
2172  mask1 = mask0 + 2;
2173 
2174  for (loop_cnt = (height >> 2); loop_cnt--;) {
2175  LD_SB4(src, src_stride, src0, src1, src2, src3);
2176  src += (4 * src_stride);
2177 
2178  XORI_B4_128_SB(src0, src1, src2, src3);
2179  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2180  filt1, out0, out1, out2, out3);
2181  SRARI_H4_SH(out0, out1, out2, out3, 6);
2182  SAT_SH4_SH(out0, out1, out2, out3, 7);
2183  tmp0 = PCKEV_XORI128_UB(out0, out1);
2184  tmp1 = PCKEV_XORI128_UB(out2, out3);
2185  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2186  dst += (4 * dst_stride);
2187  }
2188 }
2189 
2190 static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
2191  uint8_t *dst, int32_t dst_stride,
2192  const int8_t *filter, int32_t height)
2193 {
2194  if ((2 == height) || (6 == height)) {
2195  common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
2196  height);
2197  } else {
2198  common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
2199  height);
2200  }
2201 }
2202 
2203 static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
2204  uint8_t *dst, int32_t dst_stride,
2205  const int8_t *filter, int32_t height)
2206 {
2207  uint32_t loop_cnt;
2208  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2209  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2210  v16i8 vec10, vec11;
2211  v16u8 tmp0, tmp1;
2212  v8i16 filt, out0, out1, out2, out3, out4, out5;
2213 
2214  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2215  mask2 = LD_SB(&ff_hevc_mask_arr[32]);
2216 
2217  src -= 1;
2218 
2219  /* rearranging filter */
2220  filt = LD_SH(filter);
2221  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2222 
2223  mask1 = mask0 + 2;
2224  mask3 = mask2 + 2;
2225 
2226  for (loop_cnt = 4; loop_cnt--;) {
2227  LD_SB4(src, src_stride, src0, src1, src2, src3);
2228  src += (4 * src_stride);
2229 
2230  XORI_B4_128_SB(src0, src1, src2, src3);
2231  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2232  DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2233  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2234  DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
2235  SRARI_H2_SH(out0, out1, 6);
2236  SAT_SH2_SH(out0, out1, 7);
2237  tmp0 = PCKEV_XORI128_UB(out0, out1);
2238  ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2239 
2240  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2241  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2242  DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2243  out2, out3, out4, out5);
2244  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2245  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2246  DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2247  out2, out3, out4, out5);
2248  SRARI_H4_SH(out2, out3, out4, out5, 6);
2249  SAT_SH4_SH(out2, out3, out4, out5, 7);
2250  tmp0 = PCKEV_XORI128_UB(out2, out3);
2251  tmp1 = PCKEV_XORI128_UB(out4, out5);
2252  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2253  dst += (4 * dst_stride);
2254  }
2255 }
2256 
2257 static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
2258  uint8_t *dst, int32_t dst_stride,
2259  const int8_t *filter, int32_t height)
2260 {
2261  uint32_t loop_cnt;
2262  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2263  v16i8 filt0, filt1, mask0, mask1;
2264  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2265  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2266  v16u8 out;
2267 
2268  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2269  src -= 1;
2270 
2271  /* rearranging filter */
2272  filt = LD_SH(filter);
2273  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2274 
2275  mask1 = mask0 + 2;
2276 
2277  for (loop_cnt = (height >> 2); loop_cnt--;) {
2278  LD_SB4(src, src_stride, src0, src2, src4, src6);
2279  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2280  src += (4 * src_stride);
2281 
2282  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2283 
2284  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2285  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2286  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2287  out0, out1, out2, out3);
2288  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2289  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2290  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2291  out0, out1, out2, out3);
2292  SRARI_H4_SH(out0, out1, out2, out3, 6);
2293  SAT_SH4_SH(out0, out1, out2, out3, 7);
2294  out = PCKEV_XORI128_UB(out0, out1);
2295  ST_UB(out, dst);
2296  dst += dst_stride;
2297  out = PCKEV_XORI128_UB(out2, out3);
2298  ST_UB(out, dst);
2299  dst += dst_stride;
2300 
2301  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2302  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2303  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2304  out4, out5, out6, out7);
2305  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2306  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2307  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2308  out4, out5, out6, out7);
2309  SRARI_H4_SH(out4, out5, out6, out7, 6);
2310  SAT_SH4_SH(out4, out5, out6, out7, 7);
2311  out = PCKEV_XORI128_UB(out4, out5);
2312  ST_UB(out, dst);
2313  dst += dst_stride;
2314  out = PCKEV_XORI128_UB(out6, out7);
2315  ST_UB(out, dst);
2316  dst += dst_stride;
2317  }
2318 }
2319 
2320 static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
2321  uint8_t *dst, int32_t dst_stride,
2322  const int8_t *filter, int32_t height)
2323 {
2324  uint8_t *dst1 = dst + 16;
2325  uint32_t loop_cnt;
2326  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2327  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2328  v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2329  v8i16 filt, out0, out1, out2, out3;
2330  v16u8 tmp0, tmp1;
2331 
2332  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2333  src -= 1;
2334 
2335  /* rearranging filter */
2336  filt = LD_SH(filter);
2337  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2338 
2339  mask1 = mask0 + 2;
2340  mask00 = mask0 + 8;
2341  mask11 = mask0 + 10;
2342 
2343  for (loop_cnt = 8; loop_cnt--;) {
2344  LD_SB4(src, src_stride, src0, src2, src4, src6);
2345  LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2346  src += (4 * src_stride);
2347 
2348  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2349  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2350  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2351  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2352  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2353  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2354  out0, out1, out2, out3);
2355  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2356  out0, out1, out2, out3);
2357  SRARI_H4_SH(out0, out1, out2, out3, 6);
2358  SAT_SH4_SH(out0, out1, out2, out3, 7);
2359  tmp0 = PCKEV_XORI128_UB(out0, out1);
2360  ST_UB(tmp0, dst);
2361  dst += dst_stride;
2362  tmp0 = PCKEV_XORI128_UB(out2, out3);
2363  ST_UB(tmp0, dst);
2364  dst += dst_stride;
2365 
2366  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2367  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2368  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2369  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2370  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2371  out0, out1, out2, out3);
2372  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2373  out0, out1, out2, out3);
2374  SRARI_H4_SH(out0, out1, out2, out3, 6);
2375  SAT_SH4_SH(out0, out1, out2, out3, 7);
2376  tmp0 = PCKEV_XORI128_UB(out0, out1);
2377  ST_UB(tmp0, dst);
2378  dst += dst_stride;
2379  tmp0 = PCKEV_XORI128_UB(out2, out3);
2380  ST_UB(tmp0, dst);
2381  dst += dst_stride;
2382 
2383  /* 8 width */
2384  VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2385  VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2386  VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2387  VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2388 
2389  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2390  out0, out1, out2, out3);
2391  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2392  out0, out1, out2, out3);
2393 
2394  SRARI_H4_SH(out0, out1, out2, out3, 6);
2395  SAT_SH4_SH(out0, out1, out2, out3, 7);
2396  tmp0 = PCKEV_XORI128_UB(out0, out1);
2397  tmp1 = PCKEV_XORI128_UB(out2, out3);
2398  ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
2399  dst1 += (4 * dst_stride);
2400  }
2401 }
2402 
2403 static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
2404  uint8_t *dst, int32_t dst_stride,
2405  const int8_t *filter, int32_t height)
2406 {
2407  uint32_t loop_cnt;
2408  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2409  v16i8 filt0, filt1, mask0, mask1;
2410  v16u8 out;
2411  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2412  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2413 
2414  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2415  src -= 1;
2416 
2417  /* rearranging filter */
2418  filt = LD_SH(filter);
2419  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2420 
2421  mask1 = mask0 + 2;
2422 
2423  for (loop_cnt = (height >> 1); loop_cnt--;) {
2424  src0 = LD_SB(src);
2425  src1 = LD_SB(src + 8);
2426  src2 = LD_SB(src + 16);
2427  src3 = LD_SB(src + 24);
2428  src += src_stride;
2429  src4 = LD_SB(src);
2430  src5 = LD_SB(src + 8);
2431  src6 = LD_SB(src + 16);
2432  src7 = LD_SB(src + 24);
2433  src += src_stride;
2434 
2435  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2436 
2437  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2438  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2439  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2440  out0, out1, out2, out3);
2441  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2442  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2443  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2444  out0, out1, out2, out3);
2445 
2446  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2447  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2448  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2449  out4, out5, out6, out7);
2450  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2451  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2452  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2453  out4, out5, out6, out7);
2454  SRARI_H4_SH(out0, out1, out2, out3, 6);
2455  SRARI_H4_SH(out4, out5, out6, out7, 6);
2456  SAT_SH4_SH(out0, out1, out2, out3, 7);
2457  SAT_SH4_SH(out4, out5, out6, out7, 7);
2458  out = PCKEV_XORI128_UB(out0, out1);
2459  ST_UB(out, dst);
2460  out = PCKEV_XORI128_UB(out2, out3);
2461  ST_UB(out, dst + 16);
2462  dst += dst_stride;
2463  out = PCKEV_XORI128_UB(out4, out5);
2464  ST_UB(out, dst);
2465  out = PCKEV_XORI128_UB(out6, out7);
2466  ST_UB(out, dst + 16);
2467  dst += dst_stride;
2468  }
2469 }
2470 
2471 static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
2472  uint8_t *dst, int32_t dst_stride,
2473  const int8_t *filter)
2474 {
2475  v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2476  v16i8 src2110, src4332, filt0, filt1;
2477  v16u8 out;
2478  v8i16 filt, out10;
2479 
2480  src -= src_stride;
2481 
2482  filt = LD_SH(filter);
2483  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2484 
2485  LD_SB3(src, src_stride, src0, src1, src2);
2486  src += (3 * src_stride);
2487 
2488  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2489  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2490  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2491  LD_SB2(src, src_stride, src3, src4);
2492  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2493  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2494  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2495  out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2496  out10 = __msa_srari_h(out10, 6);
2497  out10 = __msa_sat_s_h(out10, 7);
2498  out = PCKEV_XORI128_UB(out10, out10);
2499  ST4x2_UB(out, dst, dst_stride);
2500 }
2501 
2503  uint8_t *dst, int32_t dst_stride,
2504  const int8_t *filter, int32_t height)
2505 {
2506  uint32_t loop_cnt;
2507  v16i8 src0, src1, src2, src3, src4, src5;
2508  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2509  v16i8 src2110, src4332, filt0, filt1;
2510  v8i16 filt, out10, out32;
2511  v16u8 out;
2512 
2513  src -= src_stride;
2514 
2515  filt = LD_SH(filter);
2516  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2517 
2518  LD_SB3(src, src_stride, src0, src1, src2);
2519  src += (3 * src_stride);
2520 
2521  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2522 
2523  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2524  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2525 
2526  for (loop_cnt = (height >> 2); loop_cnt--;) {
2527  LD_SB3(src, src_stride, src3, src4, src5);
2528  src += (3 * src_stride);
2529  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2530  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2531  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2532  out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2533 
2534  src2 = LD_SB(src);
2535  src += (src_stride);
2536  ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2537  src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2538  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2539  out32 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
2540  SRARI_H2_SH(out10, out32, 6);
2541  SAT_SH2_SH(out10, out32, 7);
2542  out = PCKEV_XORI128_UB(out10, out32);
2543  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2544  dst += (4 * dst_stride);
2545  }
2546 }
2547 
2548 static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
2549  uint8_t *dst, int32_t dst_stride,
2550  const int8_t *filter, int32_t height)
2551 {
2552  if (2 == height) {
2553  common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2554  } else {
2555  common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2556  height);
2557  }
2558 }
2559 
2560 static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
2561  uint8_t *dst, int32_t dst_stride,
2562  const int8_t *filter, int32_t height)
2563 {
2564  v16u8 out0, out1;
2565  v16i8 src0, src1, src2, src3, src4, src5, src6;
2566  v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2567  v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
2568 
2569  src -= src_stride;
2570 
2571  filter_vec = LD_SH(filter);
2572  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2573 
2574  LD_SB3(src, src_stride, src0, src1, src2);
2575  src += (3 * src_stride);
2576  XORI_B3_128_SB(src0, src1, src2);
2577  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2578 
2579  LD_SB2(src, src_stride, src3, src4);
2580  src += (2 * src_stride);
2581  XORI_B2_128_SB(src3, src4);
2582  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2583 
2584  dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2585  dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2586 
2587  LD_SB2(src, src_stride, src5, src6);
2588  src += (2 * src_stride);
2589  XORI_B2_128_SB(src5, src6);
2590  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2591 
2592  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2593  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2594 
2595  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2596  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2597  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2598  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2599  ST6x4_UB(out0, out1, dst, dst_stride);
2600  dst += (4 * dst_stride);
2601 
2602  LD_SB2(src, src_stride, src3, src4);
2603  src += (2 * src_stride);
2604  XORI_B2_128_SB(src3, src4);
2605  ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
2606 
2607  dst0_r = HEVC_FILT_4TAP_SH(src54_r, src32_r, filt0, filt1);
2608  dst1_r = HEVC_FILT_4TAP_SH(src65_r, src43_r, filt0, filt1);
2609 
2610  LD_SB2(src, src_stride, src5, src6);
2611  src += (2 * src_stride);
2612  XORI_B2_128_SB(src5, src6);
2613  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2614 
2615  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2616  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2617 
2618  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2619  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2620  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2621  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2622  ST6x4_UB(out0, out1, dst, dst_stride);
2623 }
2624 
2625 static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
2626  uint8_t *dst, int32_t dst_stride,
2627  const int8_t *filter)
2628 {
2629  v16i8 src0, src1, src2, src3, src4;
2630  v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2631  v16u8 out;
2632 
2633  src -= src_stride;
2634 
2635  /* rearranging filter_y */
2636  filt = LD_SH(filter);
2637  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2638 
2639  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2640  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2641  ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2642  tmp0 = HEVC_FILT_4TAP_SH(src01, src23, filt0, filt1);
2643  ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2644  tmp1 = HEVC_FILT_4TAP_SH(src12, src34, filt0, filt1);
2645  SRARI_H2_SH(tmp0, tmp1, 6);
2646  SAT_SH2_SH(tmp0, tmp1, 7);
2647  out = PCKEV_XORI128_UB(tmp0, tmp1);
2648  ST8x2_UB(out, dst, dst_stride);
2649 }
2650 
2651 static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
2652  uint8_t *dst, int32_t dst_stride,
2653  const int8_t *filter)
2654 {
2655  uint32_t loop_cnt;
2656  uint64_t out0, out1, out2;
2657  v16i8 src0, src1, src2, src3, src4, src5;
2658  v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2659  v8i16 filt, filt0, filt1;
2660 
2661  src -= src_stride;
2662 
2663  /* rearranging filter_y */
2664  filt = LD_SH(filter);
2665  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2666 
2667  LD_SB3(src, src_stride, src0, src1, src2);
2668  src += (3 * src_stride);
2669 
2670  XORI_B3_128_SB(src0, src1, src2);
2671  ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2672 
2673  for (loop_cnt = 2; loop_cnt--;) {
2674  LD_SB3(src, src_stride, src3, src4, src5);
2675  src += (3 * src_stride);
2676 
2677  XORI_B3_128_SB(src3, src4, src5);
2678  ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2679  tmp0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2680  tmp1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2681  tmp2 = HEVC_FILT_4TAP_SH(vec1, vec4, filt0, filt1);
2682  SRARI_H2_SH(tmp0, tmp1, 6);
2683  tmp2 = __msa_srari_h(tmp2, 6);
2684  SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2685  PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2686  XORI_B2_128_SH(tmp0, tmp2);
2687 
2688  out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2689  out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2690  out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2691  SD(out0, dst);
2692  dst += dst_stride;
2693  SD(out1, dst);
2694  dst += dst_stride;
2695  SD(out2, dst);
2696  dst += dst_stride;
2697 
2698  src2 = src5;
2699  vec0 = vec3;
2700  vec2 = vec4;
2701  }
2702 }
2703 
2704 static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2705  uint8_t *dst, int32_t dst_stride,
2706  const int8_t *filter, int32_t height)
2707 {
2708  uint32_t loop_cnt;
2709  v16i8 src0, src1, src2, src7, src8, src9, src10;
2710  v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2711  v16u8 tmp0, tmp1;
2712  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2713 
2714  src -= src_stride;
2715 
2716  filt = LD_SH(filter);
2717  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2718 
2719  LD_SB3(src, src_stride, src0, src1, src2);
2720  src += (3 * src_stride);
2721 
2722  XORI_B3_128_SB(src0, src1, src2);
2723  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2724 
2725  for (loop_cnt = (height >> 2); loop_cnt--;) {
2726  LD_SB4(src, src_stride, src7, src8, src9, src10);
2727  src += (4 * src_stride);
2728 
2729  XORI_B4_128_SB(src7, src8, src9, src10);
2730  ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2731  src72_r, src87_r, src98_r, src109_r);
2732  out0_r = HEVC_FILT_4TAP_SH(src10_r, src72_r, filt0, filt1);
2733  out1_r = HEVC_FILT_4TAP_SH(src21_r, src87_r, filt0, filt1);
2734  out2_r = HEVC_FILT_4TAP_SH(src72_r, src98_r, filt0, filt1);
2735  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2736  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2737  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2738  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2739  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2740  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2741  dst += (4 * dst_stride);
2742 
2743  src10_r = src98_r;
2744  src21_r = src109_r;
2745  src2 = src10;
2746  }
2747 }
2748 
2749 static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
2750  uint8_t *dst, int32_t dst_stride,
2751  const int8_t *filter, int32_t height)
2752 {
2753  if (2 == height) {
2754  common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2755  } else if (6 == height) {
2756  common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2757  } else {
2758  common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2759  filter, height);
2760  }
2761 }
2762 
2763 static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
2764  uint8_t *dst, int32_t dst_stride,
2765  const int8_t *filter, int32_t height)
2766 {
2767  uint32_t loop_cnt;
2768  v16i8 src0, src1, src2, src3, src4, src5, src6;
2769  v16u8 out0, out1;
2770  v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2771  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2772  v16i8 src2110, src4332, src6554;
2773  v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
2774  v8i16 filter_vec;
2775 
2776  src -= (1 * src_stride);
2777 
2778  filter_vec = LD_SH(filter);
2779  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2780 
2781  LD_SB3(src, src_stride, src0, src1, src2);
2782  src += (3 * src_stride);
2783 
2784  XORI_B3_128_SB(src0, src1, src2);
2785  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2786  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2787  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2788 
2789  for (loop_cnt = 4; loop_cnt--;) {
2790  LD_SB4(src, src_stride, src3, src4, src5, src6);
2791  src += (4 * src_stride);
2792 
2793  XORI_B4_128_SB(src3, src4, src5, src6);
2794  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2795  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2796  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2797  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2798  ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2799  src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2800 
2801  dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2802  dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2803  dst0_l = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2804  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2805  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2806  dst1_l = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
2807 
2808  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2809  SRARI_H2_SH(dst0_l, dst1_l, 6);
2810  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2811  SAT_SH2_SH(dst0_l, dst1_l, 7);
2812  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2813  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2814  ST8x4_UB(out0, out1, dst, dst_stride);
2815  out0 = PCKEV_XORI128_UB(dst0_l, dst1_l);
2816  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
2817  dst += (4 * dst_stride);
2818 
2819  src2 = src6;
2820  src10_r = src54_r;
2821  src21_r = src65_r;
2822  src2110 = src6554;
2823  }
2824 }
2825 
2826 static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
2827  uint8_t *dst, int32_t dst_stride,
2828  const int8_t *filter, int32_t height)
2829 {
2830  uint32_t loop_cnt;
2831  v16i8 src0, src1, src2, src3, src4, src5, src6;
2832  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2833  v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2834  v16u8 tmp0, tmp1, tmp2, tmp3;
2835  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2836 
2837  src -= src_stride;
2838 
2839  filt = LD_SH(filter);
2840  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2841 
2842  LD_SB3(src, src_stride, src0, src1, src2);
2843  src += (3 * src_stride);
2844 
2845  XORI_B3_128_SB(src0, src1, src2);
2846  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2847  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2848 
2849  for (loop_cnt = (height >> 2); loop_cnt--;) {
2850  LD_SB4(src, src_stride, src3, src4, src5, src6);
2851  src += (4 * src_stride);
2852 
2853  XORI_B4_128_SB(src3, src4, src5, src6);
2854  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2855  src32_r, src43_r, src54_r, src65_r);
2856  ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2857  src32_l, src43_l, src54_l, src65_l);
2858  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2859  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2860  out2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2861  out3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2862  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2863  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2864  out2_l = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
2865  out3_l = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
2866  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2867  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
2868  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2869  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2870  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2871  out3_r, tmp0, tmp1, tmp2, tmp3);
2872  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2873  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2874  dst += (4 * dst_stride);
2875 
2876  src10_r = src54_r;
2877  src21_r = src65_r;
2878  src10_l = src54_l;
2879  src21_l = src65_l;
2880  src2 = src6;
2881  }
2882 }
2883 
2884 static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
2885  uint8_t *dst, int32_t dst_stride,
2886  const int8_t *filter, int32_t height)
2887 {
2888  uint32_t loop_cnt;
2889  uint64_t out0, out1;
2890  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2891  v16i8 src11, filt0, filt1;
2892  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2893  v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2894  v16u8 out;
2895  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2896 
2897  src -= src_stride;
2898 
2899  filt = LD_SH(filter);
2900  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2901 
2902  /* 16 width */
2903  LD_SB3(src, src_stride, src0, src1, src2);
2904  XORI_B3_128_SB(src0, src1, src2);
2905  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2906  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2907 
2908  /* 8 width */
2909  LD_SB3(src + 16, src_stride, src6, src7, src8);
2910  src += (3 * src_stride);
2911  XORI_B3_128_SB(src6, src7, src8);
2912  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2913 
2914  for (loop_cnt = 8; loop_cnt--;) {
2915  /* 16 width */
2916  LD_SB2(src, src_stride, src3, src4);
2917  XORI_B2_128_SB(src3, src4);
2918  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2919  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2920 
2921  /* 8 width */
2922  LD_SB2(src + 16, src_stride, src9, src10);
2923  src += (2 * src_stride);
2924  XORI_B2_128_SB(src9, src10);
2925  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2926 
2927  /* 16 width */
2928  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2929  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2930  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2931  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2932 
2933  /* 8 width */
2934  out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
2935  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2936 
2937  /* 16 + 8 width */
2938  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2939  SRARI_H2_SH(out0_l, out1_l, 6);
2940  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2941  SAT_SH2_SH(out0_l, out1_l, 7);
2942  out = PCKEV_XORI128_UB(out0_r, out0_l);
2943  ST_UB(out, dst);
2944  PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2945  XORI_B2_128_SH(out2_r, out3_r);
2946  out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2947  out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2948  SD(out0, dst + 16);
2949  dst += dst_stride;
2950  out = PCKEV_XORI128_UB(out1_r, out1_l);
2951  ST_UB(out, dst);
2952  SD(out1, dst + 16);
2953  dst += dst_stride;
2954 
2955  /* 16 width */
2956  LD_SB2(src, src_stride, src5, src2);
2957  XORI_B2_128_SB(src5, src2);
2958  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2959  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2960 
2961  /* 8 width */
2962  LD_SB2(src + 16, src_stride, src11, src8);
2963  src += (2 * src_stride);
2964  XORI_B2_128_SB(src11, src8);
2965  ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2966 
2967  /* 16 width */
2968  out0_r = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
2969  out0_l = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
2970  out1_r = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
2971  out1_l = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
2972 
2973  /* 8 width */
2974  out2_r = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
2975  out3_r = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
2976 
2977  /* 16 + 8 width */
2978  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2979  SRARI_H2_SH(out0_l, out1_l, 6);
2980  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2981  SAT_SH2_SH(out0_l, out1_l, 7);
2982  out = PCKEV_XORI128_UB(out0_r, out0_l);
2983  ST_UB(out, dst);
2984  out = PCKEV_XORI128_UB(out2_r, out2_r);
2985  ST8x1_UB(out, dst + 16);
2986  dst += dst_stride;
2987  out = PCKEV_XORI128_UB(out1_r, out1_l);
2988  ST_UB(out, dst);
2989  out = PCKEV_XORI128_UB(out3_r, out3_r);
2990  ST8x1_UB(out, dst + 16);
2991  dst += dst_stride;
2992  }
2993 }
2994 
2995 static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
2996  uint8_t *dst, int32_t dst_stride,
2997  const int8_t *filter, int32_t height)
2998 {
2999  uint32_t loop_cnt;
3000  v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3001  v16i8 src10_r, src32_r, src76_r, src98_r;
3002  v16i8 src21_r, src43_r, src87_r, src109_r;
3003  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3004  v16i8 src10_l, src32_l, src76_l, src98_l;
3005  v16i8 src21_l, src43_l, src87_l, src109_l;
3006  v8i16 filt;
3007  v16i8 filt0, filt1;
3008  v16u8 out;
3009 
3010  src -= src_stride;
3011 
3012  filt = LD_SH(filter);
3013  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
3014 
3015  /* 16 width */
3016  LD_SB3(src, src_stride, src0, src1, src2);
3017  XORI_B3_128_SB(src0, src1, src2);
3018 
3019  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3020  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3021 
3022  /* next 16 width */
3023  LD_SB3(src + 16, src_stride, src6, src7, src8);
3024  src += (3 * src_stride);
3025 
3026  XORI_B3_128_SB(src6, src7, src8);
3027  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3028  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3029 
3030  for (loop_cnt = (height >> 1); loop_cnt--;) {
3031  /* 16 width */
3032  LD_SB2(src, src_stride, src3, src4);
3033  XORI_B2_128_SB(src3, src4);
3034  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3035  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3036 
3037  /* 16 width */
3038  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3039  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3040  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3041  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3042 
3043  /* 16 width */
3044  SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
3045  SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3046  out = PCKEV_XORI128_UB(out0_r, out0_l);
3047  ST_UB(out, dst);
3048  out = PCKEV_XORI128_UB(out1_r, out1_l);
3049  ST_UB(out, dst + dst_stride);
3050 
3051  src10_r = src32_r;
3052  src21_r = src43_r;
3053  src10_l = src32_l;
3054  src21_l = src43_l;
3055  src2 = src4;
3056 
3057  /* next 16 width */
3058  LD_SB2(src + 16, src_stride, src9, src10);
3059  src += (2 * src_stride);
3060  XORI_B2_128_SB(src9, src10);
3061  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3062  ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3063 
3064  /* next 16 width */
3065  out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3066  out2_l = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
3067  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3068  out3_l = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
3069 
3070  /* next 16 width */
3071  SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
3072  SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3073  out = PCKEV_XORI128_UB(out2_r, out2_l);
3074  ST_UB(out, dst + 16);
3075  out = PCKEV_XORI128_UB(out3_r, out3_l);
3076  ST_UB(out, dst + 16 + dst_stride);
3077 
3078  dst += 2 * dst_stride;
3079 
3080  src76_r = src98_r;
3081  src87_r = src109_r;
3082  src76_l = src98_l;
3083  src87_l = src109_l;
3084  src8 = src10;
3085  }
3086 }
3087 
3089  int32_t src_stride,
3090  uint8_t *dst,
3091  int32_t dst_stride,
3092  const int8_t *filter_x,
3093  const int8_t *filter_y)
3094 {
3095  v16u8 out;
3096  v16i8 src0, src1, src2, src3, src4;
3097  v8i16 filt0, filt1;
3098  v8i16 filt_h0, filt_h1;
3099  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3100  v16i8 mask1;
3101  v8i16 filter_vec, tmp;
3102  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3103  v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3104  v4i32 dst0, dst1;
3105 
3106  src -= (src_stride + 1);
3107 
3108  filter_vec = LD_SH(filter_x);
3109  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3110 
3111  filter_vec = LD_SH(filter_y);
3112  UNPCK_R_SB_SH(filter_vec, filter_vec);
3113 
3114  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3115 
3116  mask1 = mask0 + 2;
3117 
3118  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3119  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3120 
3121  VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3122  VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3123  VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3124 
3125  dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3126  dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3127  dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3128 
3129  ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3130  ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3131 
3132  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3133  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3134  dst0 >>= 6;
3135  dst1 >>= 6;
3136  tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3137  tmp = __msa_srari_h(tmp, 6);
3138  tmp = __msa_sat_s_h(tmp, 7);
3139  out = PCKEV_XORI128_UB(tmp, tmp);
3140  ST4x2_UB(out, dst, dst_stride);
3141 }
3142 
3144  int32_t src_stride,
3145  uint8_t *dst,
3146  int32_t dst_stride,
3147  const int8_t *filter_x,
3148  const int8_t *filter_y)
3149 {
3150  v16u8 out;
3151  v16i8 src0, src1, src2, src3, src4, src5, src6;
3152  v8i16 filt0, filt1;
3153  v8i16 filt_h0, filt_h1;
3154  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3155  v16i8 mask1;
3156  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3157  v8i16 filter_vec, tmp0, tmp1;
3158  v8i16 dst30, dst41, dst52, dst63;
3159  v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3160  v4i32 dst0, dst1, dst2, dst3;
3161 
3162  src -= (src_stride + 1);
3163 
3164  filter_vec = LD_SH(filter_x);
3165  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3166 
3167  filter_vec = LD_SH(filter_y);
3168  UNPCK_R_SB_SH(filter_vec, filter_vec);
3169 
3170  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3171 
3172  mask1 = mask0 + 2;
3173 
3174  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3175  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3176 
3177  VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3178  VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3179  VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3180  VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3181 
3182  dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3183  dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3184  dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3185  dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3186 
3187  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3188  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3189  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3190  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3191  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3192  dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3193  dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3194  SRA_4V(dst0, dst1, dst2, dst3, 6);
3195  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3196  SRARI_H2_SH(tmp0, tmp1, 6);
3197  SAT_SH2_SH(tmp0, tmp1, 7);
3198  out = PCKEV_XORI128_UB(tmp0, tmp1);
3199  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
3200 }
3201 
3203  int32_t src_stride,
3204  uint8_t *dst,
3205  int32_t dst_stride,
3206  const int8_t *filter_x,
3207  const int8_t *filter_y,
3208  int32_t height)
3209 {
3210  uint32_t loop_cnt;
3211  v16u8 out0, out1;
3212  v16i8 src0, src1, src2, src3, src4, src5;
3213  v16i8 src6, src7, src8, src9, src10;
3214  v8i16 filt0, filt1;
3215  v8i16 filt_h0, filt_h1;
3216  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3217  v16i8 mask1;
3218  v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3;
3219  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3220  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3221  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3222  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3223  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3224  v8i16 dst98_r, dst109_r;
3225 
3226  src -= (src_stride + 1);
3227 
3228  filter_vec = LD_SH(filter_x);
3229  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3230 
3231  filter_vec = LD_SH(filter_y);
3232  UNPCK_R_SB_SH(filter_vec, filter_vec);
3233 
3234  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3235 
3236  mask1 = mask0 + 2;
3237 
3238  LD_SB3(src, src_stride, src0, src1, src2);
3239  src += (3 * src_stride);
3240 
3241  XORI_B3_128_SB(src0, src1, src2);
3242 
3243  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3244  VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3245  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3246  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3247  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3248  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3249 
3250  for (loop_cnt = height >> 3; loop_cnt--;) {
3251  LD_SB8(src, src_stride,
3252  src3, src4, src5, src6, src7, src8, src9, src10);
3253  src += (8 * src_stride);
3254 
3255  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3256 
3257  VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3258  VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3259  VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3260  VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3261 
3262  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3263  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3264  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3265  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3266 
3267  dst32_r = __msa_ilvr_h(dst73, dst22);
3268  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3269  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3270  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3271  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3272  dst76_r = __msa_ilvr_h(dst22, dst106);
3273 
3274  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3275  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3276  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3277  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3278  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3279  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3280  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3281  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3282  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3283  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3284  PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3285  dst5_r, dst4_r, dst7_r, dst6_r,
3286  tmp0, tmp1, tmp2, tmp3);
3287  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3288  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3289  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3290  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3291  ST4x8_UB(out0, out1, dst, dst_stride);
3292  dst += (8 * dst_stride);
3293 
3294  dst10_r = dst98_r;
3295  dst21_r = dst109_r;
3296  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3297  }
3298 }
3299 
3301  int32_t src_stride,
3302  uint8_t *dst,
3303  int32_t dst_stride,
3304  const int8_t *filter_x,
3305  const int8_t *filter_y,
3306  int32_t height)
3307 {
3308  if (2 == height) {
3309  hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3310  filter_x, filter_y);
3311  } else if (4 == height) {
3312  hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3313  filter_x, filter_y);
3314  } else if (0 == (height % 8)) {
3315  hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3316  filter_x, filter_y, height);
3317  }
3318 }
3319 
3321  int32_t src_stride,
3322  uint8_t *dst,
3323  int32_t dst_stride,
3324  const int8_t *filter_x,
3325  const int8_t *filter_y,
3326  int32_t height)
3327 {
3328  v16u8 out0, out1, out2;
3329  v16i8 src0, src1, src2, src3, src4, src5, src6;
3330  v16i8 src7, src8, src9, src10;
3331  v8i16 filt0, filt1;
3332  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3333  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3334  v16i8 mask1;
3335  v8i16 filt_h0, filt_h1, filter_vec;
3336  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3337  v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3338  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3339  v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
3340  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3341  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3342  v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
3343  v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l;
3344  v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3345 
3346  src -= (src_stride + 1);
3347 
3348  filter_vec = LD_SH(filter_x);
3349  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3350 
3351  filter_vec = LD_SH(filter_y);
3352  UNPCK_R_SB_SH(filter_vec, filter_vec);
3353 
3354  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3355 
3356  mask1 = mask0 + 2;
3357 
3358  LD_SB3(src, src_stride, src0, src1, src2);
3359  src += (3 * src_stride);
3360 
3361  XORI_B3_128_SB(src0, src1, src2);
3362 
3363  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3364  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3365  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3366 
3367  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3368  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3369  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3370 
3371  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3372  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3373 
3374  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3375  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3376 
3377  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3378  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3379  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3380  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3381 
3382  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3383  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3384  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3385  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3386 
3387  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3388  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3389  VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3390  VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3391 
3392  dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3393  dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3394  dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3395  dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3396 
3397  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3398  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3399  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3400  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3401  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
3402  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
3403  ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
3404  ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
3405 
3406  PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3407  PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3408  dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3409 
3410  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3411  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3412  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3413  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3414  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3415  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3416  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3417  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3418  dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
3419  dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
3420  dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
3421  dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3422  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3423  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3424  SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3425  PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3426  PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3427  PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3428  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3429  SRARI_H2_SH(tmp4, tmp5, 6);
3430  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3,7);
3431  SAT_SH2_SH(tmp4, tmp5,7);
3432  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3433  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3434  out2 = PCKEV_XORI128_UB(tmp4, tmp5);
3435  ST4x8_UB(out0, out1, dst, dst_stride);
3436  ST2x4_UB(out2, 0, dst + 4, dst_stride);
3437  dst += 4 * dst_stride;
3438  ST2x4_UB(out2, 4, dst + 4, dst_stride);
3439 }
3440 
3442  int32_t src_stride,
3443  uint8_t *dst,
3444  int32_t dst_stride,
3445  const int8_t *filter_x,
3446  const int8_t *filter_y)
3447 {
3448  v16u8 out;
3449  v16i8 src0, src1, src2, src3, src4;
3450  v8i16 filt0, filt1;
3451  v8i16 filt_h0, filt_h1, filter_vec;
3452  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3453  v16i8 mask1;
3454  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3455  v8i16 dst0, dst1, dst2, dst3, dst4;
3456  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3457  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3458  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3459  v8i16 out0_r, out1_r;
3460 
3461  src -= (src_stride + 1);
3462 
3463  filter_vec = LD_SH(filter_x);
3464  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3465 
3466  filter_vec = LD_SH(filter_y);
3467  UNPCK_R_SB_SH(filter_vec, filter_vec);
3468 
3469  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3470 
3471  mask1 = mask0 + 2;
3472 
3473  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3474  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3475 
3476  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3477  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3478  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3479  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3480  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3481 
3482  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3483  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3484  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3485  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3486  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3487  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3488  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3489  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3490  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3491  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3492  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3493  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3494  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3495  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3496  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3497  SRARI_H2_SH(out0_r, out1_r, 6);
3498  SAT_SH2_SH(out0_r, out1_r, 7);
3499  out = PCKEV_XORI128_UB(out0_r, out1_r);
3500  ST8x2_UB(out, dst, dst_stride);
3501 }
3502 
3504  int32_t src_stride,
3505  uint8_t *dst,
3506  int32_t dst_stride,
3507  const int8_t *filter_x,
3508  const int8_t *filter_y,
3509  int32_t width8mult)
3510 {
3511  uint32_t cnt;
3512  v16u8 out0, out1;
3513  v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
3514  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3515  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
3516  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
3517  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3518  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3519  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3520 
3521  src -= (src_stride + 1);
3522 
3523  filter_vec = LD_SH(filter_x);
3524  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3525 
3526  filter_vec = LD_SH(filter_y);
3527  UNPCK_R_SB_SH(filter_vec, filter_vec);
3528 
3529  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3530 
3531  mask0 = LD_SB(ff_hevc_mask_arr);
3532  mask1 = mask0 + 2;
3533 
3534  for (cnt = width8mult; cnt--;) {
3535  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3536  src += 8;
3537  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3538 
3539  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3540  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3541  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3542 
3543  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3544  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3545  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3546 
3547  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3548  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3549 
3550  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3551  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3552  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3553  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3554 
3555  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3556  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3557  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3558  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3559 
3560  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3561  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3562  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3563  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3564 
3565  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3566  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3567  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3568  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3569  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3570  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3571  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3572  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3573 
3574  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3575  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3576 
3577  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3578  dst3_r, tmp0, tmp1, tmp2, tmp3);
3579  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3580  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3581  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3582  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3583  ST8x4_UB(out0, out1, dst, dst_stride);
3584  dst += 8;
3585  }
3586 }
3587 
3589  int32_t src_stride,
3590  uint8_t *dst,
3591  int32_t dst_stride,
3592  const int8_t *filter_x,
3593  const int8_t *filter_y)
3594 {
3595  v16u8 out0, out1, out2;
3596  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3597  v8i16 filt0, filt1;
3598  v8i16 filt_h0, filt_h1, filter_vec;
3599  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3600  v16i8 mask1;
3601  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3602  v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3603  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3604  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3605  v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3606  v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3607  v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3608  v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3609  v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3610  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3611 
3612  src -= (src_stride + 1);
3613 
3614  filter_vec = LD_SH(filter_x);
3615  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3616 
3617  filter_vec = LD_SH(filter_y);
3618  UNPCK_R_SB_SH(filter_vec, filter_vec);
3619 
3620  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3621 
3622  mask1 = mask0 + 2;
3623 
3624  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3625  src += (5 * src_stride);
3626  LD_SB4(src, src_stride, src5, src6, src7, src8);
3627 
3628  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3629  XORI_B4_128_SB(src5, src6, src7, src8);
3630 
3631  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3632  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3633  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3634  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3635  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3636  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3637  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3638  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3639  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3640 
3641  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3642  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3643  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3644  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3645  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3646  dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
3647  dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
3648  dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
3649  dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
3650 
3651  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3652  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3653  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3654  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3655  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3656  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3657  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3658  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3659 
3660  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3661  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3662  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3663  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3664  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3665  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3666  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3667  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3668  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3669  dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3670  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3671  dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3672 
3673  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3674  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3675  SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3676  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3677  dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3678  PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3679  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3680  SRARI_H2_SH(out4_r, out5_r, 6);
3681  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3682  SAT_SH2_SH(out4_r, out5_r, 7);
3683  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3684  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3685  out2 = PCKEV_XORI128_UB(out4_r, out5_r);
3686 
3687  ST8x4_UB(out0, out1, dst, dst_stride);
3688  dst += (4 * dst_stride);
3689  ST8x2_UB(out2, dst, dst_stride);
3690 }
3691 
3693  int32_t src_stride,
3694  uint8_t *dst,
3695  int32_t dst_stride,
3696  const int8_t *filter_x,
3697  const int8_t *filter_y,
3698  int32_t height,
3699  int32_t width8mult)
3700 {
3701  uint32_t loop_cnt, cnt;
3702  uint8_t *src_tmp;
3703  uint8_t *dst_tmp;
3704  v16u8 out0, out1;
3705  v16i8 src0, src1, src2, src3, src4, src5, src6;
3706  v8i16 filt0, filt1;
3707  v8i16 filt_h0, filt_h1, filter_vec;
3708  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3709  v16i8 mask1;
3710  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3711  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3712  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3713  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3714  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3715  v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
3716  v8i16 out0_r, out1_r, out2_r, out3_r;
3717 
3718  src -= (src_stride + 1);
3719 
3720  filter_vec = LD_SH(filter_x);
3721  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3722 
3723  filter_vec = LD_SH(filter_y);
3724  UNPCK_R_SB_SH(filter_vec, filter_vec);
3725 
3726  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3727 
3728  mask1 = mask0 + 2;
3729 
3730  for (cnt = width8mult; cnt--;) {
3731  src_tmp = src;
3732  dst_tmp = dst;
3733 
3734  LD_SB3(src_tmp, src_stride, src0, src1, src2);
3735  src_tmp += (3 * src_stride);
3736 
3737  XORI_B3_128_SB(src0, src1, src2);
3738 
3739  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3740  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3741  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3742 
3743  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3744  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3745  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3746 
3747  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3748  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3749 
3750  for (loop_cnt = (height >> 2); loop_cnt--;) {
3751  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3752  src_tmp += (4 * src_stride);
3753 
3754  XORI_B4_128_SB(src3, src4, src5, src6);
3755 
3756  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3757  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3758  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3759  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3760 
3761  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3762  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3763  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3764  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3765 
3766  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3767  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3768  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3769  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3770 
3771  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3772  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3773  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3774  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3775  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3776  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3777  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3778  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3779 
3780  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3781  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3782 
3783  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3784  dst2_l, dst2_r, dst3_l, dst3_r,
3785  out0_r, out1_r, out2_r, out3_r);
3786 
3787  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3788  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3789  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3790  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3791  ST8x4_UB(out0, out1, dst_tmp, dst_stride);
3792  dst_tmp += (4 * dst_stride);
3793 
3794  dst10_r = dst54_r;
3795  dst10_l = dst54_l;
3796  dst21_r = dst65_r;
3797  dst21_l = dst65_l;
3798  dst2 = dst6;
3799  }
3800 
3801  src += 8;
3802  dst += 8;
3803  }
3804 }
3805 
3807  int32_t src_stride,
3808  uint8_t *dst,
3809  int32_t dst_stride,
3810  const int8_t *filter_x,
3811  const int8_t *filter_y,
3812  int32_t height)
3813 {
3814  if (2 == height) {
3815  hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3816  filter_x, filter_y);
3817  } else if (4 == height) {
3818  hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride,
3819  filter_x, filter_y, 1);
3820  } else if (6 == height) {
3821  hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3822  filter_x, filter_y);
3823  } else if (0 == (height % 4)) {
3824  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3825  filter_x, filter_y, height, 1);
3826  }
3827 }
3828 
3830  int32_t src_stride,
3831  uint8_t *dst,
3832  int32_t dst_stride,
3833  const int8_t *filter_x,
3834  const int8_t *filter_y,
3835  int32_t height)
3836 {
3837  uint32_t loop_cnt;
3838  uint8_t *src_tmp, *dst_tmp;
3839  v16u8 out0, out1;
3840  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3841  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3842  v16i8 mask0, mask1, mask2, mask3;
3843  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
3844  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
3845  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3846  v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
3847  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3848  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3849  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3850  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3851 
3852  src -= (src_stride + 1);
3853 
3854  filter_vec = LD_SH(filter_x);
3855  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3856 
3857  filter_vec = LD_SH(filter_y);
3858  UNPCK_R_SB_SH(filter_vec, filter_vec);
3859 
3860  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3861 
3862  mask0 = LD_SB(ff_hevc_mask_arr);
3863  mask1 = mask0 + 2;
3864 
3865  src_tmp = src;
3866  dst_tmp = dst;
3867 
3868  LD_SB3(src_tmp, src_stride, src0, src1, src2);
3869  src_tmp += (3 * src_stride);
3870 
3871  XORI_B3_128_SB(src0, src1, src2);
3872 
3873  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3874  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3875  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3876 
3877  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3878  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3879  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3880 
3881  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3882  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3883 
3884  for (loop_cnt = 4; loop_cnt--;) {
3885  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3886  src_tmp += (4 * src_stride);
3887  XORI_B4_128_SB(src3, src4, src5, src6);
3888 
3889  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3890  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3891  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3892  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3893 
3894  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3895  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3896  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3897  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3898 
3899  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3900  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3901  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3902  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3903 
3904  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3905  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3906  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3907  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3908  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3909  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3910  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3911  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3912 
3913  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3914  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3915 
3916  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3917  dst3_r, tmp0, tmp1, tmp2, tmp3);
3918  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3919  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3920  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3921  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3922  ST8x4_UB(out0, out1, dst_tmp, dst_stride);
3923  dst_tmp += (4 * dst_stride);
3924 
3925  dst10_r = dst54_r;
3926  dst10_l = dst54_l;
3927  dst21_r = dst65_r;
3928  dst21_l = dst65_l;
3929  dsth2 = dsth6;
3930  }
3931 
3932  src += 8;
3933  dst += 8;
3934 
3935  mask2 = LD_SB(ff_hevc_mask_arr + 16);
3936  mask3 = mask2 + 2;
3937 
3938  LD_SB3(src, src_stride, src0, src1, src2);
3939  src += (3 * src_stride);
3940  XORI_B3_128_SB(src0, src1, src2);
3941  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3942  VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
3943 
3944  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3945  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3946 
3947  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3948  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3949 
3950  for (loop_cnt = 2; loop_cnt--;) {
3951  LD_SB8(src, src_stride,
3952  src3, src4, src5, src6, src7, src8, src9, src10);
3953  src += (8 * src_stride);
3954  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3955  VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
3956  VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
3957  VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
3958  VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
3959 
3960  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3961  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3962  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3963  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3964 
3965  dst32_r = __msa_ilvr_h(dst73, dst22);
3966  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3967  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3968  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3969  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3970  dst76_r = __msa_ilvr_h(dst22, dst106);
3971 
3972  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3973  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3974  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3975  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3976  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3977  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3978  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3979  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3980  SRA_4V(dst0, dst1, dst2, dst3, 6);
3981  SRA_4V(dst4, dst5, dst6, dst7, 6);
3982  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3983  tmp0, tmp1, tmp2, tmp3);
3984  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3985  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3986  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3987  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3988  ST4x8_UB(out0, out1, dst, dst_stride);
3989  dst += (8 * dst_stride);
3990 
3991  dst10_r = dst98_r;
3992  dst21_r = dst109_r;
3993  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3994  }
3995 }
3996 
3998  int32_t src_stride,
3999  uint8_t *dst,
4000  int32_t dst_stride,
4001  const int8_t *filter_x,
4002  const int8_t *filter_y,
4003  int32_t height)
4004 {
4005  if (4 == height) {
4006  hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride, filter_x,
4007  filter_y, 2);
4008  } else {
4009  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4010  filter_x, filter_y, height, 2);
4011  }
4012 }
4013 
4015  int32_t src_stride,
4016  uint8_t *dst,
4017  int32_t dst_stride,
4018  const int8_t *filter_x,
4019  const int8_t *filter_y,
4020  int32_t height)
4021 {
4022  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4023  filter_x, filter_y, height, 3);
4024 }
4025 
4027  int32_t src_stride,
4028  uint8_t *dst,
4029  int32_t dst_stride,
4030  const int8_t *filter_x,
4031  const int8_t *filter_y,
4032  int32_t height)
4033 {
4034  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4035  filter_x, filter_y, height, 4);
4036 }
4037 
4038 #define UNI_MC_COPY(WIDTH) \
4039 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4040  ptrdiff_t dst_stride, \
4041  uint8_t *src, \
4042  ptrdiff_t src_stride, \
4043  int height, \
4044  intptr_t mx, \
4045  intptr_t my, \
4046  int width) \
4047 { \
4048  copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
4049 }
4050 
4051 UNI_MC_COPY(8);
4052 UNI_MC_COPY(12);
4053 UNI_MC_COPY(16);
4054 UNI_MC_COPY(24);
4055 UNI_MC_COPY(32);
4056 UNI_MC_COPY(48);
4057 UNI_MC_COPY(64);
4058 
4059 #undef UNI_MC_COPY
4060 
4061 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4062 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4063  ptrdiff_t dst_stride, \
4064  uint8_t *src, \
4065  ptrdiff_t src_stride, \
4066  int height, \
4067  intptr_t mx, \
4068  intptr_t my, \
4069  int width) \
4070 { \
4071  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4072  \
4073  common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4074  filter, height); \
4075 }
4076 
4077 UNI_MC(qpel, h, 4, 8, hz, mx);
4078 UNI_MC(qpel, h, 8, 8, hz, mx);
4079 UNI_MC(qpel, h, 12, 8, hz, mx);
4080 UNI_MC(qpel, h, 16, 8, hz, mx);
4081 UNI_MC(qpel, h, 24, 8, hz, mx);
4082 UNI_MC(qpel, h, 32, 8, hz, mx);
4083 UNI_MC(qpel, h, 48, 8, hz, mx);
4084 UNI_MC(qpel, h, 64, 8, hz, mx);
4085 
4086 UNI_MC(qpel, v, 4, 8, vt, my);
4087 UNI_MC(qpel, v, 8, 8, vt, my);
4088 UNI_MC(qpel, v, 12, 8, vt, my);
4089 UNI_MC(qpel, v, 16, 8, vt, my);
4090 UNI_MC(qpel, v, 24, 8, vt, my);
4091 UNI_MC(qpel, v, 32, 8, vt, my);
4092 UNI_MC(qpel, v, 48, 8, vt, my);
4093 UNI_MC(qpel, v, 64, 8, vt, my);
4094 
4095 UNI_MC(epel, h, 4, 4, hz, mx);
4096 UNI_MC(epel, h, 6, 4, hz, mx);
4097 UNI_MC(epel, h, 8, 4, hz, mx);
4098 UNI_MC(epel, h, 12, 4, hz, mx);
4099 UNI_MC(epel, h, 16, 4, hz, mx);
4100 UNI_MC(epel, h, 24, 4, hz, mx);
4101 UNI_MC(epel, h, 32, 4, hz, mx);
4102 
4103 UNI_MC(epel, v, 4, 4, vt, my);
4104 UNI_MC(epel, v, 6, 4, vt, my);
4105 UNI_MC(epel, v, 8, 4, vt, my);
4106 UNI_MC(epel, v, 12, 4, vt, my);
4107 UNI_MC(epel, v, 16, 4, vt, my);
4108 UNI_MC(epel, v, 24, 4, vt, my);
4109 UNI_MC(epel, v, 32, 4, vt, my);
4110 
4111 #undef UNI_MC
4112 
4113 #define UNI_MC_HV(PEL, WIDTH, TAP) \
4114 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4115  ptrdiff_t dst_stride, \
4116  uint8_t *src, \
4117  ptrdiff_t src_stride, \
4118  int height, \
4119  intptr_t mx, \
4120  intptr_t my, \
4121  int width) \
4122 { \
4123  const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4124  const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4125  \
4126  hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4127  filter_x, filter_y, height); \
4128 }
4129 
4130 UNI_MC_HV(qpel, 4, 8);
4131 UNI_MC_HV(qpel, 8, 8);
4132 UNI_MC_HV(qpel, 12, 8);
4133 UNI_MC_HV(qpel, 16, 8);
4134 UNI_MC_HV(qpel, 24, 8);
4135 UNI_MC_HV(qpel, 32, 8);
4136 UNI_MC_HV(qpel, 48, 8);
4137 UNI_MC_HV(qpel, 64, 8);
4138 
4139 UNI_MC_HV(epel, 4, 4);
4140 UNI_MC_HV(epel, 6, 4);
4141 UNI_MC_HV(epel, 8, 4);
4142 UNI_MC_HV(epel, 12, 4);
4143 UNI_MC_HV(epel, 16, 4);
4144 UNI_MC_HV(epel, 24, 4);
4145 UNI_MC_HV(epel, 32, 4);
4146 
4147 #undef UNI_MC_HV
#define VSHF_B4_SB(...)
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
static void hevc_hv_uni_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define SRARI_W4_SW(...)
static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_hv_uni_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ILVR_H4_SH(...)
#define PCKEV_B2_SH(...)
static void copy_width24_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define XORI_B2_128_SB(...)
static void hevc_hv_uni_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define PCKEV_XORI128_UB(in0, in1)
static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
#define SD
Definition: ccaption_dec.c:819
#define LD_SB(...)
#define XORI_B3_128_SB(...)
static void hevc_hv_uni_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define UNPCK_R_SB_SH(in, out)
#define LD_UB4(...)
#define DPADD_SB4_SH(...)
#define ILVR_B2_SB(...)
#define SPLATI_H2_SH(...)
static void hevc_hv_uni_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define src
Definition: vp8dsp.c:254
static void copy_width48_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define ILVL_H2_SH(...)
static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD_SB2(...)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define ILVL_H4_SH(...)
static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B2_128_SH(...)
#define VSHF_B2_SB(...)
#define SRA_4V(in0, in1, in2, in3, shift)
#define XORI_B4_128_UB(...)
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1, out2, out3)
#define ILVR_D2_SB(...)
uint8_t
#define LD4(psrc, stride, out0, out1, out2, out3)
#define SPLATI_W2_SH(...)
#define SRARI_H4_SH(...)
#define SPLATI_H4_SH(...)
static void hevc_hv_uni_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ILVL_B2_SB(...)
#define height
#define LD_SH(...)
static void hevc_hv_uni_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ILVRL_H2_SH(...)
#define ILVR_D3_SB(...)
#define LD_SB8(...)
static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1)
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
static int aligned(int val)
Definition: dashdec.c:178
static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
#define ST8x2_UB(in, pdst, stride)
#define VSHF_B2_SH(...)
#define SPLATI_H2_SB(...)
static void hevc_hv_uni_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define XORI_B7_128_SB(...)
static void hevc_hv_uni_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ILVR_B2_SH(...)
#define XORI_B4_128_SB(...)
static void hevc_hv_uni_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define SPLATI_W4_SH(...)
#define DPADD_SB2_SH(...)
#define SRARI_H2_SH(...)
static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD_UB8(...)
#define width
#define PCKEV_D2_SH(...)
#define UNI_MC_HV(PEL, WIDTH, TAP)
#define SAT_SW4_SW(...)
#define PCKEV_H2_SW(...)
static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
int32_t
static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define PCKEV_H2_SH(...)
#define LD_SB3(...)
static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width12_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST_UB(...)
static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define SAT_SH4_SH(...)
#define SPLATI_H4_SB(...)
#define LD_SB4(...)
#define ST2x4_UB(in, stidx, pdst, stride)
#define PCKEV_B4_UB(...)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
#define ST_UB8(...)
#define ST_UB4(...)
static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define src1
Definition: h264pred.c:139
static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define ILVL_B4_SB(...)
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1, out2, out3)
#define SAT_SH2_SH(...)
static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void copy_width64_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width8mult)
static void hevc_hv_uni_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define DOTP_SB4_SH(...)
static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1)
#define SAT_SH3_SH(...)
static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define src0
Definition: h264pred.c:138
static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define SD4(in0, in1, in2, in3, pdst, stride)
#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SW(val, pdst)
static const int8_t filt[NUMTAPS]
Definition: af_earwax.c:39
static void hevc_hv_uni_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define ST4x8_UB(in0, in1, pdst, stride)
#define ST6x4_UB(in0, in1, pdst, stride)
static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD_SB7(...)
static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD_SB5(...)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define ST8x4_UB(in0, in1, pdst, stride)
#define ILVR_H2_SH(...)
static const uint8_t ff_hevc_mask_arr[16 *3]
#define UNI_MC_COPY(WIDTH)
static void hevc_hv_uni_4t_8multx4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult)
#define LD_UB(...)
static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define ILVR_B4_SB(...)
static void hevc_hv_uni_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
FILE * out
Definition: movenc.c:54
#define ILVR_B3_SH(...)
static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST8x1_UB(in, pdst)
static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define PCKEV_H4_SH(...)
static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST4x2_UB(in, pdst, stride)
#define LD2(psrc, stride, out0, out1)
static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
static void copy_width32_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define DOTP_SB2_SH(...)
static uint8_t tmp[11]
Definition: aes_ctr.c:26
#define VSHF_B3_SB(...)
#define DOTP_SB3_SH(...)