FFmpeg
hevc_mc_uni_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
24 
25 static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  /* 4 width cases */
29  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30  /* 4 width cases */
31  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32 };
33 
34 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
35  mask0, mask1, mask2, mask3, \
36  filt0, filt1, filt2, filt3, \
37  out0, out1) \
38 { \
39  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
40  \
41  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
42  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
43  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
44  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
45  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
46  DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
47  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
48  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \
49 }
50 
51 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
52  mask0, mask1, mask2, mask3, \
53  filt0, filt1, filt2, filt3, \
54  out0, out1, out2, out3) \
55 { \
56  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
57  \
58  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
59  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
60  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
61  out0, out1, out2, out3); \
62  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
63  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
64  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
65  out0, out1, out2, out3); \
66  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
67  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
68  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
69  out0, out1, out2, out3); \
70  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
71  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
72  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
73  out0, out1, out2, out3); \
74 }
75 
76 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
77  mask0, mask1, filt0, filt1, \
78  out0, out1) \
79 { \
80  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
81  \
82  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
83  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
84  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
85  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
86 }
87 
88 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
89  mask0, mask1, filt0, filt1, \
90  out0, out1, out2, out3) \
91 { \
92  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
93  \
94  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
95  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
96  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
97  out0, out1, out2, out3); \
98  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
99  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
100  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
101  out0, out1, out2, out3); \
102 }
103 
104 static void copy_width8_msa(uint8_t *src, int32_t src_stride,
105  uint8_t *dst, int32_t dst_stride,
106  int32_t height)
107 {
108  int32_t cnt;
109  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
110 
111  if (2 == height) {
112  LD2(src, src_stride, out0, out1);
113  SD(out0, dst);
114  dst += dst_stride;
115  SD(out1, dst);
116  } else if (6 == height) {
117  LD4(src, src_stride, out0, out1, out2, out3);
118  src += (4 * src_stride);
119  SD4(out0, out1, out2, out3, dst, dst_stride);
120  dst += (4 * dst_stride);
121  LD2(src, src_stride, out0, out1);
122  SD(out0, dst);
123  dst += dst_stride;
124  SD(out1, dst);
125  } else if (0 == (height % 8)) {
126  for (cnt = (height >> 3); cnt--;) {
127  LD4(src, src_stride, out0, out1, out2, out3);
128  src += (4 * src_stride);
129  LD4(src, src_stride, out4, out5, out6, out7);
130  src += (4 * src_stride);
131  SD4(out0, out1, out2, out3, dst, dst_stride);
132  dst += (4 * dst_stride);
133  SD4(out4, out5, out6, out7, dst, dst_stride);
134  dst += (4 * dst_stride);
135  }
136  } else if (0 == (height % 4)) {
137  for (cnt = (height >> 2); cnt--;) {
138  LD4(src, src_stride, out0, out1, out2, out3);
139  src += (4 * src_stride);
140  SD4(out0, out1, out2, out3, dst, dst_stride);
141  dst += (4 * dst_stride);
142  }
143  }
144 }
145 
146 static void copy_width12_msa(uint8_t *src, int32_t src_stride,
147  uint8_t *dst, int32_t dst_stride,
148  int32_t height)
149 {
150  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
151 
152  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
153  src += (8 * src_stride);
154  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
155  dst += (8 * dst_stride);
156  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
157  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
158 }
159 
160 static void copy_width16_msa(uint8_t *src, int32_t src_stride,
161  uint8_t *dst, int32_t dst_stride,
162  int32_t height)
163 {
164  int32_t cnt;
165  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
166 
167  if (12 == height) {
168  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169  src += (8 * src_stride);
170  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171  dst += (8 * dst_stride);
172  LD_UB4(src, src_stride, src0, src1, src2, src3);
173  src += (4 * src_stride);
174  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
175  dst += (4 * dst_stride);
176  } else if (0 == (height % 8)) {
177  for (cnt = (height >> 3); cnt--;) {
178  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
179  src7);
180  src += (8 * src_stride);
181  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
182  dst_stride);
183  dst += (8 * dst_stride);
184  }
185  } else if (0 == (height % 4)) {
186  for (cnt = (height >> 2); cnt--;) {
187  LD_UB4(src, src_stride, src0, src1, src2, src3);
188  src += (4 * src_stride);
189 
190  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
191  dst += (4 * dst_stride);
192  }
193  }
194 }
195 
196 static void copy_width24_msa(uint8_t *src, int32_t src_stride,
197  uint8_t *dst, int32_t dst_stride,
198  int32_t height)
199 {
200  int32_t cnt;
201  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
202  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
203 
204  for (cnt = 4; cnt--;) {
205  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
206  LD4(src + 16, src_stride, out0, out1, out2, out3);
207  src += (4 * src_stride);
208  LD4(src + 16, src_stride, out4, out5, out6, out7);
209  src += (4 * src_stride);
210 
211  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212  SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213  dst += (4 * dst_stride);
214  SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215  dst += (4 * dst_stride);
216  }
217 }
218 
219 static void copy_width32_msa(uint8_t *src, int32_t src_stride,
220  uint8_t *dst, int32_t dst_stride,
221  int32_t height)
222 {
223  int32_t cnt;
224  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
225 
226  for (cnt = (height >> 2); cnt--;) {
227  LD_UB4(src, src_stride, src0, src1, src2, src3);
228  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
229  src += (4 * src_stride);
230  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
231  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232  dst += (4 * dst_stride);
233  }
234 }
235 
236 static void copy_width48_msa(uint8_t *src, int32_t src_stride,
237  uint8_t *dst, int32_t dst_stride,
238  int32_t height)
239 {
240  int32_t cnt;
241  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
242  v16u8 src11;
243 
244  for (cnt = (height >> 2); cnt--;) {
245  LD_UB4(src, src_stride, src0, src1, src2, src3);
246  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
247  LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
248  src += (4 * src_stride);
249 
250  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
251  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252  ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253  dst += (4 * dst_stride);
254  }
255 }
256 
257 static void copy_width64_msa(uint8_t *src, int32_t src_stride,
258  uint8_t *dst, int32_t dst_stride,
259  int32_t height)
260 {
261  int32_t cnt;
262  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
263  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
264 
265  for (cnt = (height >> 2); cnt--;) {
266  LD_UB4(src, 16, src0, src1, src2, src3);
267  src += src_stride;
268  LD_UB4(src, 16, src4, src5, src6, src7);
269  src += src_stride;
270  LD_UB4(src, 16, src8, src9, src10, src11);
271  src += src_stride;
272  LD_UB4(src, 16, src12, src13, src14, src15);
273  src += src_stride;
274 
275  ST_UB4(src0, src1, src2, src3, dst, 16);
276  dst += dst_stride;
277  ST_UB4(src4, src5, src6, src7, dst, 16);
278  dst += dst_stride;
279  ST_UB4(src8, src9, src10, src11, dst, 16);
280  dst += dst_stride;
281  ST_UB4(src12, src13, src14, src15, dst, 16);
282  dst += dst_stride;
283  }
284 }
285 
286 static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
287  uint8_t *dst, int32_t dst_stride,
288  const int8_t *filter)
289 {
290  v16u8 mask0, mask1, mask2, mask3, out;
291  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
292  v8i16 filt, out0, out1;
293 
294  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
295  src -= 3;
296 
297  /* rearranging filter */
298  filt = LD_SH(filter);
299  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
300 
301  mask1 = mask0 + 2;
302  mask2 = mask0 + 4;
303  mask3 = mask0 + 6;
304 
305  LD_SB4(src, src_stride, src0, src1, src2, src3);
306  XORI_B4_128_SB(src0, src1, src2, src3);
307  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
308  mask3, filt0, filt1, filt2, filt3, out0, out1);
309  SRARI_H2_SH(out0, out1, 6);
310  SAT_SH2_SH(out0, out1, 7);
311  out = PCKEV_XORI128_UB(out0, out1);
312  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
313 }
314 
315 static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
316  uint8_t *dst, int32_t dst_stride,
317  const int8_t *filter)
318 {
319  v16i8 filt0, filt1, filt2, filt3;
320  v16i8 src0, src1, src2, src3;
321  v16u8 mask0, mask1, mask2, mask3, out;
322  v8i16 filt, out0, out1, out2, out3;
323 
324  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
325  src -= 3;
326 
327  /* rearranging filter */
328  filt = LD_SH(filter);
329  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
330 
331  mask1 = mask0 + 2;
332  mask2 = mask0 + 4;
333  mask3 = mask0 + 6;
334 
335  LD_SB4(src, src_stride, src0, src1, src2, src3);
336  XORI_B4_128_SB(src0, src1, src2, src3);
337  src += (4 * src_stride);
338  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
339  mask3, filt0, filt1, filt2, filt3, out0, out1);
340  LD_SB4(src, src_stride, src0, src1, src2, src3);
341  XORI_B4_128_SB(src0, src1, src2, src3);
342  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
343  mask3, filt0, filt1, filt2, filt3, out2, out3);
344  SRARI_H4_SH(out0, out1, out2, out3, 6);
345  SAT_SH4_SH(out0, out1, out2, out3, 7);
346  out = PCKEV_XORI128_UB(out0, out1);
347  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
348  out = PCKEV_XORI128_UB(out2, out3);
349  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
350 }
351 
352 static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
353  uint8_t *dst, int32_t dst_stride,
354  const int8_t *filter)
355 {
356  v16u8 mask0, mask1, mask2, mask3, out;
357  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
358  v8i16 filt, out0, out1, out2, out3;
359 
360  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
361  src -= 3;
362 
363  /* rearranging filter */
364  filt = LD_SH(filter);
365  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
366 
367  mask1 = mask0 + 2;
368  mask2 = mask0 + 4;
369  mask3 = mask0 + 6;
370 
371  LD_SB4(src, src_stride, src0, src1, src2, src3);
372  XORI_B4_128_SB(src0, src1, src2, src3);
373  src += (4 * src_stride);
374  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
375  mask3, filt0, filt1, filt2, filt3, out0, out1);
376  LD_SB4(src, src_stride, src0, src1, src2, src3);
377  XORI_B4_128_SB(src0, src1, src2, src3);
378  src += (4 * src_stride);
379  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
380  mask3, filt0, filt1, filt2, filt3, out2, out3);
381  SRARI_H4_SH(out0, out1, out2, out3, 6);
382  SAT_SH4_SH(out0, out1, out2, out3, 7);
383  out = PCKEV_XORI128_UB(out0, out1);
384  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
385  out = PCKEV_XORI128_UB(out2, out3);
386  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
387  dst += (8 * dst_stride);
388 
389  LD_SB4(src, src_stride, src0, src1, src2, src3);
390  XORI_B4_128_SB(src0, src1, src2, src3);
391  src += (4 * src_stride);
392  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
393  mask3, filt0, filt1, filt2, filt3, out0, out1);
394  LD_SB4(src, src_stride, src0, src1, src2, src3);
395  XORI_B4_128_SB(src0, src1, src2, src3);
396  src += (4 * src_stride);
397  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
398  mask3, filt0, filt1, filt2, filt3, out2, out3);
399 
400  SRARI_H4_SH(out0, out1, out2, out3, 6);
401  SAT_SH4_SH(out0, out1, out2, out3, 7);
402  out = PCKEV_XORI128_UB(out0, out1);
403  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
404  out = PCKEV_XORI128_UB(out2, out3);
405  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
406 }
407 
408 static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
409  uint8_t *dst, int32_t dst_stride,
410  const int8_t *filter, int32_t height)
411 {
412  if (4 == height) {
413  common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
414  } else if (8 == height) {
415  common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
416  } else if (16 == height) {
417  common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
418  }
419 }
420 
421 static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
422  uint8_t *dst, int32_t dst_stride,
423  const int8_t *filter, int32_t height)
424 {
425  uint32_t loop_cnt;
426  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
427  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
428  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
429  v8i16 filt, out0, out1, out2, out3;
430 
431  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
432  src -= 3;
433 
434  /* rearranging filter */
435  filt = LD_SH(filter);
436  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
437 
438  mask1 = mask0 + 2;
439  mask2 = mask0 + 4;
440  mask3 = mask0 + 6;
441 
442  for (loop_cnt = (height >> 2); loop_cnt--;) {
443  LD_SB4(src, src_stride, src0, src1, src2, src3);
444  XORI_B4_128_SB(src0, src1, src2, src3);
445  src += (4 * src_stride);
446 
447  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
448  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
449  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
450  out0, out1, out2, out3);
451  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
452  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
453  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
454  out0, out1, out2, out3);
455  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
456  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
457  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
458  out0, out1, out2, out3);
459  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
460  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
461  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
462  out0, out1, out2, out3);
463 
464  SRARI_H4_SH(out0, out1, out2, out3, 6);
465  SAT_SH4_SH(out0, out1, out2, out3, 7);
466  tmp0 = PCKEV_XORI128_UB(out0, out1);
467  tmp1 = PCKEV_XORI128_UB(out2, out3);
468  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
469  dst += (4 * dst_stride);
470  }
471 }
472 
473 static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
474  uint8_t *dst, int32_t dst_stride,
475  const int8_t *filter, int32_t height)
476 {
477  uint32_t loop_cnt;
478  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
479  v16u8 tmp0, tmp1, tmp2;
480  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
481  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
482  v16i8 filt0, filt1, filt2, filt3;
483  v8i16 filt, out0, out1, out2, out3, out4, out5;
484 
485  mask00 = LD_UB(&ff_hevc_mask_arr[0]);
486  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
487 
488  src = src - 3;
489 
490  /* rearranging filter */
491  filt = LD_SH(filter);
492  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
493 
494  mask1 = mask00 + 2;
495  mask2 = mask00 + 4;
496  mask3 = mask00 + 6;
497  mask4 = mask0 + 2;
498  mask5 = mask0 + 4;
499  mask6 = mask0 + 6;
500 
501  for (loop_cnt = 4; loop_cnt--;) {
502  /* 8 width */
503  LD_SB4(src, src_stride, src0, src1, src2, src3);
504  /* 4 width */
505  LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
506 
507  XORI_B4_128_SB(src0, src1, src2, src3);
508  XORI_B4_128_SB(src4, src5, src6, src7);
509  src += (4 * src_stride);
510 
511  VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
512  VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
513  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
514  out1, out2, out3);
515  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
516  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
517  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
518  out1, out2, out3);
519  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
520  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
521  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
522  out1, out2, out3);
523  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
524  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
525  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
526  out1, out2, out3);
527 
528  /* 4 width */
529  VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
530  DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5);
531  VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
532  DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5);
533  VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
534  DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5);
535  VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
536  DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5);
537 
538  SRARI_H4_SH(out0, out1, out2, out3, 6);
539  SRARI_H2_SH(out4, out5, 6);
540  SAT_SH4_SH(out0, out1, out2, out3, 7);
541  SAT_SH2_SH(out4, out5, 7);
542  tmp0 = PCKEV_XORI128_UB(out0, out1);
543  tmp1 = PCKEV_XORI128_UB(out2, out3);
544  tmp2 = PCKEV_XORI128_UB(out4, out5);
545 
546  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
547  ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
548  dst += (4 * dst_stride);
549  }
550 }
551 
552 static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
553  uint8_t *dst, int32_t dst_stride,
554  const int8_t *filter, int32_t height)
555 {
556  uint32_t loop_cnt;
557  v16u8 mask0, mask1, mask2, mask3, out;
558  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
559  v16i8 filt0, filt1, filt2, filt3;
560  v8i16 filt, out0, out1, out2, out3;
561 
562  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
563  src -= 3;
564 
565  /* rearranging filter */
566  filt = LD_SH(filter);
567  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
568 
569  mask1 = mask0 + 2;
570  mask2 = mask0 + 4;
571  mask3 = mask0 + 6;
572 
573  for (loop_cnt = (height >> 2); loop_cnt--;) {
574  LD_SB2(src, src_stride, src0, src2);
575  LD_SB2(src + 8, src_stride, src1, src3);
576  src += (2 * src_stride);
577 
578  LD_SB2(src, src_stride, src4, src6);
579  LD_SB2(src + 8, src_stride, src5, src7);
580  src += (2 * src_stride);
581 
582  XORI_B4_128_SB(src0, src1, src2, src3);
583  XORI_B4_128_SB(src4, src5, src6, src7);
584  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
585  mask3, filt0, filt1, filt2, filt3, out0,
586  out1, out2, out3);
587  SRARI_H4_SH(out0, out1, out2, out3, 6);
588  SAT_SH4_SH(out0, out1, out2, out3, 7);
589  out = PCKEV_XORI128_UB(out0, out1);
590  ST_UB(out, dst);
591  dst += dst_stride;
592  out = PCKEV_XORI128_UB(out2, out3);
593  ST_UB(out, dst);
594  dst += dst_stride;
595 
596  HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
597  mask3, filt0, filt1, filt2, filt3, out0,
598  out1, out2, out3);
599  SRARI_H4_SH(out0, out1, out2, out3, 6);
600  SAT_SH4_SH(out0, out1, out2, out3, 7);
601  out = PCKEV_XORI128_UB(out0, out1);
602  ST_UB(out, dst);
603  dst += dst_stride;
604  out = PCKEV_XORI128_UB(out2, out3);
605  ST_UB(out, dst);
606  dst += dst_stride;
607  }
608 }
609 
610 static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
611  uint8_t *dst, int32_t dst_stride,
612  const int8_t *filter, int32_t height)
613 {
614  uint32_t loop_cnt;
615  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
616  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
617  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
618  v16i8 vec11;
619  v8i16 out0, out1, out2, out3, out8, out9, filt;
620 
621  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
622  src -= 3;
623 
624  /* rearranging filter */
625  filt = LD_SH(filter);
626  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
627 
628  mask1 = mask0 + 2;
629  mask2 = mask0 + 4;
630  mask3 = mask0 + 6;
631  mask4 = mask0 + 8;
632  mask5 = mask0 + 10;
633  mask6 = mask0 + 12;
634  mask7 = mask0 + 14;
635 
636  for (loop_cnt = 16; loop_cnt--;) {
637  LD_SB2(src, src_stride, src0, src2);
638  LD_SB2(src + 16, src_stride, src1, src3);
639  XORI_B4_128_SB(src0, src1, src2, src3);
640  src += (2 * src_stride);
641  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
642  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
643  VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
644  DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
645  out8, out2, out9);
646  DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
647  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
648  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
649  VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
650  DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
651  out0, out8, out2, out9);
652  DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3);
653  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
654  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
655  VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
656  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
657  out0, out8, out2, out9);
658  DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
659  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
660  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
661  VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
662  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
663  out0, out8, out2, out9);
664  DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3);
665  SRARI_H4_SH(out0, out8, out2, out9, 6);
666  SRARI_H2_SH(out1, out3, 6);
667  SAT_SH4_SH(out0, out8, out2, out9, 7);
668  SAT_SH2_SH(out1, out3, 7);
669  out = PCKEV_XORI128_UB(out8, out9);
670  ST_D2(out, 0, 1, dst + 16, dst_stride);
671  out = PCKEV_XORI128_UB(out0, out1);
672  ST_UB(out, dst);
673  dst += dst_stride;
674  out = PCKEV_XORI128_UB(out2, out3);
675  ST_UB(out, dst);
676  dst += dst_stride;
677  }
678 }
679 
680 static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
681  uint8_t *dst, int32_t dst_stride,
682  const int8_t *filter, int32_t height)
683 {
684  uint32_t loop_cnt;
685  v16u8 mask0, mask1, mask2, mask3, out;
686  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
687  v16i8 filt0, filt1, filt2, filt3;
688  v8i16 filt, out0, out1, out2, out3;
689 
690  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
691  src -= 3;
692 
693  /* rearranging filter */
694  filt = LD_SH(filter);
695  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
696 
697  mask1 = mask0 + 2;
698  mask2 = mask0 + 4;
699  mask3 = mask0 + 6;
700 
701  for (loop_cnt = (height >> 1); loop_cnt--;) {
702  src0 = LD_SB(src);
703  src1 = LD_SB(src + 8);
704  src2 = LD_SB(src + 16);
705  src3 = LD_SB(src + 24);
706  src += src_stride;
707  XORI_B4_128_SB(src0, src1, src2, src3);
708 
709  src4 = LD_SB(src);
710  src5 = LD_SB(src + 8);
711  src6 = LD_SB(src + 16);
712  src7 = LD_SB(src + 24);
713  src += src_stride;
714  XORI_B4_128_SB(src4, src5, src6, src7);
715 
716  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
717  mask3, filt0, filt1, filt2, filt3, out0,
718  out1, out2, out3);
719  SRARI_H4_SH(out0, out1, out2, out3, 6);
720  SAT_SH4_SH(out0, out1, out2, out3, 7);
721 
722  out = PCKEV_XORI128_UB(out0, out1);
723  ST_UB(out, dst);
724  out = PCKEV_XORI128_UB(out2, out3);
725  ST_UB(out, dst + 16);
726  dst += dst_stride;
727 
728  HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
729  mask3, filt0, filt1, filt2, filt3, out0,
730  out1, out2, out3);
731  SRARI_H4_SH(out0, out1, out2, out3, 6);
732  SAT_SH4_SH(out0, out1, out2, out3, 7);
733  out = PCKEV_XORI128_UB(out0, out1);
734  ST_UB(out, dst);
735  out = PCKEV_XORI128_UB(out2, out3);
736  ST_UB(out, dst + 16);
737  dst += dst_stride;
738  }
739 }
740 
741 static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
742  uint8_t *dst, int32_t dst_stride,
743  const int8_t *filter, int32_t height)
744 {
745  uint32_t loop_cnt;
746  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
747  v16i8 src4;
748  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
749  v8i16 filt, out0, out1, out2, out3;
750 
751  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
752  src -= 3;
753 
754  /* rearranging filter */
755  filt = LD_SH(filter);
756  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
757 
758  mask1 = mask0 + 2;
759  mask2 = mask0 + 4;
760  mask3 = mask0 + 6;
761  mask4 = mask0 + 8;
762  mask5 = mask0 + 10;
763  mask6 = mask0 + 12;
764  mask7 = mask0 + 14;
765 
766  for (loop_cnt = 64; loop_cnt--;) {
767  src0 = LD_SB(src);
768  src1 = LD_SB(src + 8);
769  src2 = LD_SB(src + 16);
770  src3 = LD_SB(src + 32);
771  src4 = LD_SB(src + 40);
772  src += src_stride;
773 
774  XORI_B4_128_SB(src0, src1, src2, src3);
775  src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
776 
777  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
778  vec0, vec1, vec2);
779  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
780  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
781  vec0, vec1, vec2);
782  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
783  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
784  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
785  vec0, vec1, vec2);
786  DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
787  out2 = __msa_dpadd_s_h(out2, vec2, filt2);
788 
789  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
790  vec0, vec1, vec2);
791  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
792  out2 = __msa_dpadd_s_h(out2, vec2, filt3);
793 
794  SRARI_H2_SH(out0, out1, 6);
795  out3 = __msa_srari_h(out2, 6);
796  SAT_SH3_SH(out0, out1, out3, 7);
797  out = PCKEV_XORI128_UB(out0, out1);
798  ST_UB(out, dst);
799 
800  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
801  vec0, vec1, vec2);
802  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
803  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
804  vec0, vec1, vec2);
805  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
806  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
807  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
808  vec0, vec1, vec2);
809  DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
810  out2 = __msa_dpadd_s_h(out2, vec2, filt2);
811  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
812  vec0, vec1, vec2);
813  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
814  out2 = __msa_dpadd_s_h(out2, vec2, filt3);
815 
816  SRARI_H2_SH(out0, out1, 6);
817  out2 = __msa_srari_h(out2, 6);
818  SAT_SH3_SH(out0, out1, out2, 7);
819  out = PCKEV_XORI128_UB(out3, out0);
820  ST_UB(out, dst + 16);
821  out = PCKEV_XORI128_UB(out1, out2);
822  ST_UB(out, dst + 32);
823  dst += dst_stride;
824  }
825 }
826 
827 static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
828  uint8_t *dst, int32_t dst_stride,
829  const int8_t *filter, int32_t height)
830 {
831  int32_t loop_cnt;
832  v16u8 mask0, mask1, mask2, mask3, out;
833  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
834  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
835  v16i8 filt0, filt1, filt2, filt3;
836  v8i16 res0, res1, res2, res3, filt;
837 
838  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
839  src -= 3;
840 
841  /* rearranging filter */
842  filt = LD_SH(filter);
843  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
844 
845  mask1 = mask0 + 2;
846  mask2 = mask0 + 4;
847  mask3 = mask0 + 6;
848 
849  for (loop_cnt = height; loop_cnt--;) {
850  LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
851  src += src_stride;
852 
853  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
854 
855  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
856  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
857  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
858  res1, res2, res3);
859  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
860  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
861  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
862  res1, res2, res3);
863  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
864  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
865  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
866  res1, res2, res3);
867  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
868  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
869  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
870  res1, res2, res3);
871 
872  SRARI_H4_SH(res0, res1, res2, res3, 6);
873  SAT_SH4_SH(res0, res1, res2, res3, 7);
874  out = PCKEV_XORI128_UB(res0, res1);
875  ST_UB(out, dst);
876  out = PCKEV_XORI128_UB(res2, res3);
877  ST_UB(out, dst + 16);
878 
879  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
880  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
881  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
882  res1, res2, res3);
883  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
884  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
885  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
886  res1, res2, res3);
887  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
888  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
889  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
890  res1, res2, res3);
891  VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
892  VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
893  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
894  res1, res2, res3);
895 
896  SRARI_H4_SH(res0, res1, res2, res3, 6);
897  SAT_SH4_SH(res0, res1, res2, res3, 7);
898  out = PCKEV_XORI128_UB(res0, res1);
899  ST_UB(out, dst + 32);
900  out = PCKEV_XORI128_UB(res2, res3);
901  ST_UB(out, dst + 48);
902  dst += dst_stride;
903  }
904 }
905 
906 static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
907  uint8_t *dst, int32_t dst_stride,
908  const int8_t *filter, int32_t height)
909 {
910  uint32_t loop_cnt;
911  v16u8 out0, out1;
912  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
913  v16i8 src11, src12, src13, src14;
914  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
915  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
916  v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
917  v16i8 src10998, filt0, filt1, filt2, filt3;
918  v8i16 filt, out10, out32, out54, out76;
919 
920  src -= (3 * src_stride);
921 
922  filt = LD_SH(filter);
923  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
924 
925  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
926  src += (7 * src_stride);
927 
928  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
929  src54_r, src21_r);
930  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
931  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
932  src4332, src6554);
933  XORI_B3_128_SB(src2110, src4332, src6554);
934 
935  for (loop_cnt = (height >> 3); loop_cnt--;) {
936  LD_SB4(src, src_stride, src7, src8, src9, src10);
937  src += (4 * src_stride);
938  LD_SB4(src, src_stride, src11, src12, src13, src14);
939  src += (4 * src_stride);
940 
941  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
942  src87_r, src98_r, src109_r);
943  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
944  src1110_r, src1211_r, src1312_r, src1413_r);
945  ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
946  ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
947  src12111110, src14131312);
948  XORI_B2_128_SB(src8776, src10998);
949  XORI_B2_128_SB(src12111110, src14131312);
950 
951  DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
952  DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
953  DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
954  DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
955  DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
956  DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
957  DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
958  DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
959  SRARI_H2_SH(out10, out32, 6);
960  SRARI_H2_SH(out54, out76, 6);
961  SAT_SH2_SH(out10, out32, 7);
962  SAT_SH2_SH(out54, out76, 7);
963  out0 = PCKEV_XORI128_UB(out10, out32);
964  out1 = PCKEV_XORI128_UB(out54, out76);
965  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
966  dst += (8 * dst_stride);
967 
968  src2110 = src10998;
969  src4332 = src12111110;
970  src6554 = src14131312;
971  src6 = src14;
972  }
973 }
974 
975 static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
976  uint8_t *dst, int32_t dst_stride,
977  const int8_t *filter, int32_t height)
978 {
979  uint32_t loop_cnt;
980  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
981  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
982  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
983  v16u8 tmp0, tmp1;
984  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
985 
986  src -= (3 * src_stride);
987 
988  filt = LD_SH(filter);
989  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
990 
991  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
992  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
993  src += (7 * src_stride);
994  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
995  src54_r, src21_r);
996  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
997 
998  for (loop_cnt = (height >> 2); loop_cnt--;) {
999  LD_SB4(src, src_stride, src7, src8, src9, src10);
1000  XORI_B4_128_SB(src7, src8, src9, src10);
1001  src += (4 * src_stride);
1002 
1003  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1004  src87_r, src98_r, src109_r);
1005  DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1006  filt0, out0_r, out1_r, out2_r, out3_r);
1007  DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1008  filt1, out0_r, out1_r, out2_r, out3_r);
1009  DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1010  filt2, out0_r, out1_r, out2_r, out3_r);
1011  DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1012  filt3, out0_r, out1_r, out2_r, out3_r);
1013  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1014  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1015  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1016  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1017  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
1018  dst += (4 * dst_stride);
1019 
1020  src10_r = src54_r;
1021  src32_r = src76_r;
1022  src54_r = src98_r;
1023  src21_r = src65_r;
1024  src43_r = src87_r;
1025  src65_r = src109_r;
1026  src6 = src10;
1027  }
1028 }
1029 
1030 static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1031  uint8_t *dst, int32_t dst_stride,
1032  const int8_t *filter, int32_t height)
1033 {
1034  uint32_t loop_cnt;
1035  uint32_t out2, out3;
1036  uint64_t out0, out1;
1037  v16u8 tmp0, tmp1, tmp2, tmp3;
1038  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1039  v16i8 filt0, filt1, filt2, filt3;
1040  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1041  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1042  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1043  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1044 
1045  src -= (3 * src_stride);
1046 
1047  filt = LD_SH(filter);
1048  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1049 
1050  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1051  src += (7 * src_stride);
1052 
1053  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1054 
1055  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1056  src54_r, src21_r);
1057  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1058  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1059  src54_l, src21_l);
1060  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1061 
1062  for (loop_cnt = 4; loop_cnt--;) {
1063  LD_SB4(src, src_stride, src7, src8, src9, src10);
1064  XORI_B4_128_SB(src7, src8, src9, src10);
1065  src += (4 * src_stride);
1066 
1067  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1068  src87_r, src98_r, src109_r);
1069  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1070  src87_l, src98_l, src109_l);
1071  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1072  filt1, filt2, filt3);
1073  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1074  filt1, filt2, filt3);
1075  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1076  filt1, filt2, filt3);
1077  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1078  filt1, filt2, filt3);
1079  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1080  filt1, filt2, filt3);
1081  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1082  filt1, filt2, filt3);
1083  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1084  filt1, filt2, filt3);
1085  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1086  filt1, filt2, filt3);
1087  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1088  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1089  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1090  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1091  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1092  out3_r, tmp0, tmp1, tmp2, tmp3);
1093  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1094 
1095  out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1096  out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1097  out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1098  out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1099  SD(out0, dst);
1100  SW(out2, (dst + 8));
1101  dst += dst_stride;
1102  SD(out1, dst);
1103  SW(out3, (dst + 8));
1104  dst += dst_stride;
1105  out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1106  out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1107  out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1108  out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1109  SD(out0, dst);
1110  SW(out2, (dst + 8));
1111  dst += dst_stride;
1112  SD(out1, dst);
1113  SW(out3, (dst + 8));
1114  dst += dst_stride;
1115 
1116  src10_r = src54_r;
1117  src32_r = src76_r;
1118  src54_r = src98_r;
1119  src21_r = src65_r;
1120  src43_r = src87_r;
1121  src65_r = src109_r;
1122  src10_l = src54_l;
1123  src32_l = src76_l;
1124  src54_l = src98_l;
1125  src21_l = src65_l;
1126  src43_l = src87_l;
1127  src65_l = src109_l;
1128  src6 = src10;
1129  }
1130 }
1131 
1132 static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1133  uint8_t *dst, int32_t dst_stride,
1134  const int8_t *filter, int32_t height)
1135 {
1136  uint32_t loop_cnt;
1137  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1138  v16i8 filt0, filt1, filt2, filt3;
1139  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1140  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1141  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1142  v16u8 tmp0, tmp1, tmp2, tmp3;
1143  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1144 
1145  src -= (3 * src_stride);
1146 
1147  filt = LD_SH(filter);
1148  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1149 
1150  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1151  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1152  src += (7 * src_stride);
1153  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1154  src54_r, src21_r);
1155  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1156  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1157  src54_l, src21_l);
1158  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1159 
1160  for (loop_cnt = (height >> 2); loop_cnt--;) {
1161  LD_SB4(src, src_stride, src7, src8, src9, src10);
1162  XORI_B4_128_SB(src7, src8, src9, src10);
1163  src += (4 * src_stride);
1164 
1165  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1166  src87_r, src98_r, src109_r);
1167  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1168  src87_l, src98_l, src109_l);
1169  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1170  filt1, filt2, filt3);
1171  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1172  filt1, filt2, filt3);
1173  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1174  filt1, filt2, filt3);
1175  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1176  filt1, filt2, filt3);
1177  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1178  filt1, filt2, filt3);
1179  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1180  filt1, filt2, filt3);
1181  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1182  filt1, filt2, filt3);
1183  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1184  filt1, filt2, filt3);
1185  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1186  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1187  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1188  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1189  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1190  out3_r, tmp0, tmp1, tmp2, tmp3);
1191  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1192  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1193  dst += (4 * dst_stride);
1194 
1195  src10_r = src54_r;
1196  src32_r = src76_r;
1197  src54_r = src98_r;
1198  src21_r = src65_r;
1199  src43_r = src87_r;
1200  src65_r = src109_r;
1201  src10_l = src54_l;
1202  src32_l = src76_l;
1203  src54_l = src98_l;
1204  src21_l = src65_l;
1205  src43_l = src87_l;
1206  src65_l = src109_l;
1207  src6 = src10;
1208  }
1209 }
1210 
1212  uint8_t *dst, int32_t dst_stride,
1213  const int8_t *filter, int32_t height,
1214  int32_t width)
1215 {
1216  uint8_t *src_tmp;
1217  uint8_t *dst_tmp;
1218  uint32_t loop_cnt, cnt;
1219  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1220  v16i8 filt0, filt1, filt2, filt3;
1221  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1222  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1223  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1224  v16u8 tmp0, tmp1, tmp2, tmp3;
1225  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1226 
1227  src -= (3 * src_stride);
1228 
1229  filt = LD_SH(filter);
1230  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1231 
1232  for (cnt = (width >> 4); cnt--;) {
1233  src_tmp = src;
1234  dst_tmp = dst;
1235 
1236  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1237  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1238  src_tmp += (7 * src_stride);
1239  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1240  src32_r, src54_r, src21_r);
1241  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1242  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1243  src32_l, src54_l, src21_l);
1244  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1245 
1246  for (loop_cnt = (height >> 2); loop_cnt--;) {
1247  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1248  XORI_B4_128_SB(src7, src8, src9, src10);
1249  src_tmp += (4 * src_stride);
1250  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1251  src87_r, src98_r, src109_r);
1252  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1253  src87_l, src98_l, src109_l);
1254  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r,
1255  filt0, filt1, filt2, filt3);
1256  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r,
1257  filt0, filt1, filt2, filt3);
1258  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r,
1259  filt0, filt1, filt2, filt3);
1260  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r,
1261  filt0, filt1, filt2, filt3);
1262  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l,
1263  filt0, filt1, filt2, filt3);
1264  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l,
1265  filt0, filt1, filt2, filt3);
1266  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l,
1267  filt0, filt1, filt2, filt3);
1268  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l,
1269  filt0, filt1, filt2, filt3);
1270  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1271  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1272  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1273  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1274  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1275  out3_r, tmp0, tmp1, tmp2, tmp3);
1276  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1277  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1278  dst_tmp += (4 * dst_stride);
1279 
1280  src10_r = src54_r;
1281  src32_r = src76_r;
1282  src54_r = src98_r;
1283  src21_r = src65_r;
1284  src43_r = src87_r;
1285  src65_r = src109_r;
1286  src10_l = src54_l;
1287  src32_l = src76_l;
1288  src54_l = src98_l;
1289  src21_l = src65_l;
1290  src43_l = src87_l;
1291  src65_l = src109_l;
1292  src6 = src10;
1293  }
1294 
1295  src += 16;
1296  dst += 16;
1297  }
1298 }
1299 
1300 static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1301  uint8_t *dst, int32_t dst_stride,
1302  const int8_t *filter, int32_t height)
1303 {
1304  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1305  16);
1306 
1307  common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1308  height);
1309 }
1310 
1311 static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1312  uint8_t *dst, int32_t dst_stride,
1313  const int8_t *filter, int32_t height)
1314 {
1315  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1316  32);
1317 }
1318 
1319 static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1320  uint8_t *dst, int32_t dst_stride,
1321  const int8_t *filter, int32_t height)
1322 {
1323  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1324  48);
1325 }
1326 
1327 static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1328  uint8_t *dst, int32_t dst_stride,
1329  const int8_t *filter, int32_t height)
1330 {
1331  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1332  64);
1333 }
1334 
1336  int32_t src_stride,
1337  uint8_t *dst,
1338  int32_t dst_stride,
1339  const int8_t *filter_x,
1340  const int8_t *filter_y,
1341  int32_t height)
1342 {
1343  uint32_t loop_cnt;
1344  v16u8 out0, out1;
1345  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1346  v16i8 src9, src10, src11, src12, src13, src14;
1347  v8i16 filt0, filt1, filt2, filt3;
1348  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1349  v16i8 mask1, mask2, mask3;
1350  v8i16 filter_vec;
1351  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1352  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1353  v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1354  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1355  v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1356  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1357  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1358 
1359  src -= ((3 * src_stride) + 3);
1360  filter_vec = LD_SH(filter_x);
1361  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1362 
1363  filter_vec = LD_SH(filter_y);
1364  UNPCK_R_SB_SH(filter_vec, filter_vec);
1365 
1366  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1367 
1368  mask1 = mask0 + 2;
1369  mask2 = mask0 + 4;
1370  mask3 = mask0 + 6;
1371 
1372  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1373  src += (7 * src_stride);
1374  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1375 
1376  VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1377  VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1378  VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1379  vec8, vec9, vec10, vec11);
1380  VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1381  vec12, vec13, vec14, vec15);
1382 
1383  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1384  filt3);
1385  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1386  filt3);
1387  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1388  filt3);
1389  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1390  filt3);
1391 
1392  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1393  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1394  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1395 
1396  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1397 
1398  for (loop_cnt = height >> 3; loop_cnt--;) {
1399  LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1400  src14);
1401  src += (8 * src_stride);
1402  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1403 
1404  VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1405  vec0, vec1, vec2, vec3);
1406  VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1407  vec4, vec5, vec6, vec7);
1408  VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1409  vec8, vec9, vec10, vec11);
1410  VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1411  vec12, vec13, vec14, vec15);
1412 
1413  dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1414  filt3);
1415  dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1416  filt3);
1417  dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1418  filt2, filt3);
1419  dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1420  filt2, filt3);
1421 
1422  dst76_r = __msa_ilvr_h(dst117, dst66);
1423  ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1424  ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1425  ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1426  dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1427  dst1110_r = __msa_ilvr_h(dst117, dst1410);
1428 
1429  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1430  filt_h1, filt_h2, filt_h3);
1431  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1432  filt_h1, filt_h2, filt_h3);
1433  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1434  filt_h1, filt_h2, filt_h3);
1435  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1436  filt_h1, filt_h2, filt_h3);
1437  dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1438  filt_h1, filt_h2, filt_h3);
1439  dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1440  filt_h1, filt_h2, filt_h3);
1441  dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1442  filt_h1, filt_h2, filt_h3);
1443  dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1444  filt_h0, filt_h1, filt_h2, filt_h3);
1445 
1446  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1447  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1448  SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1449  SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1450  SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1451  SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1452  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1453  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1454  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1455  out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1456  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1457  dst += (8 * dst_stride);
1458 
1459  dst10_r = dst98_r;
1460  dst32_r = dst1110_r;
1461  dst54_r = dst1312_r;
1462  dst21_r = dst109_r;
1463  dst43_r = dst1211_r;
1464  dst65_r = dst1413_r;
1465  dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1466  }
1467 }
1468 
1470  int32_t src_stride,
1471  uint8_t *dst,
1472  int32_t dst_stride,
1473  const int8_t *filter_x,
1474  const int8_t *filter_y,
1476 {
1477  uint32_t loop_cnt, cnt;
1478  uint8_t *src_tmp;
1479  uint8_t *dst_tmp;
1480  v16u8 out;
1481  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1482  v8i16 filt0, filt1, filt2, filt3;
1483  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1484  v16i8 mask1, mask2, mask3;
1485  v8i16 filter_vec;
1486  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1487  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1488  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1489  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1490  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1491  v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1492  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1493  v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1494  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1495 
1496  src -= ((3 * src_stride) + 3);
1497 
1498  filter_vec = LD_SH(filter_x);
1499  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1500 
1501  filter_vec = LD_SH(filter_y);
1502  UNPCK_R_SB_SH(filter_vec, filter_vec);
1503 
1504  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1505 
1506  mask1 = mask0 + 2;
1507  mask2 = mask0 + 4;
1508  mask3 = mask0 + 6;
1509 
1510  for (cnt = width >> 3; cnt--;) {
1511  src_tmp = src;
1512  dst_tmp = dst;
1513 
1514  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1515  src_tmp += (7 * src_stride);
1516  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1517 
1518  /* row 0 row 1 row 2 row 3 */
1519  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1520  vec0, vec1, vec2, vec3);
1521  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1522  vec4, vec5, vec6, vec7);
1523  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1524  vec8, vec9, vec10, vec11);
1525  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1526  vec12, vec13, vec14, vec15);
1527  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1528  filt3);
1529  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1530  filt3);
1531  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1532  filt3);
1533  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1534  filt2, filt3);
1535 
1536  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1537  vec0, vec1, vec2, vec3);
1538  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1539  vec4, vec5, vec6, vec7);
1540  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1541  vec8, vec9, vec10, vec11);
1542  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1543  filt3);
1544  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1545  filt3);
1546  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1547  filt3);
1548 
1549  for (loop_cnt = height >> 1; loop_cnt--;) {
1550  LD_SB2(src_tmp, src_stride, src7, src8);
1551  XORI_B2_128_SB(src7, src8);
1552  src_tmp += 2 * src_stride;
1553 
1554  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1555  dst10_r, dst32_r, dst54_r, dst21_r);
1556  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1557  dst10_l, dst32_l, dst54_l, dst21_l);
1558  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1559  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1560 
1561  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1562  vec0, vec1, vec2, vec3);
1563  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1564  filt2, filt3);
1565 
1566  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1567  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1568  filt_h0, filt_h1, filt_h2, filt_h3);
1569  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1570  filt_h0, filt_h1, filt_h2, filt_h3);
1571  dst0_r >>= 6;
1572  dst0_l >>= 6;
1573 
1574  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1575  vec0, vec1, vec2, vec3);
1576  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1577  filt2, filt3);
1578 
1579  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1580  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1581  filt_h0, filt_h1, filt_h2, filt_h3);
1582  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1583  filt_h0, filt_h1, filt_h2, filt_h3);
1584  dst1_r >>= 6;
1585  dst1_l >>= 6;
1586  SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1587  SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1588 
1589  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1590  out = PCKEV_XORI128_UB(dst0, dst1);
1591  ST_D2(out, 0, 1, dst_tmp, dst_stride);
1592  dst_tmp += (2 * dst_stride);
1593 
1594  dst0 = dst2;
1595  dst1 = dst3;
1596  dst2 = dst4;
1597  dst3 = dst5;
1598  dst4 = dst6;
1599  dst5 = dst7;
1600  dst6 = dst8;
1601  }
1602 
1603  src += 8;
1604  dst += 8;
1605  }
1606 }
1607 
1609  int32_t src_stride,
1610  uint8_t *dst,
1611  int32_t dst_stride,
1612  const int8_t *filter_x,
1613  const int8_t *filter_y,
1614  int32_t height)
1615 {
1616  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1617  filter_x, filter_y, height, 8);
1618 }
1619 
1621  int32_t src_stride,
1622  uint8_t *dst,
1623  int32_t dst_stride,
1624  const int8_t *filter_x,
1625  const int8_t *filter_y,
1626  int32_t height)
1627 {
1628  uint32_t loop_cnt;
1629  uint8_t *src_tmp, *dst_tmp;
1630  v16u8 out0, out1;
1631  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1632  v16i8 src11, src12, src13, src14;
1633  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1634  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1635  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1636  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1637  v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1638  v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1639  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1640  v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1641  v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1642  v8i16 dst1413_r, dst87_l, filter_vec;
1643  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1644  v4i32 dst0_l, dst1_l;
1645 
1646  src -= ((3 * src_stride) + 3);
1647 
1648  filter_vec = LD_SH(filter_x);
1649  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1650 
1651  filter_vec = LD_SH(filter_y);
1652  UNPCK_R_SB_SH(filter_vec, filter_vec);
1653 
1654  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1655 
1656  mask0 = LD_SB(ff_hevc_mask_arr);
1657  mask1 = mask0 + 2;
1658  mask2 = mask0 + 4;
1659  mask3 = mask0 + 6;
1660 
1661  src_tmp = src;
1662  dst_tmp = dst;
1663 
1664  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1665  src_tmp += (7 * src_stride);
1666  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1667 
1668  /* row 0 row 1 row 2 row 3 */
1669  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1670  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1671  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1672  vec11);
1673  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1674  vec15);
1675  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1676  filt3);
1677  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1678  filt3);
1679  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1680  filt3);
1681  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1682  filt2, filt3);
1683 
1684  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1685  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1686  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1687  vec11);
1688  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1689  filt3);
1690  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1691  filt3);
1692  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1693  filt3);
1694 
1695  for (loop_cnt = 8; loop_cnt--;) {
1696  LD_SB2(src_tmp, src_stride, src7, src8);
1697  XORI_B2_128_SB(src7, src8);
1698  src_tmp += 2 * src_stride;
1699 
1700  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1701  dst32_r, dst54_r, dst21_r);
1702  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1703  dst32_l, dst54_l, dst21_l);
1704  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1705  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1706 
1707  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1708  vec3);
1709  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1710  filt3);
1711 
1712  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1713  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1714  filt_h0, filt_h1, filt_h2, filt_h3);
1715  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1716  filt_h0, filt_h1, filt_h2, filt_h3);
1717  dst0_r >>= 6;
1718  dst0_l >>= 6;
1719 
1720  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1721  vec3);
1722  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1723  filt3);
1724 
1725  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1726  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1727  filt_h0, filt_h1, filt_h2, filt_h3);
1728  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1729  filt_h0, filt_h1, filt_h2, filt_h3);
1730  dst1_r >>= 6;
1731  dst1_l >>= 6;
1732  SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1733  SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1734 
1735  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1736  out0 = PCKEV_XORI128_UB(dst0, dst1);
1737  ST_D2(out0, 0, 1, dst_tmp, dst_stride);
1738  dst_tmp += (2 * dst_stride);
1739 
1740  dst0 = dst2;
1741  dst1 = dst3;
1742  dst2 = dst4;
1743  dst3 = dst5;
1744  dst4 = dst6;
1745  dst5 = dst7;
1746  dst6 = dst8;
1747  }
1748 
1749  src += 8;
1750  dst += 8;
1751 
1752  mask4 = LD_SB(ff_hevc_mask_arr + 16);
1753  mask5 = mask4 + 2;
1754  mask6 = mask4 + 4;
1755  mask7 = mask4 + 6;
1756 
1757  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1758  src += (7 * src_stride);
1759  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1760 
1761  VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1762  VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1763  VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1764  vec11);
1765  VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1766  vec15);
1767 
1768  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1769  filt3);
1770  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1771  filt3);
1772  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1773  filt3);
1774  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1775  filt3);
1776 
1777  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1778  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1779  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1780 
1781  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1782 
1783  for (loop_cnt = 2; loop_cnt--;) {
1784  LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1785  src14);
1786  src += (8 * src_stride);
1787  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1788 
1789  VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1790  vec3);
1791  VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1792  vec7);
1793  VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1794  vec11);
1795  VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1796  vec14, vec15);
1797 
1798  dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1799  filt3);
1800  dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1801  filt3);
1802  dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1803  filt2, filt3);
1804  dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1805  filt2, filt3);
1806 
1807  dst76_r = __msa_ilvr_h(dst117, dst66);
1808  ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1809  ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1810  ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1811  dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1812  dst1110_r = __msa_ilvr_h(dst117, dst1410);
1813 
1814  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1815  filt_h1, filt_h2, filt_h3);
1816  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1817  filt_h1, filt_h2, filt_h3);
1818  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1819  filt_h1, filt_h2, filt_h3);
1820  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1821  filt_h1, filt_h2, filt_h3);
1822  dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1823  filt_h1, filt_h2, filt_h3);
1824  dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1825  filt_h1, filt_h2, filt_h3);
1826  dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1827  filt_h1, filt_h2, filt_h3);
1828  dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1829  filt_h0, filt_h1, filt_h2, filt_h3);
1830 
1831  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1832  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1833  SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1834  SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1835  SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1836  SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1837  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1838  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1839  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1840  out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1841  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1842  dst += (8 * dst_stride);
1843 
1844  dst10_r = dst98_r;
1845  dst32_r = dst1110_r;
1846  dst54_r = dst1312_r;
1847  dst21_r = dst109_r;
1848  dst43_r = dst1211_r;
1849  dst65_r = dst1413_r;
1850  dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1851  }
1852 }
1853 
1855  int32_t src_stride,
1856  uint8_t *dst,
1857  int32_t dst_stride,
1858  const int8_t *filter_x,
1859  const int8_t *filter_y,
1860  int32_t height)
1861 {
1862  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1863  filter_x, filter_y, height, 16);
1864 }
1865 
1867  int32_t src_stride,
1868  uint8_t *dst,
1869  int32_t dst_stride,
1870  const int8_t *filter_x,
1871  const int8_t *filter_y,
1872  int32_t height)
1873 {
1874  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1875  filter_x, filter_y, height, 24);
1876 }
1877 
1879  int32_t src_stride,
1880  uint8_t *dst,
1881  int32_t dst_stride,
1882  const int8_t *filter_x,
1883  const int8_t *filter_y,
1884  int32_t height)
1885 {
1886  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1887  filter_x, filter_y, height, 32);
1888 }
1889 
1891  int32_t src_stride,
1892  uint8_t *dst,
1893  int32_t dst_stride,
1894  const int8_t *filter_x,
1895  const int8_t *filter_y,
1896  int32_t height)
1897 {
1898  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1899  filter_x, filter_y, height, 48);
1900 }
1901 
1903  int32_t src_stride,
1904  uint8_t *dst,
1905  int32_t dst_stride,
1906  const int8_t *filter_x,
1907  const int8_t *filter_y,
1908  int32_t height)
1909 {
1910  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1911  filter_x, filter_y, height, 64);
1912 }
1913 
1914 static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
1915  uint8_t *dst, int32_t dst_stride,
1916  const int8_t *filter)
1917 {
1918  v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
1919  v16u8 out;
1920  v8i16 filt, res0;
1921 
1922  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1923  src -= 1;
1924 
1925  /* rearranging filter */
1926  filt = LD_SH(filter);
1927  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1928 
1929  mask1 = mask0 + 2;
1930 
1931  LD_SB2(src, src_stride, src0, src1);
1932  XORI_B2_128_SB(src0, src1);
1933  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1934  res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
1935  res0 = __msa_srari_h(res0, 6);
1936  res0 = __msa_sat_s_h(res0, 7);
1937  out = PCKEV_XORI128_UB(res0, res0);
1938  ST_W2(out, 0, 1, dst, dst_stride);
1939 }
1940 
1941 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
1942  uint8_t *dst, int32_t dst_stride,
1943  const int8_t *filter)
1944 {
1945  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1946  v8i16 filt, out0, out1;
1947  v16u8 out;
1948 
1949  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1950  src -= 1;
1951 
1952  /* rearranging filter */
1953  filt = LD_SH(filter);
1954  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1955 
1956  mask1 = mask0 + 2;
1957 
1958  LD_SB4(src, src_stride, src0, src1, src2, src3);
1959  XORI_B4_128_SB(src0, src1, src2, src3);
1960  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1961  filt0, filt1, out0, out1);
1962  SRARI_H2_SH(out0, out1, 6);
1963  SAT_SH2_SH(out0, out1, 7);
1964  out = PCKEV_XORI128_UB(out0, out1);
1965  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1966 }
1967 
1968 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
1969  uint8_t *dst, int32_t dst_stride,
1970  const int8_t *filter)
1971 {
1972  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1973  v16u8 out;
1974  v8i16 filt, out0, out1, out2, out3;
1975 
1976  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1977  src -= 1;
1978 
1979  /* rearranging filter */
1980  filt = LD_SH(filter);
1981  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1982 
1983  mask1 = mask0 + 2;
1984 
1985  LD_SB4(src, src_stride, src0, src1, src2, src3);
1986  src += (4 * src_stride);
1987 
1988  XORI_B4_128_SB(src0, src1, src2, src3);
1989  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1990  filt0, filt1, out0, out1);
1991  LD_SB4(src, src_stride, src0, src1, src2, src3);
1992  XORI_B4_128_SB(src0, src1, src2, src3);
1993  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1994  filt0, filt1, out2, out3);
1995  SRARI_H4_SH(out0, out1, out2, out3, 6);
1996  SAT_SH4_SH(out0, out1, out2, out3, 7);
1997  out = PCKEV_XORI128_UB(out0, out1);
1998  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1999  out = PCKEV_XORI128_UB(out2, out3);
2000  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2001 }
2002 
2003 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
2004  uint8_t *dst, int32_t dst_stride,
2005  const int8_t *filter)
2006 {
2007  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2008  v16i8 filt0, filt1, mask0, mask1;
2009  v16u8 out;
2010  v8i16 filt, out0, out1, out2, out3;
2011 
2012  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2013  src -= 1;
2014 
2015  /* rearranging filter */
2016  filt = LD_SH(filter);
2017  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2018 
2019  mask1 = mask0 + 2;
2020 
2021  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2022  src += (8 * src_stride);
2023  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2024  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2025  filt0, filt1, out0, out1);
2026  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2027  filt0, filt1, out2, out3);
2028  SRARI_H4_SH(out0, out1, out2, out3, 6);
2029  SAT_SH4_SH(out0, out1, out2, out3, 7);
2030  out = PCKEV_XORI128_UB(out0, out1);
2031  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2032  out = PCKEV_XORI128_UB(out2, out3);
2033  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2034  dst += (8 * dst_stride);
2035 
2036  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2037  src += (8 * src_stride);
2038  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2039  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2040  filt0, filt1, out0, out1);
2041  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2042  filt0, filt1, out2, out3);
2043  SRARI_H4_SH(out0, out1, out2, out3, 6);
2044  SAT_SH4_SH(out0, out1, out2, out3, 7);
2045  out = PCKEV_XORI128_UB(out0, out1);
2046  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2047  out = PCKEV_XORI128_UB(out2, out3);
2048  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2049 }
2050 
2051 static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
2052  uint8_t *dst, int32_t dst_stride,
2053  const int8_t *filter, int32_t height)
2054 {
2055  if (2 == height) {
2056  common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2057  } else if (4 == height) {
2058  common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2059  } else if (8 == height) {
2060  common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2061  } else if (16 == height) {
2062  common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
2063  }
2064 }
2065 
2066 static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
2067  uint8_t *dst, int32_t dst_stride,
2068  const int8_t *filter, int32_t height)
2069 {
2070  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2071  v16u8 out4, out5;
2072  v8i16 filt, out0, out1, out2, out3;
2073 
2074  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2075  src -= 1;
2076 
2077  /* rearranging filter */
2078  filt = LD_SH(filter);
2079  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2080 
2081  mask1 = mask0 + 2;
2082 
2083  LD_SB4(src, src_stride, src0, src1, src2, src3);
2084  src += (4 * src_stride);
2085 
2086  XORI_B4_128_SB(src0, src1, src2, src3);
2087  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2088  filt1, out0, out1, out2, out3);
2089  SRARI_H4_SH(out0, out1, out2, out3, 6);
2090  SAT_SH4_SH(out0, out1, out2, out3, 7);
2091  out4 = PCKEV_XORI128_UB(out0, out1);
2092  out5 = PCKEV_XORI128_UB(out2, out3);
2093  ST_W2(out4, 0, 2, dst, dst_stride);
2094  ST_H2(out4, 2, 6, dst + 4, dst_stride);
2095  ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2096  ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2097  dst += (4 * dst_stride);
2098 
2099  LD_SB4(src, src_stride, src0, src1, src2, src3);
2100  src += (4 * src_stride);
2101 
2102  XORI_B4_128_SB(src0, src1, src2, src3);
2103  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2104  filt1, out0, out1, out2, out3);
2105  SRARI_H4_SH(out0, out1, out2, out3, 6);
2106  SAT_SH4_SH(out0, out1, out2, out3, 7);
2107  out4 = PCKEV_XORI128_UB(out0, out1);
2108  out5 = PCKEV_XORI128_UB(out2, out3);
2109  ST_W2(out4, 0, 2, dst, dst_stride);
2110  ST_H2(out4, 2, 6, dst + 4, dst_stride);
2111  ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2112  ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2113 }
2114 
2115 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
2116  uint8_t *dst, int32_t dst_stride,
2117  const int8_t *filter, int32_t height)
2118 {
2119  uint32_t loop_cnt;
2120  v16i8 src0, src1, filt0, filt1, mask0, mask1;
2121  v16u8 out;
2122  v8i16 filt, vec0, vec1, vec2, vec3;
2123 
2124  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2125  src -= 1;
2126 
2127  filt = LD_SH(filter);
2128  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2129 
2130  mask1 = mask0 + 2;
2131 
2132  for (loop_cnt = (height >> 1); loop_cnt--;) {
2133  LD_SB2(src, src_stride, src0, src1);
2134  src += (2 * src_stride);
2135 
2136  XORI_B2_128_SB(src0, src1);
2137  VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2138  DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2139  VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2140  DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
2141  SRARI_H2_SH(vec0, vec1, 6);
2142  SAT_SH2_SH(vec0, vec1, 7);
2143  out = PCKEV_XORI128_UB(vec0, vec1);
2144  ST_D2(out, 0, 1, dst, dst_stride);
2145  dst += (2 * dst_stride);
2146  }
2147 }
2148 
2149 static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2150  uint8_t *dst, int32_t dst_stride,
2151  const int8_t *filter, int32_t height)
2152 {
2153  uint32_t loop_cnt;
2154  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2155  v16u8 tmp0, tmp1;
2156  v8i16 filt, out0, out1, out2, out3;
2157 
2158  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2159  src -= 1;
2160 
2161  /* rearranging filter */
2162  filt = LD_SH(filter);
2163  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2164 
2165  mask1 = mask0 + 2;
2166 
2167  for (loop_cnt = (height >> 2); loop_cnt--;) {
2168  LD_SB4(src, src_stride, src0, src1, src2, src3);
2169  src += (4 * src_stride);
2170 
2171  XORI_B4_128_SB(src0, src1, src2, src3);
2172  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2173  filt1, out0, out1, out2, out3);
2174  SRARI_H4_SH(out0, out1, out2, out3, 6);
2175  SAT_SH4_SH(out0, out1, out2, out3, 7);
2176  tmp0 = PCKEV_XORI128_UB(out0, out1);
2177  tmp1 = PCKEV_XORI128_UB(out2, out3);
2178  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2179  dst += (4 * dst_stride);
2180  }
2181 }
2182 
2183 static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
2184  uint8_t *dst, int32_t dst_stride,
2185  const int8_t *filter, int32_t height)
2186 {
2187  if ((2 == height) || (6 == height)) {
2188  common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
2189  height);
2190  } else {
2191  common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
2192  height);
2193  }
2194 }
2195 
2196 static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
2197  uint8_t *dst, int32_t dst_stride,
2198  const int8_t *filter, int32_t height)
2199 {
2200  uint32_t loop_cnt;
2201  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2202  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2203  v16i8 vec10, vec11;
2204  v16u8 tmp0, tmp1;
2205  v8i16 filt, out0, out1, out2, out3, out4, out5;
2206 
2207  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2208  mask2 = LD_SB(&ff_hevc_mask_arr[32]);
2209 
2210  src -= 1;
2211 
2212  /* rearranging filter */
2213  filt = LD_SH(filter);
2214  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2215 
2216  mask1 = mask0 + 2;
2217  mask3 = mask2 + 2;
2218 
2219  for (loop_cnt = 4; loop_cnt--;) {
2220  LD_SB4(src, src_stride, src0, src1, src2, src3);
2221  src += (4 * src_stride);
2222 
2223  XORI_B4_128_SB(src0, src1, src2, src3);
2224  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2225  DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2226  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2227  DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
2228  SRARI_H2_SH(out0, out1, 6);
2229  SAT_SH2_SH(out0, out1, 7);
2230  tmp0 = PCKEV_XORI128_UB(out0, out1);
2231  ST_W4(tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2232 
2233  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2234  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2235  DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2236  out2, out3, out4, out5);
2237  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2238  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2239  DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2240  out2, out3, out4, out5);
2241  SRARI_H4_SH(out2, out3, out4, out5, 6);
2242  SAT_SH4_SH(out2, out3, out4, out5, 7);
2243  tmp0 = PCKEV_XORI128_UB(out2, out3);
2244  tmp1 = PCKEV_XORI128_UB(out4, out5);
2245  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2246  dst += (4 * dst_stride);
2247  }
2248 }
2249 
2250 static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
2251  uint8_t *dst, int32_t dst_stride,
2252  const int8_t *filter, int32_t height)
2253 {
2254  uint32_t loop_cnt;
2255  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2256  v16i8 filt0, filt1, mask0, mask1;
2257  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2258  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2259  v16u8 out;
2260 
2261  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2262  src -= 1;
2263 
2264  /* rearranging filter */
2265  filt = LD_SH(filter);
2266  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2267 
2268  mask1 = mask0 + 2;
2269 
2270  for (loop_cnt = (height >> 2); loop_cnt--;) {
2271  LD_SB4(src, src_stride, src0, src2, src4, src6);
2272  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2273  src += (4 * src_stride);
2274 
2275  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2276 
2277  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2278  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2279  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2280  out0, out1, out2, out3);
2281  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2282  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2283  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2284  out0, out1, out2, out3);
2285  SRARI_H4_SH(out0, out1, out2, out3, 6);
2286  SAT_SH4_SH(out0, out1, out2, out3, 7);
2287  out = PCKEV_XORI128_UB(out0, out1);
2288  ST_UB(out, dst);
2289  dst += dst_stride;
2290  out = PCKEV_XORI128_UB(out2, out3);
2291  ST_UB(out, dst);
2292  dst += dst_stride;
2293 
2294  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2295  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2296  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2297  out4, out5, out6, out7);
2298  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2299  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2300  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2301  out4, out5, out6, out7);
2302  SRARI_H4_SH(out4, out5, out6, out7, 6);
2303  SAT_SH4_SH(out4, out5, out6, out7, 7);
2304  out = PCKEV_XORI128_UB(out4, out5);
2305  ST_UB(out, dst);
2306  dst += dst_stride;
2307  out = PCKEV_XORI128_UB(out6, out7);
2308  ST_UB(out, dst);
2309  dst += dst_stride;
2310  }
2311 }
2312 
2313 static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
2314  uint8_t *dst, int32_t dst_stride,
2315  const int8_t *filter, int32_t height)
2316 {
2317  uint8_t *dst1 = dst + 16;
2318  uint32_t loop_cnt;
2319  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2320  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2321  v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2322  v8i16 filt, out0, out1, out2, out3;
2323  v16u8 tmp0, tmp1;
2324 
2325  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2326  src -= 1;
2327 
2328  /* rearranging filter */
2329  filt = LD_SH(filter);
2330  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2331 
2332  mask1 = mask0 + 2;
2333  mask00 = mask0 + 8;
2334  mask11 = mask0 + 10;
2335 
2336  for (loop_cnt = 8; loop_cnt--;) {
2337  LD_SB4(src, src_stride, src0, src2, src4, src6);
2338  LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2339  src += (4 * src_stride);
2340 
2341  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2342  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2343  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2344  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2345  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2346  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2347  out0, out1, out2, out3);
2348  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2349  out0, out1, out2, out3);
2350  SRARI_H4_SH(out0, out1, out2, out3, 6);
2351  SAT_SH4_SH(out0, out1, out2, out3, 7);
2352  tmp0 = PCKEV_XORI128_UB(out0, out1);
2353  ST_UB(tmp0, dst);
2354  dst += dst_stride;
2355  tmp0 = PCKEV_XORI128_UB(out2, out3);
2356  ST_UB(tmp0, dst);
2357  dst += dst_stride;
2358 
2359  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2360  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2361  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2362  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2363  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2364  out0, out1, out2, out3);
2365  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2366  out0, out1, out2, out3);
2367  SRARI_H4_SH(out0, out1, out2, out3, 6);
2368  SAT_SH4_SH(out0, out1, out2, out3, 7);
2369  tmp0 = PCKEV_XORI128_UB(out0, out1);
2370  ST_UB(tmp0, dst);
2371  dst += dst_stride;
2372  tmp0 = PCKEV_XORI128_UB(out2, out3);
2373  ST_UB(tmp0, dst);
2374  dst += dst_stride;
2375 
2376  /* 8 width */
2377  VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2378  VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2379  VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2380  VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2381 
2382  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2383  out0, out1, out2, out3);
2384  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2385  out0, out1, out2, out3);
2386 
2387  SRARI_H4_SH(out0, out1, out2, out3, 6);
2388  SAT_SH4_SH(out0, out1, out2, out3, 7);
2389  tmp0 = PCKEV_XORI128_UB(out0, out1);
2390  tmp1 = PCKEV_XORI128_UB(out2, out3);
2391  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst1, dst_stride);
2392  dst1 += (4 * dst_stride);
2393  }
2394 }
2395 
2396 static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
2397  uint8_t *dst, int32_t dst_stride,
2398  const int8_t *filter, int32_t height)
2399 {
2400  uint32_t loop_cnt;
2401  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2402  v16i8 filt0, filt1, mask0, mask1;
2403  v16u8 out;
2404  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2405  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2406 
2407  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2408  src -= 1;
2409 
2410  /* rearranging filter */
2411  filt = LD_SH(filter);
2412  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2413 
2414  mask1 = mask0 + 2;
2415 
2416  for (loop_cnt = (height >> 1); loop_cnt--;) {
2417  src0 = LD_SB(src);
2418  src1 = LD_SB(src + 8);
2419  src2 = LD_SB(src + 16);
2420  src3 = LD_SB(src + 24);
2421  src += src_stride;
2422  src4 = LD_SB(src);
2423  src5 = LD_SB(src + 8);
2424  src6 = LD_SB(src + 16);
2425  src7 = LD_SB(src + 24);
2426  src += src_stride;
2427 
2428  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2429 
2430  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2431  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2432  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2433  out0, out1, out2, out3);
2434  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2435  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2436  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2437  out0, out1, out2, out3);
2438 
2439  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2440  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2441  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2442  out4, out5, out6, out7);
2443  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2444  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2445  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2446  out4, out5, out6, out7);
2447  SRARI_H4_SH(out0, out1, out2, out3, 6);
2448  SRARI_H4_SH(out4, out5, out6, out7, 6);
2449  SAT_SH4_SH(out0, out1, out2, out3, 7);
2450  SAT_SH4_SH(out4, out5, out6, out7, 7);
2451  out = PCKEV_XORI128_UB(out0, out1);
2452  ST_UB(out, dst);
2453  out = PCKEV_XORI128_UB(out2, out3);
2454  ST_UB(out, dst + 16);
2455  dst += dst_stride;
2456  out = PCKEV_XORI128_UB(out4, out5);
2457  ST_UB(out, dst);
2458  out = PCKEV_XORI128_UB(out6, out7);
2459  ST_UB(out, dst + 16);
2460  dst += dst_stride;
2461  }
2462 }
2463 
2464 static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
2465  uint8_t *dst, int32_t dst_stride,
2466  const int8_t *filter)
2467 {
2468  v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2469  v16i8 src2110, src4332, filt0, filt1;
2470  v16u8 out;
2471  v8i16 filt, out10;
2472 
2473  src -= src_stride;
2474 
2475  filt = LD_SH(filter);
2476  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2477 
2478  LD_SB3(src, src_stride, src0, src1, src2);
2479  src += (3 * src_stride);
2480 
2481  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2482  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2483  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2484  LD_SB2(src, src_stride, src3, src4);
2485  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2486  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2487  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2488  out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2489  out10 = __msa_srari_h(out10, 6);
2490  out10 = __msa_sat_s_h(out10, 7);
2491  out = PCKEV_XORI128_UB(out10, out10);
2492  ST_W2(out, 0, 1, dst, dst_stride);
2493 }
2494 
2496  uint8_t *dst, int32_t dst_stride,
2497  const int8_t *filter, int32_t height)
2498 {
2499  uint32_t loop_cnt;
2500  v16i8 src0, src1, src2, src3, src4, src5;
2501  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2502  v16i8 src2110, src4332, filt0, filt1;
2503  v8i16 filt, out10, out32;
2504  v16u8 out;
2505 
2506  src -= src_stride;
2507 
2508  filt = LD_SH(filter);
2509  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2510 
2511  LD_SB3(src, src_stride, src0, src1, src2);
2512  src += (3 * src_stride);
2513 
2514  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2515 
2516  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2517  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2518 
2519  for (loop_cnt = (height >> 2); loop_cnt--;) {
2520  LD_SB3(src, src_stride, src3, src4, src5);
2521  src += (3 * src_stride);
2522  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2523  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2524  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2525  out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2526 
2527  src2 = LD_SB(src);
2528  src += (src_stride);
2529  ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2530  src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2531  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2532  out32 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
2533  SRARI_H2_SH(out10, out32, 6);
2534  SAT_SH2_SH(out10, out32, 7);
2535  out = PCKEV_XORI128_UB(out10, out32);
2536  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2537  dst += (4 * dst_stride);
2538  }
2539 }
2540 
2541 static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
2542  uint8_t *dst, int32_t dst_stride,
2543  const int8_t *filter, int32_t height)
2544 {
2545  if (2 == height) {
2546  common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2547  } else {
2548  common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2549  height);
2550  }
2551 }
2552 
2553 static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
2554  uint8_t *dst, int32_t dst_stride,
2555  const int8_t *filter, int32_t height)
2556 {
2557  v16u8 out0, out1;
2558  v16i8 src0, src1, src2, src3, src4, src5, src6;
2559  v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2560  v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
2561 
2562  src -= src_stride;
2563 
2564  filter_vec = LD_SH(filter);
2565  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2566 
2567  LD_SB3(src, src_stride, src0, src1, src2);
2568  src += (3 * src_stride);
2569  XORI_B3_128_SB(src0, src1, src2);
2570  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2571 
2572  LD_SB2(src, src_stride, src3, src4);
2573  src += (2 * src_stride);
2574  XORI_B2_128_SB(src3, src4);
2575  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2576 
2577  dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2578  dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2579 
2580  LD_SB2(src, src_stride, src5, src6);
2581  src += (2 * src_stride);
2582  XORI_B2_128_SB(src5, src6);
2583  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2584 
2585  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2586  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2587 
2588  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2589  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2590  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2591  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2592  ST_W2(out0, 0, 2, dst, dst_stride);
2593  ST_H2(out0, 2, 6, dst + 4, dst_stride);
2594  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2595  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2596  dst += (4 * dst_stride);
2597 
2598  LD_SB2(src, src_stride, src3, src4);
2599  src += (2 * src_stride);
2600  XORI_B2_128_SB(src3, src4);
2601  ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
2602 
2603  dst0_r = HEVC_FILT_4TAP_SH(src54_r, src32_r, filt0, filt1);
2604  dst1_r = HEVC_FILT_4TAP_SH(src65_r, src43_r, filt0, filt1);
2605 
2606  LD_SB2(src, src_stride, src5, src6);
2607  src += (2 * src_stride);
2608  XORI_B2_128_SB(src5, src6);
2609  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2610 
2611  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2612  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2613 
2614  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2615  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2616  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2617  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2618  ST_W2(out0, 0, 2, dst, dst_stride);
2619  ST_H2(out0, 2, 6, dst + 4, dst_stride);
2620  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2621  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2622 }
2623 
2624 static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
2625  uint8_t *dst, int32_t dst_stride,
2626  const int8_t *filter)
2627 {
2628  v16i8 src0, src1, src2, src3, src4;
2629  v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2630  v16u8 out;
2631 
2632  src -= src_stride;
2633 
2634  /* rearranging filter_y */
2635  filt = LD_SH(filter);
2636  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2637 
2638  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2639  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2640  ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2641  tmp0 = HEVC_FILT_4TAP_SH(src01, src23, filt0, filt1);
2642  ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2643  tmp1 = HEVC_FILT_4TAP_SH(src12, src34, filt0, filt1);
2644  SRARI_H2_SH(tmp0, tmp1, 6);
2645  SAT_SH2_SH(tmp0, tmp1, 7);
2646  out = PCKEV_XORI128_UB(tmp0, tmp1);
2647  ST_D2(out, 0, 1, dst, dst_stride);
2648 }
2649 
2650 static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
2651  uint8_t *dst, int32_t dst_stride,
2652  const int8_t *filter)
2653 {
2654  uint32_t loop_cnt;
2655  uint64_t out0, out1, out2;
2656  v16i8 src0, src1, src2, src3, src4, src5;
2657  v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2658  v8i16 filt, filt0, filt1;
2659 
2660  src -= src_stride;
2661 
2662  /* rearranging filter_y */
2663  filt = LD_SH(filter);
2664  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2665 
2666  LD_SB3(src, src_stride, src0, src1, src2);
2667  src += (3 * src_stride);
2668 
2669  XORI_B3_128_SB(src0, src1, src2);
2670  ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2671 
2672  for (loop_cnt = 2; loop_cnt--;) {
2673  LD_SB3(src, src_stride, src3, src4, src5);
2674  src += (3 * src_stride);
2675 
2676  XORI_B3_128_SB(src3, src4, src5);
2677  ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2678  tmp0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2679  tmp1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2680  tmp2 = HEVC_FILT_4TAP_SH(vec1, vec4, filt0, filt1);
2681  SRARI_H2_SH(tmp0, tmp1, 6);
2682  tmp2 = __msa_srari_h(tmp2, 6);
2683  SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2684  PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2685  XORI_B2_128_SH(tmp0, tmp2);
2686 
2687  out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2688  out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2689  out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2690  SD(out0, dst);
2691  dst += dst_stride;
2692  SD(out1, dst);
2693  dst += dst_stride;
2694  SD(out2, dst);
2695  dst += dst_stride;
2696 
2697  src2 = src5;
2698  vec0 = vec3;
2699  vec2 = vec4;
2700  }
2701 }
2702 
2703 static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2704  uint8_t *dst, int32_t dst_stride,
2705  const int8_t *filter, int32_t height)
2706 {
2707  uint32_t loop_cnt;
2708  v16i8 src0, src1, src2, src7, src8, src9, src10;
2709  v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2710  v16u8 tmp0, tmp1;
2711  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2712 
2713  src -= src_stride;
2714 
2715  filt = LD_SH(filter);
2716  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2717 
2718  LD_SB3(src, src_stride, src0, src1, src2);
2719  src += (3 * src_stride);
2720 
2721  XORI_B3_128_SB(src0, src1, src2);
2722  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2723 
2724  for (loop_cnt = (height >> 2); loop_cnt--;) {
2725  LD_SB4(src, src_stride, src7, src8, src9, src10);
2726  src += (4 * src_stride);
2727 
2728  XORI_B4_128_SB(src7, src8, src9, src10);
2729  ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2730  src72_r, src87_r, src98_r, src109_r);
2731  out0_r = HEVC_FILT_4TAP_SH(src10_r, src72_r, filt0, filt1);
2732  out1_r = HEVC_FILT_4TAP_SH(src21_r, src87_r, filt0, filt1);
2733  out2_r = HEVC_FILT_4TAP_SH(src72_r, src98_r, filt0, filt1);
2734  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2735  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2736  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2737  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2738  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2739  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2740  dst += (4 * dst_stride);
2741 
2742  src10_r = src98_r;
2743  src21_r = src109_r;
2744  src2 = src10;
2745  }
2746 }
2747 
2748 static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
2749  uint8_t *dst, int32_t dst_stride,
2750  const int8_t *filter, int32_t height)
2751 {
2752  if (2 == height) {
2753  common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2754  } else if (6 == height) {
2755  common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2756  } else {
2757  common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2758  filter, height);
2759  }
2760 }
2761 
2762 static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
2763  uint8_t *dst, int32_t dst_stride,
2764  const int8_t *filter, int32_t height)
2765 {
2766  uint32_t loop_cnt;
2767  v16i8 src0, src1, src2, src3, src4, src5, src6;
2768  v16u8 out0, out1;
2769  v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2770  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2771  v16i8 src2110, src4332, src6554;
2772  v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
2773  v8i16 filter_vec;
2774 
2775  src -= (1 * src_stride);
2776 
2777  filter_vec = LD_SH(filter);
2778  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2779 
2780  LD_SB3(src, src_stride, src0, src1, src2);
2781  src += (3 * src_stride);
2782 
2783  XORI_B3_128_SB(src0, src1, src2);
2784  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2785  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2786  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2787 
2788  for (loop_cnt = 4; loop_cnt--;) {
2789  LD_SB4(src, src_stride, src3, src4, src5, src6);
2790  src += (4 * src_stride);
2791 
2792  XORI_B4_128_SB(src3, src4, src5, src6);
2793  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2794  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2795  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2796  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2797  ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2798  src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2799 
2800  dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2801  dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2802  dst0_l = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2803  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2804  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2805  dst1_l = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
2806 
2807  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2808  SRARI_H2_SH(dst0_l, dst1_l, 6);
2809  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2810  SAT_SH2_SH(dst0_l, dst1_l, 7);
2811  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2812  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2813  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2814  out0 = PCKEV_XORI128_UB(dst0_l, dst1_l);
2815  ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
2816  dst += (4 * dst_stride);
2817 
2818  src2 = src6;
2819  src10_r = src54_r;
2820  src21_r = src65_r;
2821  src2110 = src6554;
2822  }
2823 }
2824 
2825 static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
2826  uint8_t *dst, int32_t dst_stride,
2827  const int8_t *filter, int32_t height)
2828 {
2829  uint32_t loop_cnt;
2830  v16i8 src0, src1, src2, src3, src4, src5, src6;
2831  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2832  v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2833  v16u8 tmp0, tmp1, tmp2, tmp3;
2834  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2835 
2836  src -= src_stride;
2837 
2838  filt = LD_SH(filter);
2839  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2840 
2841  LD_SB3(src, src_stride, src0, src1, src2);
2842  src += (3 * src_stride);
2843 
2844  XORI_B3_128_SB(src0, src1, src2);
2845  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2846  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2847 
2848  for (loop_cnt = (height >> 2); loop_cnt--;) {
2849  LD_SB4(src, src_stride, src3, src4, src5, src6);
2850  src += (4 * src_stride);
2851 
2852  XORI_B4_128_SB(src3, src4, src5, src6);
2853  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2854  src32_r, src43_r, src54_r, src65_r);
2855  ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2856  src32_l, src43_l, src54_l, src65_l);
2857  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2858  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2859  out2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2860  out3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2861  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2862  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2863  out2_l = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
2864  out3_l = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
2865  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2866  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
2867  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2868  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2869  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2870  out3_r, tmp0, tmp1, tmp2, tmp3);
2871  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2872  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2873  dst += (4 * dst_stride);
2874 
2875  src10_r = src54_r;
2876  src21_r = src65_r;
2877  src10_l = src54_l;
2878  src21_l = src65_l;
2879  src2 = src6;
2880  }
2881 }
2882 
2883 static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
2884  uint8_t *dst, int32_t dst_stride,
2885  const int8_t *filter, int32_t height)
2886 {
2887  uint32_t loop_cnt;
2888  uint64_t out0, out1;
2889  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2890  v16i8 src11, filt0, filt1;
2891  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2892  v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2893  v16u8 out;
2894  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2895 
2896  src -= src_stride;
2897 
2898  filt = LD_SH(filter);
2899  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2900 
2901  /* 16 width */
2902  LD_SB3(src, src_stride, src0, src1, src2);
2903  XORI_B3_128_SB(src0, src1, src2);
2904  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2905  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2906 
2907  /* 8 width */
2908  LD_SB3(src + 16, src_stride, src6, src7, src8);
2909  src += (3 * src_stride);
2910  XORI_B3_128_SB(src6, src7, src8);
2911  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2912 
2913  for (loop_cnt = 8; loop_cnt--;) {
2914  /* 16 width */
2915  LD_SB2(src, src_stride, src3, src4);
2916  XORI_B2_128_SB(src3, src4);
2917  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2918  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2919 
2920  /* 8 width */
2921  LD_SB2(src + 16, src_stride, src9, src10);
2922  src += (2 * src_stride);
2923  XORI_B2_128_SB(src9, src10);
2924  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2925 
2926  /* 16 width */
2927  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2928  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2929  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2930  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2931 
2932  /* 8 width */
2933  out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
2934  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2935 
2936  /* 16 + 8 width */
2937  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2938  SRARI_H2_SH(out0_l, out1_l, 6);
2939  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2940  SAT_SH2_SH(out0_l, out1_l, 7);
2941  out = PCKEV_XORI128_UB(out0_r, out0_l);
2942  ST_UB(out, dst);
2943  PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2944  XORI_B2_128_SH(out2_r, out3_r);
2945  out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2946  out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2947  SD(out0, dst + 16);
2948  dst += dst_stride;
2949  out = PCKEV_XORI128_UB(out1_r, out1_l);
2950  ST_UB(out, dst);
2951  SD(out1, dst + 16);
2952  dst += dst_stride;
2953 
2954  /* 16 width */
2955  LD_SB2(src, src_stride, src5, src2);
2956  XORI_B2_128_SB(src5, src2);
2957  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2958  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2959 
2960  /* 8 width */
2961  LD_SB2(src + 16, src_stride, src11, src8);
2962  src += (2 * src_stride);
2963  XORI_B2_128_SB(src11, src8);
2964  ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2965 
2966  /* 16 width */
2967  out0_r = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
2968  out0_l = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
2969  out1_r = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
2970  out1_l = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
2971 
2972  /* 8 width */
2973  out2_r = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
2974  out3_r = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
2975 
2976  /* 16 + 8 width */
2977  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2978  SRARI_H2_SH(out0_l, out1_l, 6);
2979  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2980  SAT_SH2_SH(out0_l, out1_l, 7);
2981  out = PCKEV_XORI128_UB(out0_r, out0_l);
2982  ST_UB(out, dst);
2983  out = PCKEV_XORI128_UB(out2_r, out2_r);
2984  ST_D1(out, 0, dst + 16);
2985  dst += dst_stride;
2986  out = PCKEV_XORI128_UB(out1_r, out1_l);
2987  ST_UB(out, dst);
2988  out = PCKEV_XORI128_UB(out3_r, out3_r);
2989  ST_D1(out, 0, dst + 16);
2990  dst += dst_stride;
2991  }
2992 }
2993 
2994 static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
2995  uint8_t *dst, int32_t dst_stride,
2996  const int8_t *filter, int32_t height)
2997 {
2998  uint32_t loop_cnt;
2999  v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3000  v16i8 src10_r, src32_r, src76_r, src98_r;
3001  v16i8 src21_r, src43_r, src87_r, src109_r;
3002  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3003  v16i8 src10_l, src32_l, src76_l, src98_l;
3004  v16i8 src21_l, src43_l, src87_l, src109_l;
3005  v8i16 filt;
3006  v16i8 filt0, filt1;
3007  v16u8 out;
3008 
3009  src -= src_stride;
3010 
3011  filt = LD_SH(filter);
3012  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
3013 
3014  /* 16 width */
3015  LD_SB3(src, src_stride, src0, src1, src2);
3016  XORI_B3_128_SB(src0, src1, src2);
3017 
3018  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3019  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3020 
3021  /* next 16 width */
3022  LD_SB3(src + 16, src_stride, src6, src7, src8);
3023  src += (3 * src_stride);
3024 
3025  XORI_B3_128_SB(src6, src7, src8);
3026  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3027  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3028 
3029  for (loop_cnt = (height >> 1); loop_cnt--;) {
3030  /* 16 width */
3031  LD_SB2(src, src_stride, src3, src4);
3032  XORI_B2_128_SB(src3, src4);
3033  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3034  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3035 
3036  /* 16 width */
3037  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3038  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3039  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3040  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3041 
3042  /* 16 width */
3043  SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
3044  SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3045  out = PCKEV_XORI128_UB(out0_r, out0_l);
3046  ST_UB(out, dst);
3047  out = PCKEV_XORI128_UB(out1_r, out1_l);
3048  ST_UB(out, dst + dst_stride);
3049 
3050  src10_r = src32_r;
3051  src21_r = src43_r;
3052  src10_l = src32_l;
3053  src21_l = src43_l;
3054  src2 = src4;
3055 
3056  /* next 16 width */
3057  LD_SB2(src + 16, src_stride, src9, src10);
3058  src += (2 * src_stride);
3059  XORI_B2_128_SB(src9, src10);
3060  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3061  ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3062 
3063  /* next 16 width */
3064  out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3065  out2_l = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
3066  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3067  out3_l = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
3068 
3069  /* next 16 width */
3070  SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
3071  SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3072  out = PCKEV_XORI128_UB(out2_r, out2_l);
3073  ST_UB(out, dst + 16);
3074  out = PCKEV_XORI128_UB(out3_r, out3_l);
3075  ST_UB(out, dst + 16 + dst_stride);
3076 
3077  dst += 2 * dst_stride;
3078 
3079  src76_r = src98_r;
3080  src87_r = src109_r;
3081  src76_l = src98_l;
3082  src87_l = src109_l;
3083  src8 = src10;
3084  }
3085 }
3086 
3088  int32_t src_stride,
3089  uint8_t *dst,
3090  int32_t dst_stride,
3091  const int8_t *filter_x,
3092  const int8_t *filter_y)
3093 {
3094  v16u8 out;
3095  v16i8 src0, src1, src2, src3, src4;
3096  v8i16 filt0, filt1;
3097  v8i16 filt_h0, filt_h1;
3098  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3099  v16i8 mask1;
3100  v8i16 filter_vec, tmp;
3101  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3102  v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3103  v4i32 dst0, dst1;
3104 
3105  src -= (src_stride + 1);
3106 
3107  filter_vec = LD_SH(filter_x);
3108  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3109 
3110  filter_vec = LD_SH(filter_y);
3111  UNPCK_R_SB_SH(filter_vec, filter_vec);
3112 
3113  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3114 
3115  mask1 = mask0 + 2;
3116 
3117  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3118  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3119 
3120  VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3121  VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3122  VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3123 
3124  dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3125  dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3126  dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3127 
3128  ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3129  ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3130 
3131  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3132  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3133  dst0 >>= 6;
3134  dst1 >>= 6;
3135  tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3136  tmp = __msa_srari_h(tmp, 6);
3137  tmp = __msa_sat_s_h(tmp, 7);
3138  out = PCKEV_XORI128_UB(tmp, tmp);
3139  ST_W2(out, 0, 1, dst, dst_stride);
3140 }
3141 
3143  int32_t src_stride,
3144  uint8_t *dst,
3145  int32_t dst_stride,
3146  const int8_t *filter_x,
3147  const int8_t *filter_y)
3148 {
3149  v16u8 out;
3150  v16i8 src0, src1, src2, src3, src4, src5, src6;
3151  v8i16 filt0, filt1;
3152  v8i16 filt_h0, filt_h1;
3153  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3154  v16i8 mask1;
3155  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3156  v8i16 filter_vec, tmp0, tmp1;
3157  v8i16 dst30, dst41, dst52, dst63;
3158  v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3159  v4i32 dst0, dst1, dst2, dst3;
3160 
3161  src -= (src_stride + 1);
3162 
3163  filter_vec = LD_SH(filter_x);
3164  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3165 
3166  filter_vec = LD_SH(filter_y);
3167  UNPCK_R_SB_SH(filter_vec, filter_vec);
3168 
3169  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3170 
3171  mask1 = mask0 + 2;
3172 
3173  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3174  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3175 
3176  VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3177  VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3178  VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3179  VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3180 
3181  dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3182  dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3183  dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3184  dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3185 
3186  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3187  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3188  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3189  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3190  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3191  dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3192  dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3193  SRA_4V(dst0, dst1, dst2, dst3, 6);
3194  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3195  SRARI_H2_SH(tmp0, tmp1, 6);
3196  SAT_SH2_SH(tmp0, tmp1, 7);
3197  out = PCKEV_XORI128_UB(tmp0, tmp1);
3198  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3199 }
3200 
3202  int32_t src_stride,
3203  uint8_t *dst,
3204  int32_t dst_stride,
3205  const int8_t *filter_x,
3206  const int8_t *filter_y,
3207  int32_t height)
3208 {
3209  uint32_t loop_cnt;
3210  v16u8 out0, out1;
3211  v16i8 src0, src1, src2, src3, src4, src5;
3212  v16i8 src6, src7, src8, src9, src10;
3213  v8i16 filt0, filt1;
3214  v8i16 filt_h0, filt_h1;
3215  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3216  v16i8 mask1;
3217  v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3;
3218  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3219  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3220  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3221  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3222  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3223  v8i16 dst98_r, dst109_r;
3224 
3225  src -= (src_stride + 1);
3226 
3227  filter_vec = LD_SH(filter_x);
3228  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3229 
3230  filter_vec = LD_SH(filter_y);
3231  UNPCK_R_SB_SH(filter_vec, filter_vec);
3232 
3233  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3234 
3235  mask1 = mask0 + 2;
3236 
3237  LD_SB3(src, src_stride, src0, src1, src2);
3238  src += (3 * src_stride);
3239 
3240  XORI_B3_128_SB(src0, src1, src2);
3241 
3242  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3243  VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3244  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3245  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3246  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3247  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3248 
3249  for (loop_cnt = height >> 3; loop_cnt--;) {
3250  LD_SB8(src, src_stride,
3251  src3, src4, src5, src6, src7, src8, src9, src10);
3252  src += (8 * src_stride);
3253 
3254  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3255 
3256  VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3257  VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3258  VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3259  VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3260 
3261  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3262  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3263  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3264  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3265 
3266  dst32_r = __msa_ilvr_h(dst73, dst22);
3267  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3268  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3269  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3270  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3271  dst76_r = __msa_ilvr_h(dst22, dst106);
3272 
3273  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3274  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3275  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3276  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3277  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3278  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3279  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3280  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3281  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3282  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3283  PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3284  dst5_r, dst4_r, dst7_r, dst6_r,
3285  tmp0, tmp1, tmp2, tmp3);
3286  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3287  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3288  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3289  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3290  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3291  dst += (8 * dst_stride);
3292 
3293  dst10_r = dst98_r;
3294  dst21_r = dst109_r;
3295  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3296  }
3297 }
3298 
3300  int32_t src_stride,
3301  uint8_t *dst,
3302  int32_t dst_stride,
3303  const int8_t *filter_x,
3304  const int8_t *filter_y,
3305  int32_t height)
3306 {
3307  if (2 == height) {
3308  hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3309  filter_x, filter_y);
3310  } else if (4 == height) {
3311  hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3312  filter_x, filter_y);
3313  } else if (0 == (height % 8)) {
3314  hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3315  filter_x, filter_y, height);
3316  }
3317 }
3318 
3320  int32_t src_stride,
3321  uint8_t *dst,
3322  int32_t dst_stride,
3323  const int8_t *filter_x,
3324  const int8_t *filter_y,
3325  int32_t height)
3326 {
3327  v16u8 out0, out1, out2;
3328  v16i8 src0, src1, src2, src3, src4, src5, src6;
3329  v16i8 src7, src8, src9, src10;
3330  v8i16 filt0, filt1;
3331  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3332  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3333  v16i8 mask1;
3334  v8i16 filt_h0, filt_h1, filter_vec;
3335  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3336  v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3337  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3338  v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
3339  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3340  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3341  v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
3342  v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l;
3343  v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3344 
3345  src -= (src_stride + 1);
3346 
3347  filter_vec = LD_SH(filter_x);
3348  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3349 
3350  filter_vec = LD_SH(filter_y);
3351  UNPCK_R_SB_SH(filter_vec, filter_vec);
3352 
3353  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3354 
3355  mask1 = mask0 + 2;
3356 
3357  LD_SB3(src, src_stride, src0, src1, src2);
3358  src += (3 * src_stride);
3359 
3360  XORI_B3_128_SB(src0, src1, src2);
3361 
3362  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3363  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3364  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3365 
3366  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3367  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3368  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3369 
3370  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3371  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3372 
3373  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3374  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3375 
3376  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3377  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3378  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3379  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3380 
3381  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3382  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3383  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3384  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3385 
3386  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3387  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3388  VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3389  VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3390 
3391  dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3392  dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3393  dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3394  dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3395 
3396  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3397  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3398  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3399  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3400  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
3401  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
3402  ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
3403  ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
3404 
3405  PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3406  PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3407  dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3408 
3409  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3410  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3411  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3412  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3413  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3414  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3415  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3416  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3417  dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
3418  dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
3419  dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
3420  dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3421  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3422  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3423  SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3424  PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3425  PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3426  PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3427  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3428  SRARI_H2_SH(tmp4, tmp5, 6);
3429  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3,7);
3430  SAT_SH2_SH(tmp4, tmp5,7);
3431  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3432  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3433  out2 = PCKEV_XORI128_UB(tmp4, tmp5);
3434  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3435  ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
3436 }
3437 
3439  int32_t src_stride,
3440  uint8_t *dst,
3441  int32_t dst_stride,
3442  const int8_t *filter_x,
3443  const int8_t *filter_y)
3444 {
3445  v16u8 out;
3446  v16i8 src0, src1, src2, src3, src4;
3447  v8i16 filt0, filt1;
3448  v8i16 filt_h0, filt_h1, filter_vec;
3449  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3450  v16i8 mask1;
3451  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3452  v8i16 dst0, dst1, dst2, dst3, dst4;
3453  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3454  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3455  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3456  v8i16 out0_r, out1_r;
3457 
3458  src -= (src_stride + 1);
3459 
3460  filter_vec = LD_SH(filter_x);
3461  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3462 
3463  filter_vec = LD_SH(filter_y);
3464  UNPCK_R_SB_SH(filter_vec, filter_vec);
3465 
3466  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3467 
3468  mask1 = mask0 + 2;
3469 
3470  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3471  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3472 
3473  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3474  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3475  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3476  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3477  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3478 
3479  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3480  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3481  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3482  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3483  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3484  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3485  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3486  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3487  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3488  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3489  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3490  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3491  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3492  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3493  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3494  SRARI_H2_SH(out0_r, out1_r, 6);
3495  SAT_SH2_SH(out0_r, out1_r, 7);
3496  out = PCKEV_XORI128_UB(out0_r, out1_r);
3497  ST_D2(out, 0, 1, dst, dst_stride);
3498 }
3499 
3501  int32_t src_stride,
3502  uint8_t *dst,
3503  int32_t dst_stride,
3504  const int8_t *filter_x,
3505  const int8_t *filter_y,
3506  int32_t width8mult)
3507 {
3508  uint32_t cnt;
3509  v16u8 out0, out1;
3510  v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
3511  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3512  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
3513  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
3514  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3515  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3516  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3517 
3518  src -= (src_stride + 1);
3519 
3520  filter_vec = LD_SH(filter_x);
3521  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3522 
3523  filter_vec = LD_SH(filter_y);
3524  UNPCK_R_SB_SH(filter_vec, filter_vec);
3525 
3526  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3527 
3528  mask0 = LD_SB(ff_hevc_mask_arr);
3529  mask1 = mask0 + 2;
3530 
3531  for (cnt = width8mult; cnt--;) {
3532  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3533  src += 8;
3534  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3535 
3536  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3537  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3538  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3539 
3540  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3541  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3542  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3543 
3544  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3545  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3546 
3547  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3548  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3549  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3550  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3551 
3552  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3553  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3554  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3555  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3556 
3557  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3558  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3559  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3560  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3561 
3562  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3563  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3564  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3565  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3566  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3567  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3568  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3569  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3570 
3571  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3572  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3573 
3574  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3575  dst3_r, tmp0, tmp1, tmp2, tmp3);
3576  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3577  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3578  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3579  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3580  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3581  dst += 8;
3582  }
3583 }
3584 
3586  int32_t src_stride,
3587  uint8_t *dst,
3588  int32_t dst_stride,
3589  const int8_t *filter_x,
3590  const int8_t *filter_y)
3591 {
3592  v16u8 out0, out1, out2;
3593  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3594  v8i16 filt0, filt1;
3595  v8i16 filt_h0, filt_h1, filter_vec;
3596  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3597  v16i8 mask1;
3598  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3599  v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3600  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3601  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3602  v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3603  v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3604  v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3605  v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3606  v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3607  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3608 
3609  src -= (src_stride + 1);
3610 
3611  filter_vec = LD_SH(filter_x);
3612  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3613 
3614  filter_vec = LD_SH(filter_y);
3615  UNPCK_R_SB_SH(filter_vec, filter_vec);
3616 
3617  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3618 
3619  mask1 = mask0 + 2;
3620 
3621  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3622  src += (5 * src_stride);
3623  LD_SB4(src, src_stride, src5, src6, src7, src8);
3624 
3625  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3626  XORI_B4_128_SB(src5, src6, src7, src8);
3627 
3628  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3629  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3630  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3631  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3632  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3633  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3634  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3635  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3636  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3637 
3638  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3639  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3640  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3641  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3642  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3643  dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
3644  dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
3645  dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
3646  dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
3647 
3648  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3649  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3650  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3651  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3652  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3653  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3654  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3655  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3656 
3657  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3658  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3659  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3660  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3661  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3662  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3663  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3664  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3665  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3666  dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3667  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3668  dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3669 
3670  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3671  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3672  SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3673  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3674  dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3675  PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3676  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3677  SRARI_H2_SH(out4_r, out5_r, 6);
3678  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3679  SAT_SH2_SH(out4_r, out5_r, 7);
3680  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3681  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3682  out2 = PCKEV_XORI128_UB(out4_r, out5_r);
3683 
3684  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3685  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
3686 }
3687 
3689  int32_t src_stride,
3690  uint8_t *dst,
3691  int32_t dst_stride,
3692  const int8_t *filter_x,
3693  const int8_t *filter_y,
3694  int32_t height,
3695  int32_t width8mult)
3696 {
3697  uint32_t loop_cnt, cnt;
3698  uint8_t *src_tmp;
3699  uint8_t *dst_tmp;
3700  v16u8 out0, out1;
3701  v16i8 src0, src1, src2, src3, src4, src5, src6;
3702  v8i16 filt0, filt1;
3703  v8i16 filt_h0, filt_h1, filter_vec;
3704  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3705  v16i8 mask1;
3706  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3707  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3708  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3709  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3710  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3711  v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
3712  v8i16 out0_r, out1_r, out2_r, out3_r;
3713 
3714  src -= (src_stride + 1);
3715 
3716  filter_vec = LD_SH(filter_x);
3717  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3718 
3719  filter_vec = LD_SH(filter_y);
3720  UNPCK_R_SB_SH(filter_vec, filter_vec);
3721 
3722  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3723 
3724  mask1 = mask0 + 2;
3725 
3726  for (cnt = width8mult; cnt--;) {
3727  src_tmp = src;
3728  dst_tmp = dst;
3729 
3730  LD_SB3(src_tmp, src_stride, src0, src1, src2);
3731  src_tmp += (3 * src_stride);
3732 
3733  XORI_B3_128_SB(src0, src1, src2);
3734 
3735  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3736  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3737  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3738 
3739  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3740  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3741  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3742 
3743  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3744  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3745 
3746  for (loop_cnt = (height >> 2); loop_cnt--;) {
3747  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3748  src_tmp += (4 * src_stride);
3749 
3750  XORI_B4_128_SB(src3, src4, src5, src6);
3751 
3752  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3753  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3754  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3755  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3756 
3757  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3758  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3759  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3760  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3761 
3762  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3763  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3764  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3765  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3766 
3767  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3768  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3769  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3770  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3771  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3772  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3773  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3774  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3775 
3776  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3777  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3778 
3779  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3780  dst2_l, dst2_r, dst3_l, dst3_r,
3781  out0_r, out1_r, out2_r, out3_r);
3782 
3783  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3784  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3785  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3786  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3787  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3788  dst_tmp += (4 * dst_stride);
3789 
3790  dst10_r = dst54_r;
3791  dst10_l = dst54_l;
3792  dst21_r = dst65_r;
3793  dst21_l = dst65_l;
3794  dst2 = dst6;
3795  }
3796 
3797  src += 8;
3798  dst += 8;
3799  }
3800 }
3801 
3803  int32_t src_stride,
3804  uint8_t *dst,
3805  int32_t dst_stride,
3806  const int8_t *filter_x,
3807  const int8_t *filter_y,
3808  int32_t height)
3809 {
3810  if (2 == height) {
3811  hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3812  filter_x, filter_y);
3813  } else if (4 == height) {
3814  hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride,
3815  filter_x, filter_y, 1);
3816  } else if (6 == height) {
3817  hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3818  filter_x, filter_y);
3819  } else if (0 == (height % 4)) {
3820  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3821  filter_x, filter_y, height, 1);
3822  }
3823 }
3824 
3826  int32_t src_stride,
3827  uint8_t *dst,
3828  int32_t dst_stride,
3829  const int8_t *filter_x,
3830  const int8_t *filter_y,
3831  int32_t height)
3832 {
3833  uint32_t loop_cnt;
3834  uint8_t *src_tmp, *dst_tmp;
3835  v16u8 out0, out1;
3836  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3837  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3838  v16i8 mask0, mask1, mask2, mask3;
3839  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
3840  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
3841  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3842  v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
3843  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3844  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3845  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3846  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3847 
3848  src -= (src_stride + 1);
3849 
3850  filter_vec = LD_SH(filter_x);
3851  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3852 
3853  filter_vec = LD_SH(filter_y);
3854  UNPCK_R_SB_SH(filter_vec, filter_vec);
3855 
3856  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3857 
3858  mask0 = LD_SB(ff_hevc_mask_arr);
3859  mask1 = mask0 + 2;
3860 
3861  src_tmp = src;
3862  dst_tmp = dst;
3863 
3864  LD_SB3(src_tmp, src_stride, src0, src1, src2);
3865  src_tmp += (3 * src_stride);
3866 
3867  XORI_B3_128_SB(src0, src1, src2);
3868 
3869  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3870  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3871  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3872 
3873  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3874  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3875  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3876 
3877  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3878  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3879 
3880  for (loop_cnt = 4; loop_cnt--;) {
3881  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3882  src_tmp += (4 * src_stride);
3883  XORI_B4_128_SB(src3, src4, src5, src6);
3884 
3885  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3886  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3887  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3888  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3889 
3890  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3891  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3892  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3893  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3894 
3895  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3896  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3897  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3898  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3899 
3900  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3901  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3902  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3903  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3904  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3905  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3906  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3907  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3908 
3909  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3910  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3911 
3912  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3913  dst3_r, tmp0, tmp1, tmp2, tmp3);
3914  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3915  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3916  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3917  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3918  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3919  dst_tmp += (4 * dst_stride);
3920 
3921  dst10_r = dst54_r;
3922  dst10_l = dst54_l;
3923  dst21_r = dst65_r;
3924  dst21_l = dst65_l;
3925  dsth2 = dsth6;
3926  }
3927 
3928  src += 8;
3929  dst += 8;
3930 
3931  mask2 = LD_SB(ff_hevc_mask_arr + 16);
3932  mask3 = mask2 + 2;
3933 
3934  LD_SB3(src, src_stride, src0, src1, src2);
3935  src += (3 * src_stride);
3936  XORI_B3_128_SB(src0, src1, src2);
3937  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3938  VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
3939 
3940  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3941  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3942 
3943  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3944  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3945 
3946  for (loop_cnt = 2; loop_cnt--;) {
3947  LD_SB8(src, src_stride,
3948  src3, src4, src5, src6, src7, src8, src9, src10);
3949  src += (8 * src_stride);
3950  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3951  VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
3952  VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
3953  VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
3954  VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
3955 
3956  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3957  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3958  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3959  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3960 
3961  dst32_r = __msa_ilvr_h(dst73, dst22);
3962  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3963  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3964  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3965  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3966  dst76_r = __msa_ilvr_h(dst22, dst106);
3967 
3968  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3969  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3970  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3971  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3972  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3973  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3974  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3975  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3976  SRA_4V(dst0, dst1, dst2, dst3, 6);
3977  SRA_4V(dst4, dst5, dst6, dst7, 6);
3978  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3979  tmp0, tmp1, tmp2, tmp3);
3980  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3981  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3982  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3983  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3984  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3985  dst += (8 * dst_stride);
3986 
3987  dst10_r = dst98_r;
3988  dst21_r = dst109_r;
3989  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3990  }
3991 }
3992 
3994  int32_t src_stride,
3995  uint8_t *dst,
3996  int32_t dst_stride,
3997  const int8_t *filter_x,
3998  const int8_t *filter_y,
3999  int32_t height)
4000 {
4001  if (4 == height) {
4002  hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride, filter_x,
4003  filter_y, 2);
4004  } else {
4005  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4006  filter_x, filter_y, height, 2);
4007  }
4008 }
4009 
4011  int32_t src_stride,
4012  uint8_t *dst,
4013  int32_t dst_stride,
4014  const int8_t *filter_x,
4015  const int8_t *filter_y,
4016  int32_t height)
4017 {
4018  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4019  filter_x, filter_y, height, 3);
4020 }
4021 
4023  int32_t src_stride,
4024  uint8_t *dst,
4025  int32_t dst_stride,
4026  const int8_t *filter_x,
4027  const int8_t *filter_y,
4028  int32_t height)
4029 {
4030  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4031  filter_x, filter_y, height, 4);
4032 }
4033 
4034 #define UNI_MC_COPY(WIDTH) \
4035 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4036  ptrdiff_t dst_stride, \
4037  uint8_t *src, \
4038  ptrdiff_t src_stride, \
4039  int height, \
4040  intptr_t mx, \
4041  intptr_t my, \
4042  int width) \
4043 { \
4044  copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
4045 }
4046 
4047 UNI_MC_COPY(8);
4048 UNI_MC_COPY(12);
4049 UNI_MC_COPY(16);
4050 UNI_MC_COPY(24);
4051 UNI_MC_COPY(32);
4052 UNI_MC_COPY(48);
4053 UNI_MC_COPY(64);
4054 
4055 #undef UNI_MC_COPY
4056 
4057 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4058 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4059  ptrdiff_t dst_stride, \
4060  uint8_t *src, \
4061  ptrdiff_t src_stride, \
4062  int height, \
4063  intptr_t mx, \
4064  intptr_t my, \
4065  int width) \
4066 { \
4067  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4068  \
4069  common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4070  filter, height); \
4071 }
4072 
4073 UNI_MC(qpel, h, 4, 8, hz, mx);
4074 UNI_MC(qpel, h, 8, 8, hz, mx);
4075 UNI_MC(qpel, h, 12, 8, hz, mx);
4076 UNI_MC(qpel, h, 16, 8, hz, mx);
4077 UNI_MC(qpel, h, 24, 8, hz, mx);
4078 UNI_MC(qpel, h, 32, 8, hz, mx);
4079 UNI_MC(qpel, h, 48, 8, hz, mx);
4080 UNI_MC(qpel, h, 64, 8, hz, mx);
4081 
4082 UNI_MC(qpel, v, 4, 8, vt, my);
4083 UNI_MC(qpel, v, 8, 8, vt, my);
4084 UNI_MC(qpel, v, 12, 8, vt, my);
4085 UNI_MC(qpel, v, 16, 8, vt, my);
4086 UNI_MC(qpel, v, 24, 8, vt, my);
4087 UNI_MC(qpel, v, 32, 8, vt, my);
4088 UNI_MC(qpel, v, 48, 8, vt, my);
4089 UNI_MC(qpel, v, 64, 8, vt, my);
4090 
4091 UNI_MC(epel, h, 4, 4, hz, mx);
4092 UNI_MC(epel, h, 6, 4, hz, mx);
4093 UNI_MC(epel, h, 8, 4, hz, mx);
4094 UNI_MC(epel, h, 12, 4, hz, mx);
4095 UNI_MC(epel, h, 16, 4, hz, mx);
4096 UNI_MC(epel, h, 24, 4, hz, mx);
4097 UNI_MC(epel, h, 32, 4, hz, mx);
4098 
4099 UNI_MC(epel, v, 4, 4, vt, my);
4100 UNI_MC(epel, v, 6, 4, vt, my);
4101 UNI_MC(epel, v, 8, 4, vt, my);
4102 UNI_MC(epel, v, 12, 4, vt, my);
4103 UNI_MC(epel, v, 16, 4, vt, my);
4104 UNI_MC(epel, v, 24, 4, vt, my);
4105 UNI_MC(epel, v, 32, 4, vt, my);
4106 
4107 #undef UNI_MC
4108 
4109 #define UNI_MC_HV(PEL, WIDTH, TAP) \
4110 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4111  ptrdiff_t dst_stride, \
4112  uint8_t *src, \
4113  ptrdiff_t src_stride, \
4114  int height, \
4115  intptr_t mx, \
4116  intptr_t my, \
4117  int width) \
4118 { \
4119  const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4120  const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4121  \
4122  hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4123  filter_x, filter_y, height); \
4124 }
4125 
4126 UNI_MC_HV(qpel, 4, 8);
4127 UNI_MC_HV(qpel, 8, 8);
4128 UNI_MC_HV(qpel, 12, 8);
4129 UNI_MC_HV(qpel, 16, 8);
4130 UNI_MC_HV(qpel, 24, 8);
4131 UNI_MC_HV(qpel, 32, 8);
4132 UNI_MC_HV(qpel, 48, 8);
4133 UNI_MC_HV(qpel, 64, 8);
4134 
4135 UNI_MC_HV(epel, 4, 4);
4136 UNI_MC_HV(epel, 6, 4);
4137 UNI_MC_HV(epel, 8, 4);
4138 UNI_MC_HV(epel, 12, 4);
4139 UNI_MC_HV(epel, 16, 4);
4140 UNI_MC_HV(epel, 24, 4);
4141 UNI_MC_HV(epel, 32, 4);
4142 
4143 #undef UNI_MC_HV
#define VSHF_B4_SB(...)
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
static void hevc_hv_uni_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define SRARI_W4_SW(...)
static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define ST_D2(in, idx0, idx1, pdst, stride)
static void hevc_hv_uni_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ILVR_H4_SH(...)
#define PCKEV_B2_SH(...)
static void copy_width24_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define XORI_B2_128_SB(...)
static void hevc_hv_uni_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define PCKEV_XORI128_UB(in0, in1)
static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
#define SD
Definition: ccaption_dec.c:819
#define LD_SB(...)
#define XORI_B3_128_SB(...)
static void hevc_hv_uni_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define UNPCK_R_SB_SH(in, out)
#define LD_UB4(...)
#define DPADD_SB4_SH(...)
#define ILVR_B2_SB(...)
#define SPLATI_H2_SH(...)
static void hevc_hv_uni_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define src
Definition: vp8dsp.c:254
static void copy_width48_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define ILVL_H2_SH(...)
static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD_SB2(...)
#define ILVL_H4_SH(...)
static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B2_128_SH(...)
#define VSHF_B2_SB(...)
#define SRA_4V(in0, in1, in2, in3, shift)
#define XORI_B4_128_UB(...)
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1, out2, out3)
#define ILVR_D2_SB(...)
uint8_t
#define LD4(psrc, stride, out0, out1, out2, out3)
#define SPLATI_W2_SH(...)
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define SRARI_H4_SH(...)
#define SPLATI_H4_SH(...)
static void hevc_hv_uni_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ILVL_B2_SB(...)
#define height
#define ST_D1(in, idx, pdst)
#define LD_SH(...)
static void hevc_hv_uni_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ILVRL_H2_SH(...)
#define ST_H2(in, idx0, idx1, pdst, stride)
#define ILVR_D3_SB(...)
#define LD_SB8(...)
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5,idx6, idx7, pdst, stride)
static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1)
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
static int aligned(int val)
Definition: dashdec.c:178
static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
#define VSHF_B2_SH(...)
#define SPLATI_H2_SB(...)
static void hevc_hv_uni_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define XORI_B7_128_SB(...)
static void hevc_hv_uni_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ILVR_B2_SH(...)
#define XORI_B4_128_SB(...)
static void hevc_hv_uni_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define SPLATI_W4_SH(...)
#define DPADD_SB2_SH(...)
#define SRARI_H2_SH(...)
static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD_UB8(...)
#define width
#define PCKEV_D2_SH(...)
#define UNI_MC_HV(PEL, WIDTH, TAP)
#define SAT_SW4_SW(...)
#define PCKEV_H2_SW(...)
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
int32_t
static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define PCKEV_H2_SH(...)
#define LD_SB3(...)
static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width12_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST_UB(...)
static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define SAT_SH4_SH(...)
#define SPLATI_H4_SB(...)
#define LD_SB4(...)
#define PCKEV_B4_UB(...)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
#define ST_UB8(...)
#define ST_UB4(...)
static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define src1
Definition: h264pred.c:139
static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define ILVL_B4_SB(...)
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1, out2, out3)
#define SAT_SH2_SH(...)
static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void copy_width64_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width8mult)
static void hevc_hv_uni_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define DOTP_SB4_SH(...)
static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1)
#define SAT_SH3_SH(...)
static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define src0
Definition: h264pred.c:138
static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define SD4(in0, in1, in2, in3, pdst, stride)
#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SW(val, pdst)
static const int8_t filt[NUMTAPS]
Definition: af_earwax.c:39
static void hevc_hv_uni_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD_SB7(...)
static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD_SB5(...)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define ILVR_H2_SH(...)
static const uint8_t ff_hevc_mask_arr[16 *3]
#define UNI_MC_COPY(WIDTH)
static void hevc_hv_uni_4t_8multx4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult)
#define LD_UB(...)
static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define ILVR_B4_SB(...)
static void hevc_hv_uni_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
FILE * out
Definition: movenc.c:54
#define ILVR_B3_SH(...)
static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST_W2(in, idx0, idx1, pdst, stride)
static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define PCKEV_H4_SH(...)
static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD2(psrc, stride, out0, out1)
static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
static void copy_width32_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define DOTP_SB2_SH(...)
static uint8_t tmp[11]
Definition: aes_ctr.c:26
#define VSHF_B3_SB(...)
#define DOTP_SB3_SH(...)