FFmpeg
hevc_mc_uniw_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
24 
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  /* 4 width cases */
29  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
30 };
31 
32 #define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \
33  out0_h, out1_h) \
34 { \
35  v4i32 in0_r_m, in0_l_m, in1_r_m, in1_l_m; \
36  \
37  ILVRL_H2_SW(in0_h, in0_h, in0_r_m, in0_l_m); \
38  ILVRL_H2_SW(in1_h, in1_h, in1_r_m, in1_l_m); \
39  DOTP_SH4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, wgt_w, wgt_w, wgt_w, \
40  wgt_w, in0_r_m, in1_r_m, in0_l_m, in1_l_m); \
41  SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w); \
42  PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h); \
43  ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h); \
44  CLIP_SH2_0_255(out0_h, out1_h); \
45 }
46 
47 #define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w, \
48  offset_h, rnd_w, out0_h, out1_h, \
49  out2_h, out3_h) \
50 { \
51  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \
52  out0_h, out1_h); \
53  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in2_h, in3_h, wgt_w, offset_h, rnd_w, \
54  out2_h, out3_h); \
55 }
56 
58  int32_t src_stride,
59  uint8_t *dst,
60  int32_t dst_stride,
64  int32_t rnd_val)
65 {
66  uint32_t loop_cnt, tp0, tp1, tp2, tp3;
67  v16i8 zero = { 0 };
68  v16u8 out0, out1;
69  v16i8 src0 = { 0 }, src1 = { 0 };
70  v8i16 dst0, dst1, dst2, dst3, offset_vec;
71  v4i32 weight_vec, rnd_vec;
72 
73  weight = weight & 0x0000FFFF;
74  weight_vec = __msa_fill_w(weight);
75  offset_vec = __msa_fill_h(offset);
76  rnd_vec = __msa_fill_w(rnd_val);
77 
78  if (2 == height) {
79  v4i32 dst0_r, dst0_l;
80 
81  LW2(src, src_stride, tp0, tp1);
82  INSERT_W2_SB(tp0, tp1, src0);
83  dst0 = (v8i16) __msa_ilvr_b(zero, src0);
84  dst0 <<= 6;
85 
86  ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
87  DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
88  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
89  dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
90  dst0 += offset_vec;
91  CLIP_SH_0_255(dst0);
92  out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
93  ST_W2(out0, 0, 1, dst, dst_stride);
94  } else if (4 == height) {
95  LW4(src, src_stride, tp0, tp1, tp2, tp3);
96  INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
97  ILVRL_B2_SH(zero, src0, dst0, dst1);
98  SLLI_2V(dst0, dst1, 6);
99  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
100  rnd_vec, dst0, dst1);
101  out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
102  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
103  } else if (0 == (height % 8)) {
104  for (loop_cnt = (height >> 3); loop_cnt--;) {
105  LW4(src, src_stride, tp0, tp1, tp2, tp3);
106  src += 4 * src_stride;
107  INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
108  LW4(src, src_stride, tp0, tp1, tp2, tp3);
109  src += 4 * src_stride;
110  INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
111  ILVRL_B2_SH(zero, src0, dst0, dst1);
112  ILVRL_B2_SH(zero, src1, dst2, dst3);
113  SLLI_4V(dst0, dst1, dst2, dst3, 6);
114  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
115  offset_vec, rnd_vec, dst0, dst1,
116  dst2, dst3);
117  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
118  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
119  dst += 8 * dst_stride;
120  }
121  }
122 }
123 
125  int32_t src_stride,
126  uint8_t *dst,
127  int32_t dst_stride,
128  int32_t height,
129  int32_t weight,
130  int32_t offset,
131  int32_t rnd_val)
132 {
133  uint32_t loop_cnt;
134  uint64_t tp0, tp1, tp2, tp3;
135  v16i8 zero = { 0 };
136  v16u8 out0, out1, out2, out3;
137  v16i8 src0, src1, src2, src3;
138  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
139  v4i32 weight_vec, rnd_vec;
140 
141  weight = weight & 0x0000FFFF;
142  weight_vec = __msa_fill_w(weight);
143  offset_vec = __msa_fill_h(offset);
144  rnd_vec = __msa_fill_w(rnd_val);
145 
146  for (loop_cnt = (height >> 3); loop_cnt--;) {
147  LD4(src, src_stride, tp0, tp1, tp2, tp3);
148  src += (4 * src_stride);
149  INSERT_D2_SB(tp0, tp1, src0);
150  INSERT_D2_SB(tp2, tp3, src1);
151  LD4(src, src_stride, tp0, tp1, tp2, tp3);
152  src += (4 * src_stride);
153  INSERT_D2_SB(tp0, tp1, src2);
154  INSERT_D2_SB(tp2, tp3, src3);
155 
156  ILVRL_B2_SH(zero, src0, dst0, dst1);
157  ILVRL_B2_SH(zero, src1, dst2, dst3);
158  ILVRL_B2_SH(zero, src2, dst4, dst5);
159  ILVRL_B2_SH(zero, src3, dst6, dst7);
160 
161  SLLI_4V(dst0, dst1, dst2, dst3, 6);
162  SLLI_4V(dst4, dst5, dst6, dst7, 6);
163 
164  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
165  offset_vec, rnd_vec, dst0, dst1, dst2,
166  dst3);
167  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
168  offset_vec, rnd_vec, dst4, dst5, dst6,
169  dst7);
170  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
171  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
172 
173  ST_W2(out0, 0, 2, dst, dst_stride);
174  ST_H2(out0, 2, 6, dst + 4, dst_stride);
175  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
176  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
177  dst += (4 * dst_stride);
178  ST_W2(out2, 0, 2, dst, dst_stride);
179  ST_H2(out2, 2, 6, dst + 4, dst_stride);
180  ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
181  ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
182  dst += (4 * dst_stride);
183  }
184 }
185 
187  int32_t src_stride,
188  uint8_t *dst,
189  int32_t dst_stride,
190  int32_t height,
191  int32_t weight,
192  int32_t offset,
193  int32_t rnd_val)
194 {
195  uint32_t loop_cnt;
196  uint64_t tp0, tp1, tp2, tp3;
197  v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
198  v16i8 zero = { 0 };
199  v16u8 out0, out1, out2, out3;
200  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
201  v4i32 weight_vec, rnd_vec;
202 
203  weight = weight & 0x0000FFFF;
204  weight_vec = __msa_fill_w(weight);
205  offset_vec = __msa_fill_h(offset);
206  rnd_vec = __msa_fill_w(rnd_val);
207 
208  if (2 == height) {
209  LD2(src, src_stride, tp0, tp1);
210  INSERT_D2_SB(tp0, tp1, src0);
211  ILVRL_B2_SH(zero, src0, dst0, dst1);
212  SLLI_2V(dst0, dst1, 6);
213  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
214  rnd_vec, dst0, dst1);
215  out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
216  ST_D2(out0, 0, 1, dst, dst_stride);
217  } else if (4 == height) {
218  LD4(src, src_stride, tp0, tp1, tp2, tp3);
219  INSERT_D2_SB(tp0, tp1, src0);
220  INSERT_D2_SB(tp2, tp3, src1);
221  ILVRL_B2_SH(zero, src0, dst0, dst1);
222  ILVRL_B2_SH(zero, src1, dst2, dst3);
223  SLLI_4V(dst0, dst1, dst2, dst3, 6);
224  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
225  offset_vec, rnd_vec, dst0, dst1, dst2,
226  dst3);
227  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
228  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
229  } else if (6 == height) {
230  LD4(src, src_stride, tp0, tp1, tp2, tp3);
231  src += 4 * src_stride;
232  INSERT_D2_SB(tp0, tp1, src0);
233  INSERT_D2_SB(tp2, tp3, src1);
234  LD2(src, src_stride, tp0, tp1);
235  INSERT_D2_SB(tp0, tp1, src2);
236  ILVRL_B2_SH(zero, src0, dst0, dst1);
237  ILVRL_B2_SH(zero, src1, dst2, dst3);
238  ILVRL_B2_SH(zero, src2, dst4, dst5);
239  SLLI_4V(dst0, dst1, dst2, dst3, 6);
240  SLLI_2V(dst4, dst5, 6);
241  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
242  offset_vec, rnd_vec, dst0, dst1, dst2,
243  dst3);
244  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
245  rnd_vec, dst4, dst5);
246  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
247  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
248  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
249  } else if (0 == height % 8) {
250  for (loop_cnt = (height >> 3); loop_cnt--;) {
251  LD4(src, src_stride, tp0, tp1, tp2, tp3);
252  src += 4 * src_stride;
253  INSERT_D2_SB(tp0, tp1, src0);
254  INSERT_D2_SB(tp2, tp3, src1);
255  LD4(src, src_stride, tp0, tp1, tp2, tp3);
256  src += 4 * src_stride;
257  INSERT_D2_SB(tp0, tp1, src2);
258  INSERT_D2_SB(tp2, tp3, src3);
259 
260  ILVRL_B2_SH(zero, src0, dst0, dst1);
261  ILVRL_B2_SH(zero, src1, dst2, dst3);
262  ILVRL_B2_SH(zero, src2, dst4, dst5);
263  ILVRL_B2_SH(zero, src3, dst6, dst7);
264  SLLI_4V(dst0, dst1, dst2, dst3, 6);
265  SLLI_4V(dst4, dst5, dst6, dst7, 6);
266  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
267  offset_vec, rnd_vec, dst0, dst1,
268  dst2, dst3);
269  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
270  offset_vec, rnd_vec, dst4, dst5,
271  dst6, dst7);
272  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
273  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
274  ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1,
275  dst, dst_stride);
276  dst += (8 * dst_stride);
277  }
278  }
279 }
280 
282  int32_t src_stride,
283  uint8_t *dst,
284  int32_t dst_stride,
285  int32_t height,
286  int32_t weight,
287  int32_t offset,
288  int32_t rnd_val)
289 {
290  uint32_t loop_cnt;
291  v16u8 out0, out1, out2;
292  v16i8 src0, src1, src2, src3;
293  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
294  v8i16 offset_vec;
295  v16i8 zero = { 0 };
296  v4i32 weight_vec, rnd_vec;
297 
298  weight = weight & 0x0000FFFF;
299  weight_vec = __msa_fill_w(weight);
300  offset_vec = __msa_fill_h(offset);
301  rnd_vec = __msa_fill_w(rnd_val);
302 
303  for (loop_cnt = 4; loop_cnt--;) {
304  LD_SB4(src, src_stride, src0, src1, src2, src3);
305  src += (4 * src_stride);
306  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
307  dst0, dst1, dst2, dst3);
308 
309  ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
310  ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
311  SLLI_4V(dst0, dst1, dst2, dst3, 6);
312  SLLI_2V(dst4, dst5, 6);
313  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
314  offset_vec, rnd_vec, dst0, dst1, dst2,
315  dst3);
316  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
317  rnd_vec, dst4, dst5);
318 
319  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
320  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
321  ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
322  dst += (4 * dst_stride);
323  }
324 }
325 
327  int32_t src_stride,
328  uint8_t *dst,
329  int32_t dst_stride,
330  int32_t height,
331  int32_t weight,
332  int32_t offset,
333  int32_t rnd_val)
334 {
335  uint32_t loop_cnt;
336  v16u8 out0, out1, out2, out3;
337  v16i8 src0, src1, src2, src3;
338  v16i8 zero = { 0 };
339  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
340  v4i32 weight_vec, rnd_vec;
341 
342  weight = weight & 0x0000FFFF;
343  weight_vec = __msa_fill_w(weight);
344  offset_vec = __msa_fill_h(offset);
345  rnd_vec = __msa_fill_w(rnd_val);
346 
347  for (loop_cnt = height >> 2; loop_cnt--;) {
348  LD_SB4(src, src_stride, src0, src1, src2, src3);
349  src += (4 * src_stride);
350  ILVRL_B2_SH(zero, src0, dst0, dst1);
351  ILVRL_B2_SH(zero, src1, dst2, dst3);
352  ILVRL_B2_SH(zero, src2, dst4, dst5);
353  ILVRL_B2_SH(zero, src3, dst6, dst7);
354  SLLI_4V(dst0, dst1, dst2, dst3, 6);
355  SLLI_4V(dst4, dst5, dst6, dst7, 6);
356  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
357  offset_vec, rnd_vec, dst0, dst1, dst2,
358  dst3);
359  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
360  offset_vec, rnd_vec, dst4, dst5, dst6,
361  dst7);
362  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
363  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
364  ST_UB4(out0, out1, out2, out3, dst, dst_stride);
365  dst += (4 * dst_stride);
366  }
367 }
368 
370  int32_t src_stride,
371  uint8_t *dst,
372  int32_t dst_stride,
373  int32_t height,
374  int32_t weight,
375  int32_t offset,
376  int32_t rnd_val)
377 {
378  uint32_t loop_cnt;
379  v16u8 out0, out1, out2, out3, out4, out5;
380  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
381  v16i8 zero = { 0 };
382  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
383  v8i16 dst8, dst9, dst10, dst11;
384  v4i32 weight_vec, rnd_vec;
385 
386  weight = weight & 0x0000FFFF;
387  weight_vec = __msa_fill_w(weight);
388  offset_vec = __msa_fill_h(offset);
389  rnd_vec = __msa_fill_w(rnd_val);
390 
391  for (loop_cnt = (height >> 2); loop_cnt--;) {
392  LD_SB4(src, src_stride, src0, src1, src4, src5);
393  LD_SB4(src + 16, src_stride, src2, src3, src6, src7);
394  src += (4 * src_stride);
395 
396  ILVRL_B2_SH(zero, src0, dst0, dst1);
397  ILVRL_B2_SH(zero, src1, dst2, dst3);
398  ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
399  ILVRL_B2_SH(zero, src4, dst6, dst7);
400  ILVRL_B2_SH(zero, src5, dst8, dst9);
401  ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
402  SLLI_4V(dst0, dst1, dst2, dst3, 6);
403  SLLI_4V(dst4, dst5, dst6, dst7, 6);
404  SLLI_4V(dst8, dst9, dst10, dst11, 6);
405  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
406  offset_vec, rnd_vec, dst0, dst1, dst2,
407  dst3);
408  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
409  offset_vec, rnd_vec, dst4, dst5, dst6,
410  dst7);
411  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
412  offset_vec, rnd_vec, dst8, dst9, dst10,
413  dst11);
414  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
415  PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
416  ST_UB4(out0, out1, out3, out4, dst, dst_stride);
417  ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
418  dst += (4 * dst_stride);
419  }
420 }
421 
423  int32_t src_stride,
424  uint8_t *dst,
425  int32_t dst_stride,
426  int32_t height,
427  int32_t weight,
428  int32_t offset,
429  int32_t rnd_val)
430 {
431  uint32_t loop_cnt;
432  v16u8 out0, out1, out2, out3;
433  v16i8 src0, src1, src2, src3;
434  v16i8 zero = { 0 };
435  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
436  v4i32 weight_vec, rnd_vec;
437 
438  weight = weight & 0x0000FFFF;
439  weight_vec = __msa_fill_w(weight);
440  offset_vec = __msa_fill_h(offset);
441  rnd_vec = __msa_fill_w(rnd_val);
442 
443  for (loop_cnt = (height >> 1); loop_cnt--;) {
444  LD_SB2(src, src_stride, src0, src1);
445  LD_SB2(src + 16, src_stride, src2, src3);
446  src += (2 * src_stride);
447 
448  ILVRL_B2_SH(zero, src0, dst0, dst1);
449  ILVRL_B2_SH(zero, src1, dst2, dst3);
450  ILVRL_B2_SH(zero, src2, dst4, dst5);
451  ILVRL_B2_SH(zero, src3, dst6, dst7);
452  SLLI_4V(dst0, dst1, dst2, dst3, 6);
453  SLLI_4V(dst4, dst5, dst6, dst7, 6);
454  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
455  offset_vec, rnd_vec, dst0, dst1, dst2,
456  dst3);
457  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
458  offset_vec, rnd_vec, dst4, dst5, dst6,
459  dst7);
460  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
461  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
462  ST_UB2(out0, out1, dst, dst_stride);
463  ST_UB2(out2, out3, dst + 16, dst_stride);
464  dst += (2 * dst_stride);
465  }
466 }
467 
469  int32_t src_stride,
470  uint8_t *dst,
471  int32_t dst_stride,
472  int32_t height,
473  int32_t weight,
474  int32_t offset,
475  int32_t rnd_val)
476 {
477  uint32_t loop_cnt;
478  v16u8 out0, out1, out2, out3, out4, out5;
479  v16i8 src0, src1, src2, src3, src4, src5;
480  v16i8 zero = { 0 };
481  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, offset_vec;
482  v8i16 dst6, dst7, dst8, dst9, dst10, dst11;
483  v4i32 weight_vec, rnd_vec;
484 
485  weight = weight & 0x0000FFFF;
486  weight_vec = __msa_fill_w(weight);
487  offset_vec = __msa_fill_h(offset);
488  rnd_vec = __msa_fill_w(rnd_val);
489 
490  for (loop_cnt = (height >> 1); loop_cnt--;) {
491  LD_SB3(src, 16, src0, src1, src2);
492  src += src_stride;
493  LD_SB3(src, 16, src3, src4, src5);
494  src += src_stride;
495 
496  ILVRL_B2_SH(zero, src0, dst0, dst1);
497  ILVRL_B2_SH(zero, src1, dst2, dst3);
498  ILVRL_B2_SH(zero, src2, dst4, dst5);
499  ILVRL_B2_SH(zero, src3, dst6, dst7);
500  ILVRL_B2_SH(zero, src4, dst8, dst9);
501  ILVRL_B2_SH(zero, src5, dst10, dst11);
502  SLLI_4V(dst0, dst1, dst2, dst3, 6);
503  SLLI_4V(dst4, dst5, dst6, dst7, 6);
504  SLLI_4V(dst8, dst9, dst10, dst11, 6);
505  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
506  offset_vec, rnd_vec, dst0, dst1, dst2,
507  dst3);
508  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
509  offset_vec, rnd_vec, dst4, dst5, dst6,
510  dst7);
511  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
512  offset_vec, rnd_vec, dst8, dst9, dst10,
513  dst11);
514  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
515  PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
516  ST_UB2(out0, out1, dst, 16);
517  ST_UB(out2, dst + 32);
518  dst += dst_stride;
519  ST_UB2(out3, out4, dst, 16);
520  ST_UB(out5, dst + 32);
521  dst += dst_stride;
522  }
523 }
524 
526  int32_t src_stride,
527  uint8_t *dst,
528  int32_t dst_stride,
529  int32_t height,
530  int32_t weight,
531  int32_t offset,
532  int32_t rnd_val)
533 {
534  uint32_t loop_cnt;
535  v16u8 out0, out1, out2, out3, out4, out5, out6, out7;
536  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
537  v16i8 zero = { 0 };
538  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
539  v8i16 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
540  v4i32 weight_vec, rnd_vec;
541 
542  weight = weight & 0x0000FFFF;
543  weight_vec = __msa_fill_w(weight);
544  offset_vec = __msa_fill_h(offset);
545  rnd_vec = __msa_fill_w(rnd_val);
546 
547  for (loop_cnt = (height >> 1); loop_cnt--;) {
548  LD_SB4(src, 16, src0, src1, src2, src3);
549  src += src_stride;
550  LD_SB4(src, 16, src4, src5, src6, src7);
551  src += src_stride;
552 
553  ILVRL_B2_SH(zero, src0, dst0, dst1);
554  ILVRL_B2_SH(zero, src1, dst2, dst3);
555  ILVRL_B2_SH(zero, src2, dst4, dst5);
556  ILVRL_B2_SH(zero, src3, dst6, dst7);
557  ILVRL_B2_SH(zero, src4, dst8, dst9);
558  ILVRL_B2_SH(zero, src5, dst10, dst11);
559  ILVRL_B2_SH(zero, src6, dst12, dst13);
560  ILVRL_B2_SH(zero, src7, dst14, dst15);
561  SLLI_4V(dst0, dst1, dst2, dst3, 6);
562  SLLI_4V(dst4, dst5, dst6, dst7, 6);
563  SLLI_4V(dst8, dst9, dst10, dst11, 6);
564  SLLI_4V(dst12, dst13, dst14, dst15, 6);
565  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
566  offset_vec, rnd_vec, dst0, dst1, dst2,
567  dst3);
568  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
569  offset_vec, rnd_vec, dst4, dst5, dst6,
570  dst7);
571  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
572  offset_vec, rnd_vec, dst8, dst9, dst10,
573  dst11);
574  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst12, dst13, dst14, dst15, weight_vec,
575  offset_vec, rnd_vec, dst12, dst13, dst14,
576  dst15);
577  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
578  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
579  PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
580  PCKEV_B2_UB(dst13, dst12, dst15, dst14, out6, out7);
581  ST_UB4(out0, out1, out2, out3, dst, 16);
582  dst += dst_stride;
583  ST_UB4(out4, out5, out6, out7, dst, 16);
584  dst += dst_stride;
585  }
586 }
587 
589  int32_t src_stride,
590  uint8_t *dst,
591  int32_t dst_stride,
592  const int8_t *filter,
593  int32_t height,
594  int32_t weight,
595  int32_t offset,
596  int32_t rnd_val)
597 {
598  uint32_t loop_cnt;
599  v16u8 out0, out1;
600  v8i16 filt0, filt1, filt2, filt3;
601  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
602  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
603  v16i8 mask0, mask1, mask2, mask3, vec11, vec12, vec13, vec14, vec15;
604  v8i16 filter_vec, dst01, dst23, dst45, dst67;
605  v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
606  v4i32 weight_vec, rnd_vec;
607 
608  src -= 3;
609  weight = weight & 0x0000FFFF;
610 
611  weight_vec = __msa_fill_w(weight);
612  rnd_vec = __msa_fill_w(rnd_val);
613 
614  weight *= 128;
615  rnd_val -= 6;
616 
617  weight_vec_h = __msa_fill_h(weight);
618  offset_vec = __msa_fill_h(offset);
619  denom_vec = __msa_fill_h(rnd_val);
620 
621  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
622  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
623 
624  filter_vec = LD_SH(filter);
625  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
626 
627  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
628  mask1 = mask0 + 2;
629  mask2 = mask0 + 4;
630  mask3 = mask0 + 6;
631 
632  for (loop_cnt = (height >> 3); loop_cnt--;) {
633  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
634  src += (8 * src_stride);
635  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
636 
637  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
638  vec0, vec1, vec2, vec3);
639  VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
640  vec4, vec5, vec6, vec7);
641  VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
642  vec8, vec9, vec10, vec11);
643  VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
644  vec12, vec13, vec14, vec15);
645  dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
646  filt3);
647  dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
648  filt3);
649  dst45 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
650  filt3);
651  dst67 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
652  filt2, filt3);
653 
654  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec,
655  offset_vec, rnd_vec, dst0, dst1, dst2,
656  dst3);
657 
658  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
659  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
660  dst += (8 * dst_stride);
661  }
662 }
663 
665  int32_t src_stride,
666  uint8_t *dst,
667  int32_t dst_stride,
668  const int8_t *filter,
669  int32_t height,
670  int32_t weight,
671  int32_t offset,
672  int32_t rnd_val)
673 {
674  uint32_t loop_cnt;
675  v16u8 out0, out1;
676  v16i8 src0, src1, src2, src3;
677  v8i16 filt0, filt1, filt2, filt3;
678  v16i8 mask0, mask1, mask2, mask3;
679  v8i16 filter_vec;
680  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
681  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
682  v8i16 dst0, dst1, dst2, dst3;
683  v8i16 weight_vec_h, offset_vec, denom_vec;
684  v4i32 weight_vec, rnd_vec;
685 
686  src -= 3;
687  weight = weight & 0x0000FFFF;
688 
689  weight_vec = __msa_fill_w(weight);
690  rnd_vec = __msa_fill_w(rnd_val);
691 
692  weight *= 128;
693  rnd_val -= 6;
694 
695  weight_vec_h = __msa_fill_h(weight);
696  offset_vec = __msa_fill_h(offset);
697  denom_vec = __msa_fill_h(rnd_val);
698 
699  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
700  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
701 
702  filter_vec = LD_SH(filter);
703  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
704 
705  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
706  mask1 = mask0 + 2;
707  mask2 = mask0 + 4;
708  mask3 = mask0 + 6;
709 
710  for (loop_cnt = (height >> 2); loop_cnt--;) {
711  LD_SB4(src, src_stride, src0, src1, src2, src3);
712  src += (4 * src_stride);
713  XORI_B4_128_SB(src0, src1, src2, src3);
714 
715  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
716  vec0, vec1, vec2, vec3);
717  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
718  vec4, vec5, vec6, vec7);
719  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
720  vec8, vec9, vec10, vec11);
721  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
722  vec12, vec13, vec14, vec15);
723  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
724  filt3);
725  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
726  filt3);
727  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
728  filt3);
729  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
730  filt2, filt3);
731 
732  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
733  offset_vec, rnd_vec, dst0, dst1, dst2,
734  dst3);
735 
736  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
737  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
738  dst += (4 * dst_stride);
739  }
740 }
741 
743  int32_t src_stride,
744  uint8_t *dst,
745  int32_t dst_stride,
746  const int8_t *filter,
747  int32_t height,
748  int32_t weight,
749  int32_t offset,
750  int32_t rnd_val)
751 {
752  uint32_t loop_cnt;
753  v16u8 out0, out1, out2;
754  v8i16 filt0, filt1, filt2, filt3;
755  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
756  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
757  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
758  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
759  v8i16 filter_vec;
760  v8i16 dst01, dst23, dst0, dst1, dst2, dst3, dst4, dst5;
761  v8i16 weight_vec_h, offset_vec, denom_vec;
762  v4i32 weight_vec, rnd_vec;
763 
764  src -= 3;
765  weight = weight & 0x0000FFFF;
766 
767  weight_vec = __msa_fill_w(weight);
768  rnd_vec = __msa_fill_w(rnd_val);
769 
770  weight *= 128;
771  rnd_val -= 6;
772 
773  weight_vec_h = __msa_fill_h(weight);
774  offset_vec = __msa_fill_h(offset);
775  denom_vec = __msa_fill_h(rnd_val);
776 
777  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
778  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
779 
780  filter_vec = LD_SH(filter);
781  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
782 
783  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
784  mask1 = mask0 + 2;
785  mask2 = mask0 + 4;
786  mask3 = mask0 + 6;
787  mask4 = LD_SB(&ff_hevc_mask_arr[16]);
788  mask5 = mask4 + 2;
789  mask6 = mask4 + 4;
790  mask7 = mask4 + 6;
791 
792  for (loop_cnt = (height >> 2); loop_cnt--;) {
793  LD_SB4(src, src_stride, src0, src1, src2, src3);
794  LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
795  src += (4 * src_stride);
796  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
797 
798  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
799  vec0, vec1, vec2, vec3);
800  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
801  vec4, vec5, vec6, vec7);
802  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
803  vec8, vec9, vec10, vec11);
804  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
805  vec12, vec13, vec14, vec15);
806  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
807  filt3);
808  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
809  filt3);
810  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
811  filt3);
812  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
813  filt2, filt3);
814  VSHF_B4_SB(src4, src5, mask4, mask5, mask6, mask7,
815  vec0, vec1, vec2, vec3);
816  VSHF_B4_SB(src6, src7, mask4, mask5, mask6, mask7,
817  vec4, vec5, vec6, vec7);
818  dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
819  filt3);
820  dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
821  filt3);
822 
823  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
824  offset_vec, rnd_vec, dst0, dst1, dst2,
825  dst3);
826  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst01, dst23, weight_vec, offset_vec,
827  rnd_vec, dst4, dst5);
828 
829  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
830  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
831  ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
832  dst += (4 * dst_stride);
833  }
834 }
835 
837  int32_t src_stride,
838  uint8_t *dst,
839  int32_t dst_stride,
840  const int8_t *filter,
841  int32_t height,
842  int32_t weight,
843  int32_t offset,
844  int32_t rnd_val)
845 {
846  uint32_t loop_cnt;
847  v16u8 out0, out1;
848  v16i8 src0, src1, src2, src3;
849  v8i16 filt0, filt1, filt2, filt3;
850  v16i8 mask0, mask1, mask2, mask3;
851  v8i16 filter_vec;
852  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
853  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
854  v8i16 dst0, dst1, dst2, dst3;
855  v8i16 weight_vec_h, offset_vec, denom_vec;
856  v4i32 weight_vec, rnd_vec;
857 
858  src -= 3;
859 
860  weight_vec = __msa_fill_w(weight);
861  rnd_vec = __msa_fill_w(rnd_val);
862 
863  weight *= 128;
864  rnd_val -= 6;
865 
866  weight_vec_h = __msa_fill_h(weight);
867  offset_vec = __msa_fill_h(offset);
868  denom_vec = __msa_fill_h(rnd_val);
869 
870  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
871  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
872 
873  filter_vec = LD_SH(filter);
874  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
875 
876  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
877  mask1 = mask0 + 2;
878  mask2 = mask0 + 4;
879  mask3 = mask0 + 6;
880 
881  for (loop_cnt = (height >> 1); loop_cnt--;) {
882  LD_SB2(src, src_stride, src0, src2);
883  LD_SB2(src + 8, src_stride, src1, src3);
884  src += (2 * src_stride);
885  XORI_B4_128_SB(src0, src1, src2, src3);
886 
887  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
888  vec0, vec1, vec2, vec3);
889  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
890  vec4, vec5, vec6, vec7);
891  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
892  vec8, vec9, vec10, vec11);
893  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
894  vec12, vec13, vec14, vec15);
895  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
896  filt3);
897  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
898  filt3);
899  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
900  filt3);
901  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
902  filt2, filt3);
903 
904  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
905  offset_vec, rnd_vec, dst0, dst1, dst2,
906  dst3);
907 
908  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
909  ST_UB2(out0, out1, dst, dst_stride);
910  dst += (2 * dst_stride);
911  }
912 }
913 
915  int32_t src_stride,
916  uint8_t *dst,
917  int32_t dst_stride,
918  const int8_t *filter,
919  int32_t height,
920  int32_t weight,
921  int32_t offset,
922  int32_t rnd_val)
923 {
924  uint32_t loop_cnt;
925  v16u8 out0, out1, out2;
926  v16i8 src0, src1, src2, src3;
927  v8i16 filt0, filt1, filt2, filt3;
928  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
929  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
930  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
931  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
932  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
933  v4i32 weight_vec, rnd_vec;
934 
935  src -= 3;
936 
937  weight_vec = __msa_fill_w(weight);
938  rnd_vec = __msa_fill_w(rnd_val);
939 
940  weight *= 128;
941  rnd_val -= 6;
942 
943  weight_vec_h = __msa_fill_h(weight);
944  offset_vec = __msa_fill_h(offset);
945  denom_vec = __msa_fill_h(rnd_val);
946 
947  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
948  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
949 
950  filter_vec = LD_SH(filter);
951  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
952 
953  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
954  mask1 = mask0 + 2;
955  mask2 = mask0 + 4;
956  mask3 = mask0 + 6;
957  mask4 = mask0 + 8;
958  mask5 = mask0 + 10;
959  mask6 = mask0 + 12;
960  mask7 = mask0 + 14;
961 
962  for (loop_cnt = 16; loop_cnt--;) {
963  LD_SB2(src, 16, src0, src1);
964  src += src_stride;
965  LD_SB2(src, 16, src2, src3);
966  src += src_stride;
967  XORI_B4_128_SB(src0, src1, src2, src3);
968  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
969  vec0, vec1, vec2, vec3);
970  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
971  vec4, vec5, vec6, vec7);
972  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
973  vec8, vec9, vec10, vec11);
974  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
975  vec12, vec13, vec14, vec15);
976  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
977  filt3);
978  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
979  filt3);
980  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
981  filt3);
982  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
983  filt2, filt3);
984 
985  VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
986  vec0, vec1, vec2, vec3);
987  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
988  vec4, vec5, vec6, vec7);
989  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
990  filt3);
991  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
992  filt3);
993 
994  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
995  offset_vec, rnd_vec, dst0, dst1, dst2,
996  dst3);
997  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
998  rnd_vec, dst4, dst5);
999 
1000  PCKEV_B3_UB(dst1, dst0, dst4, dst3, dst5, dst2, out0, out1, out2);
1001  ST_UB2(out0, out1, dst, dst_stride);
1002  ST_D2(out2, 0, 1, dst + 16, dst_stride);
1003  dst += (2 * dst_stride);
1004  }
1005 }
1006 
1008  int32_t src_stride,
1009  uint8_t *dst,
1010  int32_t dst_stride,
1011  const int8_t *filter,
1012  int32_t height,
1013  int32_t weight,
1014  int32_t offset,
1015  int32_t rnd_val)
1016 {
1017  uint32_t loop_cnt;
1018  v16u8 out0, out1, out2, out3;
1019  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1020  v8i16 filt0, filt1, filt2, filt3;
1021  v16i8 mask0, mask1, mask2, mask3;
1022  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1023  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1024  v8i16 filter_vec;
1025  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1026  v8i16 weight_vec_h, offset_vec, denom_vec;
1027  v4i32 weight_vec, rnd_vec;
1028 
1029  src -= 3;
1030 
1031  weight_vec = __msa_fill_w(weight);
1032  rnd_vec = __msa_fill_w(rnd_val);
1033 
1034  weight *= 128;
1035  rnd_val -= 6;
1036 
1037  weight_vec_h = __msa_fill_h(weight);
1038  offset_vec = __msa_fill_h(offset);
1039  denom_vec = __msa_fill_h(rnd_val);
1040 
1041  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1042  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1043 
1044  filter_vec = LD_SH(filter);
1045  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1046 
1047  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1048  mask1 = mask0 + 2;
1049  mask2 = mask0 + 4;
1050  mask3 = mask0 + 6;
1051 
1052  for (loop_cnt = height >> 1; loop_cnt--;) {
1053  LD_SB4(src, 8, src0, src1, src2, src3);
1054  src += src_stride;
1055  LD_SB4(src, 8, src4, src5, src6, src7);
1056  src += src_stride;
1057  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1058 
1059  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1060  vec0, vec1, vec2, vec3);
1061  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1062  vec4, vec5, vec6, vec7);
1063  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1064  vec8, vec9, vec10, vec11);
1065  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1066  vec12, vec13, vec14, vec15);
1067  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1068  filt3);
1069  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1070  filt3);
1071  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1072  filt3);
1073  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1074  filt2, filt3);
1075 
1076  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1077  vec0, vec1, vec2, vec3);
1078  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1079  vec4, vec5, vec6, vec7);
1080  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1081  vec8, vec9, vec10, vec11);
1082  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1083  vec12, vec13, vec14, vec15);
1084  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1085  filt3);
1086  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1087  filt3);
1088  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1089  filt3);
1090  dst7 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1091  filt2, filt3);
1092 
1093  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1094  offset_vec, rnd_vec, dst0, dst1, dst2,
1095  dst3);
1096  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
1097  offset_vec, rnd_vec, dst4, dst5, dst6,
1098  dst7);
1099 
1100  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1101  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
1102  ST_UB2(out0, out1, dst, 16);
1103  dst += dst_stride;
1104  ST_UB2(out2, out3, dst, 16);
1105  dst += dst_stride;
1106  }
1107 }
1108 
1110  int32_t src_stride,
1111  uint8_t *dst,
1112  int32_t dst_stride,
1113  const int8_t *filter,
1114  int32_t height,
1115  int32_t weight,
1116  int32_t offset,
1117  int32_t rnd_val)
1118 {
1119  uint32_t loop_cnt;
1120  v16u8 out0, out1, out2;
1121  v16i8 src0, src1, src2, src3;
1122  v8i16 filt0, filt1, filt2, filt3;
1123  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1124  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1125  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1126  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1127  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
1128  v4i32 weight_vec, rnd_vec;
1129 
1130  src -= 3;
1131 
1132  weight = weight & 0x0000FFFF;
1133  weight_vec = __msa_fill_w(weight);
1134  rnd_vec = __msa_fill_w(rnd_val);
1135 
1136  weight *= 128;
1137  rnd_val -= 6;
1138 
1139  weight_vec_h = __msa_fill_h(weight);
1140  offset_vec = __msa_fill_h(offset);
1141  denom_vec = __msa_fill_h(rnd_val);
1142 
1143  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1144  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1145 
1146  filter_vec = LD_SH(filter);
1147  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1148 
1149  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1150  mask1 = mask0 + 2;
1151  mask2 = mask0 + 4;
1152  mask3 = mask0 + 6;
1153  mask4 = mask0 + 8;
1154  mask5 = mask0 + 10;
1155  mask6 = mask0 + 12;
1156  mask7 = mask0 + 14;
1157 
1158  for (loop_cnt = 64; loop_cnt--;) {
1159  LD_SB3(src, 16, src0, src1, src2);
1160  src3 = LD_SB(src + 40);
1161  src += src_stride;
1162  XORI_B4_128_SB(src0, src1, src2, src3);
1163 
1164  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1165  vec0, vec1, vec2, vec3);
1166  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1167  vec4, vec5, vec6, vec7);
1168  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1169  vec8, vec9, vec10, vec11);
1170  VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
1171  vec12, vec13, vec14, vec15);
1172  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1173  filt3);
1174  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1175  filt3);
1176  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1177  filt3);
1178  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1179  filt2, filt3);
1180 
1181  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1182  vec0, vec1, vec2, vec3);
1183  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1184  vec4, vec5, vec6, vec7);
1185  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1186  filt3);
1187  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1188  filt3);
1189 
1190  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1191  offset_vec, rnd_vec, dst0, dst1, dst2,
1192  dst3);
1193  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
1194  rnd_vec, dst4, dst5);
1195 
1196  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
1197  ST_UB2(out0, out1, dst, 16);
1198  ST_UB(out2, dst + 32);
1199  dst += dst_stride;
1200  }
1201 }
1202 
1204  int32_t src_stride,
1205  uint8_t *dst,
1206  int32_t dst_stride,
1207  const int8_t *filter,
1208  int32_t height,
1209  int32_t weight,
1210  int32_t offset,
1211  int32_t rnd_val)
1212 {
1213  uint8_t *src_tmp;
1214  uint8_t *dst_tmp;
1215  uint32_t loop_cnt, cnt;
1216  v16u8 out0, out1;
1217  v16i8 src0, src1, src2;
1218  v8i16 filt0, filt1, filt2, filt3;
1219  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1220  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1221  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1222  v8i16 dst0, dst1, dst2, dst3;
1223  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
1224  v4i32 weight_vec, rnd_vec;
1225 
1226  src -= 3;
1227 
1228  weight_vec = __msa_fill_w(weight);
1229  rnd_vec = __msa_fill_w(rnd_val);
1230 
1231  weight *= 128;
1232  rnd_val -= 6;
1233 
1234  weight_vec_h = __msa_fill_h(weight);
1235  offset_vec = __msa_fill_h(offset);
1236  denom_vec = __msa_fill_h(rnd_val);
1237 
1238  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1239  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1240 
1241  filter_vec = LD_SH(filter);
1242  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1243 
1244  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1245  mask1 = mask0 + 2;
1246  mask2 = mask0 + 4;
1247  mask3 = mask0 + 6;
1248  mask4 = mask0 + 8;
1249  mask5 = mask0 + 10;
1250  mask6 = mask0 + 12;
1251  mask7 = mask0 + 14;
1252 
1253  for (loop_cnt = height; loop_cnt--;) {
1254  src_tmp = src;
1255  dst_tmp = dst;
1256 
1257  for (cnt = 2; cnt--;) {
1258  LD_SB2(src_tmp, 16, src0, src1);
1259  src2 = LD_SB(src_tmp + 24);
1260  src_tmp += 32;
1261  XORI_B3_128_SB(src0, src1, src2);
1262 
1263  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1264  vec0, vec1, vec2, vec3);
1265  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1266  vec4, vec5, vec6, vec7);
1267  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1268  vec8, vec9, vec10, vec11);
1269  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1270  vec12, vec13, vec14, vec15);
1271  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1272  filt2, filt3);
1273  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1,
1274  filt2, filt3);
1275  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1276  filt2, filt3);
1277  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1278  filt2, filt3);
1279 
1280  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1281  offset_vec, rnd_vec, dst0, dst1,
1282  dst2, dst3);
1283 
1284  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1285  ST_UB2(out0, out1, dst_tmp, 16);
1286  dst_tmp += 32;
1287  }
1288 
1289  src += src_stride;
1290  dst += dst_stride;
1291  }
1292 }
1293 
1295  int32_t src_stride,
1296  uint8_t *dst,
1297  int32_t dst_stride,
1298  const int8_t *filter,
1299  int32_t height,
1300  int32_t weight,
1301  int32_t offset,
1302  int32_t rnd_val)
1303 {
1304  int32_t loop_cnt;
1305  v16u8 out0, out1;
1306  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1307  v16i8 src9, src10, src11, src12, src13, src14;
1308  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1309  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1310  v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1311  v16i8 src2110, src4332, src6554, src8776, src10998;
1312  v16i8 src12111110, src14131312;
1313  v8i16 filter_vec, dst01, dst23, dst45, dst67;
1314  v8i16 filt0, filt1, filt2, filt3;
1315  v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
1316  v4i32 weight_vec, rnd_vec;
1317 
1318  src -= (3 * src_stride);
1319 
1320 
1321  weight_vec = __msa_fill_w(weight);
1322  rnd_vec = __msa_fill_w(rnd_val);
1323 
1324  weight *= 128;
1325  rnd_val -= 6;
1326 
1327  weight_vec_h = __msa_fill_h(weight);
1328  offset_vec = __msa_fill_h(offset);
1329  denom_vec = __msa_fill_h(rnd_val);
1330 
1331  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1332  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1333 
1334  filter_vec = LD_SH(filter);
1335  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1336 
1337  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1338  src += (7 * src_stride);
1339 
1340  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1341  src10_r, src32_r, src54_r, src21_r);
1342 
1343  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1344 
1345  ILVR_D3_SB(src21_r, src10_r, src43_r,
1346  src32_r, src65_r, src54_r, src2110, src4332, src6554);
1347 
1348  XORI_B3_128_SB(src2110, src4332, src6554);
1349 
1350  for (loop_cnt = (height >> 3); loop_cnt--;) {
1351  LD_SB8(src, src_stride,
1352  src7, src8, src9, src10, src11, src12, src13, src14);
1353  src += (8 * src_stride);
1354  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1355  src76_r, src87_r, src98_r, src109_r);
1356  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1357  src1110_r, src1211_r, src1312_r, src1413_r);
1358  ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1359  src1413_r, src1312_r,
1360  src8776, src10998, src12111110, src14131312);
1361  XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1362  dst01 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0,
1363  filt1, filt2, filt3);
1364  dst23 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0,
1365  filt1, filt2, filt3);
1366  dst45 = HEVC_FILT_8TAP_SH(src6554, src8776, src10998, src12111110,
1367  filt0, filt1, filt2, filt3);
1368  dst67 = HEVC_FILT_8TAP_SH(src8776, src10998, src12111110, src14131312,
1369  filt0, filt1, filt2, filt3);
1370 
1371  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec,
1372  offset_vec, rnd_vec, dst0, dst1, dst2,
1373  dst3);
1374 
1375  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1376  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1377  dst += (8 * dst_stride);
1378 
1379  src2110 = src10998;
1380  src4332 = src12111110;
1381  src6554 = src14131312;
1382  src6 = src14;
1383  }
1384 }
1385 
1387  int32_t src_stride,
1388  uint8_t *dst,
1389  int32_t dst_stride,
1390  const int8_t *filter,
1391  int32_t height,
1392  int32_t weight,
1393  int32_t offset,
1394  int32_t rnd_val)
1395 {
1396  int32_t loop_cnt;
1397  v16u8 out0, out1;
1398  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1399  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1400  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1401  v8i16 filt0, filt1, filt2, filt3;
1402  v8i16 filter_vec;
1403  v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
1404  v4i32 weight_vec, rnd_vec;
1405 
1406  src -= (3 * src_stride);
1407 
1408  weight_vec = __msa_fill_w(weight);
1409  rnd_vec = __msa_fill_w(rnd_val);
1410 
1411  weight *= 128;
1412  rnd_val -= 6;
1413 
1414  weight_vec_h = __msa_fill_h(weight);
1415  offset_vec = __msa_fill_h(offset);
1416  denom_vec = __msa_fill_h(rnd_val);
1417 
1418  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1419  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1420 
1421  filter_vec = LD_SH(filter);
1422  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1423 
1424  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1425  src += (7 * src_stride);
1426  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1427 
1428  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1429  src10_r, src32_r, src54_r, src21_r);
1430  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1431 
1432  for (loop_cnt = (height >> 2); loop_cnt--;) {
1433  LD_SB4(src, src_stride, src7, src8, src9, src10);
1434  src += (4 * src_stride);
1435  XORI_B4_128_SB(src7, src8, src9, src10);
1436  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1437  src76_r, src87_r, src98_r, src109_r);
1438  dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1439  filt1, filt2, filt3);
1440  dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1441  filt1, filt2, filt3);
1442  dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1443  filt1, filt2, filt3);
1444  dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1445  filt1, filt2, filt3);
1446 
1447  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1448  offset_vec, rnd_vec, dst0, dst1, dst2,
1449  dst3);
1450 
1451  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1452  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1453  dst += (4 * dst_stride);
1454 
1455  src10_r = src54_r;
1456  src32_r = src76_r;
1457  src54_r = src98_r;
1458  src21_r = src65_r;
1459  src43_r = src87_r;
1460  src65_r = src109_r;
1461  src6 = src10;
1462  }
1463 }
1464 
1466  int32_t src_stride,
1467  uint8_t *dst,
1468  int32_t dst_stride,
1469  const int8_t *filter,
1470  int32_t height,
1471  int32_t weight,
1472  int32_t offset,
1473  int32_t rnd_val)
1474 {
1475  int32_t loop_cnt;
1476  v16u8 out0, out1, out2;
1477  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1478  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1479  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1480  v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1481  v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1482  v16i8 src2110, src4332, src6554, src8776, src10998;
1483  v8i16 filt0, filt1, filt2, filt3;
1484  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1485  v8i16 weight_vec_h, offset_vec, denom_vec, filter_vec;
1486  v4i32 weight_vec, rnd_vec;
1487 
1488  src -= (3 * src_stride);
1489 
1490  weight = weight & 0x0000FFFF;
1491  weight_vec = __msa_fill_w(weight);
1492  rnd_vec = __msa_fill_w(rnd_val);
1493 
1494  weight *= 128;
1495  rnd_val -= 6;
1496 
1497  weight_vec_h = __msa_fill_h(weight);
1498  offset_vec = __msa_fill_h(offset);
1499  denom_vec = __msa_fill_h(rnd_val);
1500 
1501  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1502  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1503 
1504  filter_vec = LD_SH(filter);
1505  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1506 
1507  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1508  src += (7 * src_stride);
1509  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1510 
1511  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1512  src10_r, src32_r, src54_r, src21_r);
1513  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1514  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1515  src10_l, src32_l, src54_l, src21_l);
1516  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1517  ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1518  src2110, src4332, src6554);
1519 
1520  for (loop_cnt = 4; loop_cnt--;) {
1521  LD_SB4(src, src_stride, src7, src8, src9, src10);
1522  src += (4 * src_stride);
1523  XORI_B4_128_SB(src7, src8, src9, src10);
1524 
1525  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1526  src76_r, src87_r, src98_r, src109_r);
1527  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1528  src76_l, src87_l, src98_l, src109_l);
1529  ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1530 
1531  dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1532  filt1, filt2, filt3);
1533  dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1534  filt1, filt2, filt3);
1535  dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1536  filt1, filt2, filt3);
1537  dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1538  filt1, filt2, filt3);
1539  dst4 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0,
1540  filt1, filt2, filt3);
1541  dst5 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0,
1542  filt1, filt2, filt3);
1543 
1544  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1545  offset_vec, rnd_vec, dst0, dst1, dst2,
1546  dst3);
1547  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
1548  rnd_vec, dst4, dst5);
1549 
1550  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
1551  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1552  ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
1553  dst += (4 * dst_stride);
1554 
1555  src10_r = src54_r;
1556  src32_r = src76_r;
1557  src54_r = src98_r;
1558  src21_r = src65_r;
1559  src43_r = src87_r;
1560  src65_r = src109_r;
1561  src2110 = src6554;
1562  src4332 = src8776;
1563  src6554 = src10998;
1564  src6 = src10;
1565  }
1566 }
1567 
1569  int32_t src_stride,
1570  uint8_t *dst,
1571  int32_t dst_stride,
1572  const int8_t *filter,
1573  int32_t height,
1574  int32_t weight,
1575  int32_t offset,
1576  int32_t rnd_val,
1577  int32_t weightmul16)
1578 {
1579  uint8_t *src_tmp;
1580  uint8_t *dst_tmp;
1581  int32_t loop_cnt, cnt;
1582  v16u8 out0, out1, out2, out3;
1583  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1584  v16i8 src10_r, src32_r, src54_r, src76_r;
1585  v16i8 src21_r, src43_r, src65_r, src87_r;
1586  v16i8 src10_l, src32_l, src54_l, src76_l;
1587  v16i8 src21_l, src43_l, src65_l, src87_l;
1588  v16i8 src98_r, src109_r, src98_l, src109_l;
1589  v8i16 filt0, filt1, filt2, filt3;
1590  v8i16 filter_vec;
1591  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1592  v8i16 weight_vec_h, offset_vec, denom_vec;
1593  v4i32 weight_vec, rnd_vec;
1594 
1595  src -= (3 * src_stride);
1596 
1597  weight_vec = __msa_fill_w(weight);
1598  rnd_vec = __msa_fill_w(rnd_val);
1599 
1600  weight *= 128;
1601  rnd_val -= 6;
1602 
1603  weight_vec_h = __msa_fill_h(weight);
1604  offset_vec = __msa_fill_h(offset);
1605  denom_vec = __msa_fill_h(rnd_val);
1606 
1607  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1608  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1609 
1610  filter_vec = LD_SH(filter);
1611  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1612 
1613  for (cnt = weightmul16; cnt--;) {
1614  src_tmp = src;
1615  dst_tmp = dst;
1616 
1617  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1618  src_tmp += (7 * src_stride);
1619  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1620 
1621  for (loop_cnt = (height >> 2); loop_cnt--;) {
1622  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1623  src_tmp += (4 * src_stride);
1624  XORI_B4_128_SB(src7, src8, src9, src10);
1625 
1626  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1627  src10_r, src32_r, src54_r, src21_r);
1628  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1629  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1630  src10_l, src32_l, src54_l, src21_l);
1631  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1632  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1633  src76_r, src87_r, src98_r, src109_r);
1634  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1635  src76_l, src87_l, src98_l, src109_l);
1636 
1637  dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1638  filt1, filt2, filt3);
1639  dst1 = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1640  filt1, filt2, filt3);
1641  dst2 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1642  filt1, filt2, filt3);
1643  dst3 = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1644  filt1, filt2, filt3);
1645  dst4 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1646  filt1, filt2, filt3);
1647  dst5 = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1648  filt1, filt2, filt3);
1649  dst6 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1650  filt1, filt2, filt3);
1651  dst7 = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1652  filt1, filt2, filt3);
1653 
1654  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1655  offset_vec, rnd_vec, dst0, dst1,
1656  dst2, dst3);
1657  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
1658  offset_vec, rnd_vec, dst4, dst5,
1659  dst6, dst7);
1660  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1661  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
1662  ST_UB4(out0, out1, out2, out3, dst_tmp, dst_stride);
1663  dst_tmp += (4 * dst_stride);
1664 
1665  src0 = src4;
1666  src1 = src5;
1667  src2 = src6;
1668  src3 = src7;
1669  src4 = src8;
1670  src5 = src9;
1671  src6 = src10;
1672  }
1673 
1674  src += 16;
1675  dst += 16;
1676  }
1677 }
1678 
1680  int32_t src_stride,
1681  uint8_t *dst,
1682  int32_t dst_stride,
1683  const int8_t *filter,
1684  int32_t height,
1685  int32_t weight,
1686  int32_t offset,
1687  int32_t rnd_val)
1688 {
1689  hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1690  filter, height, weight,
1691  offset, rnd_val, 1);
1692 }
1693 
1695  int32_t src_stride,
1696  uint8_t *dst,
1697  int32_t dst_stride,
1698  const int8_t *filter,
1699  int32_t height,
1700  int32_t weight,
1701  int32_t offset,
1702  int32_t rnd_val)
1703 {
1704  hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1705  filter, 32, weight,
1706  offset, rnd_val, 1);
1707 
1708  hevc_vt_uniwgt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
1709  filter, 32, weight, offset, rnd_val);
1710 }
1711 
1713  int32_t src_stride,
1714  uint8_t *dst,
1715  int32_t dst_stride,
1716  const int8_t *filter,
1717  int32_t height,
1718  int32_t weight,
1719  int32_t offset,
1720  int32_t rnd_val)
1721 {
1722  hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1723  filter, height, weight,
1724  offset, rnd_val, 2);
1725 }
1726 
1728  int32_t src_stride,
1729  uint8_t *dst,
1730  int32_t dst_stride,
1731  const int8_t *filter,
1732  int32_t height,
1733  int32_t weight,
1734  int32_t offset,
1735  int32_t rnd_val)
1736 {
1737  hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1738  filter, 64, weight,
1739  offset, rnd_val, 3);
1740 }
1741 
1743  int32_t src_stride,
1744  uint8_t *dst,
1745  int32_t dst_stride,
1746  const int8_t *filter,
1747  int32_t height,
1748  int32_t weight,
1749  int32_t offset,
1750  int32_t rnd_val)
1751 {
1752  hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1753  filter, height, weight,
1754  offset, rnd_val, 4);
1755 }
1756 
1758  int32_t src_stride,
1759  uint8_t *dst,
1760  int32_t dst_stride,
1761  const int8_t *filter_x,
1762  const int8_t *filter_y,
1763  int32_t height,
1764  int32_t weight,
1765  int32_t offset,
1766  int32_t rnd_val)
1767 {
1768  uint32_t loop_cnt;
1769  v16u8 out;
1770  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1771  v8i16 filt0, filt1, filt2, filt3;
1772  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1773  v16i8 mask1, mask2, mask3;
1774  v8i16 filter_vec;
1775  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1776  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1777  v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1778  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
1779  v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
1780  v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
1781  v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
1782  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1783 
1784  src -= ((3 * src_stride) + 3);
1785  filter_vec = LD_SH(filter_x);
1786  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1787 
1788  filter_vec = LD_SH(filter_y);
1789  UNPCK_R_SB_SH(filter_vec, filter_vec);
1790 
1791  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1792 
1793  mask1 = mask0 + 2;
1794  mask2 = mask0 + 4;
1795  mask3 = mask0 + 6;
1796 
1797  weight_vec = __msa_fill_w(weight);
1798  offset_vec = __msa_fill_w(offset);
1799  rnd_vec = __msa_fill_w(rnd_val);
1800  denom_vec = rnd_vec - 6;
1801 
1802  const_128 = __msa_ldi_w(128);
1803  const_128 *= weight_vec;
1804  offset_vec += __msa_srar_w(const_128, denom_vec);
1805 
1806  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1807  src += (7 * src_stride);
1808  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1809 
1810  /* row 0 row 1 row 2 row 3 */
1811  VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1812  VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1813  VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1814  vec8, vec9, vec10, vec11);
1815  VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1816  vec12, vec13, vec14, vec15);
1817  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1818  filt3);
1819  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1820  filt3);
1821  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1822  filt3);
1823  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1824  filt3);
1825 
1826  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1827  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1828  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1829 
1830  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1831 
1832  for (loop_cnt = height >> 2; loop_cnt--;) {
1833  LD_SB4(src, src_stride, src7, src8, src9, src10);
1834  src += (4 * src_stride);
1835  XORI_B4_128_SB(src7, src8, src9, src10);
1836 
1837  VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1838  vec0, vec1, vec2, vec3);
1839  VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1840  vec4, vec5, vec6, vec7);
1841  dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1842  filt3);
1843  dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1844  filt3);
1845 
1846  dst76_r = __msa_ilvr_h(dst97, dst66);
1847  ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
1848  dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1849  dst98_r = __msa_ilvr_h(dst66, dst108);
1850 
1851  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1852  filt_h1, filt_h2, filt_h3);
1853  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1854  filt_h1, filt_h2, filt_h3);
1855  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1856  filt_h1, filt_h2, filt_h3);
1857  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1858  filt_h1, filt_h2, filt_h3);
1859 
1860  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1861  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
1862  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
1863  SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
1864  ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
1865  ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
1866  CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
1867  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1868  out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
1869  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1870  dst += (4 * dst_stride);
1871 
1872  dst10_r = dst54_r;
1873  dst32_r = dst76_r;
1874  dst54_r = dst98_r;
1875  dst21_r = dst65_r;
1876  dst43_r = dst87_r;
1877  dst65_r = dst109_r;
1878  dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1879  }
1880 }
1881 
1883  int32_t src_stride,
1884  uint8_t *dst,
1885  int32_t dst_stride,
1886  const int8_t *filter_x,
1887  const int8_t *filter_y,
1888  int32_t height,
1889  int32_t weight,
1890  int32_t offset,
1891  int32_t rnd_val,
1892  int32_t width)
1893 {
1894  uint32_t loop_cnt, cnt;
1895  uint8_t *src_tmp;
1896  uint8_t *dst_tmp;
1897  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1898  v8i16 filt0, filt1, filt2, filt3;
1899  v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1900  v16i8 mask1, mask2, mask3;
1901  v8i16 filter_vec;
1902  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1903  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1904  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1905  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1906  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1907  v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1908  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1909  v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1910  v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
1911  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1912 
1913  src -= ((3 * src_stride) + 3);
1914 
1915  weight_vec = __msa_fill_w(weight);
1916  offset_vec = __msa_fill_w(offset);
1917  rnd_vec = __msa_fill_w(rnd_val);
1918  denom_vec = rnd_vec - 6;
1919 
1920  const_128 = __msa_ldi_w(128);
1921  const_128 *= weight_vec;
1922  offset_vec += __msa_srar_w(const_128, denom_vec);
1923 
1924  filter_vec = LD_SH(filter_x);
1925  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1926 
1927  filter_vec = LD_SH(filter_y);
1928  UNPCK_R_SB_SH(filter_vec, filter_vec);
1929  SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1930 
1931  mask1 = mask0 + 2;
1932  mask2 = mask0 + 4;
1933  mask3 = mask0 + 6;
1934 
1935  for (cnt = width >> 3; cnt--;) {
1936  src_tmp = src;
1937  dst_tmp = dst;
1938 
1939  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1940  src_tmp += (7 * src_stride);
1941  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1942 
1943  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1944  vec0, vec1, vec2, vec3);
1945  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1946  vec4, vec5, vec6, vec7);
1947  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1948  vec8, vec9, vec10, vec11);
1949  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1950  vec12, vec13, vec14, vec15);
1951  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1952  filt3);
1953  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1954  filt3);
1955  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1956  filt3);
1957  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1958  filt2, filt3);
1959 
1960  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1961  vec0, vec1, vec2, vec3);
1962  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1963  vec4, vec5, vec6, vec7);
1964  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1965  vec8, vec9, vec10, vec11);
1966  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1967  filt3);
1968  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1969  filt3);
1970  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1971  filt3);
1972 
1973  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1974  dst10_r, dst32_r, dst54_r, dst21_r);
1975  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1976  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1977  dst10_l, dst32_l, dst54_l, dst21_l);
1978  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1979 
1980  for (loop_cnt = height >> 1; loop_cnt--;) {
1981  LD_SB2(src_tmp, src_stride, src7, src8);
1982  src_tmp += 2 * src_stride;
1983  XORI_B2_128_SB(src7, src8);
1984 
1985  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1986  vec0, vec1, vec2, vec3);
1987  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1988  filt2, filt3);
1989 
1990  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1991  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1992  filt_h0, filt_h1, filt_h2, filt_h3);
1993  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1994  filt_h0, filt_h1, filt_h2, filt_h3);
1995  dst0_r >>= 6;
1996  dst0_l >>= 6;
1997 
1998  /* row 8 */
1999  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2000  vec0, vec1, vec2, vec3);
2001  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2002  filt2, filt3);
2003 
2004  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
2005  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
2006  filt_h0, filt_h1, filt_h2, filt_h3);
2007  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
2008  filt_h0, filt_h1, filt_h2, filt_h3);
2009  dst1_r >>= 6;
2010  dst1_l >>= 6;
2011 
2012  MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
2013  MUL2(dst1_r, weight_vec, dst1_l, weight_vec, dst1_r, dst1_l);
2014  SRAR_W4_SW(dst0_r, dst1_r, dst0_l, dst1_l, rnd_vec);
2015  ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2016  ADD2(dst1_r, offset_vec, dst1_l, offset_vec, dst1_r, dst1_l);
2017  CLIP_SW4_0_255(dst0_r, dst1_r, dst0_l, dst1_l);
2018 
2019  PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
2020  dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2021  ST_D2(dst0_r, 0, 1, dst_tmp, dst_stride);
2022  dst_tmp += (2 * dst_stride);
2023 
2024  dst10_r = dst32_r;
2025  dst32_r = dst54_r;
2026  dst54_r = dst76_r;
2027  dst10_l = dst32_l;
2028  dst32_l = dst54_l;
2029  dst54_l = dst76_l;
2030  dst21_r = dst43_r;
2031  dst43_r = dst65_r;
2032  dst65_r = dst87_r;
2033  dst21_l = dst43_l;
2034  dst43_l = dst65_l;
2035  dst65_l = dst87_l;
2036  dst6 = dst8;
2037  }
2038 
2039  src += 8;
2040  dst += 8;
2041  }
2042 }
2043 
2045  int32_t src_stride,
2046  uint8_t *dst,
2047  int32_t dst_stride,
2048  const int8_t *filter_x,
2049  const int8_t *filter_y,
2050  int32_t height,
2051  int32_t weight,
2052  int32_t offset,
2053  int32_t rnd_val)
2054 {
2055  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2056  filter_x, filter_y, height, weight,
2057  offset, rnd_val, 8);
2058 }
2059 
2061  int32_t src_stride,
2062  uint8_t *dst,
2063  int32_t dst_stride,
2064  const int8_t *filter_x,
2065  const int8_t *filter_y,
2066  int32_t height,
2067  int32_t weight,
2068  int32_t offset,
2069  int32_t rnd_val)
2070 {
2071  uint32_t loop_cnt;
2072  uint8_t *src_tmp, *dst_tmp;
2073  v16u8 out;
2074  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2075  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2076  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2077  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2078  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2079  v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
2080  v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2081  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst10_l, dst32_l, dst54_l;
2082  v8i16 dst98_r, dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
2083  v8i16 dst76_l, filter_vec;
2084  v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
2085  v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
2086 
2087  src -= ((3 * src_stride) + 3);
2088 
2089  filter_vec = LD_SH(filter_x);
2090  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2091 
2092  filter_vec = LD_SH(filter_y);
2093  UNPCK_R_SB_SH(filter_vec, filter_vec);
2094 
2095  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2096 
2097  weight_vec = __msa_fill_w(weight);
2098  offset_vec = __msa_fill_w(offset);
2099  rnd_vec = __msa_fill_w(rnd_val);
2100  denom_vec = rnd_vec - 6;
2101 
2102  const_128 = __msa_ldi_w(128);
2103  const_128 *= weight_vec;
2104  offset_vec += __msa_srar_w(const_128, denom_vec);
2105 
2106  mask0 = LD_SB(ff_hevc_mask_arr);
2107  mask1 = mask0 + 2;
2108  mask2 = mask0 + 4;
2109  mask3 = mask0 + 6;
2110 
2111  src_tmp = src;
2112  dst_tmp = dst;
2113 
2114  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
2115  src_tmp += (7 * src_stride);
2116  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2117 
2118  /* row 0 row 1 row 2 row 3 */
2119  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2120  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2121  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2122  vec11);
2123  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2124  vec15);
2125  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2126  filt3);
2127  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2128  filt3);
2129  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2130  filt3);
2131  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2132  filt2, filt3);
2133  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2134  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2135  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2136  vec11);
2137  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2138  filt3);
2139  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2140  filt3);
2141  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2142  filt3);
2143 
2144  for (loop_cnt = 16; loop_cnt--;) {
2145  src7 = LD_SB(src_tmp);
2146  src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
2147  src_tmp += src_stride;
2148 
2149  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2150  vec3);
2151  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2152  filt3);
2153  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
2154  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
2155  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
2156  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
2157 
2158  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
2159  filt_h0, filt_h1, filt_h2, filt_h3);
2160  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
2161  filt_h0, filt_h1, filt_h2, filt_h3);
2162  dst0_r >>= 6;
2163  dst0_l >>= 6;
2164 
2165  MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
2166  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2167  ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2168  CLIP_SW2_0_255(dst0_r, dst0_l);
2169  dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2170  out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
2171  ST_D1(out, 0, dst_tmp);
2172  dst_tmp += dst_stride;
2173 
2174  dst0 = dst1;
2175  dst1 = dst2;
2176  dst2 = dst3;
2177  dst3 = dst4;
2178  dst4 = dst5;
2179  dst5 = dst6;
2180  dst6 = dst7;
2181  }
2182 
2183  src += 8;
2184  dst += 8;
2185 
2186  mask4 = LD_SB(ff_hevc_mask_arr + 16);
2187  mask5 = mask4 + 2;
2188  mask6 = mask4 + 4;
2189  mask7 = mask4 + 6;
2190 
2191  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
2192  src += (7 * src_stride);
2193  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2194 
2195  VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2196  VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2197  VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2198  vec11);
2199  VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2200  vec15);
2201  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2202  filt3);
2203  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2204  filt3);
2205  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2206  filt3);
2207  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2208  filt3);
2209  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
2210  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
2211  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
2212 
2213  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2214 
2215  for (loop_cnt = 4; loop_cnt--;) {
2216  LD_SB4(src, src_stride, src7, src8, src9, src10);
2217  src += (4 * src_stride);
2218  XORI_B4_128_SB(src7, src8, src9, src10);
2219 
2220  VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2221  vec3);
2222  VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2223  vec7);
2224  dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2225  filt3);
2226  dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2227  filt3);
2228 
2229  dst76_r = __msa_ilvr_h(dst97, dst66);
2230  ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
2231  dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2232  dst98_r = __msa_ilvr_h(dst66, dst108);
2233 
2234  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2235  filt_h1, filt_h2, filt_h3);
2236  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2237  filt_h1, filt_h2, filt_h3);
2238  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
2239  filt_h1, filt_h2, filt_h3);
2240  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
2241  filt_h1, filt_h2, filt_h3);
2242 
2243  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2244  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
2245  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
2246  SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
2247  ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
2248  ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
2249  CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
2250  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
2251  out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2252  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2253  dst += (4 * dst_stride);
2254 
2255  dst10_r = dst54_r;
2256  dst32_r = dst76_r;
2257  dst54_r = dst98_r;
2258  dst21_r = dst65_r;
2259  dst43_r = dst87_r;
2260  dst65_r = dst109_r;
2261  dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2262  }
2263 }
2264 
2266  int32_t src_stride,
2267  uint8_t *dst,
2268  int32_t dst_stride,
2269  const int8_t *filter_x,
2270  const int8_t *filter_y,
2271  int32_t height,
2272  int32_t weight,
2273  int32_t offset,
2274  int32_t rnd_val)
2275 {
2276  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2277  filter_x, filter_y, height, weight,
2278  offset, rnd_val, 16);
2279 }
2280 
2282  int32_t src_stride,
2283  uint8_t *dst,
2284  int32_t dst_stride,
2285  const int8_t *filter_x,
2286  const int8_t *filter_y,
2287  int32_t height,
2288  int32_t weight,
2289  int32_t offset,
2290  int32_t rnd_val)
2291 {
2292  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2293  filter_x, filter_y, height, weight,
2294  offset, rnd_val, 24);
2295 }
2296 
2298  int32_t src_stride,
2299  uint8_t *dst,
2300  int32_t dst_stride,
2301  const int8_t *filter_x,
2302  const int8_t *filter_y,
2303  int32_t height,
2304  int32_t weight,
2305  int32_t offset,
2306  int32_t rnd_val)
2307 {
2308  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2309  filter_x, filter_y, height, weight,
2310  offset, rnd_val, 32);
2311 }
2312 
2314  int32_t src_stride,
2315  uint8_t *dst,
2316  int32_t dst_stride,
2317  const int8_t *filter_x,
2318  const int8_t *filter_y,
2319  int32_t height,
2320  int32_t weight,
2321  int32_t offset,
2322  int32_t rnd_val)
2323 {
2324  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2325  filter_x, filter_y, height, weight,
2326  offset, rnd_val, 48);
2327 }
2328 
2330  int32_t src_stride,
2331  uint8_t *dst,
2332  int32_t dst_stride,
2333  const int8_t *filter_x,
2334  const int8_t *filter_y,
2335  int32_t height,
2336  int32_t weight,
2337  int32_t offset,
2338  int32_t rnd_val)
2339 {
2340  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2341  filter_x, filter_y, height, weight,
2342  offset, rnd_val, 64);
2343 }
2344 
2346  int32_t src_stride,
2347  uint8_t *dst,
2348  int32_t dst_stride,
2349  const int8_t *filter,
2350  int32_t weight,
2351  int32_t offset,
2352  int32_t rnd_val)
2353 {
2354  v16u8 out;
2355  v8i16 filt0, filt1;
2356  v16i8 src0, src1, vec0, vec1;
2357  v16i8 mask1;
2358  v8i16 dst0;
2359  v4i32 dst0_r, dst0_l;
2360  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2361  v4i32 weight_vec, rnd_vec;
2362  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2363 
2364  src -= 1;
2365 
2366  filter_vec = LD_SH(filter);
2367  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2368 
2369  mask1 = mask0 + 2;
2370 
2371  weight = weight & 0x0000FFFF;
2372 
2373  weight_vec = __msa_fill_w(weight);
2374  rnd_vec = __msa_fill_w(rnd_val);
2375 
2376  weight *= 128;
2377  rnd_val -= 6;
2378 
2379  weight_vec_h = __msa_fill_h(weight);
2380  offset_vec = __msa_fill_h(offset);
2381  denom_vec = __msa_fill_h(rnd_val);
2382 
2383  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2384  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2385 
2386  LD_SB2(src, src_stride, src0, src1);
2387  XORI_B2_128_SB(src0, src1);
2388 
2389  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2390  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2391 
2392  ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
2393  DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
2394  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2395  dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2396  dst0 = __msa_adds_s_h(dst0, offset_vec);
2397  CLIP_SH_0_255(dst0);
2398  out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
2399  ST_W2(out, 0, 1, dst, dst_stride);
2400  dst += (4 * dst_stride);
2401 }
2402 
2404  int32_t src_stride,
2405  uint8_t *dst,
2406  int32_t dst_stride,
2407  const int8_t *filter,
2408  int32_t weight,
2409  int32_t offset,
2410  int32_t rnd_val)
2411 {
2412  v16u8 out;
2413  v8i16 filt0, filt1;
2414  v16i8 src0, src1, src2, src3;
2415  v16i8 mask1, vec0, vec1, vec2, vec3;
2416  v8i16 dst0, dst1;
2417  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2418  v4i32 weight_vec, rnd_vec;
2419  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2420 
2421  src -= 1;
2422 
2423  /* rearranging filter */
2424  filter_vec = LD_SH(filter);
2425  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2426 
2427  mask1 = mask0 + 2;
2428 
2429  weight = weight & 0x0000FFFF;
2430 
2431  weight_vec = __msa_fill_w(weight);
2432  rnd_vec = __msa_fill_w(rnd_val);
2433 
2434  weight *= 128;
2435  rnd_val -= 6;
2436 
2437  weight_vec_h = __msa_fill_h(weight);
2438  offset_vec = __msa_fill_h(offset);
2439  denom_vec = __msa_fill_h(rnd_val);
2440 
2441  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2442  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2443 
2444  LD_SB4(src, src_stride, src0, src1, src2, src3);
2445  XORI_B4_128_SB(src0, src1, src2, src3);
2446 
2447  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2448  VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
2449  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2450  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2451 
2452  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
2453  dst0, dst1);
2454 
2455  out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2456  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2457  dst += (4 * dst_stride);
2458 }
2459 
2461  int32_t src_stride,
2462  uint8_t *dst,
2463  int32_t dst_stride,
2464  const int8_t *filter,
2465  int32_t height,
2466  int32_t weight,
2467  int32_t offset,
2468  int32_t rnd_val)
2469 {
2470  uint32_t loop_cnt;
2471  v16u8 out0, out1;
2472  v8i16 filt0, filt1;
2473  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2474  v16i8 mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2475  v8i16 dst0, dst1, dst2, dst3;
2476  v8i16 filter_vec;
2477  v8i16 weight_vec_h, offset_vec, denom_vec;
2478  v4i32 weight_vec, rnd_vec;
2479  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2480 
2481  src -= 1;
2482 
2483  filter_vec = LD_SH(filter);
2484  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2485 
2486  weight = weight & 0x0000FFFF;
2487 
2488  weight_vec = __msa_fill_w(weight);
2489  rnd_vec = __msa_fill_w(rnd_val);
2490 
2491  weight *= 128;
2492  rnd_val -= 6;
2493 
2494  weight_vec_h = __msa_fill_h(weight);
2495  offset_vec = __msa_fill_h(offset);
2496  denom_vec = __msa_fill_h(rnd_val);
2497 
2498  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2499  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2500 
2501  mask1 = mask0 + 2;
2502 
2503  for (loop_cnt = (height >> 3); loop_cnt--;) {
2504  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2505  src += (8 * src_stride);
2506 
2507  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2508 
2509  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2510  VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
2511  VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec4, vec5);
2512  VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec6, vec7);
2513  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2514  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2515  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2516  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2517 
2518  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2519  weight_vec, offset_vec, rnd_vec,
2520  dst0, dst1, dst2, dst3);
2521 
2522  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
2523  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2524  dst += (8 * dst_stride);
2525  }
2526 }
2527 
2529  int32_t src_stride,
2530  uint8_t *dst,
2531  int32_t dst_stride,
2532  const int8_t *filter,
2533  int32_t height,
2534  int32_t weight,
2535  int32_t offset,
2536  int32_t rnd_val)
2537 {
2538  if (2 == height) {
2539  hevc_hz_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
2540  filter, weight, offset, rnd_val);
2541  } else if (4 == height) {
2542  hevc_hz_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
2543  filter, weight, offset, rnd_val);
2544  } else if (8 == height || 16 == height) {
2545  hevc_hz_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
2546  filter, height, weight,
2547  offset, rnd_val);
2548  }
2549 }
2550 
2552  int32_t src_stride,
2553  uint8_t *dst,
2554  int32_t dst_stride,
2555  const int8_t *filter,
2556  int32_t height,
2557  int32_t weight,
2558  int32_t offset,
2559  int32_t rnd_val)
2560 {
2561  v16u8 out0, out1, out2, out3;
2562  v8i16 filt0, filt1;
2563  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2564  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2565  v16i8 mask1;
2566  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2567  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2568  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2569  v4i32 weight_vec, rnd_vec;
2570 
2571  src -= 1;
2572 
2573  filter_vec = LD_SH(filter);
2574  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2575 
2576  weight = weight & 0x0000FFFF;
2577 
2578  weight_vec = __msa_fill_w(weight);
2579  rnd_vec = __msa_fill_w(rnd_val);
2580 
2581  weight *= 128;
2582  rnd_val -= 6;
2583 
2584  weight_vec_h = __msa_fill_h(weight);
2585  offset_vec = __msa_fill_h(offset);
2586  denom_vec = __msa_fill_h(rnd_val);
2587 
2588  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2589  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2590 
2591  mask1 = mask0 + 2;
2592 
2593  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2594  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2595  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2596  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2597  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2598  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2599  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2600  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2601  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2602  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2603  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2604  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
2605  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
2606  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
2607  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2608  dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2609  dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2610  dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2611 
2612  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2613  weight_vec, offset_vec, rnd_vec,
2614  dst0, dst1, dst2, dst3);
2615  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
2616  weight_vec, offset_vec, rnd_vec,
2617  dst4, dst5, dst6, dst7);
2618 
2619  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
2620  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
2621  ST_W2(out0, 0, 2, dst, dst_stride);
2622  ST_H2(out0, 2, 6, dst + 4, dst_stride);
2623  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2624  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2625  dst += (4 * dst_stride);
2626  ST_W2(out2, 0, 2, dst, dst_stride);
2627  ST_H2(out2, 2, 6, dst + 4, dst_stride);
2628  ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
2629  ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2630 }
2631 
2633  int32_t src_stride,
2634  uint8_t *dst,
2635  int32_t dst_stride,
2636  const int8_t *filter,
2637  int32_t weight,
2638  int32_t offset,
2639  int32_t rnd_val)
2640 {
2641  v16u8 out;
2642  v8i16 filt0, filt1, dst0, dst1;
2643  v16i8 src0, src1;
2644  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2645  v16i8 mask1;
2646  v16i8 vec0, vec1, vec2, vec3;
2647  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2648  v4i32 weight_vec, rnd_vec;
2649 
2650  src -= 1;
2651 
2652  filter_vec = LD_SH(filter);
2653  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2654 
2655  weight = weight & 0x0000FFFF;
2656 
2657  weight_vec = __msa_fill_w(weight);
2658  rnd_vec = __msa_fill_w(rnd_val);
2659 
2660  weight *= 128;
2661  rnd_val -= 6;
2662 
2663  weight_vec_h = __msa_fill_h(weight);
2664  offset_vec = __msa_fill_h(offset);
2665  denom_vec = __msa_fill_h(rnd_val);
2666 
2667  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2668  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2669 
2670  mask1 = mask0 + 2;
2671 
2672  LD_SB2(src, src_stride, src0, src1);
2673  XORI_B2_128_SB(src0, src1);
2674 
2675  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2676  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2677  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2678  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2679 
2680  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
2681  dst0, dst1);
2682 
2683  out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2684  ST_D2(out, 0, 1, dst, dst_stride);
2685 }
2686 
2688  int32_t src_stride,
2689  uint8_t *dst,
2690  int32_t dst_stride,
2691  const int8_t *filter,
2692  int32_t weight,
2693  int32_t offset,
2694  int32_t rnd_val)
2695 {
2696  v16u8 out0, out1;
2697  v16i8 src0, src1, src2, src3;
2698  v16i8 mask0, mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2699  v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
2700  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2701  v4i32 weight_vec, rnd_vec;
2702 
2703  src -= 1;
2704 
2705  filter_vec = LD_SH(filter);
2706  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2707 
2708  weight = weight & 0x0000FFFF;
2709  weight_vec = __msa_fill_w(weight);
2710  rnd_vec = __msa_fill_w(rnd_val);
2711 
2712  weight *= 128;
2713  rnd_val -= 6;
2714 
2715  weight_vec_h = __msa_fill_h(weight);
2716  offset_vec = __msa_fill_h(offset);
2717  denom_vec = __msa_fill_h(rnd_val);
2718 
2719  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2720  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2721 
2722  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2723  mask1 = mask0 + 2;
2724 
2725  LD_SB4(src, src_stride, src0, src1, src2, src3);
2726  XORI_B4_128_SB(src0, src1, src2, src3);
2727  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2728  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2729  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2730  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2731  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2732  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2733  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2734  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2735 
2736  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2737  weight_vec, offset_vec, rnd_vec,
2738  dst0, dst1, dst2, dst3);
2739 
2740  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
2741  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2742 }
2743 
2745  int32_t src_stride,
2746  uint8_t *dst,
2747  int32_t dst_stride,
2748  const int8_t *filter,
2749  int32_t weight,
2750  int32_t offset,
2751  int32_t rnd_val)
2752 {
2753  v16u8 out0, out1, out2;
2754  v8i16 filt0, filt1;
2755  v16i8 src0, src1, src2, src3, src4, src5;
2756  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2757  v16i8 mask1;
2758  v16i8 vec11;
2759  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2760  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2761  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2762  v4i32 weight_vec, rnd_vec;
2763 
2764  src -= 1;
2765 
2766  filter_vec = LD_SH(filter);
2767  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2768 
2769  weight = weight & 0x0000FFFF;
2770 
2771  weight_vec = __msa_fill_w(weight);
2772  rnd_vec = __msa_fill_w(rnd_val);
2773 
2774  weight *= 128;
2775  rnd_val -= 6;
2776 
2777  weight_vec_h = __msa_fill_h(weight);
2778  offset_vec = __msa_fill_h(offset);
2779  denom_vec = __msa_fill_h(rnd_val);
2780 
2781  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2782  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2783 
2784  mask1 = mask0 + 2;
2785 
2786  LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
2787  XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
2788 
2789  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2790  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2791  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2792  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2793  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
2794  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
2795  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2796  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2797  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2798  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2799  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
2800  dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
2801 
2802  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2803  weight_vec, offset_vec, rnd_vec,
2804  dst0, dst1, dst2, dst3);
2805 
2806  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec,
2807  dst4, dst5);
2808 
2809  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
2810  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2811  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
2812 }
2813 
2815  int32_t src_stride,
2816  uint8_t *dst,
2817  int32_t dst_stride,
2818  const int8_t *filter,
2819  int32_t height,
2820  int32_t weight,
2821  int32_t offset,
2822  int32_t rnd_val)
2823 {
2824  uint32_t loop_cnt;
2825  v8i16 filt0, filt1;
2826  v16u8 out0, out1, out2, out3;
2827  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2828  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2829  v16i8 mask1;
2830  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2831  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2832  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2833  v4i32 weight_vec, rnd_vec;
2834 
2835  src -= 1;
2836 
2837  filter_vec = LD_SH(filter);
2838  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2839 
2840  weight = weight & 0x0000FFFF;
2841 
2842  weight_vec = __msa_fill_w(weight);
2843  rnd_vec = __msa_fill_w(rnd_val);
2844 
2845  weight *= 128;
2846  rnd_val -= 6;
2847 
2848  weight_vec_h = __msa_fill_h(weight);
2849  offset_vec = __msa_fill_h(offset);
2850  denom_vec = __msa_fill_h(rnd_val);
2851 
2852  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2853  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2854 
2855  mask1 = mask0 + 2;
2856 
2857  for (loop_cnt = (height >> 3); loop_cnt--;) {
2858  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2859  src += (8 * src_stride);
2860  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2861 
2862  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2863  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2864  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2865  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2866  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2867  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2868  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2869  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2870  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2871  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
2872  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
2873  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
2874  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2875  dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2876  dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2877  dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2878 
2879  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2880  weight_vec, offset_vec, rnd_vec,
2881  dst0, dst1, dst2, dst3);
2882 
2883  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
2884  weight_vec, offset_vec, rnd_vec,
2885  dst4, dst5, dst6, dst7);
2886 
2887  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
2888  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
2889  ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
2890  dst += (8 * dst_stride);
2891  }
2892 }
2893 
2895  int32_t src_stride,
2896  uint8_t *dst,
2897  int32_t dst_stride,
2898  const int8_t *filter,
2899  int32_t height,
2900  int32_t weight,
2901  int32_t offset,
2902  int32_t rnd_val)
2903 {
2904  if (2 == height) {
2905  hevc_hz_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
2906  filter, weight, offset, rnd_val);
2907  } else if (4 == height) {
2908  hevc_hz_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride,
2909  filter, weight, offset, rnd_val);
2910  } else if (6 == height) {
2911  hevc_hz_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
2912  filter, weight, offset, rnd_val);
2913  } else {
2914  hevc_hz_uniwgt_4t_8x8multiple_msa(src, src_stride, dst, dst_stride,
2915  filter, height, weight, offset,
2916  rnd_val);
2917  }
2918 }
2919 
2921  int32_t src_stride,
2922  uint8_t *dst,
2923  int32_t dst_stride,
2924  const int8_t *filter,
2925  int32_t height,
2926  int32_t weight,
2927  int32_t offset,
2928  int32_t rnd_val)
2929 {
2930  uint32_t loop_cnt;
2931  v16u8 out0, out1, out2;
2932  v8i16 filt0, filt1;
2933  v16i8 src0, src1, src2, src3;
2934  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2935  v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2936  };
2937  v16i8 mask1;
2938  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2939  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2940  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2941  v16i8 mask3, vec11;
2942  v4i32 weight_vec, rnd_vec;
2943 
2944  src -= 1;
2945 
2946  filter_vec = LD_SH(filter);
2947  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2948 
2949  weight = weight & 0x0000FFFF;
2950 
2951  weight_vec = __msa_fill_w(weight);
2952  rnd_vec = __msa_fill_w(rnd_val);
2953 
2954  weight *= 128;
2955  rnd_val -= 6;
2956 
2957  weight_vec_h = __msa_fill_h(weight);
2958  offset_vec = __msa_fill_h(offset);
2959  denom_vec = __msa_fill_h(rnd_val);
2960 
2961  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2962  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2963 
2964  mask1 = mask0 + 2;
2965  mask3 = mask2 + 2;
2966 
2967  for (loop_cnt = 4; loop_cnt--;) {
2968  LD_SB4(src, src_stride, src0, src1, src2, src3);
2969  src += (4 * src_stride);
2970 
2971  XORI_B4_128_SB(src0, src1, src2, src3);
2972 
2973  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2974  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2975  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2976  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2977  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec8, vec9);
2978  VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec10, vec11);
2979  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2980  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2981  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2982  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2983  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
2984  dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
2985 
2986  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2987  weight_vec, offset_vec, rnd_vec,
2988  dst0, dst1, dst2, dst3);
2989 
2990  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
2991  rnd_vec, dst4, dst5);
2992 
2993  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
2994  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2995  ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
2996  dst += (4 * dst_stride);
2997  }
2998 }
2999 
3001  int32_t src_stride,
3002  uint8_t *dst,
3003  int32_t dst_stride,
3004  const int8_t *filter,
3005  int32_t height,
3006  int32_t weight,
3007  int32_t offset,
3008  int32_t rnd_val)
3009 {
3010  uint32_t loop_cnt;
3011  v16u8 out0, out1, out2, out3;
3012  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
3013  v8i16 filt0, filt1;
3014  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3015  v16i8 mask1;
3016  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3017  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3018  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3019  v4i32 weight_vec, rnd_vec;
3020 
3021  src -= 1;
3022 
3023  filter_vec = LD_SH(filter);
3024  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3025 
3026  weight = weight & 0x0000FFFF;
3027 
3028  weight_vec = __msa_fill_w(weight);
3029  rnd_vec = __msa_fill_w(rnd_val);
3030 
3031  weight *= 128;
3032  rnd_val -= 6;
3033 
3034  weight_vec_h = __msa_fill_h(weight);
3035  offset_vec = __msa_fill_h(offset);
3036  denom_vec = __msa_fill_h(rnd_val);
3037 
3038  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3039  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3040 
3041  mask1 = mask0 + 2;
3042 
3043  for (loop_cnt = (height >> 2); loop_cnt--;) {
3044  LD_SB4(src, src_stride, src0, src2, src4, src6);
3045  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3046  src += (4 * src_stride);
3047 
3048  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3049 
3050  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3051  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3052  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3053  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3054  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3055  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3056  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3057  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3058  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3059  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
3060  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
3061  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
3062  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3063  dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3064  dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3065  dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3066 
3067  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3068  weight_vec, offset_vec, rnd_vec,
3069  dst0, dst1, dst2, dst3);
3070 
3071  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
3072  weight_vec, offset_vec, rnd_vec,
3073  dst4, dst5, dst6, dst7);
3074 
3075  PCKEV_B4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3076  out0, out1, out2, out3);
3077 
3078  ST_UB4(out0, out1, out2, out3, dst, dst_stride);
3079  dst += (4 * dst_stride);
3080  }
3081 }
3082 
3084  int32_t src_stride,
3085  uint8_t *dst,
3086  int32_t dst_stride,
3087  const int8_t *filter,
3088  int32_t height,
3089  int32_t weight,
3090  int32_t offset,
3091  int32_t rnd_val)
3092 {
3093  uint32_t loop_cnt;
3094  v16u8 out0, out1, out2;
3095  v16i8 src0, src1, src2, src3;
3096  v8i16 filt0, filt1;
3097  v16i8 mask0, mask1, mask2, mask3;
3098  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3099  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3100  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3101  v4i32 weight_vec, rnd_vec;
3102 
3103  src -= 1;
3104 
3105  filter_vec = LD_SH(filter);
3106  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3107 
3108  weight = weight & 0x0000FFFF;
3109  weight_vec = __msa_fill_w(weight);
3110  rnd_vec = __msa_fill_w(rnd_val);
3111 
3112  weight *= 128;
3113  rnd_val -= 6;
3114 
3115  weight_vec_h = __msa_fill_h(weight);
3116  offset_vec = __msa_fill_h(offset);
3117  denom_vec = __msa_fill_h(rnd_val);
3118 
3119  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3120  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3121 
3122  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3123  mask1 = mask0 + 2;
3124  mask2 = mask0 + 8;
3125  mask3 = mask0 + 10;
3126 
3127  for (loop_cnt = 16; loop_cnt--;) {
3128  LD_SB2(src, src_stride, src0, src2);
3129  LD_SB2(src + 16, src_stride, src1, src3);
3130  src += (2 * src_stride);
3131 
3132  XORI_B4_128_SB(src0, src1, src2, src3);
3133 
3134  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3135  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
3136  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3137  VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec6, vec7);
3138  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3139  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3140  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3141  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3142  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3143  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec2, vec3);
3144  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3145  dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3146 
3147  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3148  weight_vec, offset_vec, rnd_vec,
3149  dst0, dst1, dst2, dst3);
3150 
3151  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
3152  rnd_vec, dst4, dst5);
3153 
3154  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
3155  ST_UB2(out0, out1, dst, dst_stride);
3156  ST_D2(out2, 0, 1, dst + 16, dst_stride);
3157  dst += (2 * dst_stride);
3158  }
3159 }
3160 
3162  int32_t src_stride,
3163  uint8_t *dst,
3164  int32_t dst_stride,
3165  const int8_t *filter,
3166  int32_t height,
3167  int32_t weight,
3168  int32_t offset,
3169  int32_t rnd_val)
3170 {
3171  uint32_t loop_cnt;
3172  v16u8 out0, out1, out2, out3;
3173  v16i8 src0, src1, src2, src3, src4, src5;
3174  v8i16 filt0, filt1;
3175  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3176  v16i8 mask1, mask2, mask3;
3177  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3178  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3179  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3180  v4i32 weight_vec, rnd_vec;
3181 
3182  src -= 1;
3183 
3184  filter_vec = LD_SH(filter);
3185  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3186 
3187  weight = weight & 0x0000FFFF;
3188 
3189  weight_vec = __msa_fill_w(weight);
3190  rnd_vec = __msa_fill_w(rnd_val);
3191 
3192  weight *= 128;
3193  rnd_val -= 6;
3194 
3195  weight_vec_h = __msa_fill_h(weight);
3196  offset_vec = __msa_fill_h(offset);
3197  denom_vec = __msa_fill_h(rnd_val);
3198 
3199  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3200  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3201 
3202  mask1 = mask0 + 2;
3203  mask2 = mask0 + 8;
3204  mask3 = mask0 + 10;
3205 
3206  for (loop_cnt = (height >> 1); loop_cnt--;) {
3207  LD_SB2(src, 16, src0, src1);
3208  src2 = LD_SB(src + 24);
3209  src += src_stride;
3210  LD_SB2(src, 16, src3, src4);
3211  src5 = LD_SB(src + 24);
3212  src += src_stride;
3213  XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
3214  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3215  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
3216  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec4, vec5);
3217  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec6, vec7);
3218  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3219  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3220  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3221  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3222  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3223  VSHF_B2_SB(src3, src4, src3, src4, mask2, mask3, vec2, vec3);
3224  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec4, vec5);
3225  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec6, vec7);
3226  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3227  dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3228  dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3229  dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3230 
3231  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3232  weight_vec, offset_vec, rnd_vec,
3233  dst0, dst1, dst2, dst3);
3234 
3235  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
3236  weight_vec, offset_vec, rnd_vec,
3237  dst4, dst5, dst6, dst7);
3238 
3239  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3240  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
3241  ST_UB2(out0, out1, dst, 16);
3242  dst += dst_stride;
3243  ST_UB2(out2, out3, dst, 16);
3244  dst += dst_stride;
3245  }
3246 }
3247 
3249  int32_t src_stride,
3250  uint8_t *dst,
3251  int32_t dst_stride,
3252  const int8_t *filter,
3253  int32_t weight,
3254  int32_t offset,
3255  int32_t rnd_val)
3256 {
3257  v16u8 out;
3258  v16i8 src0, src1, src2, src3, src4;
3259  v16i8 src10_r, src32_r, src21_r, src43_r;
3260  v16i8 src2110, src4332;
3261  v8i16 dst0;
3262  v4i32 dst0_r, dst0_l;
3263  v8i16 filt0, filt1;
3264  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3265  v4i32 weight_vec, rnd_vec;
3266 
3267  src -= src_stride;
3268 
3269  weight = weight & 0x0000FFFF;
3270 
3271  weight_vec = __msa_fill_w(weight);
3272  rnd_vec = __msa_fill_w(rnd_val);
3273 
3274  weight *= 128;
3275  rnd_val -= 6;
3276 
3277  weight_vec_h = __msa_fill_h(weight);
3278  offset_vec = __msa_fill_h(offset);
3279  denom_vec = __msa_fill_h(rnd_val);
3280 
3281  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3282  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3283 
3284  filter_vec = LD_SH(filter);
3285  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3286 
3287  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3288  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3289  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3290  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3291  XORI_B2_128_SB(src2110, src4332);
3292  dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3293  ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
3294  DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
3295  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
3296  dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
3297  dst0 = __msa_adds_s_h(dst0, offset_vec);
3298  CLIP_SH_0_255(dst0);
3299  out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
3300  ST_W2(out, 0, 1, dst, dst_stride);
3301 }
3302 
3304  int32_t src_stride,
3305  uint8_t *dst,
3306  int32_t dst_stride,
3307  const int8_t *filter,
3308  int32_t weight,
3309  int32_t offset,
3310  int32_t rnd_val)
3311 {
3312  v16u8 out;
3313  v16i8 src0, src1, src2, src3, src4, src5, src6;
3314  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3315  v16i8 src2110, src4332, src6554;
3316  v8i16 dst0, dst1;
3317  v8i16 filt0, filt1;
3318  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3319  v4i32 weight_vec, rnd_vec;
3320 
3321  src -= src_stride;
3322 
3323  weight = weight & 0x0000FFFF;
3324 
3325  weight_vec = __msa_fill_w(weight);
3326  rnd_vec = __msa_fill_w(rnd_val);
3327 
3328  weight *= 128;
3329  rnd_val -= 6;
3330 
3331  weight_vec_h = __msa_fill_h(weight);
3332  offset_vec = __msa_fill_h(offset);
3333  denom_vec = __msa_fill_h(rnd_val);
3334 
3335  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3336  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3337 
3338  filter_vec = LD_SH(filter);
3339  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3340 
3341  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3342  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3343  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3344  src32_r, src43_r, src54_r, src65_r);
3345  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
3346  src2110, src4332, src6554);
3347  XORI_B3_128_SB(src2110, src4332, src6554);
3348  dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3349  dst1 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3350  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
3351  dst0, dst1);
3352 
3353  out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3354  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3355 }
3356 
3358  int32_t src_stride,
3359  uint8_t *dst,
3360  int32_t dst_stride,
3361  const int8_t *filter,
3362  int32_t height,
3363  int32_t weight,
3364  int32_t offset,
3365  int32_t rnd_val)
3366 {
3367  int32_t loop_cnt;
3368  v16u8 out0, out1;
3369  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3370  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3371  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3372  v16i8 src2110, src4332, src6554, src8776;
3373  v16i8 src10998;
3374  v8i16 dst0, dst1, dst2, dst3, filt0, filt1;
3375  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3376  v4i32 weight_vec, rnd_vec;
3377 
3378  src -= src_stride;
3379 
3380  weight = weight & 0x0000FFFF;
3381 
3382  weight_vec = __msa_fill_w(weight);
3383  rnd_vec = __msa_fill_w(rnd_val);
3384 
3385  weight *= 128;
3386  rnd_val -= 6;
3387 
3388  weight_vec_h = __msa_fill_h(weight);
3389  offset_vec = __msa_fill_h(offset);
3390  denom_vec = __msa_fill_h(rnd_val);
3391 
3392  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3393  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3394 
3395  filter_vec = LD_SH(filter);
3396  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3397 
3398  LD_SB3(src, src_stride, src0, src1, src2);
3399  src += (3 * src_stride);
3400  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3401  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3402  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3403 
3404  for (loop_cnt = (height >> 3); loop_cnt--;) {
3405  LD_SB8(src, src_stride,
3406  src3, src4, src5, src6, src7, src8, src9, src10);
3407  src += (8 * src_stride);
3408  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3409  src32_r, src43_r, src54_r, src65_r);
3410  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3411  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3412  ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3413  src109_r, src98_r, src4332, src6554, src8776, src10998);
3414  XORI_B4_128_SB(src4332, src6554, src8776, src10998);
3415  dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3416  dst1 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3417  dst2 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
3418  dst3 = HEVC_FILT_4TAP_SH(src8776, src10998, filt0, filt1);
3419 
3420  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3421  weight_vec, offset_vec, rnd_vec,
3422  dst0, dst1, dst2, dst3);
3423 
3424  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3425  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3426  dst += (8 * dst_stride);
3427 
3428  src2 = src10;
3429  src2110 = src10998;
3430  }
3431 }
3432 
3434  int32_t src_stride,
3435  uint8_t *dst,
3436  int32_t dst_stride,
3437  const int8_t *filter,
3438  int32_t height,
3439  int32_t weight,
3440  int32_t offset,
3441  int32_t rnd_val)
3442 {
3443  if (2 == height) {
3444  hevc_vt_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
3445  filter, weight, offset, rnd_val);
3446  } else if (4 == height) {
3447  hevc_vt_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
3448  filter, weight, offset, rnd_val);
3449  } else if (0 == (height % 8)) {
3450  hevc_vt_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
3451  filter, height, weight, offset,
3452  rnd_val);
3453  }
3454 }
3455 
3457  int32_t src_stride,
3458  uint8_t *dst,
3459  int32_t dst_stride,
3460  const int8_t *filter,
3461  int32_t height,
3462  int32_t weight,
3463  int32_t offset,
3464  int32_t rnd_val)
3465 {
3466  v16u8 out0, out1, out2, out3;
3467  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3468  v16i8 src10_r, src32_r, src21_r, src43_r;
3469  v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
3470  v8i16 filt0, filt1;
3471  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3472  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3473  v4i32 weight_vec, rnd_vec;
3474 
3475  src -= src_stride;
3476 
3477  weight = weight & 0x0000FFFF;
3478 
3479  weight_vec = __msa_fill_w(weight);
3480  rnd_vec = __msa_fill_w(rnd_val);
3481 
3482  weight *= 128;
3483  rnd_val -= 6;
3484 
3485  weight_vec_h = __msa_fill_h(weight);
3486  offset_vec = __msa_fill_h(offset);
3487  denom_vec = __msa_fill_h(rnd_val);
3488 
3489  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3490  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3491 
3492  filter_vec = LD_SH(filter);
3493  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3494 
3495  LD_SB3(src, src_stride, src0, src1, src2);
3496  src += (3 * src_stride);
3497  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3498  XORI_B3_128_SB(src0, src1, src2);
3499  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3500  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3501  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3502  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3503  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3504  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3505  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3506  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3507  dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3508  dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3509  dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3510  dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3511  dst6 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3512  dst7 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3513 
3514  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3515  weight_vec, offset_vec, rnd_vec,
3516  dst0, dst1, dst2, dst3);
3517  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
3518  weight_vec, offset_vec, rnd_vec,
3519  dst4, dst5, dst6, dst7);
3520 
3521  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3522  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
3523  ST_W2(out0, 0, 2, dst, dst_stride);
3524  ST_H2(out0, 2, 6, dst + 4, dst_stride);
3525  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
3526  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3527  dst += (4 * dst_stride);
3528  ST_W2(out2, 0, 2, dst, dst_stride);
3529  ST_H2(out2, 2, 6, dst + 4, dst_stride);
3530  ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
3531  ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3532 }
3533 
3535  int32_t src_stride,
3536  uint8_t *dst,
3537  int32_t dst_stride,
3538  const int8_t *filter,
3539  int32_t weight,
3540  int32_t offset,
3541  int32_t rnd_val)
3542 {
3543  v16u8 out;
3544  v16i8 src0, src1, src2, src3, src4;
3545  v16i8 src10_r, src32_r, src21_r, src43_r;
3546  v8i16 dst0, dst1;
3547  v8i16 filt0, filt1;
3548  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3549  v4i32 weight_vec, rnd_vec;
3550 
3551  src -= src_stride;
3552 
3553  weight = weight & 0x0000FFFF;
3554 
3555  weight_vec = __msa_fill_w(weight);
3556  rnd_vec = __msa_fill_w(rnd_val);
3557 
3558  weight *= 128;
3559  rnd_val -= 6;
3560 
3561  weight_vec_h = __msa_fill_h(weight);
3562  offset_vec = __msa_fill_h(offset);
3563  denom_vec = __msa_fill_h(rnd_val);
3564 
3565  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3566  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3567 
3568  filter_vec = LD_SH(filter);
3569  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3570 
3571  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3572  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3573  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3574  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3575  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3576  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3577 
3578  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
3579  dst0, dst1);
3580 
3581  out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3582  ST_D2(out, 0, 1, dst, dst_stride);
3583 }
3584 
3586  int32_t src_stride,
3587  uint8_t *dst,
3588  int32_t dst_stride,
3589  const int8_t *filter,
3590  int32_t weight,
3591  int32_t offset,
3592  int32_t rnd_val)
3593 {
3594  v16u8 out0, out1;
3595  v16i8 src0, src1, src2, src3, src4;
3596  v16i8 src10_r, src32_r, src21_r, src43_r;
3597  v16i8 src5, src6, src54_r, src65_r;
3598  v8i16 filt0, filt1;
3599  v8i16 dst0, dst1, dst2, dst3;
3600  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3601  v4i32 weight_vec, rnd_vec;
3602 
3603  src -= src_stride;
3604 
3605  weight = weight & 0x0000FFFF;
3606 
3607  weight_vec = __msa_fill_w(weight);
3608  rnd_vec = __msa_fill_w(rnd_val);
3609 
3610  weight *= 128;
3611  rnd_val -= 6;
3612 
3613  weight_vec_h = __msa_fill_h(weight);
3614  offset_vec = __msa_fill_h(offset);
3615  denom_vec = __msa_fill_h(rnd_val);
3616 
3617  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3618  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3619 
3620  filter_vec = LD_SH(filter);
3621  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3622 
3623  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3624  src += (3 * src_stride);
3625  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3626  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3627  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3628  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3629  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3630  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3631  dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3632  dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3633  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3634  offset_vec, rnd_vec, dst0, dst1, dst2,
3635  dst3);
3636  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3637  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3638 }
3639 
3641  int32_t src_stride,
3642  uint8_t *dst,
3643  int32_t dst_stride,
3644  const int8_t *filter,
3645  int32_t weight,
3646  int32_t offset,
3647  int32_t rnd_val)
3648 {
3649  v16u8 out0, out1, out2;
3650  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3651  v16i8 src10_r, src32_r, src54_r, src76_r;
3652  v16i8 src21_r, src43_r, src65_r, src87_r;
3653  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3654  v8i16 filt0, filt1;
3655  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3656  v4i32 weight_vec, rnd_vec;
3657 
3658  src -= src_stride;
3659 
3660  weight = weight & 0x0000FFFF;
3661 
3662  weight_vec = __msa_fill_w(weight);
3663  rnd_vec = __msa_fill_w(rnd_val);
3664 
3665  weight *= 128;
3666  rnd_val -= 6;
3667 
3668  weight_vec_h = __msa_fill_h(weight);
3669  offset_vec = __msa_fill_h(offset);
3670  denom_vec = __msa_fill_h(rnd_val);
3671 
3672  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3673  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3674 
3675  filter_vec = LD_SH(filter);
3676  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3677 
3678  LD_SB3(src, src_stride, src0, src1, src2);
3679  src += (3 * src_stride);
3680  LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
3681 
3682  XORI_B3_128_SB(src0, src1, src2);
3683  XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3684  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3685  src32_r, src43_r);
3686  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3687  src76_r, src87_r);
3688  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3689  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3690  dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3691  dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3692  dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3693  dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3694  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3695  offset_vec, rnd_vec, dst0, dst1, dst2, dst3);
3696  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec,
3697  dst4, dst5);
3698  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
3699  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3700  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
3701 }
3702 
3704  int32_t src_stride,
3705  uint8_t *dst,
3706  int32_t dst_stride,
3707  const int8_t *filter,
3708  int32_t height,
3709  int32_t weight,
3710  int32_t offset,
3711  int32_t rnd_val)
3712 {
3713  int32_t loop_cnt;
3714  v16u8 out0, out1, out2, out3;
3715  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3716  v16i8 src10_r, src32_r, src21_r, src43_r;
3717  v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
3718  v8i16 filt0, filt1;
3719  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3720  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3721  v4i32 weight_vec, rnd_vec;
3722 
3723  src -= src_stride;
3724 
3725  weight = weight & 0x0000FFFF;
3726 
3727  weight_vec = __msa_fill_w(weight);
3728  rnd_vec = __msa_fill_w(rnd_val);
3729 
3730  weight *= 128;
3731  rnd_val -= 6;
3732 
3733  weight_vec_h = __msa_fill_h(weight);
3734  offset_vec = __msa_fill_h(offset);
3735  denom_vec = __msa_fill_h(rnd_val);
3736 
3737  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3738  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3739 
3740  filter_vec = LD_SH(filter);
3741  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3742 
3743  LD_SB3(src, src_stride, src0, src1, src2);
3744  src += (3 * src_stride);
3745  XORI_B3_128_SB(src0, src1, src2);
3746  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3747 
3748  for (loop_cnt = (height >> 3); loop_cnt--;) {
3749  LD_SB8(src, src_stride,
3750  src3, src4, src5, src6, src7, src8, src9, src10);
3751  src += (8 * src_stride);
3752  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3753  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3754  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3755  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3756  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3757  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3758  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3759  dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3760  dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3761  dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3762  dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3763  dst6 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3764  dst7 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3765  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3766  offset_vec, rnd_vec, dst0, dst1, dst2,
3767  dst3);
3768  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
3769  offset_vec, rnd_vec, dst4, dst5, dst6,
3770  dst7);
3771  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3772  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
3773  ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
3774  dst += (8 * dst_stride);
3775 
3776  src2 = src10;
3777  src10_r = src98_r;
3778  src21_r = src109_r;
3779  }
3780 }
3781 
3783  int32_t src_stride,
3784  uint8_t *dst,
3785  int32_t dst_stride,
3786  const int8_t *filter,
3787  int32_t height,
3788  int32_t weight,
3789  int32_t offset,
3790  int32_t rnd_val)
3791 {
3792  if (2 == height) {
3793  hevc_vt_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
3794  filter, weight, offset, rnd_val);
3795  } else if (4 == height) {
3796  hevc_vt_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride,
3797  filter, weight, offset, rnd_val);
3798  } else if (6 == height) {
3799  hevc_vt_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
3800  filter, weight, offset, rnd_val);
3801  } else {
3802  hevc_vt_uniwgt_4t_8x8mult_msa(src, src_stride, dst, dst_stride,
3803  filter, height, weight, offset,
3804  rnd_val);
3805  }
3806 }
3807 
3809  int32_t src_stride,
3810  uint8_t *dst,
3811  int32_t dst_stride,
3812  const int8_t *filter,
3813  int32_t height,
3814  int32_t weight,
3815  int32_t offset,
3816  int32_t rnd_val)
3817 {
3818  int32_t loop_cnt;
3819  v16u8 out0, out1, out2, out3, out4, out5;
3820  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3821  v16i8 src10_r, src32_r, src21_r, src43_r;
3822  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3823  v16i8 src2110, src4332;
3824  v16i8 src54_r, src76_r, src98_r, src65_r, src87_r, src109_r;
3825  v16i8 src76_l, src98_l, src87_l, src109_l, src6554, src8776, src10998;
3826  v8i16 filt0, filt1;
3827  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3828  v8i16 dst9, dst10, dst11, filter_vec, weight_vec_h, offset_vec, denom_vec;
3829  v4i32 weight_vec, rnd_vec;
3830 
3831  src -= (1 * src_stride);
3832 
3833  weight = weight & 0x0000FFFF;
3834 
3835  weight_vec = __msa_fill_w(weight);
3836  rnd_vec = __msa_fill_w(rnd_val);
3837 
3838  weight *= 128;
3839  rnd_val -= 6;
3840 
3841  weight_vec_h = __msa_fill_h(weight);
3842  offset_vec = __msa_fill_h(offset);
3843  denom_vec = __msa_fill_h(rnd_val);
3844 
3845  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3846  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3847 
3848  filter_vec = LD_SH(filter);
3849  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3850 
3851  LD_SB3(src, src_stride, src0, src1, src2);
3852  src += (3 * src_stride);
3853  XORI_B3_128_SB(src0, src1, src2);
3854  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3855  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3856  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3857 
3858  for (loop_cnt = 2; loop_cnt--;) {
3859  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3860  src += (8 * src_stride);
3861  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3862  ILVRL_B2_SB(src3, src2, src32_r, src32_l);
3863  ILVRL_B2_SB(src4, src3, src43_r, src43_l);
3864  ILVRL_B2_SB(src5, src4, src54_r, src54_l);
3865  ILVRL_B2_SB(src6, src5, src65_r, src65_l);
3866  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3867  src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3868  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3869  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3870  dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3871  dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3872  dst4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3873  dst5 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3874  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3875  offset_vec, rnd_vec, dst0, dst1, dst2,
3876  dst3);
3877  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
3878  rnd_vec, dst4, dst5);
3879  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
3880  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3881  ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
3882  dst += (4 * dst_stride);
3883 
3884  ILVRL_B2_SB(src7, src6, src76_r, src76_l);
3885  ILVRL_B2_SB(src8, src7, src87_r, src87_l);
3886  ILVRL_B2_SB(src9, src8, src98_r, src98_l);
3887  ILVRL_B2_SB(src10, src9, src109_r, src109_l);
3888  src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
3889  src10998 = (v16i8) __msa_ilvr_d((v2i64) src109_l, (v2i64) src98_l);
3890  dst6 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3891  dst7 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3892  dst8 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3893  dst9 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3894  dst10 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
3895  dst11 = HEVC_FILT_4TAP_SH(src8776, src10998, filt0, filt1);
3896  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst6, dst7, dst8, dst9, weight_vec,
3897  offset_vec, rnd_vec, dst6, dst7, dst8,
3898  dst9);
3899  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst10, dst11, weight_vec, offset_vec,
3900  rnd_vec, dst10, dst11);
3901  PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
3902  ST_D4(out3, out4, 0, 1, 0, 1, dst, dst_stride);
3903  ST_W4(out5, 0, 1, 2, 3, dst + 8, dst_stride);
3904  dst += (4 * dst_stride);
3905 
3906  src2 = src10;
3907  src10_r = src98_r;
3908  src21_r = src109_r;
3909  src2110 = src10998;
3910  }
3911 }
3912 
3914  int32_t src_stride,
3915  uint8_t *dst,
3916  int32_t dst_stride,
3917  const int8_t *filter,
3918  int32_t height,
3919  int32_t weight,
3920  int32_t offset,
3921  int32_t rnd_val)
3922 {
3923  int32_t loop_cnt;
3924  v16u8 out0, out1, out2, out3;
3925  v16i8 src0, src1, src2, src3, src4, src5;
3926  v16i8 src10_r, src32_r, src21_r, src43_r;
3927  v16i8 src10_l, src32_l, src21_l, src43_l;
3928  v16i8 src54_r, src54_l, src65_r, src65_l, src6;
3929  v8i16 filt0, filt1;
3930  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3931  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3932  v4i32 weight_vec, rnd_vec;
3933 
3934  src -= src_stride;
3935 
3936  weight = weight & 0x0000FFFF;
3937 
3938  weight_vec = __msa_fill_w(weight);
3939  rnd_vec = __msa_fill_w(rnd_val);
3940 
3941  weight *= 128;
3942  rnd_val -= 6;
3943 
3944  weight_vec_h = __msa_fill_h(weight);
3945  offset_vec = __msa_fill_h(offset);
3946  denom_vec = __msa_fill_h(rnd_val);
3947 
3948  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3949  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3950 
3951  filter_vec = LD_SH(filter);
3952  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3953 
3954  LD_SB3(src, src_stride, src0, src1, src2);
3955  src += (3 * src_stride);
3956  XORI_B3_128_SB(src0, src1, src2);
3957  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3958  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3959 
3960  for (loop_cnt = (height >> 2); loop_cnt--;) {
3961  LD_SB4(src, src_stride, src3, src4, src5, src6);
3962  src += (4 * src_stride);
3963  XORI_B4_128_SB(src3, src4, src5, src6);
3964  ILVRL_B2_SB(src3, src2, src32_r, src32_l);
3965  ILVRL_B2_SB(src4, src3, src43_r, src43_l);
3966  ILVRL_B2_SB(src5, src4, src54_r, src54_l);
3967  ILVRL_B2_SB(src6, src5, src65_r, src65_l);
3968  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3969  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3970  dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3971  dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3972  dst4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3973  dst5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3974  dst6 = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
3975  dst7 = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
3976  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3977  offset_vec, rnd_vec, dst0, dst1, dst2,
3978  dst3);
3979  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
3980  offset_vec, rnd_vec, dst4, dst5, dst6,
3981  dst7);
3982  PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1,
3983  out2, out3);
3984  ST_UB4(out0, out1, out2, out3, dst, dst_stride);
3985  dst += (4 * dst_stride);
3986 
3987  src2 = src6;
3988  src10_r = src54_r;
3989  src21_r = src65_r;
3990  src10_l = src54_l;
3991  src21_l = src65_l;
3992  }
3993 }
3994 
3996  int32_t src_stride,
3997  uint8_t *dst,
3998  int32_t dst_stride,
3999  const int8_t *filter,
4000  int32_t height,
4001  int32_t weight,
4002  int32_t offset,
4003  int32_t rnd_val)
4004 {
4005  uint32_t loop_cnt;
4006  v16u8 out0, out1, out2, out3, out4, out5;
4007  v16i8 src0, src1, src2, src3, src4, src5;
4008  v16i8 src6, src7, src8, src9, src10, src11, src12, src13;
4009  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
4010  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4011  v16i8 src87_r, src98_r, src109_r, src1110_r, src1211_r, src1312_r;
4012  v8i16 filt0, filt1;
4013  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
4014  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec, dst11;
4015  v4i32 weight_vec, rnd_vec;
4016 
4017  src -= src_stride;
4018 
4019  weight = weight & 0x0000FFFF;
4020 
4021  weight_vec = __msa_fill_w(weight);
4022  rnd_vec = __msa_fill_w(rnd_val);
4023 
4024  weight *= 128;
4025  rnd_val -= 6;
4026 
4027  weight_vec_h = __msa_fill_h(weight);
4028  offset_vec = __msa_fill_h(offset);
4029  denom_vec = __msa_fill_h(rnd_val);
4030 
4031  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
4032  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
4033 
4034  filter_vec = LD_SH(filter);
4035  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4036 
4037  LD_SB3(src, src_stride, src0, src1, src2);
4038  LD_SB3(src + 16, src_stride, src7, src8, src9);
4039  src += (3 * src_stride);
4040  XORI_B3_128_SB(src0, src1, src2);
4041  XORI_B3_128_SB(src7, src8, src9);
4042  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4043  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4044  ILVR_B2_SB(src8, src7, src9, src8, src87_r, src98_r);
4045 
4046  for (loop_cnt = 8; loop_cnt--;) {
4047  LD_SB4(src, src_stride, src3, src4, src5, src6);
4048  LD_SB4(src + 16, src_stride, src10, src11, src12, src13);
4049  src += (4 * src_stride);
4050  XORI_B4_128_SB(src3, src4, src5, src6);
4051  XORI_B4_128_SB(src10, src11, src12, src13);
4052  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4053  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4054  ILVRL_B2_SB(src5, src4, src54_r, src54_l);
4055  ILVRL_B2_SB(src6, src5, src65_r, src65_l);
4056  ILVR_B2_SB(src10, src9, src11, src10, src109_r, src1110_r);
4057  ILVR_B2_SB(src12, src11, src13, src12, src1211_r, src1312_r);
4058  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4059  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4060  dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
4061  dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
4062  dst4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4063  dst5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4064  dst6 = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
4065  dst7 = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
4066  dst8 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4067  dst9 = HEVC_FILT_4TAP_SH(src98_r, src1110_r, filt0, filt1);
4068  dst10 = HEVC_FILT_4TAP_SH(src109_r, src1211_r, filt0, filt1);
4069  dst11 = HEVC_FILT_4TAP_SH(src1110_r, src1312_r, filt0, filt1);
4070  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
4071  offset_vec, rnd_vec, dst0, dst1, dst2,
4072  dst3);
4073  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
4074  offset_vec, rnd_vec, dst4, dst5, dst6,
4075  dst7);
4076  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
4077  offset_vec, rnd_vec, dst8, dst9, dst10,
4078  dst11);
4079  PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1,
4080  out2, out3);
4081  PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
4082  ST_UB4(out0, out1, out2, out3, dst, dst_stride);
4083  ST_D4(out4, out5, 0, 1, 0, 1, dst + 16, dst_stride);
4084  dst += (4 * dst_stride);
4085 
4086  src2 = src6;
4087  src9 = src13;
4088  src10_r = src54_r;
4089  src21_r = src65_r;
4090  src10_l = src54_l;
4091  src21_l = src65_l;
4092  src87_r = src1211_r;
4093  src98_r = src1312_r;
4094  }
4095 }
4096 
4098  int32_t src_stride,
4099  uint8_t *dst,
4100  int32_t dst_stride,
4101  const int8_t *filter,
4102  int32_t height,
4103  int32_t weight,
4104  int32_t offset,
4105  int32_t rnd_val)
4106 {
4107  uint32_t loop_cnt;
4108  v16u8 out0, out1, out2, out3;
4109  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
4110  v16i8 src10_r, src32_r, src76_r, src98_r;
4111  v16i8 src21_r, src43_r, src65_r, src87_r;
4112  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4113  v16i8 src10_l, src32_l, src76_l, src98_l;
4114  v16i8 src21_l, src43_l, src65_l, src87_l;
4115  v8i16 filt0, filt1;
4116  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
4117  v4i32 weight_vec, rnd_vec;
4118 
4119  src -= src_stride;
4120 
4121  weight = weight & 0x0000FFFF;
4122 
4123  weight_vec = __msa_fill_w(weight);
4124  rnd_vec = __msa_fill_w(rnd_val);
4125 
4126  weight *= 128;
4127  rnd_val -= 6;
4128 
4129  weight_vec_h = __msa_fill_h(weight);
4130  offset_vec = __msa_fill_h(offset);
4131  denom_vec = __msa_fill_h(rnd_val);
4132 
4133  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
4134  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
4135 
4136  filter_vec = LD_SH(filter);
4137  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4138 
4139  LD_SB3(src, src_stride, src0, src1, src2);
4140  LD_SB3(src + 16, src_stride, src5, src6, src7);
4141  src += (3 * src_stride);
4142  XORI_B6_128_SB(src0, src1, src2, src5, src6, src7);
4143  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4144  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4145  ILVR_B2_SB(src6, src5, src7, src6, src65_r, src76_r);
4146  ILVL_B2_SB(src6, src5, src7, src6, src65_l, src76_l);
4147 
4148  for (loop_cnt = (height >> 1); loop_cnt--;) {
4149  LD_SB2(src, src_stride, src3, src4);
4150  LD_SB2(src + 16, src_stride, src8, src9);
4151  src += (2 * src_stride);
4152  XORI_B4_128_SB(src3, src4, src8, src9);
4153  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4154  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4155  ILVRL_B2_SB(src8, src7, src87_r, src87_l);
4156  ILVRL_B2_SB(src9, src8, src98_r, src98_l);
4157  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4158  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4159  dst2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4160  dst3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4161  dst4 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
4162  dst5 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4163  dst6 = HEVC_FILT_4TAP_SH(src65_l, src87_l, filt0, filt1);
4164  dst7 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
4165  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
4166  offset_vec, rnd_vec, dst0, dst1, dst2,
4167  dst3);
4168  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
4169  offset_vec, rnd_vec, dst4, dst5, dst6,
4170  dst7);
4171  PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst6, dst4, dst7, dst5, out0, out1,
4172  out2, out3);
4173  ST_UB2(out0, out2, dst, 16);
4174  dst += dst_stride;
4175  ST_UB2(out1, out3, dst, 16);
4176  dst += dst_stride;
4177 
4178  src2 = src4;
4179  src7 = src9;
4180  src10_r = src32_r;
4181  src21_r = src43_r;
4182  src10_l = src32_l;
4183  src21_l = src43_l;
4184  src65_r = src87_r;
4185  src76_r = src98_r;
4186  src65_l = src87_l;
4187  src76_l = src98_l;
4188  }
4189 }
4190 
4192  int32_t src_stride,
4193  uint8_t *dst,
4194  int32_t dst_stride,
4195  const int8_t *filter_x,
4196  const int8_t *filter_y,
4197  int32_t weight,
4198  int32_t offset,
4199  int32_t rnd_val)
4200 {
4201  v16u8 out;
4202  v16i8 src0, src1, src2, src3, src4;
4203  v8i16 filt0, filt1;
4204  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4205  v16i8 mask1;
4206  v8i16 filt_h0, filt_h1, filter_vec, tmp;
4207  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4208  v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
4209  v8i16 offset_vec, const_128, denom_vec;
4210  v4i32 dst0, dst1, weight_vec, rnd_vec;
4211 
4212  src -= (src_stride + 1);
4213 
4214  filter_vec = LD_SH(filter_x);
4215  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4216 
4217  filter_vec = LD_SH(filter_y);
4218  UNPCK_R_SB_SH(filter_vec, filter_vec);
4219 
4220  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4221 
4222  mask1 = mask0 + 2;
4223 
4224  weight_vec = __msa_fill_w(weight);
4225  rnd_vec = __msa_fill_w(rnd_val);
4226 
4227  offset_vec = __msa_fill_h(offset);
4228  denom_vec = __msa_fill_h(rnd_val - 6);
4229  const_128 = __msa_fill_h((128 * weight));
4230  offset_vec += __msa_srar_h(const_128, denom_vec);
4231 
4232  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
4233  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4234  VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
4235  VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
4236  VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
4237  dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4238  dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4239  dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4240  ILVRL_H2_SH(dst31, dst20, dst10, dst32);
4241  ILVRL_H2_SH(dst42, dst31, dst21, dst43);
4242  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4243  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4244  dst0 >>= 6;
4245  dst1 >>= 6;
4246  MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
4247  SRAR_W2_SW(dst0, dst1, rnd_vec);
4248  tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4249  tmp += offset_vec;
4250  CLIP_SH_0_255(tmp);
4251  out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
4252  ST_W2(out, 0, 1, dst, dst_stride);
4253 }
4254 
4256  int32_t src_stride,
4257  uint8_t *dst,
4258  int32_t dst_stride,
4259  const int8_t *filter_x,
4260  const int8_t *filter_y,
4261  int32_t weight,
4262  int32_t offset,
4263  int32_t rnd_val)
4264 {
4265  v16u8 out;
4266  v16i8 src0, src1, src2, src3, src4, src5, src6;
4267  v8i16 filt0, filt1;
4268  v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1;
4269  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4270  v16i8 mask1;
4271  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4272  v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65;
4273  v8i16 offset_vec, const_128, denom_vec;
4274  v4i32 dst0, dst1, dst2, dst3, weight_vec, rnd_vec;
4275 
4276  src -= (src_stride + 1);
4277 
4278  filter_vec = LD_SH(filter_x);
4279  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4280 
4281  filter_vec = LD_SH(filter_y);
4282  UNPCK_R_SB_SH(filter_vec, filter_vec);
4283 
4284  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4285 
4286  mask1 = mask0 + 2;
4287 
4288  weight_vec = __msa_fill_w(weight);
4289  rnd_vec = __msa_fill_w(rnd_val);
4290 
4291  offset_vec = __msa_fill_h(offset);
4292  denom_vec = __msa_fill_h(rnd_val - 6);
4293  const_128 = __msa_fill_h((128 * weight));
4294  offset_vec += __msa_srar_h(const_128, denom_vec);
4295 
4296  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
4297  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4298  VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
4299  VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
4300  VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
4301  VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4302  dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4303  dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4304  dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4305  dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4306  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
4307  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
4308  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
4309  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4310  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4311  dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
4312  dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
4313  SRA_4V(dst0, dst1, dst2, dst3, 6);
4314  MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
4315  MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
4316  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4317  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4318  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4319  CLIP_SH2_0_255(tmp0, tmp1);
4320  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4321  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
4322 }
4323 
4325  int32_t src_stride,
4326  uint8_t *dst,
4327  int32_t dst_stride,
4328  const int8_t *filter_x,
4329  const int8_t *filter_y,
4330  int32_t height,
4331  int32_t weight,
4332  int32_t offset,
4333  int32_t rnd_val)
4334 {
4335  uint32_t loop_cnt;
4336  v16u8 out0, out1;
4337  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4338  v8i16 filt0, filt1;
4339  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4340  v16i8 mask1;
4341  v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
4342  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4343  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4344  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4345  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4346  v8i16 dst98_r, dst109_r, offset_vec, const_128, denom_vec;
4347  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec;
4348 
4349  src -= (src_stride + 1);
4350 
4351  filter_vec = LD_SH(filter_x);
4352  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4353 
4354  filter_vec = LD_SH(filter_y);
4355  UNPCK_R_SB_SH(filter_vec, filter_vec);
4356 
4357  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4358 
4359  mask1 = mask0 + 2;
4360 
4361  weight_vec = __msa_fill_w(weight);
4362  rnd_vec = __msa_fill_w(rnd_val);
4363 
4364  offset_vec = __msa_fill_h(offset);
4365  denom_vec = __msa_fill_h(rnd_val - 6);
4366  const_128 = __msa_fill_h((128 * weight));
4367  offset_vec += __msa_srar_h(const_128, denom_vec);
4368 
4369  LD_SB3(src, src_stride, src0, src1, src2);
4370  src += (3 * src_stride);
4371  XORI_B3_128_SB(src0, src1, src2);
4372 
4373  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
4374  VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
4375  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4376  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4377  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4378  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4379 
4380  for (loop_cnt = height >> 3; loop_cnt--;) {
4381  LD_SB8(src, src_stride,
4382  src3, src4, src5, src6, src7, src8, src9, src10);
4383  src += (8 * src_stride);
4384  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4385 
4386  VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4387  VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4388  VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4389  VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4390  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4391  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4392  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4393  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4394  dst32_r = __msa_ilvr_h(dst73, dst22);
4395  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4396  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4397  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4398  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4399  dst76_r = __msa_ilvr_h(dst22, dst106);
4400  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4401  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4402  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4403  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4404  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4405  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4406  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4407  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4408  SRA_4V(dst0, dst1, dst2, dst3, 6);
4409  SRA_4V(dst4, dst5, dst6, dst7, 6);
4410  MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
4411  MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
4412  MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
4413  MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
4414  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4415  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4416  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4417  tmp2, tmp3);
4418  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4419  ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4420  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4421  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4422  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4423  dst += (8 * dst_stride);
4424 
4425  dst10_r = dst98_r;
4426  dst21_r = dst109_r;
4427  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4428  }
4429 }
4430 
4432  int32_t src_stride,
4433  uint8_t *dst,
4434  int32_t dst_stride,
4435  const int8_t *filter_x,
4436  const int8_t *filter_y,
4437  int32_t height,
4438  int32_t weight,
4439  int32_t offset,
4440  int32_t rnd_val)
4441 {
4442  if (2 == height) {
4443  hevc_hv_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
4444  filter_x, filter_y, weight,
4445  offset, rnd_val);
4446  } else if (4 == height) {
4447  hevc_hv_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
4448  filter_x,filter_y, weight,
4449  offset, rnd_val);
4450  } else if (0 == (height % 8)) {
4451  hevc_hv_uniwgt_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
4452  filter_x, filter_y, height, weight,
4453  offset, rnd_val);
4454  }
4455 }
4456 
4458  int32_t src_stride,
4459  uint8_t *dst,
4460  int32_t dst_stride,
4461  const int8_t *filter_x,
4462  const int8_t *filter_y,
4463  int32_t height,
4464  int32_t weight,
4465  int32_t offset,
4466  int32_t rnd_val)
4467 {
4468  v16u8 out0, out1, out2;
4469  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4470  v8i16 filt0, filt1;
4471  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4472  v16i8 mask1;
4473  v8i16 filt_h0, filt_h1, filter_vec;
4474  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4475  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4476  v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4477  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4478  v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4479  v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4480  v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4481  v8i16 offset_vec, const_128, denom_vec;
4482  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
4483  v4i32 dst0_l, dst1_l, dst2_l, dst3_l, weight_vec, rnd_vec;
4484 
4485  src -= (src_stride + 1);
4486 
4487  filter_vec = LD_SH(filter_x);
4488  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4489 
4490  filter_vec = LD_SH(filter_y);
4491  UNPCK_R_SB_SH(filter_vec, filter_vec);
4492 
4493  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4494 
4495  mask1 = mask0 + 2;
4496 
4497  weight_vec = __msa_fill_w(weight);
4498  rnd_vec = __msa_fill_w(rnd_val);
4499 
4500  offset_vec = __msa_fill_h(offset);
4501  denom_vec = __msa_fill_h(rnd_val - 6);
4502  const_128 = __msa_fill_h((128 * weight));
4503  offset_vec += __msa_srar_h(const_128, denom_vec);
4504 
4505  LD_SB3(src, src_stride, src0, src1, src2);
4506  src += (3 * src_stride);
4507  XORI_B3_128_SB(src0, src1, src2);
4508 
4509  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4510  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4511  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4512  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4513  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4514  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4515  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4516  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4517 
4518  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
4519  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4520  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4521  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4522  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4523  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4524  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4525  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4526  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4527  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4528  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4529  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4530  VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4531  VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4532  dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4533  dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4534  dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4535  dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4536  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4537  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4538  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4539  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4540  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
4541  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
4542  ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
4543  ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
4544  PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4545  PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4546  dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4547  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4548  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4549  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4550  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4551  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4552  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4553  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4554  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4555  dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
4556  dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
4557  dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
4558  dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4559  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4560  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4561  SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4562  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4563  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4564  MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
4565  MUL2(dst6_r, weight_vec, dst7_r, weight_vec, dst6_r, dst7_r);
4566  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4567  MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4568  SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
4569  SRAR_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, rnd_vec);
4570  SRAR_W4_SW(dst0_l, dst1_l, dst2_l, dst3_l, rnd_vec);
4571  PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
4572  PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
4573  PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
4574  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4575  ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4576  ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
4577  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4578  CLIP_SH2_0_255(tmp4, tmp5);
4579  PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
4580  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4581  ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
4582 }
4583 
4585  int32_t src_stride,
4586  uint8_t *dst,
4587  int32_t dst_stride,
4588  const int8_t *filter_x,
4589  const int8_t *filter_y,
4590  int32_t weight,
4591  int32_t offset,
4592  int32_t rnd_val)
4593 {
4594  v16u8 out;
4595  v16i8 src0, src1, src2, src3, src4;
4596  v8i16 filt0, filt1;
4597  v8i16 filt_h0, filt_h1, filter_vec;
4598  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4599  v16i8 mask1;
4600  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4601  v8i16 dst0, dst1, dst2, dst3, dst4;
4602  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4603  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4604  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4605  v8i16 tmp0, tmp1;
4606  v8i16 offset_vec, const_128, denom_vec;
4607  v4i32 weight_vec, rnd_vec;
4608 
4609  src -= (src_stride + 1);
4610 
4611  filter_vec = LD_SH(filter_x);
4612  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4613 
4614  filter_vec = LD_SH(filter_y);
4615  UNPCK_R_SB_SH(filter_vec, filter_vec);
4616 
4617  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4618 
4619  mask1 = mask0 + 2;
4620 
4621  weight_vec = __msa_fill_w(weight);
4622  rnd_vec = __msa_fill_w(rnd_val);
4623 
4624  offset_vec = __msa_fill_h(offset);
4625  denom_vec = __msa_fill_h(rnd_val - 6);
4626  const_128 = __msa_fill_h((128 * weight));
4627  offset_vec += __msa_srar_h(const_128, denom_vec);
4628 
4629  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
4630  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4631  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4632  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4633  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4634  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4635  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4636  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4637  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4638  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4639  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4640  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4641  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4642  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4643  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4644  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4645  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4646  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4647  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4648  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4649  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4650  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4651  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4652  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4653  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
4654  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4655  CLIP_SH2_0_255(tmp0, tmp1);
4656  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4657  ST_D2(out, 0, 1, dst, dst_stride);
4658 }
4659 
4661  int32_t src_stride,
4662  uint8_t *dst,
4663  int32_t dst_stride,
4664  const int8_t *filter_x,
4665  const int8_t *filter_y,
4666  int32_t width8mult,
4667  int32_t weight,
4668  int32_t offset,
4669  int32_t rnd_val)
4670 {
4671  uint32_t cnt;
4672  v16u8 out0, out1;
4673  v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
4674  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4675  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
4676  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4677  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4678  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4679  v8i16 offset_vec, const_128, denom_vec;
4680  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4681  v4i32 weight_vec, rnd_vec;
4682 
4683  src -= (src_stride + 1);
4684 
4685  filter_vec = LD_SH(filter_x);
4686  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4687 
4688  filter_vec = LD_SH(filter_y);
4689  UNPCK_R_SB_SH(filter_vec, filter_vec);
4690 
4691  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4692 
4693  mask0 = LD_SB(ff_hevc_mask_arr);
4694  mask1 = mask0 + 2;
4695 
4696  weight_vec = __msa_fill_w(weight);
4697  rnd_vec = __msa_fill_w(rnd_val);
4698 
4699  offset_vec = __msa_fill_h(offset);
4700  denom_vec = __msa_fill_h(rnd_val - 6);
4701  const_128 = __msa_fill_h((128 * weight));
4702  offset_vec += __msa_srar_h(const_128, denom_vec);
4703 
4704  for (cnt = width8mult; cnt--;) {
4705  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
4706  src += 8;
4707  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4708  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4709  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4710  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4711  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4712  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4713  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4714  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4715  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4716  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4717  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4718  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4719  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4720  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4721  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4722  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4723  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4724  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4725  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4726  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4727  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4728  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4729  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4730  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4731  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4732  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4733  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4734  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4735  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4736  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4737  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4738  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4739  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4740  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4741  MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4742  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4743  SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
4744  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4745  dst3_r, tmp0, tmp1, tmp2, tmp3);
4746  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4747  ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4748  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4749  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4750  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4751  dst += 8;
4752  }
4753 }
4754 
4756  int32_t src_stride,
4757  uint8_t *dst,
4758  int32_t dst_stride,
4759  const int8_t *filter_x,
4760  const int8_t *filter_y,
4761  int32_t weight,
4762  int32_t offset,
4763  int32_t rnd_val)
4764 {
4765  v16u8 out0, out1, out2;
4766  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4767  v8i16 filt0, filt1;
4768  v8i16 filt_h0, filt_h1, filter_vec;
4769  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4770  v16i8 mask1;
4771  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4772  v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
4773  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4774  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4775  v4i32 dst4_r, dst4_l, dst5_r, dst5_l, weight_vec, rnd_vec;
4776  v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4777  v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4778  v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4779  v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4780  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4781  v8i16 offset_vec, const_128, denom_vec;
4782 
4783  src -= (src_stride + 1);
4784 
4785  filter_vec = LD_SH(filter_x);
4786  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4787 
4788  filter_vec = LD_SH(filter_y);
4789  UNPCK_R_SB_SH(filter_vec, filter_vec);
4790 
4791  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4792 
4793  mask1 = mask0 + 2;
4794 
4795  weight_vec = __msa_fill_w(weight);
4796  rnd_vec = __msa_fill_w(rnd_val);
4797 
4798  offset_vec = __msa_fill_h(offset);
4799  denom_vec = __msa_fill_h(rnd_val - 6);
4800  const_128 = __msa_fill_h((128 * weight));
4801  offset_vec += __msa_srar_h(const_128, denom_vec);
4802 
4803  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
4804  src += (5 * src_stride);
4805  LD_SB4(src, src_stride, src5, src6, src7, src8);
4806  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4807  XORI_B4_128_SB(src5, src6, src7, src8);
4808  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4809  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4810  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4811  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4812  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4813  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
4814  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
4815  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
4816  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
4817  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4818  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4819  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4820  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4821  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4822  dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
4823  dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
4824  dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
4825  dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
4826  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4827  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4828  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4829  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4830  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4831  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4832  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
4833  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
4834  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4835  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4836  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4837  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4838  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4839  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4840  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4841  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4842  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4843  dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
4844  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4845  dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
4846  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4847  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4848  SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
4849  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4850  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4851  MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
4852  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4853  MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4854  MUL2(dst4_l, weight_vec, dst5_l, weight_vec, dst4_l, dst5_l);
4855  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4856  SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
4857  SRAR_W4_SW(dst4_r, dst4_l, dst5_r, dst5_l, rnd_vec);
4858  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
4859  tmp0, tmp1, tmp2, tmp3);
4860  PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
4861  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4862  ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4863  ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
4864  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4865  CLIP_SH2_0_255(tmp4, tmp5);
4866  PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
4867  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4868  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
4869 }
4870 
4872  int32_t src_stride,
4873  uint8_t *dst,
4874  int32_t dst_stride,
4875  const int8_t *filter_x,
4876  const int8_t *filter_y,
4877  int32_t height,
4878  int32_t weight,
4879  int32_t offset,
4880  int32_t rnd_val,
4881  int32_t width8mult)
4882 {
4883  uint32_t loop_cnt, cnt;
4884  uint8_t *src_tmp;
4885  uint8_t *dst_tmp;
4886  v16u8 out0, out1;
4887  v16i8 src0, src1, src2, src3, src4, src5, src6;
4888  v8i16 filt0, filt1;
4889  v8i16 filt_h0, filt_h1, filter_vec;
4890  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4891  v16i8 mask1;
4892  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4893  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4894  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4895  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4896  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4897  v8i16 offset_vec, const_128, denom_vec;
4898  v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
4899  v4i32 weight_vec, rnd_vec;
4900 
4901  src -= (src_stride + 1);
4902 
4903  filter_vec = LD_SH(filter_x);
4904  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4905 
4906  filter_vec = LD_SH(filter_y);
4907  UNPCK_R_SB_SH(filter_vec, filter_vec);
4908 
4909  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4910 
4911  mask1 = mask0 + 2;
4912 
4913  weight_vec = __msa_fill_w(weight);
4914  rnd_vec = __msa_fill_w(rnd_val);
4915 
4916  offset_vec = __msa_fill_h(offset);
4917  denom_vec = __msa_fill_h(rnd_val - 6);
4918  const_128 = __msa_fill_h((128 * weight));
4919  offset_vec += __msa_srar_h(const_128, denom_vec);
4920 
4921  for (cnt = width8mult; cnt--;) {
4922  src_tmp = src;
4923  dst_tmp = dst;
4924 
4925  LD_SB3(src_tmp, src_stride, src0, src1, src2);
4926  src_tmp += (3 * src_stride);
4927  XORI_B3_128_SB(src0, src1, src2);
4928 
4929  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4930  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4931  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4932  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4933  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4934  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4935 
4936  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4937  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4938 
4939  for (loop_cnt = height >> 2; loop_cnt--;) {
4940  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
4941  src_tmp += (4 * src_stride);
4942  XORI_B4_128_SB(src3, src4, src5, src6);
4943 
4944  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4945  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4946  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4947  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4948  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4949  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4950  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4951  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4952  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4953  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4954  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4955  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4956  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4957  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4958  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4959  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4960  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4961  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4962  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4963  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4964  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4965  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4966  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4967  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4968  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4969  MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4970  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4971  SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
4972  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4973  dst3_r, tmp0, tmp1, tmp2, tmp3);
4974  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4975  ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4976  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4977  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4978  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4979  dst_tmp += (4 * dst_stride);
4980 
4981  dst10_r = dst54_r;
4982  dst10_l = dst54_l;
4983  dst21_r = dst65_r;
4984  dst21_l = dst65_l;
4985  dst2 = dst6;
4986  }
4987 
4988  src += 8;
4989  dst += 8;
4990  }
4991 }
4992 
4994  int32_t src_stride,
4995  uint8_t *dst,
4996  int32_t dst_stride,
4997  const int8_t *filter_x,
4998  const int8_t *filter_y,
4999  int32_t height,
5000  int32_t weight,
5001  int32_t offset,
5002  int32_t rnd_val)
5003 {
5004 
5005  if (2 == height) {
5006  hevc_hv_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
5007  filter_x, filter_y, weight,
5008  offset, rnd_val);
5009  } else if (4 == height) {
5010  hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride,
5011  filter_x, filter_y, 1, weight,
5012  offset, rnd_val);
5013  } else if (6 == height) {
5014  hevc_hv_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
5015  filter_x, filter_y, weight,
5016  offset, rnd_val);
5017  } else if (0 == (height % 4)) {
5018  hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
5019  filter_x, filter_y, height, weight,
5020  offset, rnd_val, 1);
5021  }
5022 }
5023 
5025  int32_t src_stride,
5026  uint8_t *dst,
5027  int32_t dst_stride,
5028  const int8_t *filter_x,
5029  const int8_t *filter_y,
5030  int32_t height,
5031  int32_t weight,
5032  int32_t offset,
5033  int32_t rnd_val)
5034 {
5035  uint32_t loop_cnt;
5036  uint8_t *src_tmp, *dst_tmp;
5037  v16u8 out0, out1;
5038  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5039  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5040  v16i8 mask0, mask1, mask2, mask3;
5041  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
5042  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5043  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
5044  v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
5045  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5046  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5047  v8i16 offset_vec, const_128, denom_vec;
5048  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5049  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec;
5050 
5051  src -= (src_stride + 1);
5052 
5053  filter_vec = LD_SH(filter_x);
5054  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5055 
5056  filter_vec = LD_SH(filter_y);
5057  UNPCK_R_SB_SH(filter_vec, filter_vec);
5058 
5059  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5060 
5061  mask0 = LD_SB(ff_hevc_mask_arr);
5062  mask1 = mask0 + 2;
5063 
5064  weight_vec = __msa_fill_w(weight);
5065  rnd_vec = __msa_fill_w(rnd_val);
5066 
5067  offset_vec = __msa_fill_h(offset);
5068  denom_vec = __msa_fill_h(rnd_val - 6);
5069  const_128 = __msa_fill_h((128 * weight));
5070  offset_vec += __msa_srar_h(const_128, denom_vec);
5071 
5072  src_tmp = src;
5073  dst_tmp = dst;
5074 
5075  LD_SB3(src_tmp, src_stride, src0, src1, src2);
5076  src_tmp += (3 * src_stride);
5077  XORI_B3_128_SB(src0, src1, src2);
5078  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5079  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5080  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5081  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5082  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5083  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5084  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5085  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5086 
5087  for (loop_cnt = 4; loop_cnt--;) {
5088  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
5089  src_tmp += (4 * src_stride);
5090  XORI_B4_128_SB(src3, src4, src5, src6);
5091  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5092  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5093  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5094  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5095  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5096  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5097  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5098  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5099  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5100  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5101  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5102  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5103  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5104  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5105  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5106  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5107  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5108  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5109  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5110  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5111  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5112  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5113  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
5114  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
5115  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
5116  MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
5117  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5118  SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
5119  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5120  dst3_r, tmp0, tmp1, tmp2, tmp3);
5121  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
5122  ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
5123  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5124  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5125  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5126  dst_tmp += (4 * dst_stride);
5127 
5128  dst10_r = dst54_r;
5129  dst10_l = dst54_l;
5130  dst21_r = dst65_r;
5131  dst21_l = dst65_l;
5132  dsth2 = dsth6;
5133  }
5134 
5135  src += 8;
5136  dst += 8;
5137 
5138  mask2 = LD_SB(ff_hevc_mask_arr + 16);
5139  mask3 = mask2 + 2;
5140 
5141  LD_SB3(src, src_stride, src0, src1, src2);
5142  src += (3 * src_stride);
5143  XORI_B3_128_SB(src0, src1, src2);
5144  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
5145  VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
5146  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5147  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5148  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
5149  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
5150 
5151  for (loop_cnt = 2; loop_cnt--;) {
5152  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9,
5153  src10);
5154  src += (8 * src_stride);
5155  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
5156  VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
5157  VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
5158  VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
5159  VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
5160  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5161  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5162  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5163  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5164  dst32_r = __msa_ilvr_h(dst73, dst22);
5165  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
5166  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
5167  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
5168  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
5169  dst76_r = __msa_ilvr_h(dst22, dst106);
5170  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5171  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5172  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5173  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5174  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5175  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5176  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
5177  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
5178  SRA_4V(dst0, dst1, dst2, dst3, 6);
5179  SRA_4V(dst4, dst5, dst6, dst7, 6);
5180  MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
5181  MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
5182  MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
5183  MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
5184  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5185  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5186  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5187  tmp2, tmp3);
5188  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
5189  ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
5190  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5191  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5192  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5193  dst += (8 * dst_stride);
5194 
5195  dst10_r = dst98_r;
5196  dst21_r = dst109_r;
5197  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5198  }
5199 }
5200 
5202  int32_t src_stride,
5203  uint8_t *dst,
5204  int32_t dst_stride,
5205  const int8_t *filter_x,
5206  const int8_t *filter_y,
5207  int32_t height,
5208  int32_t weight,
5209  int32_t offset,
5210  int32_t rnd_val)
5211 {
5212  if (4 == height) {
5213  hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride,
5214  filter_x, filter_y, 2, weight, offset,
5215  rnd_val);
5216  } else {
5217  hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
5218  filter_x, filter_y, height, weight,
5219  offset, rnd_val, 2);
5220  }
5221 }
5222 
5224  int32_t src_stride,
5225  uint8_t *dst,
5226  int32_t dst_stride,
5227  const int8_t *filter_x,
5228  const int8_t *filter_y,
5229  int32_t height,
5230  int32_t weight,
5231  int32_t offset,
5232  int32_t rnd_val)
5233 {
5234  hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
5235  filter_x, filter_y, height, weight,
5236  offset, rnd_val, 3);
5237 }
5238 
5240  int32_t src_stride,
5241  uint8_t *dst,
5242  int32_t dst_stride,
5243  const int8_t *filter_x,
5244  const int8_t *filter_y,
5245  int32_t height,
5246  int32_t weight,
5247  int32_t offset,
5248  int32_t rnd_val)
5249 {
5250  hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
5251  filter_x, filter_y, height, weight,
5252  offset, rnd_val, 4);
5253 }
5254 
5255 #define UNIWGT_MC_COPY(WIDTH) \
5256 void ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
5257  ptrdiff_t dst_stride, \
5258  uint8_t *src, \
5259  ptrdiff_t src_stride, \
5260  int height, \
5261  int denom, \
5262  int weight, \
5263  int offset, \
5264  intptr_t mx, \
5265  intptr_t my, \
5266  int width) \
5267 { \
5268  int shift = denom + 14 - 8; \
5269  hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
5270  height, weight, offset, shift); \
5271 }
5272 
5273 UNIWGT_MC_COPY(4);
5274 UNIWGT_MC_COPY(6);
5275 UNIWGT_MC_COPY(8);
5276 UNIWGT_MC_COPY(12);
5277 UNIWGT_MC_COPY(16);
5278 UNIWGT_MC_COPY(24);
5279 UNIWGT_MC_COPY(32);
5280 UNIWGT_MC_COPY(48);
5281 UNIWGT_MC_COPY(64);
5282 
5283 #undef UNIWGT_MC_COPY
5284 
5285 #define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
5286 void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
5287  ptrdiff_t \
5288  dst_stride, \
5289  uint8_t *src, \
5290  ptrdiff_t \
5291  src_stride, \
5292  int height, \
5293  int denom, \
5294  int weight, \
5295  int offset, \
5296  intptr_t mx, \
5297  intptr_t my, \
5298  int width) \
5299 { \
5300  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
5301  int shift = denom + 14 - 8; \
5302  \
5303  hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
5304  dst_stride, filter, height, \
5305  weight, offset, shift); \
5306 }
5307 
5308 UNI_W_MC(qpel, h, 4, 8, hz, mx);
5309 UNI_W_MC(qpel, h, 8, 8, hz, mx);
5310 UNI_W_MC(qpel, h, 12, 8, hz, mx);
5311 UNI_W_MC(qpel, h, 16, 8, hz, mx);
5312 UNI_W_MC(qpel, h, 24, 8, hz, mx);
5313 UNI_W_MC(qpel, h, 32, 8, hz, mx);
5314 UNI_W_MC(qpel, h, 48, 8, hz, mx);
5315 UNI_W_MC(qpel, h, 64, 8, hz, mx);
5316 
5317 UNI_W_MC(qpel, v, 4, 8, vt, my);
5318 UNI_W_MC(qpel, v, 8, 8, vt, my);
5319 UNI_W_MC(qpel, v, 12, 8, vt, my);
5320 UNI_W_MC(qpel, v, 16, 8, vt, my);
5321 UNI_W_MC(qpel, v, 24, 8, vt, my);
5322 UNI_W_MC(qpel, v, 32, 8, vt, my);
5323 UNI_W_MC(qpel, v, 48, 8, vt, my);
5324 UNI_W_MC(qpel, v, 64, 8, vt, my);
5325 
5326 UNI_W_MC(epel, h, 4, 4, hz, mx);
5327 UNI_W_MC(epel, h, 6, 4, hz, mx);
5328 UNI_W_MC(epel, h, 8, 4, hz, mx);
5329 UNI_W_MC(epel, h, 12, 4, hz, mx);
5330 UNI_W_MC(epel, h, 16, 4, hz, mx);
5331 UNI_W_MC(epel, h, 24, 4, hz, mx);
5332 UNI_W_MC(epel, h, 32, 4, hz, mx);
5333 
5334 UNI_W_MC(epel, v, 4, 4, vt, my);
5335 UNI_W_MC(epel, v, 6, 4, vt, my);
5336 UNI_W_MC(epel, v, 8, 4, vt, my);
5337 UNI_W_MC(epel, v, 12, 4, vt, my);
5338 UNI_W_MC(epel, v, 16, 4, vt, my);
5339 UNI_W_MC(epel, v, 24, 4, vt, my);
5340 UNI_W_MC(epel, v, 32, 4, vt, my);
5341 
5342 #undef UNI_W_MC
5343 
5344 #define UNI_W_MC_HV(PEL, WIDTH, TAP) \
5345 void ff_hevc_put_hevc_uni_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
5346  ptrdiff_t dst_stride, \
5347  uint8_t *src, \
5348  ptrdiff_t src_stride, \
5349  int height, \
5350  int denom, \
5351  int weight, \
5352  int offset, \
5353  intptr_t mx, \
5354  intptr_t my, \
5355  int width) \
5356 { \
5357  const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
5358  const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
5359  int shift = denom + 14 - 8; \
5360  \
5361  hevc_hv_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
5362  filter_x, filter_y, height, \
5363  weight, offset, shift); \
5364 }
5365 
5366 UNI_W_MC_HV(qpel, 4, 8);
5367 UNI_W_MC_HV(qpel, 8, 8);
5368 UNI_W_MC_HV(qpel, 12, 8);
5369 UNI_W_MC_HV(qpel, 16, 8);
5370 UNI_W_MC_HV(qpel, 24, 8);
5371 UNI_W_MC_HV(qpel, 32, 8);
5372 UNI_W_MC_HV(qpel, 48, 8);
5373 UNI_W_MC_HV(qpel, 64, 8);
5374 
5375 UNI_W_MC_HV(epel, 4, 4);
5376 UNI_W_MC_HV(epel, 6, 4);
5377 UNI_W_MC_HV(epel, 8, 4);
5378 UNI_W_MC_HV(epel, 12, 4);
5379 UNI_W_MC_HV(epel, 16, 4);
5380 UNI_W_MC_HV(epel, 24, 4);
5381 UNI_W_MC_HV(epel, 32, 4);
5382 
5383 #undef UNI_W_MC_HV
#define VSHF_B4_SB(...)
static void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B5_128_SB(...)
#define CLIP_SW4_0_255(in0, in1, in2, in3)
#define XORI_B8_128_SB(...)
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
#define ILVRL_B2_SH(...)
static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST_D2(in, idx0, idx1, pdst, stride)
static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
#define ILVR_H4_SH(...)
static void hevc_vt_uniwgt_4t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
#define UNI_W_MC_HV(PEL, WIDTH, TAP)
#define XORI_B2_128_SB(...)
#define MUL2(in0, in1, in2, in3, out0, out1)
static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define LD_SB(...)
#define XORI_B3_128_SB(...)
static void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ILVRL_H2_SW(...)
#define PCKEV_B3_UB(...)
#define UNPCK_R_SB_SH(in, out)
#define ILVR_B2_SB(...)
#define SPLATI_H2_SH(...)
#define src
Definition: vp8dsp.c:254
#define ILVL_H2_SH(...)
#define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,out0_h, out1_h)
#define LD_SB2(...)
#define ILVL_H4_SH(...)
static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width8mult)
static void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define VSHF_B2_SB(...)
static void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SRA_4V(in0, in1, in2, in3, shift)
static void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ILVR_D2_SB(...)
uint8_t
#define LD4(psrc, stride, out0, out1, out2, out3)
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
#define SPLATI_W2_SH(...)
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define CLIP_SH_0_255(in)
static void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SPLATI_H4_SH(...)
#define DOTP_SH2_SW(...)
#define ILVL_B2_SB(...)
#define height
#define ST_D1(in, idx, pdst)
static void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define LD_SH(...)
#define ILVRL_H2_SH(...)
#define ST_H2(in, idx0, idx1, pdst, stride)
#define ILVR_D3_SB(...)
#define ILVR_D4_SB(...)
#define LD_SB8(...)
#define CLIP_SH2_0_255(in0, in1)
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5,idx6, idx7, pdst, stride)
static void hevc_vt_uniwgt_4t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_8x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static int aligned(int val)
Definition: dashdec.c:178
static void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
static void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B7_128_SB(...)
static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define zero
Definition: regdef.h:64
#define LW2(psrc, stride, out0, out1)
#define ILVR_B2_SH(...)
#define XORI_B4_128_SB(...)
#define SPLATI_W4_SH(...)
static void hevc_vt_uniwgt_8t_16multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t weightmul16)
#define CLIP_SH4_0_255(in0, in1, in2, in3)
#define CLIP_SW2_0_255(in0, in1)
static void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
static void hevc_hz_uniwgt_4t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define width
#define PCKEV_D2_SH(...)
static void hevc_hv_uniwgt_4t_8multx4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult, int32_t weight, int32_t offset, int32_t rnd_val)
#define SPLATI_W4_SW(...)
static void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define PCKEV_H2_SW(...)
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
int32_t
#define PCKEV_H2_SH(...)
#define LD_SB3(...)
#define SRAR_W4_SW(...)
static void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST_UB(...)
static void hevc_uniwgt_copy_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define LD_SB4(...)
static void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define PCKEV_B4_UB(...)
static void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
#define ST_UB2(...)
static void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST_UB4(...)
#define src1
Definition: h264pred.c:139
static void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width)
static void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ILVL_B4_SB(...)
static void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SRAR_W2_SW(...)
static void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static const uint8_t ff_hevc_mask_arr[16 *2]
static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
#define ILVR_B4_SH(...)
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_uniwgt_copy_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
#define src0
Definition: h264pred.c:138
#define ADD2(in0, in1, in2, in3, out0, out1)
#define INSERT_W2_SB(...)
static int weight(int i, int blen, int offset)
Definition: diracdec.c:1564
static void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SLLI_4V(in0, in1, in2, in3, shift)
static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
#define LD_SB7(...)
static void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define LD_SB5(...)
static void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ILVL_W2_SB(...)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
#define LW4(psrc, stride, out0, out1, out2, out3)
static void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ILVRL_B2_SB(...)
#define LD_SB6(...)
#define UNIWGT_MC_COPY(WIDTH)
#define ILVR_H2_SH(...)
#define INSERT_D2_SB(...)
static void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B6_128_SB(...)
#define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w,offset_h, rnd_w, out0_h, out1_h,out2_h, out3_h)
static void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ILVR_B4_SB(...)
FILE * out
Definition: movenc.c:54
static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST_W2(in, idx0, idx1, pdst, stride)
#define SLLI_2V(in0, in1, shift)
#define PCKEV_H4_SH(...)
#define INSERT_W4_SB(...)
#define LD2(psrc, stride, out0, out1)
static void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
#define PCKEV_B2_UB(...)
static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
static uint8_t tmp[11]
Definition: aes_ctr.c:26