FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
hevc_mc_biw_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
24 
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
29 };
30 
31 #define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \
32  out0, out1) \
33 { \
34  v4i32 out0_r, out1_r, out0_l, out1_l; \
35  \
36  ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
37  ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
38  \
39  out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
40  out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
41  out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
42  out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
43  \
44  SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
45  PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
46  CLIP_SH2_0_255(out0, out1); \
47 }
48 
49 #define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \
50  wgt, rnd, offset, out0, out1, out2, out3) \
51 { \
52  HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1); \
53  HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3); \
54 }
55 
56 #define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, \
57  offset, out0, out1) \
58 { \
59  v4i32 out0_r, out1_r, out0_l, out1_l; \
60  \
61  ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
62  ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
63  out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
64  out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
65  out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
66  out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
67  SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
68  PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
69  CLIP_SH2_0_255_MAX_SATU(out0, out1); \
70 }
71 
72 #define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
73  vec3, wgt, rnd, offset, out0, out1, \
74  out2, out3) \
75 { \
76  HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, \
77  out0, out1); \
78  HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset, \
79  out2, out3); \
80 }
81 
82 static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
83  int32_t src_stride,
84  int16_t *src1_ptr,
85  int32_t src2_stride,
86  uint8_t *dst,
87  int32_t dst_stride,
89  int32_t weight0,
90  int32_t weight1,
91  int32_t offset0,
92  int32_t offset1,
93  int32_t rnd_val)
94 {
95  uint32_t loop_cnt, tp0, tp1, tp2, tp3;
96  uint64_t tpd0, tpd1, tpd2, tpd3;
98  v16u8 out0, out1;
99  v16i8 zero = { 0 };
100  v16i8 src0 = { 0 }, src1 = { 0 };
101  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
102  v8i16 dst0, dst1, dst2, dst3, weight_vec;
103  v4i32 dst0_r, dst0_l, offset_vec, rnd_vec;
104 
105  offset = (offset0 + offset1) << rnd_val;
106  weight0 = weight0 & 0x0000FFFF;
107  weight = weight0 | (weight1 << 16);
108 
109  offset_vec = __msa_fill_w(offset);
110  weight_vec = (v8i16) __msa_fill_w(weight);
111  rnd_vec = __msa_fill_w(rnd_val + 1);
112 
113  if (2 == height) {
114  LW2(src0_ptr, src_stride, tp0, tp1);
115  INSERT_W2_SB(tp0, tp1, src0);
116  LD2(src1_ptr, src2_stride, tpd0, tpd1);
117  INSERT_D2_SH(tpd0, tpd1, in0);
118 
119  dst0 = (v8i16) __msa_ilvr_b(zero, src0);
120  dst0 <<= 6;
121 
122  ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
123  dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec);
124  dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
125  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
126  dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
127  dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
128  out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
129  ST4x2_UB(out0, dst, dst_stride);
130  } else if (4 == height) {
131  LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
132  INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
133  LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
134  INSERT_D2_SH(tpd0, tpd1, in0);
135  INSERT_D2_SH(tpd2, tpd3, in1);
136  ILVRL_B2_SH(zero, src0, dst0, dst1);
137  SLLI_2V(dst0, dst1, 6);
138  HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec,
139  offset_vec, dst0, dst1);
140  out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
141  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
142  } else if (0 == height % 8) {
143  for (loop_cnt = (height >> 3); loop_cnt--;) {
144  LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
145  src0_ptr += 4 * src_stride;
146  INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
147  LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
148  src0_ptr += 4 * src_stride;
149  INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
150  LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
151  src1_ptr += (4 * src2_stride);
152  INSERT_D2_SH(tpd0, tpd1, in0);
153  INSERT_D2_SH(tpd2, tpd3, in1);
154  LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
155  src1_ptr += (4 * src2_stride);
156  INSERT_D2_SH(tpd0, tpd1, in2);
157  INSERT_D2_SH(tpd2, tpd3, in3);
158  ILVRL_B2_SH(zero, src0, dst0, dst1);
159  ILVRL_B2_SH(zero, src1, dst2, dst3);
160  SLLI_4V(dst0, dst1, dst2, dst3, 6);
161  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
162  in3, weight_vec, rnd_vec, offset_vec,
163  dst0, dst1, dst2, dst3);
164  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
165  ST4x8_UB(out0, out1, dst, dst_stride);
166  dst += (8 * dst_stride);
167  }
168  }
169 }
170 
171 static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr,
172  int32_t src_stride,
173  int16_t *src1_ptr,
174  int32_t src2_stride,
175  uint8_t *dst,
176  int32_t dst_stride,
177  int32_t height,
178  int32_t weight0,
179  int32_t weight1,
180  int32_t offset0,
181  int32_t offset1,
182  int32_t rnd_val)
183 {
184  uint32_t loop_cnt;
186  uint64_t tp0, tp1, tp2, tp3;
187  v16u8 out0, out1;
188  v16i8 zero = { 0 };
189  v16i8 src0 = { 0 }, src1 = { 0 };
190  v8i16 in0, in1, in2, in3;
191  v8i16 dst0, dst1, dst2, dst3;
192  v4i32 offset_vec, weight_vec, rnd_vec;
193 
194  offset = (offset0 + offset1) << rnd_val;
195  weight0 = weight0 & 0x0000FFFF;
196  weight = weight0 | (weight1 << 16);
197 
198  weight_vec = __msa_fill_w(weight);
199  offset_vec = __msa_fill_w(offset);
200  rnd_vec = __msa_fill_w(rnd_val + 1);
201 
202  for (loop_cnt = (height >> 2); loop_cnt--;) {
203  LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
204  src0_ptr += (4 * src_stride);
205  INSERT_D2_SB(tp0, tp1, src0);
206  INSERT_D2_SB(tp2, tp3, src1);
207  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
208  src1_ptr += (4 * src2_stride);
209  ILVRL_B2_SH(zero, src0, dst0, dst1);
210  ILVRL_B2_SH(zero, src1, dst2, dst3);
211  SLLI_4V(dst0, dst1, dst2, dst3, 6);
212  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3,
213  in0, in1, in2, in3,
214  weight_vec, rnd_vec, offset_vec,
215  dst0, dst1, dst2, dst3);
216  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
217  ST6x4_UB(out0, out1, dst, dst_stride);
218  dst += (4 * dst_stride);
219  }
220 }
221 
222 static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
223  int32_t src_stride,
224  int16_t *src1_ptr,
225  int32_t src2_stride,
226  uint8_t *dst,
227  int32_t dst_stride,
228  int32_t height,
229  int32_t weight0,
230  int32_t weight1,
231  int32_t offset0,
232  int32_t offset1,
233  int32_t rnd_val)
234 {
235  uint64_t tp0, tp1, tp2, tp3;
237  v16u8 out0, out1, out2;
238  v16i8 zero = { 0 };
239  v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 };
240  v8i16 in0, in1, in2, in3, in4, in5;
241  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
242  v4i32 offset_vec, weight_vec, rnd_vec;
243 
244  offset = (offset0 + offset1) << rnd_val;
245  weight0 = weight0 & 0x0000FFFF;
246  weight = weight0 | (weight1 << 16);
247 
248  offset_vec = __msa_fill_w(offset);
249  weight_vec = __msa_fill_w(weight);
250  rnd_vec = __msa_fill_w(rnd_val + 1);
251 
252  if (2 == height) {
253  LD2(src0_ptr, src_stride, tp0, tp1);
254  INSERT_D2_SB(tp0, tp1, src0);
255  LD_SH2(src1_ptr, src2_stride, in0, in1);
256  ILVRL_B2_SH(zero, src0, dst0, dst1);
257  SLLI_2V(dst0, dst1, 6);
258 
259  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
260  weight_vec, rnd_vec, offset_vec,
261  dst0, dst1);
262 
263  out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
264  ST8x2_UB(out0, dst, dst_stride);
265  } else if (6 == height) {
266  LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
267  src0_ptr += 4 * src_stride;
268  INSERT_D2_SB(tp0, tp1, src0);
269  INSERT_D2_SB(tp2, tp3, src1);
270  LD2(src0_ptr, src_stride, tp0, tp1);
271  INSERT_D2_SB(tp0, tp1, src2);
272  ILVRL_B2_SH(zero, src0, dst0, dst1);
273  ILVRL_B2_SH(zero, src1, dst2, dst3);
274  ILVRL_B2_SH(zero, src2, dst4, dst5);
275  LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
276  SLLI_4V(dst0, dst1, dst2, dst3, 6);
277  SLLI_2V(dst4, dst5, 6);
278  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
279  weight_vec, rnd_vec, offset_vec, dst0, dst1,
280  dst2, dst3);
281  HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
282  offset_vec, dst4, dst5);
283  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
284  ST8x4_UB(out0, out1, dst, dst_stride);
285  dst += (4 * dst_stride);
286  ST8x2_UB(out2, dst, dst_stride);
287  } else if (0 == height % 4) {
288  uint32_t loop_cnt;
289 
290  for (loop_cnt = (height >> 2); loop_cnt--;) {
291  LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
292  src0_ptr += (4 * src_stride);
293  INSERT_D2_SB(tp0, tp1, src0);
294  INSERT_D2_SB(tp2, tp3, src1);
295  ILVRL_B2_SH(zero, src0, dst0, dst1);
296  ILVRL_B2_SH(zero, src1, dst2, dst3);
297  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
298  src1_ptr += (4 * src2_stride);
299 
300  SLLI_4V(dst0, dst1, dst2, dst3, 6);
301  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
302  in3, weight_vec, rnd_vec, offset_vec,
303  dst0, dst1, dst2, dst3);
304  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
305  ST8x4_UB(out0, out1, dst, dst_stride);
306  dst += (4 * dst_stride);
307  }
308  }
309 }
310 
311 static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr,
312  int32_t src_stride,
313  int16_t *src1_ptr,
314  int32_t src2_stride,
315  uint8_t *dst,
316  int32_t dst_stride,
317  int32_t height,
318  int32_t weight0,
319  int32_t weight1,
320  int32_t offset0,
321  int32_t offset1,
322  int32_t rnd_val)
323 {
324  uint32_t loop_cnt;
326  v16i8 zero = { 0 };
327  v16u8 out0, out1, out2;
328  v16i8 src0, src1, src2, src3;
329  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
330  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
331  v4i32 offset_vec, weight_vec, rnd_vec;
332 
333  offset = (offset0 + offset1) << rnd_val;
334  weight0 = weight0 & 0x0000FFFF;
335  weight = weight0 | (weight1 << 16);
336 
337  offset_vec = __msa_fill_w(offset);
338  weight_vec = __msa_fill_w(weight);
339  rnd_vec = __msa_fill_w(rnd_val + 1);
340 
341  for (loop_cnt = (16 >> 2); loop_cnt--;) {
342  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
343  src0_ptr += (4 * src_stride);
344  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
345  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
346  src1_ptr += (4 * src2_stride);
347 
348  ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
349  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
350  dst0, dst1, dst2, dst3);
351 
352  SLLI_4V(dst0, dst1, dst2, dst3, 6);
353  ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
354  ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
355 
356  dst4 <<= 6;
357  dst5 <<= 6;
358  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
359  weight_vec, rnd_vec, offset_vec, dst0, dst1,
360  dst2, dst3);
361  HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
362  offset_vec, dst4, dst5);
363  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
364  ST12x4_UB(out0, out1, out2, dst, dst_stride);
365  dst += (4 * dst_stride);
366  }
367 }
368 
369 static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr,
370  int32_t src_stride,
371  int16_t *src1_ptr,
372  int32_t src2_stride,
373  uint8_t *dst,
374  int32_t dst_stride,
375  int32_t height,
376  int32_t weight0,
377  int32_t weight1,
378  int32_t offset0,
379  int32_t offset1,
380  int32_t rnd_val)
381 {
382  uint32_t loop_cnt;
384  v16u8 out0, out1, out2, out3;
385  v16i8 zero = { 0 };
386  v16i8 src0, src1, src2, src3;
387  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
388  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
389  v4i32 offset_vec, weight_vec, rnd_vec;
390 
391  offset = (offset0 + offset1) << rnd_val;
392  weight0 = weight0 & 0x0000FFFF;
393  weight = weight0 | (weight1 << 16);
394 
395  offset_vec = __msa_fill_w(offset);
396  weight_vec = __msa_fill_w(weight);
397  rnd_vec = __msa_fill_w(rnd_val + 1);
398 
399  for (loop_cnt = (height >> 2); loop_cnt--;) {
400  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
401  src0_ptr += (4 * src_stride);
402  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
403  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
404  src1_ptr += (4 * src2_stride);
405  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
406  tmp2, tmp3);
407  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
408  tmp6, tmp7);
409  SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
410  SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
411  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp1, tmp4, tmp5, in0, in1, in4, in5,
412  weight_vec, rnd_vec, offset_vec, tmp0, tmp1,
413  tmp4, tmp5);
414  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp3, tmp6, tmp7, in2, in3, in6, in7,
415  weight_vec, rnd_vec, offset_vec, tmp2, tmp3,
416  tmp6, tmp7);
417  PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
418  PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
419  ST_UB4(out0, out1, out2, out3, dst, dst_stride);
420  dst += (4 * dst_stride);
421  }
422 }
423 
424 static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr,
425  int32_t src_stride,
426  int16_t *src1_ptr,
427  int32_t src2_stride,
428  uint8_t *dst,
429  int32_t dst_stride,
430  int32_t height,
431  int32_t weight0,
432  int32_t weight1,
433  int32_t offset0,
434  int32_t offset1,
435  int32_t rnd_val)
436 {
437  uint32_t loop_cnt;
439  v16u8 out0, out1, out2, out3, out4, out5;
440  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
441  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
442  v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
443  v4i32 offset_vec, weight_vec, rnd_vec;
444 
445  offset = (offset0 + offset1) << rnd_val;
446  weight0 = weight0 & 0x0000FFFF;
447  weight = weight0 | (weight1 << 16);
448 
449  offset_vec = __msa_fill_w(offset);
450  weight_vec = __msa_fill_w(weight);
451  rnd_vec = __msa_fill_w(rnd_val + 1);
452 
453  for (loop_cnt = 8; loop_cnt--;) {
454  LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
455  LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
456  src0_ptr += (4 * src_stride);
457  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
458  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
459  LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
460  src1_ptr += (4 * src2_stride);
461 
462  ILVRL_B2_SH(zero, src0, dst0, dst1);
463  ILVRL_B2_SH(zero, src1, dst2, dst3);
464  ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
465  ILVRL_B2_SH(zero, src4, dst6, dst7);
466  ILVRL_B2_SH(zero, src5, dst8, dst9);
467  ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
468  SLLI_4V(dst0, dst1, dst2, dst3, 6);
469  SLLI_4V(dst4, dst5, dst6, dst7, 6);
470  SLLI_4V(dst8, dst9, dst10, dst11, 6);
471  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in4, in1, in5,
472  weight_vec, rnd_vec, offset_vec, dst0, dst1,
473  dst2, dst3);
474  HEVC_BIW_RND_CLIP4_MAX_SATU(dst4, dst5, dst6, dst7, in8, in9, in2, in6,
475  weight_vec, rnd_vec, offset_vec, dst4, dst5,
476  dst6, dst7);
477  HEVC_BIW_RND_CLIP4_MAX_SATU(dst8, dst9, dst10, dst11, in3, in7, in10,
478  in11, weight_vec, rnd_vec, offset_vec,
479  dst8, dst9, dst10, dst11);
480  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
481  PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
482  ST_UB4(out0, out1, out3, out4, dst, dst_stride);
483  ST8x4_UB(out2, out5, dst + 16, dst_stride);
484  dst += (4 * dst_stride);
485  }
486 }
487 
488 static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr,
489  int32_t src_stride,
490  int16_t *src1_ptr,
491  int32_t src2_stride,
492  uint8_t *dst,
493  int32_t dst_stride,
494  int32_t height,
495  int32_t weight0,
496  int32_t weight1,
497  int32_t offset0,
498  int32_t offset1,
499  int32_t rnd_val)
500 {
501  uint32_t loop_cnt;
503  v16u8 out0, out1, out2, out3;
504  v16i8 zero = { 0 };
505  v16i8 src0, src1, src2, src3;
506  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
507  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
508  v4i32 offset_vec, weight_vec, rnd_vec;
509 
510  offset = (offset0 + offset1) << rnd_val;
511  weight0 = weight0 & 0x0000FFFF;
512  weight = weight0 | (weight1 << 16);
513 
514  offset_vec = __msa_fill_w(offset);
515  weight_vec = __msa_fill_w(weight);
516  rnd_vec = __msa_fill_w(rnd_val + 1);
517 
518  for (loop_cnt = (height >> 1); loop_cnt--;) {
519  LD_SB2(src0_ptr, 16, src0, src1);
520  src0_ptr += src_stride;
521  LD_SB2(src0_ptr, 16, src2, src3);
522  src0_ptr += src_stride;
523  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
524  src1_ptr += src2_stride;
525  LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
526  src1_ptr += src2_stride;
527 
528  ILVRL_B2_SH(zero, src0, tmp0, tmp4);
529  ILVRL_B2_SH(zero, src1, tmp1, tmp5);
530  ILVRL_B2_SH(zero, src2, tmp2, tmp6);
531  ILVRL_B2_SH(zero, src3, tmp3, tmp7);
532  SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
533  SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
534  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
535  weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
536  tmp1, tmp5);
537  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
538  weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
539  tmp3, tmp7);
540  PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
541  PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
542  ST_UB2(out0, out1, dst, 16);
543  dst += dst_stride;
544  ST_UB2(out2, out3, dst, 16);
545  dst += dst_stride;
546  }
547 }
548 
549 static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr,
550  int32_t src_stride,
551  int16_t *src1_ptr,
552  int32_t src2_stride,
553  uint8_t *dst,
554  int32_t dst_stride,
555  int32_t height,
556  int32_t weight0,
557  int32_t weight1,
558  int32_t offset0,
559  int32_t offset1,
560  int32_t rnd_val)
561 {
562  uint32_t loop_cnt;
564  v16u8 out0, out1, out2;
565  v16i8 src0, src1, src2;
566  v16i8 zero = { 0 };
567  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5;
568  v4i32 offset_vec, weight_vec, rnd_vec;
569 
570  offset = (offset0 + offset1) << rnd_val;
571  weight0 = weight0 & 0x0000FFFF;
572  weight = weight0 | (weight1 << 16);
573 
574  offset_vec = __msa_fill_w(offset);
575  weight_vec = __msa_fill_w(weight);
576  rnd_vec = __msa_fill_w(rnd_val + 1);
577 
578  for (loop_cnt = 64; loop_cnt--;) {
579  LD_SB3(src0_ptr, 16, src0, src1, src2);
580  src0_ptr += src_stride;
581  LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
582  src1_ptr += src2_stride;
583 
584  ILVRL_B2_SH(zero, src0, dst0, dst1);
585  ILVRL_B2_SH(zero, src1, dst2, dst3);
586  ILVRL_B2_SH(zero, src2, dst4, dst5);
587  SLLI_4V(dst0, dst1, dst2, dst3, 6);
588  SLLI_2V(dst4, dst5, 6);
589  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
590  weight_vec, rnd_vec, offset_vec, dst0, dst1,
591  dst2, dst3);
592  HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
593  offset_vec, dst4, dst5);
594  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
595  ST_UB2(out0, out1, dst, 16);
596  ST_UB(out2, dst + 32);
597  dst += dst_stride;
598  }
599 }
600 
601 static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr,
602  int32_t src_stride,
603  int16_t *src1_ptr,
604  int32_t src2_stride,
605  uint8_t *dst,
606  int32_t dst_stride,
607  int32_t height,
608  int32_t weight0,
609  int32_t weight1,
610  int32_t offset0,
611  int32_t offset1,
612  int32_t rnd_val)
613 {
614  uint32_t loop_cnt;
616  v16u8 out0, out1, out2, out3;
617  v16i8 zero = { 0 };
618  v16i8 src0, src1, src2, src3;
619  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
620  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
621  v4i32 offset_vec, weight_vec, rnd_vec;
622 
623  offset = (offset0 + offset1) << rnd_val;
624  weight0 = weight0 & 0x0000FFFF;
625  weight = weight0 | (weight1 << 16);
626 
627  offset_vec = __msa_fill_w(offset);
628  weight_vec = __msa_fill_w(weight);
629  rnd_vec = __msa_fill_w(rnd_val + 1);
630 
631  for (loop_cnt = height; loop_cnt--;) {
632  LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
633  src0_ptr += src_stride;
634  LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
635  src1_ptr += src2_stride;
636 
637  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
638  tmp2, tmp3);
639  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
640  tmp6, tmp7);
641  SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
642  SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
643  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
644  weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
645  tmp1, tmp5);
646  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
647  weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
648  tmp3, tmp7);
649  PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
650  PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
651  ST_UB4(out0, out1, out2, out3, dst, 16);
652  dst += dst_stride;
653  }
654 }
655 
656 static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr,
657  int32_t src_stride,
658  int16_t *src1_ptr,
659  int32_t src2_stride,
660  uint8_t *dst,
661  int32_t dst_stride,
662  const int8_t *filter,
663  int32_t height,
664  int32_t weight0,
665  int32_t weight1,
666  int32_t offset0,
667  int32_t offset1,
668  int32_t rnd_val)
669 {
670  uint32_t loop_cnt;
671  int32_t offset, weight, constant;
672  v8i16 filt0, filt1, filt2, filt3;
673  v16i8 src0, src1, src2, src3;
674  v16i8 mask1, mask2, mask3;
675  v16i8 vec0, vec1, vec2, vec3;
676  v8i16 dst0, dst1;
677  v8i16 in0, in1, in2, in3;
678  v8i16 filter_vec, out0, out1;
679  v4i32 weight_vec, offset_vec, rnd_vec;
680  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
681 
682  src0_ptr -= 3;
683  filter_vec = LD_SH(filter);
684  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
685 
686  mask1 = mask0 + 2;
687  mask2 = mask0 + 4;
688  mask3 = mask0 + 6;
689 
690  offset = (offset0 + offset1) << rnd_val;
691  weight0 = weight0 & 0x0000FFFF;
692  weight = weight0 | (weight1 << 16);
693  constant = 128 * weight1;
694  constant <<= 6;
695  offset += constant;
696 
697  offset_vec = __msa_fill_w(offset);
698  weight_vec = __msa_fill_w(weight);
699  rnd_vec = __msa_fill_w(rnd_val + 1);
700 
701  for (loop_cnt = (height >> 2); loop_cnt--;) {
702  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
703  src0_ptr += (4 * src_stride);
704  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
705  src1_ptr += (4 * src2_stride);
706  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
707  XORI_B4_128_SB(src0, src1, src2, src3);
708 
709  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
710  vec0, vec1, vec2, vec3);
711  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
712  filt3);
713  VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
714  vec0, vec1, vec2, vec3);
715  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
716  filt3);
717 
718  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
719  weight_vec, rnd_vec, offset_vec,
720  out0, out1);
721 
722  out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
723  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
724  dst += (4 * dst_stride);
725  }
726 }
727 
728 static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr,
729  int32_t src_stride,
730  int16_t *src1_ptr,
731  int32_t src2_stride,
732  uint8_t *dst,
733  int32_t dst_stride,
734  const int8_t *filter,
735  int32_t height,
736  int32_t weight0,
737  int32_t weight1,
738  int32_t offset0,
739  int32_t offset1,
740  int32_t rnd_val)
741 {
742  uint32_t loop_cnt;
743  int32_t offset, weight, constant;
744  v8i16 filt0, filt1, filt2, filt3;
745  v16i8 src0, src1, src2, src3;
746  v16i8 mask1, mask2, mask3;
747  v16i8 vec0, vec1, vec2, vec3;
748  v8i16 dst0, dst1, dst2, dst3;
749  v8i16 in0, in1, in2, in3;
750  v8i16 filter_vec, out0, out1, out2, out3;
751  v4i32 weight_vec, offset_vec, rnd_vec;
752  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
753 
754  src0_ptr -= 3;
755  offset = (offset0 + offset1) << rnd_val;
756  weight0 = weight0 & 0x0000FFFF;
757  weight = weight0 | (weight1 << 16);
758  constant = 128 * weight1;
759  constant <<= 6;
760  offset += constant;
761 
762  offset_vec = __msa_fill_w(offset);
763  weight_vec = __msa_fill_w(weight);
764  rnd_vec = __msa_fill_w(rnd_val + 1);
765 
766  filter_vec = LD_SH(filter);
767  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
768 
769  mask1 = mask0 + 2;
770  mask2 = mask0 + 4;
771  mask3 = mask0 + 6;
772 
773  for (loop_cnt = (height >> 2); loop_cnt--;) {
774  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
775  src0_ptr += (4 * src_stride);
776  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
777  src1_ptr += (4 * src2_stride);
778  XORI_B4_128_SB(src0, src1, src2, src3);
779 
780  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
781  vec0, vec1, vec2, vec3);
782  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
783  filt3);
784  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
785  vec0, vec1, vec2, vec3);
786  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
787  filt3);
788  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
789  vec0, vec1, vec2, vec3);
790  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
791  filt3);
792  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
793  vec0, vec1, vec2, vec3);
794  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
795  filt3);
796 
797  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
798  in0, in1, in2, in3,
799  weight_vec, rnd_vec, offset_vec,
800  out0, out1, out2, out3);
801 
802  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
803  ST8x4_UB(out0, out1, dst, dst_stride);
804  dst += (4 * dst_stride);
805  }
806 }
807 
808 static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr,
809  int32_t src_stride,
810  int16_t *src1_ptr,
811  int32_t src2_stride,
812  uint8_t *dst,
813  int32_t dst_stride,
814  const int8_t *filter,
815  int32_t height,
816  int32_t weight0,
817  int32_t weight1,
818  int32_t offset0,
819  int32_t offset1,
820  int32_t rnd_val)
821 {
822  uint32_t loop_cnt;
823  int32_t offset, weight, constant;
824  v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3;
825  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
826  v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3;
827  v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec;
828  v4i32 weight_vec, offset_vec, rnd_vec;
829 
830  src0_ptr -= 3;
831 
832  weight0 = weight0 & 0x0000FFFF;
833  weight = weight0 | (weight1 << 16);
834  constant = 128 * weight1;
835  constant <<= 6;
836  offset = (offset0 + offset1) << rnd_val;
837  offset += constant;
838 
839  offset_vec = __msa_fill_w(offset);
840  weight_vec = __msa_fill_w(weight);
841  rnd_vec = __msa_fill_w(rnd_val + 1);
842 
843  filter_vec = LD_SH(filter);
844  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
845 
846  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
847  mask1 = mask0 + 2;
848  mask2 = mask0 + 4;
849  mask3 = mask0 + 6;
850  mask4 = LD_SB(&ff_hevc_mask_arr[16]);
851  mask5 = mask4 + 2;
852  mask6 = mask4 + 4;
853  mask7 = mask4 + 6;
854 
855  for (loop_cnt = 4; loop_cnt--;) {
856  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
857  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
858  XORI_B4_128_SB(src0, src1, src2, src3);
859  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
860  vec3);
861  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
862  filt3);
863  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
864  vec3);
865  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
866  filt3);
867  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
868  vec3);
869  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
870  filt3);
871  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
872  vec3);
873  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
874  filt3);
875  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
876  weight_vec, rnd_vec, offset_vec, out0, out1, out2,
877  out3);
878  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
879  ST8x4_UB(out0, out1, dst, dst_stride);
880 
881  LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3);
882  src0_ptr += (4 * src_stride);
883  LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3);
884  src1_ptr += (4 * src2_stride);
885  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
886  XORI_B4_128_SB(src0, src1, src2, src3);
887  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
888  vec3);
889  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
890  filt3);
891  VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
892  vec3);
893  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
894  filt3);
895  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec,
896  offset_vec, out0, out1);
897  out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
898  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
899  dst += (4 * dst_stride);
900  }
901 }
902 
903 static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr,
904  int32_t src_stride,
905  int16_t *src1_ptr,
906  int32_t src2_stride,
907  uint8_t *dst,
908  int32_t dst_stride,
909  const int8_t *filter,
910  int32_t height,
911  int32_t weight0,
912  int32_t weight1,
913  int32_t offset0,
914  int32_t offset1,
915  int32_t rnd_val)
916 {
917  uint32_t loop_cnt;
918  int32_t offset, weight, constant;
919  v16i8 src0, src1, src2, src3;
920  v8i16 in0, in1, in2, in3;
921  v8i16 filt0, filt1, filt2, filt3;
922  v16i8 mask1, mask2, mask3;
923  v8i16 filter_vec, out0, out1, out2, out3;
924  v16i8 vec0, vec1, vec2, vec3;
925  v8i16 dst0, dst1, dst2, dst3;
926  v4i32 weight_vec, offset_vec, rnd_vec;
927  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
928 
929  src0_ptr -= 3;
930  offset = (offset0 + offset1) << rnd_val;
931  weight0 = weight0 & 0x0000FFFF;
932  weight = weight0 | (weight1 << 16);
933  constant = 128 * weight1;
934  constant <<= 6;
935  offset += constant;
936 
937  offset_vec = __msa_fill_w(offset);
938  weight_vec = __msa_fill_w(weight);
939  rnd_vec = __msa_fill_w(rnd_val + 1);
940 
941  filter_vec = LD_SH(filter);
942  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
943 
944  mask1 = mask0 + 2;
945  mask2 = mask0 + 4;
946  mask3 = mask0 + 6;
947 
948  for (loop_cnt = (height >> 1); loop_cnt--;) {
949  LD_SB2(src0_ptr, 8, src0, src1);
950  src0_ptr += src_stride;
951  LD_SB2(src0_ptr, 8, src2, src3);
952  src0_ptr += src_stride;
953  LD_SH2(src1_ptr, 8, in0, in1);
954  src1_ptr += src2_stride;
955  LD_SH2(src1_ptr, 8, in2, in3);
956  src1_ptr += src2_stride;
957  XORI_B4_128_SB(src0, src1, src2, src3);
958 
959  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
960  vec0, vec1, vec2, vec3);
961  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
962  filt3);
963  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
964  vec0, vec1, vec2, vec3);
965  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
966  filt3);
967  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
968  vec0, vec1, vec2, vec3);
969  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
970  filt3);
971  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
972  vec0, vec1, vec2, vec3);
973  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
974  filt3);
975 
976  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
977  in0, in1, in2, in3,
978  weight_vec, rnd_vec, offset_vec,
979  out0, out1, out2, out3);
980 
981  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
982  ST_SH2(out0, out1, dst, dst_stride);
983  dst += (2 * dst_stride);
984  }
985 }
986 
987 static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr,
988  int32_t src_stride,
989  int16_t *src1_ptr,
990  int32_t src2_stride,
991  uint8_t *dst,
992  int32_t dst_stride,
993  const int8_t *filter,
994  int32_t height,
995  int32_t weight0,
996  int32_t weight1,
997  int32_t offset0,
998  int32_t offset1,
999  int32_t rnd_val)
1000 {
1001  uint32_t loop_cnt;
1002  uint64_t dst_val0;
1003  int32_t offset, weight, constant;
1004  v16i8 src0, src1;
1005  v8i16 in0, in1, in2;
1006  v8i16 filt0, filt1, filt2, filt3;
1007  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1008  v16i8 vec0, vec1, vec2, vec3;
1009  v8i16 dst0, dst1, dst2;
1010  v4i32 dst2_r, dst2_l;
1011  v8i16 filter_vec, out0, out1, out2;
1012  v4i32 weight_vec, offset_vec, rnd_vec;
1013  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1014 
1015  src0_ptr = src0_ptr - 3;
1016  offset = (offset0 + offset1) << rnd_val;
1017  weight0 = weight0 & 0x0000FFFF;
1018  weight = weight0 | (weight1 << 16);
1019  constant = 128 * weight1;
1020  constant <<= 6;
1021  offset += constant;
1022 
1023  offset_vec = __msa_fill_w(offset);
1024  weight_vec = __msa_fill_w(weight);
1025  rnd_vec = __msa_fill_w(rnd_val + 1);
1026 
1027  filter_vec = LD_SH(filter);
1028  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1029 
1030  mask1 = mask0 + 2;
1031  mask2 = mask0 + 4;
1032  mask3 = mask0 + 6;
1033  mask4 = mask0 + 8;
1034  mask5 = mask0 + 10;
1035  mask6 = mask0 + 12;
1036  mask7 = mask0 + 14;
1037 
1038  LD_SB2(src0_ptr, 16, src0, src1);
1039  src0_ptr += src_stride;
1040  LD_SH2(src1_ptr, 8, in0, in1);
1041  in2 = LD_SH(src1_ptr + 16);
1042  src1_ptr += src2_stride;
1043  XORI_B2_128_SB(src0, src1);
1044 
1045  for (loop_cnt = 31; loop_cnt--;) {
1046  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1047  vec0, vec1, vec2, vec3);
1048  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1049  filt3);
1050  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1051  vec0, vec1, vec2, vec3);
1052  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1053  filt3);
1054  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1055  vec0, vec1, vec2, vec3);
1056  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1057  filt3);
1058 
1059  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
1060  weight_vec, rnd_vec, offset_vec,
1061  out0, out1);
1062 
1063  ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1064  dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1065  (v8i16) weight_vec);
1066  dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1067  (v8i16) weight_vec);
1068  SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1069  dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1070  out2 = CLIP_SH_0_255(dst2_r);
1071 
1072  LD_SB2(src0_ptr, 16, src0, src1);
1073  src0_ptr += src_stride;
1074  LD_SH2(src1_ptr, 8, in0, in1);
1075  in2 = LD_SH(src1_ptr + 16);
1076  src1_ptr += src2_stride;
1077  XORI_B2_128_SB(src0, src1);
1078  PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1079  dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1080  ST_SH(out0, dst);
1081  SD(dst_val0, dst + 16);
1082  dst += dst_stride;
1083  }
1084 
1085  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1086  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1087  filt3);
1088  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1089  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1090  filt3);
1091  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1092  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1093  filt3);
1094  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec,
1095  out0, out1);
1096  ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1097  dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
1098  dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
1099  SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1100  dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1101  out2 = CLIP_SH_0_255(dst2_r);
1102  PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1103  dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1104  ST_SH(out0, dst);
1105  SD(dst_val0, dst + 16);
1106  dst += dst_stride;
1107 }
1108 
1109 static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr,
1110  int32_t src_stride,
1111  int16_t *src1_ptr,
1112  int32_t src2_stride,
1113  uint8_t *dst,
1114  int32_t dst_stride,
1115  const int8_t *filter,
1116  int32_t height,
1117  int32_t weight0,
1118  int32_t weight1,
1119  int32_t offset0,
1120  int32_t offset1,
1121  int32_t rnd_val)
1122 {
1123  uint32_t loop_cnt;
1124  int32_t offset, weight, constant;
1125  v16i8 src0, src1, src2;
1126  v8i16 in0, in1, in2, in3;
1127  v8i16 filt0, filt1, filt2, filt3;
1128  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1129  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1130  v16i8 vec0, vec1, vec2, vec3;
1131  v8i16 dst0, dst1, dst2, dst3;
1132  v8i16 filter_vec, out0, out1, out2, out3;
1133  v4i32 weight_vec, offset_vec, rnd_vec;
1134 
1135  src0_ptr -= 3;
1136  offset = (offset0 + offset1) << rnd_val;
1137  weight0 = weight0 & 0x0000FFFF;
1138  weight = weight0 | (weight1 << 16);
1139  constant = 128 * weight1;
1140  constant <<= 6;
1141  offset += constant;
1142 
1143  offset_vec = __msa_fill_w(offset);
1144  weight_vec = __msa_fill_w(weight);
1145  rnd_vec = __msa_fill_w(rnd_val + 1);
1146 
1147  filter_vec = LD_SH(filter);
1148  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1149 
1150  mask1 = mask0 + 2;
1151  mask2 = mask0 + 4;
1152  mask3 = mask0 + 6;
1153  mask4 = mask0 + 8;
1154  mask5 = mask0 + 10;
1155  mask6 = mask0 + 12;
1156  mask7 = mask0 + 14;
1157 
1158  for (loop_cnt = height; loop_cnt--;) {
1159  LD_SB2(src0_ptr, 16, src0, src1);
1160  src2 = LD_SB(src0_ptr + 24);
1161  src0_ptr += src_stride;
1162  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1163  src1_ptr += src2_stride;
1164 
1165  XORI_B3_128_SB(src0, src1, src2);
1166 
1167  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1168  vec0, vec1, vec2, vec3);
1169  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1170  filt3);
1171  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1172  vec0, vec1, vec2, vec3);
1173  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1174  filt3);
1175  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1176  vec0, vec1, vec2, vec3);
1177  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1178  filt3);
1179  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1180  vec0, vec1, vec2, vec3);
1181  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1182  filt3);
1183 
1184  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1185  in0, in1, in2, in3,
1186  weight_vec, rnd_vec, offset_vec,
1187  out0, out1, out2, out3);
1188 
1189  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1190  ST_SH2(out0, out1, dst, 16);
1191  dst += dst_stride;
1192  }
1193 }
1194 
1195 static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr,
1196  int32_t src_stride,
1197  int16_t *src1_ptr,
1198  int32_t src2_stride,
1199  uint8_t *dst,
1200  int32_t dst_stride,
1201  const int8_t *filter,
1202  int32_t height,
1203  int32_t weight0,
1204  int32_t weight1,
1205  int32_t offset0,
1206  int32_t offset1,
1207  int32_t rnd_val)
1208 {
1209  uint32_t loop_cnt;
1210  int32_t offset, weight, constant;
1211  v16i8 src0, src1, src2, src3, src4;
1212  v8i16 in0, in1, in2, in3;
1213  v8i16 filt0, filt1, filt2, filt3;
1214  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1215  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1216  v16i8 vec0, vec1, vec2, vec3;
1217  v8i16 dst0, dst1, dst2, dst3;
1218  v8i16 filter_vec, out0, out1, out2, out3;
1219  v4i32 weight_vec, offset_vec, rnd_vec;
1220 
1221  src0_ptr -= 3;
1222  offset = (offset0 + offset1) << rnd_val;
1223  weight0 = weight0 & 0x0000FFFF;
1224  weight = weight0 | (weight1 << 16);
1225  constant = 128 * weight1;
1226  constant <<= 6;
1227  offset += constant;
1228 
1229  offset_vec = __msa_fill_w(offset);
1230  weight_vec = __msa_fill_w(weight);
1231  rnd_vec = __msa_fill_w(rnd_val + 1);
1232 
1233  filter_vec = LD_SH(filter);
1234  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1235 
1236  mask1 = mask0 + 2;
1237  mask2 = mask0 + 4;
1238  mask3 = mask0 + 6;
1239  mask4 = mask0 + 8;
1240  mask5 = mask0 + 10;
1241  mask6 = mask0 + 12;
1242  mask7 = mask0 + 14;
1243 
1244  for (loop_cnt = 64; loop_cnt--;) {
1245  LD_SB2(src0_ptr, 16, src0, src1);
1246  src2 = LD_SB(src0_ptr + 24);
1247  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1248  XORI_B3_128_SB(src0, src1, src2);
1249  LD_SB2(src0_ptr + 32, 8, src3, src4);
1250  src0_ptr += src_stride;
1251  XORI_B2_128_SB(src3, src4);
1252 
1253  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1254  vec0, vec1, vec2, vec3);
1255  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1256  filt3);
1257  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1258  vec0, vec1, vec2, vec3);
1259  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1260  filt3);
1261  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1262  vec0, vec1, vec2, vec3);
1263  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1264  filt3);
1265  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1266  vec0, vec1, vec2, vec3);
1267  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1268  filt3);
1269 
1270  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
1271  weight_vec, rnd_vec, offset_vec,
1272  out0, out1, out2, out3);
1273 
1274  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1275  ST_SH2(out0, out1, dst, 16);
1276 
1277  LD_SH2(src1_ptr + 32, 8, in2, in3);
1278  src1_ptr += src2_stride;
1279 
1280  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1281  vec0, vec1, vec2, vec3);
1282  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1283  filt3);
1284  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1285  vec0, vec1, vec2, vec3);
1286  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1287  filt3);
1288 
1289  HEVC_BIW_RND_CLIP2(dst0, dst1, in2, in3,
1290  weight_vec, rnd_vec, offset_vec,
1291  out0, out1);
1292 
1293  out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1294  ST_SH(out0, dst + 32);
1295  dst += dst_stride;
1296  }
1297 }
1298 
1299 static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr,
1300  int32_t src_stride,
1301  int16_t *src1_ptr,
1302  int32_t src2_stride,
1303  uint8_t *dst,
1304  int32_t dst_stride,
1305  const int8_t *filter,
1306  int32_t height,
1307  int32_t weight0,
1308  int32_t weight1,
1309  int32_t offset0,
1310  int32_t offset1,
1311  int32_t rnd_val)
1312 {
1313  uint8_t *src0_ptr_tmp;
1314  uint8_t *dst_tmp;
1315  int16_t *src1_ptr_tmp;
1316  uint32_t loop_cnt, cnt;
1317  int32_t offset, weight, constant;
1318  v16i8 src0, src1, src2;
1319  v8i16 in0, in1, in2, in3;
1320  v8i16 filt0, filt1, filt2, filt3;
1321  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1322  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1323  v16i8 vec0, vec1, vec2, vec3;
1324  v8i16 dst0, dst1, dst2, dst3;
1325  v8i16 filter_vec, out0, out1, out2, out3;
1326  v4i32 weight_vec, offset_vec, rnd_vec;
1327 
1328  src0_ptr -= 3;
1329  offset = (offset0 + offset1) << rnd_val;
1330  weight0 = weight0 & 0x0000FFFF;
1331  weight = weight0 | (weight1 << 16);
1332  constant = 128 * weight1;
1333  constant <<= 6;
1334  offset += constant;
1335 
1336  offset_vec = __msa_fill_w(offset);
1337  weight_vec = __msa_fill_w(weight);
1338  rnd_vec = __msa_fill_w(rnd_val + 1);
1339 
1340  filter_vec = LD_SH(filter);
1341  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1342 
1343  mask1 = mask0 + 2;
1344  mask2 = mask0 + 4;
1345  mask3 = mask0 + 6;
1346  mask4 = mask0 + 8;
1347  mask5 = mask0 + 10;
1348  mask6 = mask0 + 12;
1349  mask7 = mask0 + 14;
1350 
1351  for (loop_cnt = height; loop_cnt--;) {
1352  src0_ptr_tmp = src0_ptr;
1353  dst_tmp = dst;
1354  src1_ptr_tmp = src1_ptr;
1355 
1356  for (cnt = 2; cnt--;) {
1357  LD_SB2(src0_ptr_tmp, 16, src0, src1);
1358  src2 = LD_SB(src0_ptr_tmp + 24);
1359  src0_ptr_tmp += 32;
1360  LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
1361  src1_ptr_tmp += 32;
1362  XORI_B3_128_SB(src0, src1, src2);
1363 
1364  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1365  vec0, vec1, vec2, vec3);
1366  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1367  filt2, filt3);
1368  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1369  vec0, vec1, vec2, vec3);
1370  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1371  filt2, filt3);
1372  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1373  vec0, vec1, vec2, vec3);
1374  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1375  filt2, filt3);
1376  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1377  vec0, vec1, vec2, vec3);
1378  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1379  filt2, filt3);
1380 
1381  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1382  in0, in1, in2, in3,
1383  weight_vec, rnd_vec, offset_vec,
1384  out0, out1, out2, out3);
1385 
1386  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1387  ST_SH2(out0, out1, dst_tmp, 16);
1388  dst_tmp += 32;
1389  }
1390 
1391  src0_ptr += src_stride;
1392  src1_ptr += src2_stride;
1393  dst += dst_stride;
1394 
1395  }
1396 }
1397 
1398 static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr,
1399  int32_t src_stride,
1400  int16_t *src1_ptr,
1401  int32_t src2_stride,
1402  uint8_t *dst,
1403  int32_t dst_stride,
1404  const int8_t *filter,
1405  int32_t height,
1406  int32_t weight0,
1407  int32_t weight1,
1408  int32_t offset0,
1409  int32_t offset1,
1410  int32_t rnd_val)
1411 {
1412  uint32_t loop_cnt;
1414  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1415  v16i8 src11, src12, src13, src14;
1416  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1417  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1418  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1419  v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1420  v16i8 src2110, src4332, src6554, src8776, src10998;
1421  v16i8 src12111110, src14131312;
1422  v8i16 dst10, dst32, dst54, dst76;
1423  v8i16 filt0, filt1, filt2, filt3;
1424  v8i16 filter_vec, out0, out1, out2, out3;
1425  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1426 
1427  src0_ptr -= (3 * src_stride);
1428  offset = (offset0 + offset1) << rnd_val;
1429  weight0 = weight0 & 0x0000FFFF;
1430  weight = weight0 | (weight1 << 16);
1431 
1432  const_vec = __msa_ldi_w(128);
1433  const_vec <<= 6;
1434  offset_vec = __msa_fill_w(offset);
1435  weight_vec = __msa_fill_w(weight);
1436  rnd_vec = __msa_fill_w(rnd_val + 1);
1437  weight1_vec = __msa_fill_w(weight1);
1438  offset_vec += const_vec * weight1_vec;
1439 
1440  filter_vec = LD_SH(filter);
1441  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1442 
1443  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1444  src0_ptr += (7 * src_stride);
1445 
1446  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1447  src10_r, src32_r, src54_r, src21_r);
1448  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1449  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1450  src2110, src4332, src6554);
1451  XORI_B3_128_SB(src2110, src4332, src6554);
1452 
1453  for (loop_cnt = (height >> 3); loop_cnt--;) {
1454  LD_SB8(src0_ptr, src_stride,
1455  src7, src8, src9, src10, src11, src12, src13, src14);
1456  src0_ptr += (8 * src_stride);
1457  LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1458  src1_ptr += (8 * src2_stride);
1459 
1460  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1461  ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1462  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1463  src76_r, src87_r, src98_r, src109_r);
1464  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1465  src1110_r, src1211_r, src1312_r, src1413_r);
1466  ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1467  src1413_r, src1312_r,
1468  src8776, src10998, src12111110, src14131312);
1469  XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1470 
1471  DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
1472  filt0, dst10, dst32, dst54, dst76);
1473  DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
1474  filt1, dst10, dst32, dst54, dst76);
1475  DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
1476  filt2, filt2, dst10, dst32, dst54, dst76);
1477  DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
1478  filt3, filt3, dst10, dst32, dst54, dst76);
1479 
1480  HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
1481  in0, in1, in2, in3,
1482  weight_vec, rnd_vec, offset_vec,
1483  out0, out1, out2, out3);
1484 
1485  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1486  ST4x8_UB(out0, out1, dst, dst_stride);
1487  dst += (8 * dst_stride);
1488 
1489  src2110 = src10998;
1490  src4332 = src12111110;
1491  src6554 = src14131312;
1492  src6 = src14;
1493  }
1494 }
1495 
1496 static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr,
1497  int32_t src_stride,
1498  int16_t *src1_ptr,
1499  int32_t src2_stride,
1500  uint8_t *dst,
1501  int32_t dst_stride,
1502  const int8_t *filter,
1503  int32_t height,
1504  int32_t weight0,
1505  int32_t weight1,
1506  int32_t offset0,
1507  int32_t offset1,
1508  int32_t rnd_val)
1509 {
1510  uint32_t loop_cnt;
1512  v16i8 src0, src1, src2, src3, src4, src5;
1513  v16i8 src6, src7, src8, src9, src10;
1514  v8i16 in0, in1, in2, in3;
1515  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1516  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1517  v8i16 tmp0, tmp1, tmp2, tmp3;
1518  v8i16 filt0, filt1, filt2, filt3;
1519  v8i16 filter_vec, out0, out1, out2, out3;
1520  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1521 
1522  src0_ptr -= (3 * src_stride);
1523  offset = (offset0 + offset1) << rnd_val;
1524  weight0 = weight0 & 0x0000FFFF;
1525  weight = weight0 | (weight1 << 16);
1526 
1527  const_vec = __msa_ldi_w(128);
1528  const_vec <<= 6;
1529  offset_vec = __msa_fill_w(offset);
1530  weight_vec = __msa_fill_w(weight);
1531  rnd_vec = __msa_fill_w(rnd_val + 1);
1532  weight1_vec = __msa_fill_w(weight1);
1533  offset_vec += const_vec * weight1_vec;
1534 
1535  filter_vec = LD_SH(filter);
1536  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1537 
1538  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1539  src0_ptr += (7 * src_stride);
1540  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1541 
1542  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1543  src10_r, src32_r, src54_r, src21_r);
1544  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1545 
1546  for (loop_cnt = (height >> 2); loop_cnt--;) {
1547  LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1548  src0_ptr += (4 * src_stride);
1549  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1550  src1_ptr += (4 * src2_stride);
1551 
1552  XORI_B4_128_SB(src7, src8, src9, src10);
1553  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1554  src76_r, src87_r, src98_r, src109_r);
1555 
1556  DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1557  filt0, tmp0, tmp1, tmp2, tmp3);
1558  DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1559  filt1, tmp0, tmp1, tmp2, tmp3);
1560  DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1561  filt2, tmp0, tmp1, tmp2, tmp3);
1562  DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1563  filt3, tmp0, tmp1, tmp2, tmp3);
1564 
1565  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1566  in0, in1, in2, in3,
1567  weight_vec, rnd_vec, offset_vec,
1568  out0, out1, out2, out3);
1569 
1570  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1571  ST8x4_UB(out0, out1, dst, dst_stride);
1572  dst += (4 * dst_stride);
1573 
1574  src10_r = src54_r;
1575  src32_r = src76_r;
1576  src54_r = src98_r;
1577  src21_r = src65_r;
1578  src43_r = src87_r;
1579  src65_r = src109_r;
1580  src6 = src10;
1581  }
1582 }
1583 
1584 static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr,
1585  int32_t src_stride,
1586  int16_t *src1_ptr,
1587  int32_t src2_stride,
1588  uint8_t *dst,
1589  int32_t dst_stride,
1590  const int8_t *filter,
1591  int32_t height,
1592  int32_t weight0,
1593  int32_t weight1,
1594  int32_t offset0,
1595  int32_t offset1,
1596  int32_t rnd_val)
1597 {
1598  uint32_t loop_cnt;
1600  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1601  v8i16 in0, in1, in2, in3;
1602  v16i8 src10_r, src32_r, src54_r, src76_r;
1603  v16i8 src21_r, src43_r, src65_r, src87_r;
1604  v8i16 tmp0, tmp1, tmp2;
1605  v16i8 src10_l, src32_l, src54_l, src76_l;
1606  v16i8 src21_l, src43_l, src65_l, src87_l;
1607  v16i8 src2110, src4332, src6554, src8776;
1608  v8i16 filt0, filt1, filt2, filt3;
1609  v8i16 out0, out1, out2, filter_vec;
1610  v4i32 dst2_r, dst2_l;
1611  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1612 
1613  src0_ptr -= (3 * src_stride);
1614  offset = (offset0 + offset1) << rnd_val;
1615  weight0 = weight0 & 0x0000FFFF;
1616  weight = weight0 | (weight1 << 16);
1617 
1618  const_vec = __msa_ldi_w(128);
1619  const_vec <<= 6;
1620  offset_vec = __msa_fill_w(offset);
1621  weight_vec = __msa_fill_w(weight);
1622  rnd_vec = __msa_fill_w(rnd_val + 1);
1623  weight1_vec = __msa_fill_w(weight1);
1624  offset_vec += const_vec * weight1_vec;
1625 
1626  filter_vec = LD_SH(filter);
1627  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1628 
1629  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1630  src0_ptr += (7 * src_stride);
1631  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1632 
1633  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1634  src10_r, src32_r, src54_r, src21_r);
1635  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1636  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1637  src10_l, src32_l, src54_l, src21_l);
1638  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1639  ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1640  src2110, src4332, src6554);
1641 
1642  for (loop_cnt = 8; loop_cnt--;) {
1643  LD_SB2(src0_ptr, src_stride, src7, src8);
1644  src0_ptr += (2 * src_stride);
1645  LD_SH2(src1_ptr, src2_stride, in0, in1);
1646  LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
1647  src1_ptr += (2 * src2_stride);
1648  in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
1649  XORI_B2_128_SB(src7, src8);
1650 
1651  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1652  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1653  src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
1654 
1655  DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0,
1656  tmp0, tmp1, tmp2);
1657  DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1);
1658  tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1);
1659  DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1);
1660  tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2);
1661  DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1);
1662  tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3);
1663 
1664  HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
1665  weight_vec, rnd_vec, offset_vec,
1666  out0, out1);
1667 
1668  ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l);
1669  dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1670  (v8i16) weight_vec);
1671  dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1672  (v8i16) weight_vec);
1673  SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1674  dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1675  out2 = CLIP_SH_0_255(dst2_r);
1676  PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1677  ST8x2_UB(out0, dst, dst_stride);
1678  ST4x2_UB(out2, dst + 8, dst_stride);
1679  dst += (2 * dst_stride);
1680 
1681  src10_r = src32_r;
1682  src32_r = src54_r;
1683  src54_r = src76_r;
1684  src21_r = src43_r;
1685  src43_r = src65_r;
1686  src65_r = src87_r;
1687  src2110 = src4332;
1688  src4332 = src6554;
1689  src6554 = src8776;
1690  src6 = src8;
1691  }
1692 }
1693 
1695  int32_t src_stride,
1696  int16_t *src1_ptr,
1697  int32_t src2_stride,
1698  uint8_t *dst,
1699  int32_t dst_stride,
1700  const int8_t *filter,
1701  int32_t height,
1702  int32_t weight0,
1703  int32_t weight1,
1704  int32_t offset0,
1705  int32_t offset1,
1706  int32_t rnd_val,
1707  int32_t width)
1708 {
1709  uint8_t *src0_ptr_tmp;
1710  int16_t *src1_ptr_tmp;
1711  uint8_t *dst_tmp;
1712  uint32_t loop_cnt, cnt;
1714  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1715  v8i16 in0, in1, in2, in3;
1716  v16i8 src10_r, src32_r, src54_r, src76_r;
1717  v16i8 src21_r, src43_r, src65_r, src87_r;
1718  v16i8 src10_l, src32_l, src54_l, src76_l;
1719  v16i8 src21_l, src43_l, src65_l, src87_l;
1720  v8i16 tmp0, tmp1, tmp2, tmp3;
1721  v8i16 filt0, filt1, filt2, filt3;
1722  v8i16 filter_vec;
1723  v8i16 out0, out1, out2, out3;
1724  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1725 
1726  src0_ptr -= (3 * src_stride);
1727 
1728  offset = (offset0 + offset1) << rnd_val;
1729  weight0 = weight0 & 0x0000FFFF;
1730  weight = weight0 | (weight1 << 16);
1731 
1732  const_vec = __msa_ldi_w(128);
1733  const_vec <<= 6;
1734  offset_vec = __msa_fill_w(offset);
1735  weight_vec = __msa_fill_w(weight);
1736  rnd_vec = __msa_fill_w(rnd_val + 1);
1737  weight1_vec = __msa_fill_w(weight1);
1738  offset_vec += const_vec * weight1_vec;
1739 
1740  filter_vec = LD_SH(filter);
1741  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1742 
1743  for (cnt = (width >> 4); cnt--;) {
1744  src0_ptr_tmp = src0_ptr;
1745  src1_ptr_tmp = src1_ptr;
1746  dst_tmp = dst;
1747 
1748  LD_SB7(src0_ptr_tmp, src_stride,
1749  src0, src1, src2, src3, src4, src5, src6);
1750  src0_ptr_tmp += (7 * src_stride);
1751 
1752  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1753  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1754  src10_r, src32_r, src54_r, src21_r);
1755  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1756  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1757  src10_l, src32_l, src54_l, src21_l);
1758  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1759 
1760  for (loop_cnt = (height >> 1); loop_cnt--;) {
1761  LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1762  src0_ptr_tmp += (2 * src_stride);
1763  LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1764  LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1765  src1_ptr_tmp += (2 * src2_stride);
1766 
1767  XORI_B2_128_SB(src7, src8);
1768  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1769  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1770 
1771  DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0,
1772  filt0, filt0, tmp0, tmp1, tmp2, tmp3);
1773  DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1,
1774  filt1, filt1, tmp0, tmp1, tmp2, tmp3);
1775  DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2,
1776  filt2, filt2, tmp0, tmp1, tmp2, tmp3);
1777  DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3,
1778  filt3, filt3, tmp0, tmp1, tmp2, tmp3);
1779 
1780  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1781  in0, in1, in2, in3,
1782  weight_vec, rnd_vec, offset_vec,
1783  out0, out1, out2, out3);
1784 
1785  PCKEV_B2_SH(out2, out0, out3, out1, out0, out1);
1786  ST_SH2(out0, out1, dst_tmp, dst_stride);
1787  dst_tmp += (2 * dst_stride);
1788 
1789  src10_r = src32_r;
1790  src32_r = src54_r;
1791  src54_r = src76_r;
1792  src21_r = src43_r;
1793  src43_r = src65_r;
1794  src65_r = src87_r;
1795  src10_l = src32_l;
1796  src32_l = src54_l;
1797  src54_l = src76_l;
1798  src21_l = src43_l;
1799  src43_l = src65_l;
1800  src65_l = src87_l;
1801  src6 = src8;
1802  }
1803 
1804  src0_ptr += 16;
1805  src1_ptr += 16;
1806  dst += 16;
1807  }
1808 }
1809 
1810 static void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr,
1811  int32_t src_stride,
1812  int16_t *src1_ptr,
1813  int32_t src2_stride,
1814  uint8_t *dst,
1815  int32_t dst_stride,
1816  const int8_t *filter,
1817  int32_t height,
1818  int32_t weight0,
1819  int32_t weight1,
1820  int32_t offset0,
1821  int32_t offset1,
1822  int32_t rnd_val)
1823 {
1824  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1825  src1_ptr, src2_stride,
1826  dst, dst_stride, filter, height,
1827  weight0, weight1, offset0, offset1,
1828  rnd_val, 16);
1829 }
1830 
1831 static void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr,
1832  int32_t src_stride,
1833  int16_t *src1_ptr,
1834  int32_t src2_stride,
1835  uint8_t *dst,
1836  int32_t dst_stride,
1837  const int8_t *filter,
1838  int32_t height,
1839  int32_t weight0,
1840  int32_t weight1,
1841  int32_t offset0,
1842  int32_t offset1,
1843  int32_t rnd_val)
1844 {
1845  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1846  src1_ptr, src2_stride,
1847  dst, dst_stride, filter, height,
1848  weight0, weight1, offset0, offset1,
1849  rnd_val, 16);
1850  hevc_vt_biwgt_8t_8w_msa(src0_ptr + 16, src_stride,
1851  src1_ptr + 16, src2_stride,
1852  dst + 16, dst_stride, filter, height,
1853  weight0, weight1, offset0, offset1, rnd_val);
1854 }
1855 
1856 static void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr,
1857  int32_t src_stride,
1858  int16_t *src1_ptr,
1859  int32_t src2_stride,
1860  uint8_t *dst,
1861  int32_t dst_stride,
1862  const int8_t *filter,
1863  int32_t height,
1864  int32_t weight0,
1865  int32_t weight1,
1866  int32_t offset0,
1867  int32_t offset1,
1868  int32_t rnd_val)
1869 {
1870  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1871  src1_ptr, src2_stride,
1872  dst, dst_stride, filter, height,
1873  weight0, weight1, offset0, offset1,
1874  rnd_val, 32);
1875 }
1876 
1877 static void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr,
1878  int32_t src_stride,
1879  int16_t *src1_ptr,
1880  int32_t src2_stride,
1881  uint8_t *dst,
1882  int32_t dst_stride,
1883  const int8_t *filter,
1884  int32_t height,
1885  int32_t weight0,
1886  int32_t weight1,
1887  int32_t offset0,
1888  int32_t offset1,
1889  int32_t rnd_val)
1890 {
1891  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1892  src1_ptr, src2_stride,
1893  dst, dst_stride, filter, height,
1894  weight0, weight1, offset0, offset1,
1895  rnd_val, 48);
1896 }
1897 
1898 static void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr,
1899  int32_t src_stride,
1900  int16_t *src1_ptr,
1901  int32_t src2_stride,
1902  uint8_t *dst,
1903  int32_t dst_stride,
1904  const int8_t *filter,
1905  int32_t height,
1906  int32_t weight0,
1907  int32_t weight1,
1908  int32_t offset0,
1909  int32_t offset1,
1910  int32_t rnd_val)
1911 {
1912  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1913  src1_ptr, src2_stride,
1914  dst, dst_stride, filter, height,
1915  weight0, weight1, offset0, offset1,
1916  rnd_val, 64);
1917 }
1918 
1919 static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
1920  int32_t src_stride,
1921  int16_t *src1_ptr,
1922  int32_t src2_stride,
1923  uint8_t *dst,
1924  int32_t dst_stride,
1925  const int8_t *filter_x,
1926  const int8_t *filter_y,
1927  int32_t height,
1928  int32_t weight0,
1929  int32_t weight1,
1930  int32_t offset0,
1931  int32_t offset1,
1932  int32_t rnd_val)
1933 {
1934  uint32_t loop_cnt;
1935  uint64_t tp0, tp1;
1937  v16u8 out;
1938  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1939  v8i16 in0 = { 0 }, in1 = { 0 };
1940  v8i16 filt0, filt1, filt2, filt3;
1941  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1942  v16i8 mask1, mask2, mask3;
1943  v8i16 filter_vec, weight_vec;
1944  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1945  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1946  v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1947  v8i16 tmp0, tmp1, tmp2, tmp3;
1948  v8i16 dst10, dst32, dst54, dst76;
1949  v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98;
1950  v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
1951  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1952 
1953  src0_ptr -= ((3 * src_stride) + 3);
1954 
1955  filter_vec = LD_SH(filter_x);
1956  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1957 
1958  filter_vec = LD_SH(filter_y);
1959  UNPCK_R_SB_SH(filter_vec, filter_vec);
1960 
1961  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1962 
1963  mask1 = mask0 + 2;
1964  mask2 = mask0 + 4;
1965  mask3 = mask0 + 6;
1966 
1967  offset = (offset0 + offset1) << rnd_val;
1968  weight0 = weight0 & 0x0000FFFF;
1969  weight = weight0 | (weight1 << 16);
1970 
1971  const_vec = __msa_fill_w((128 * weight1));
1972  const_vec <<= 6;
1973  offset_vec = __msa_fill_w(offset);
1974  rnd_vec = __msa_fill_w(rnd_val + 1);
1975  offset_vec += const_vec;
1976  weight_vec = (v8i16) __msa_fill_w(weight);
1977 
1978  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1979  src0_ptr += (7 * src_stride);
1980 
1981  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1982 
1983  VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1984  VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1985  VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1986  vec8, vec9, vec10, vec11);
1987  VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1988  vec12, vec13, vec14, vec15);
1989 
1990  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1991  filt3);
1992  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1993  filt3);
1994  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1995  filt3);
1996  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1997  filt3);
1998 
1999  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2000  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2001  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2002 
2003  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2004 
2005  for (loop_cnt = height >> 2; loop_cnt--;) {
2006  LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2007  src0_ptr += (4 * src_stride);
2008  XORI_B4_128_SB(src7, src8, src9, src10);
2009 
2010  LD2(src1_ptr, src2_stride, tp0, tp1);
2011  INSERT_D2_SH(tp0, tp1, in0);
2012  src1_ptr += (2 * src2_stride);
2013  LD2(src1_ptr, src2_stride, tp0, tp1);
2014  INSERT_D2_SH(tp0, tp1, in1);
2015  src1_ptr += (2 * src2_stride);
2016 
2017  VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
2018  vec0, vec1, vec2, vec3);
2019  VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
2020  vec4, vec5, vec6, vec7);
2021  dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2022  filt3);
2023  dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2024  filt3);
2025 
2026  dst76 = __msa_ilvr_h(dst97, dst66);
2027  ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2028  dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2029  dst98 = __msa_ilvr_h(dst66, dst108);
2030 
2031  dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2032  filt_h2, filt_h3);
2033  dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2034  filt_h2, filt_h3);
2035  dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2036  filt_h2, filt_h3);
2037  dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2038  filt_h2, filt_h3);
2039  SRA_4V(dst0, dst1, dst2, dst3, 6);
2040  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2041  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2042  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2043  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2044  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2045  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2046  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2047  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2048  CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
2049  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2050  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2051  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2052  dst += (4 * dst_stride);
2053 
2054  dst10 = dst54;
2055  dst32 = dst76;
2056  dst54 = dst98;
2057  dst21 = dst65;
2058  dst43 = dst87;
2059  dst65 = dst109;
2060  dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2061  }
2062 }
2063 
2065  int32_t src_stride,
2066  int16_t *src1_ptr,
2067  int32_t src2_stride,
2068  uint8_t *dst,
2069  int32_t dst_stride,
2070  const int8_t *filter_x,
2071  const int8_t *filter_y,
2072  int32_t height,
2073  int32_t weight0,
2074  int32_t weight1,
2075  int32_t offset0,
2076  int32_t offset1,
2077  int32_t rnd_val,
2078  int32_t width8mult)
2079 {
2080  uint32_t loop_cnt, cnt;
2082  uint8_t *src0_ptr_tmp;
2083  int16_t *src1_ptr_tmp;
2084  uint8_t *dst_tmp;
2085  v16u8 out;
2086  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2087  v8i16 in0, in1;
2088  v8i16 filt0, filt1, filt2, filt3;
2089  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2090  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2091  v16i8 mask1, mask2, mask3;
2092  v8i16 filter_vec, weight_vec;
2093  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2094  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2095  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
2096  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
2097  v8i16 tmp0, tmp1, tmp2, tmp3;
2098  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2099  v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
2100  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
2101  v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
2102  v4i32 offset_vec, rnd_vec, const_vec;
2103 
2104  src0_ptr -= ((3 * src_stride) + 3);
2105 
2106  offset = (offset0 + offset1) << rnd_val;
2107  weight0 = weight0 & 0x0000FFFF;
2108  weight = weight0 | (weight1 << 16);
2109 
2110  const_vec = __msa_fill_w((128 * weight1));
2111  const_vec <<= 6;
2112  offset_vec = __msa_fill_w(offset);
2113  rnd_vec = __msa_fill_w(rnd_val + 1);
2114  offset_vec += const_vec;
2115  weight_vec = (v8i16) __msa_fill_w(weight);
2116 
2117  filter_vec = LD_SH(filter_x);
2118  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2119 
2120  filter_vec = LD_SH(filter_y);
2121  UNPCK_R_SB_SH(filter_vec, filter_vec);
2122 
2123  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2124 
2125  mask1 = mask0 + 2;
2126  mask2 = mask0 + 4;
2127  mask3 = mask0 + 6;
2128 
2129  for (cnt = width8mult; cnt--;) {
2130  src0_ptr_tmp = src0_ptr;
2131  src1_ptr_tmp = src1_ptr;
2132  dst_tmp = dst;
2133 
2134  LD_SB7(src0_ptr_tmp, src_stride,
2135  src0, src1, src2, src3, src4, src5, src6);
2136  src0_ptr_tmp += (7 * src_stride);
2137 
2138  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2139 
2140  /* row 0 row 1 row 2 row 3 */
2141  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
2142  vec0, vec1, vec2, vec3);
2143  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
2144  vec4, vec5, vec6, vec7);
2145  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
2146  vec8, vec9, vec10, vec11);
2147  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
2148  vec12, vec13, vec14, vec15);
2149 
2150  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2151  filt3);
2152  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2153  filt3);
2154  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2155  filt3);
2156  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2157  filt2, filt3);
2158 
2159  /* row 4 row 5 row 6 */
2160  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
2161  vec0, vec1, vec2, vec3);
2162  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
2163  vec4, vec5, vec6, vec7);
2164  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
2165  vec8, vec9, vec10, vec11);
2166 
2167  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2168  filt3);
2169  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2170  filt3);
2171  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2172  filt3);
2173 
2174  for (loop_cnt = height >> 1; loop_cnt--;) {
2175  LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2176  XORI_B2_128_SB(src7, src8);
2177  src0_ptr_tmp += 2 * src_stride;
2178 
2179  LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2180  src1_ptr_tmp += (2 * src2_stride);
2181 
2182  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
2183  dst32_r, dst54_r, dst21_r);
2184  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
2185  dst32_l, dst54_l, dst21_l);
2186  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
2187  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
2188 
2189  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2190  vec0, vec1, vec2, vec3);
2191  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2192  filt2, filt3);
2193 
2194  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
2195  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
2196  filt_h0, filt_h1, filt_h2, filt_h3);
2197  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
2198  filt_h0, filt_h1, filt_h2, filt_h3);
2199 
2200  dst0_r >>= 6;
2201  dst0_l >>= 6;
2202 
2203  /* row 8 */
2204  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2205  vec0, vec1, vec2, vec3);
2206  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2207  filt2, filt3);
2208 
2209  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
2210  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
2211  filt_h0, filt_h1, filt_h2, filt_h3);
2212  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
2213  filt_h0, filt_h1, filt_h2, filt_h3);
2214 
2215  dst1_r >>= 6;
2216  dst1_l >>= 6;
2217 
2218  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
2219  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2220  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2221  dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2222  dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2223  dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2224  dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2225  SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
2226  CLIP_SW4_0_255_MAX_SATU(dst0_l, dst0_r, dst1_l, dst1_r);
2227  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
2228  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2229  ST8x2_UB(out, dst_tmp, dst_stride);
2230  dst_tmp += (2 * dst_stride);
2231 
2232  dst0 = dst2;
2233  dst1 = dst3;
2234  dst2 = dst4;
2235  dst3 = dst5;
2236  dst4 = dst6;
2237  dst5 = dst7;
2238  dst6 = dst8;
2239  }
2240 
2241  src0_ptr += 8;
2242  src1_ptr += 8;
2243  dst += 8;
2244  }
2245 }
2246 
2247 static void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr,
2248  int32_t src_stride,
2249  int16_t *src1_ptr,
2250  int32_t src2_stride,
2251  uint8_t *dst,
2252  int32_t dst_stride,
2253  const int8_t *filter_x,
2254  const int8_t *filter_y,
2255  int32_t height,
2256  int32_t weight0,
2257  int32_t weight1,
2258  int32_t offset0,
2259  int32_t offset1,
2260  int32_t rnd_val)
2261 {
2262  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2263  src1_ptr, src2_stride,
2264  dst, dst_stride, filter_x, filter_y,
2265  height, weight0, weight1, offset0,
2266  offset1, rnd_val, 1);
2267 }
2268 
2269 static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
2270  int32_t src_stride,
2271  int16_t *src1_ptr,
2272  int32_t src2_stride,
2273  uint8_t *dst,
2274  int32_t dst_stride,
2275  const int8_t *filter_x,
2276  const int8_t *filter_y,
2277  int32_t height,
2278  int32_t weight0,
2279  int32_t weight1,
2280  int32_t offset0,
2281  int32_t offset1,
2282  int32_t rnd_val)
2283 {
2284  uint32_t loop_cnt;
2285  uint8_t *src0_ptr_tmp, *dst_tmp;
2286  int16_t *src1_ptr_tmp;
2288  uint64_t tp0, tp1;
2289  v16u8 out;
2290  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2291  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2292  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2293  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2294  v8i16 in0 = { 0 }, in1 = { 0 };
2295  v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3;
2296  v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2297  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
2298  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
2299  v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
2300  v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76;
2301  v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l;
2302  v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
2303 
2304  src0_ptr -= ((3 * src_stride) + 3);
2305 
2306  offset = (offset0 + offset1) << rnd_val;
2307  weight0 = weight0 & 0x0000FFFF;
2308  weight = weight0 | (weight1 << 16);
2309 
2310  const_vec = __msa_fill_w((128 * weight1));
2311  const_vec <<= 6;
2312  offset_vec = __msa_fill_w(offset);
2313  rnd_vec = __msa_fill_w(rnd_val + 1);
2314  offset_vec += const_vec;
2315  weight_vec = (v8i16) __msa_fill_w(weight);
2316 
2317  filter_vec = LD_SH(filter_x);
2318  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2319 
2320  filter_vec = LD_SH(filter_y);
2321  UNPCK_R_SB_SH(filter_vec, filter_vec);
2322 
2323  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2324 
2325  mask0 = LD_SB(ff_hevc_mask_arr);
2326  mask1 = mask0 + 2;
2327  mask2 = mask0 + 4;
2328  mask3 = mask0 + 6;
2329 
2330  src0_ptr_tmp = src0_ptr;
2331  src1_ptr_tmp = src1_ptr;
2332  dst_tmp = dst;
2333 
2334  LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
2335  src0_ptr_tmp += (7 * src_stride);
2336  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2337 
2338  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2339  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2340  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2341  vec11);
2342  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2343  vec15);
2344  dsth0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2345  filt3);
2346  dsth1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2347  filt3);
2348  dsth2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2349  filt3);
2350  dsth3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2351  filt2, filt3);
2352  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2353  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2354  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2355  vec11);
2356  dsth4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2357  filt3);
2358  dsth5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2359  filt3);
2360  dsth6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2361  filt3);
2362 
2363  for (loop_cnt = 8; loop_cnt--;) {
2364  LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2365  src0_ptr_tmp += (2 * src_stride);
2366  XORI_B2_128_SB(src7, src8);
2367 
2368  LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2369  src1_ptr_tmp += (2 * src2_stride);
2370 
2371  ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2372  dst10_r, dst32_r, dst54_r, dst21_r);
2373  ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2374  dst10_l, dst32_l, dst54_l, dst21_l);
2375  ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r);
2376  ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l);
2377 
2378  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2379  vec3);
2380  dsth7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2381  filt3);
2382 
2383  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
2384  dst0 = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2385  filt_h1, filt_h2, filt_h3);
2386  dst1 = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
2387  filt_h1, filt_h2, filt_h3);
2388  dst0 >>= 6;
2389  dst1 >>= 6;
2390 
2391  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2392  vec3);
2393  dsth8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2394  filt3);
2395 
2396  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
2397  dst2 = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2398  filt_h1, filt_h2, filt_h3);
2399  dst3 = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0,
2400  filt_h1, filt_h2, filt_h3);
2401  dst2 >>= 6;
2402  dst3 >>= 6;
2403 
2404  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2405  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2406  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2407  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2408  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2409  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2410  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2411  SRAR_W4_SW(dst1, dst0, dst3, dst2, rnd_vec);
2412  CLIP_SW4_0_255_MAX_SATU(dst1, dst0, dst3, dst2);
2413  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2414  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2415  ST8x2_UB(out, dst_tmp, dst_stride);
2416  dst_tmp += (2 * dst_stride);
2417 
2418  dsth0 = dsth2;
2419  dsth1 = dsth3;
2420  dsth2 = dsth4;
2421  dsth3 = dsth5;
2422  dsth4 = dsth6;
2423  dsth5 = dsth7;
2424  dsth6 = dsth8;
2425  }
2426 
2427  src0_ptr += 8;
2428  src1_ptr += 8;
2429  dst += 8;
2430 
2431  mask4 = LD_SB(ff_hevc_mask_arr + 16);
2432  mask5 = mask4 + 2;
2433  mask6 = mask4 + 4;
2434  mask7 = mask4 + 6;
2435 
2436  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2437  src0_ptr += (7 * src_stride);
2438  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2439 
2440  VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2441  VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2442  VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2443  vec11);
2444  VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2445  vec15);
2446  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2447  filt3);
2448  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2449  filt3);
2450  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2451  filt3);
2452  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2453  filt3);
2454  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2455  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2456  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2457 
2458  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2459 
2460  for (loop_cnt = 4; loop_cnt--;) {
2461  LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2462  src0_ptr += (4 * src_stride);
2463  XORI_B4_128_SB(src7, src8, src9, src10);
2464 
2465  LD2(src1_ptr, src2_stride, tp0, tp1);
2466  INSERT_D2_SH(tp0, tp1, in0);
2467  src1_ptr += (2 * src2_stride);
2468  LD2(src1_ptr, src2_stride, tp0, tp1);
2469  INSERT_D2_SH(tp0, tp1, in1);
2470  src1_ptr += (2 * src2_stride);
2471 
2472  VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2473  vec3);
2474  VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2475  vec7);
2476  dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2477  filt3);
2478  dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2479  filt3);
2480 
2481  dst76 = __msa_ilvr_h(dst97, dst66);
2482  ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2483  dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2484  dst98 = __msa_ilvr_h(dst66, dst108);
2485 
2486  dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2487  filt_h2, filt_h3);
2488  dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2489  filt_h2, filt_h3);
2490  dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2491  filt_h2, filt_h3);
2492  dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2493  filt_h2, filt_h3);
2494  SRA_4V(dst0, dst1, dst2, dst3, 6);
2495  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2496  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2497  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2498  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2499  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2500  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2501  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2502  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2503  CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
2504  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2505  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2506  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2507  dst += (4 * dst_stride);
2508 
2509  dst10 = dst54;
2510  dst32 = dst76;
2511  dst54 = dst98;
2512  dst21 = dst65;
2513  dst43 = dst87;
2514  dst65 = dst109;
2515  dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2516  }
2517 }
2518 
2519 static void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr,
2520  int32_t src_stride,
2521  int16_t *src1_ptr,
2522  int32_t src2_stride,
2523  uint8_t *dst,
2524  int32_t dst_stride,
2525  const int8_t *filter_x,
2526  const int8_t *filter_y,
2527  int32_t height,
2528  int32_t weight0,
2529  int32_t weight1,
2530  int32_t offset0,
2531  int32_t offset1,
2532  int32_t rnd_val)
2533 {
2534  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2535  src1_ptr, src2_stride,
2536  dst, dst_stride, filter_x, filter_y,
2537  height, weight0, weight1, offset0,
2538  offset1, rnd_val, 2);
2539 }
2540 
2541 static void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr,
2542  int32_t src_stride,
2543  int16_t *src1_ptr,
2544  int32_t src2_stride,
2545  uint8_t *dst,
2546  int32_t dst_stride,
2547  const int8_t *filter_x,
2548  const int8_t *filter_y,
2549  int32_t height,
2550  int32_t weight0,
2551  int32_t weight1,
2552  int32_t offset0,
2553  int32_t offset1,
2554  int32_t rnd_val)
2555 {
2556  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2557  src1_ptr, src2_stride,
2558  dst, dst_stride, filter_x, filter_y,
2559  height, weight0, weight1, offset0,
2560  offset1, rnd_val, 3);
2561 }
2562 
2563 static void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr,
2564  int32_t src_stride,
2565  int16_t *src1_ptr,
2566  int32_t src2_stride,
2567  uint8_t *dst,
2568  int32_t dst_stride,
2569  const int8_t *filter_x,
2570  const int8_t *filter_y,
2571  int32_t height,
2572  int32_t weight0,
2573  int32_t weight1,
2574  int32_t offset0,
2575  int32_t offset1,
2576  int32_t rnd_val)
2577 {
2578  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2579  src1_ptr, src2_stride,
2580  dst, dst_stride, filter_x, filter_y,
2581  height, weight0, weight1, offset0,
2582  offset1, rnd_val, 4);
2583 }
2584 
2585 static void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr,
2586  int32_t src_stride,
2587  int16_t *src1_ptr,
2588  int32_t src2_stride,
2589  uint8_t *dst,
2590  int32_t dst_stride,
2591  const int8_t *filter_x,
2592  const int8_t *filter_y,
2593  int32_t height,
2594  int32_t weight0,
2595  int32_t weight1,
2596  int32_t offset0,
2597  int32_t offset1,
2598  int32_t rnd_val)
2599 {
2600  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2601  src1_ptr, src2_stride,
2602  dst, dst_stride, filter_x, filter_y,
2603  height, weight0, weight1, offset0,
2604  offset1, rnd_val, 6);
2605 }
2606 
2607 static void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr,
2608  int32_t src_stride,
2609  int16_t *src1_ptr,
2610  int32_t src2_stride,
2611  uint8_t *dst,
2612  int32_t dst_stride,
2613  const int8_t *filter_x,
2614  const int8_t *filter_y,
2615  int32_t height,
2616  int32_t weight0,
2617  int32_t weight1,
2618  int32_t offset0,
2619  int32_t offset1,
2620  int32_t rnd_val)
2621 {
2622  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2623  src1_ptr, src2_stride,
2624  dst, dst_stride, filter_x, filter_y,
2625  height, weight0, weight1, offset0,
2626  offset1, rnd_val, 8);
2627 }
2628 
2629 static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
2630  int32_t src_stride,
2631  int16_t *src1_ptr,
2632  int32_t src2_stride,
2633  uint8_t *dst,
2634  int32_t dst_stride,
2635  const int8_t *filter,
2636  int32_t weight0,
2637  int32_t weight1,
2638  int32_t offset0,
2639  int32_t offset1,
2640  int32_t rnd_val)
2641 {
2642  int32_t offset, weight, constant;
2643  v8i16 filt0, filt1;
2644  v16i8 src0, src1;
2645  v8i16 in0, in1;
2646  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2647  v16i8 mask1, vec0, vec1;
2648  v8i16 dst0;
2649  v4i32 dst0_r, dst0_l;
2650  v8i16 out0, filter_vec;
2651  v4i32 weight_vec, offset_vec, rnd_vec;
2652 
2653  src0_ptr -= 1;
2654 
2655  filter_vec = LD_SH(filter);
2656  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2657 
2658  mask1 = mask0 + 2;
2659 
2660  offset = (offset0 + offset1) << rnd_val;
2661  weight0 = weight0 & 0x0000FFFF;
2662  weight = weight0 | (weight1 << 16);
2663  constant = 128 * weight1;
2664  constant <<= 6;
2665  offset += constant;
2666 
2667  offset_vec = __msa_fill_w(offset);
2668  weight_vec = __msa_fill_w(weight);
2669  rnd_vec = __msa_fill_w(rnd_val + 1);
2670 
2671  LD_SB2(src0_ptr, src_stride, src0, src1);
2672  LD_SH2(src1_ptr, src2_stride, in0, in1);
2673  in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2674  XORI_B2_128_SB(src0, src1);
2675 
2676  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2677  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2678 
2679  ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
2680  dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
2681  dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
2682  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2683  dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2684  out0 = CLIP_SH_0_255(dst0_r);
2685  out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
2686  ST4x2_UB(out0, dst, dst_stride);
2687 }
2688 
2689 static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
2690  int32_t src_stride,
2691  int16_t *src1_ptr,
2692  int32_t src2_stride,
2693  uint8_t *dst,
2694  int32_t dst_stride,
2695  const int8_t *filter,
2696  int32_t weight0,
2697  int32_t weight1,
2698  int32_t offset0,
2699  int32_t offset1,
2700  int32_t rnd_val)
2701 {
2702  int32_t offset, weight, constant;
2703  v8i16 filt0, filt1;
2704  v16i8 src0, src1, src2, src3;
2705  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2706  v16i8 mask1;
2707  v8i16 dst0, dst1;
2708  v16i8 vec0, vec1;
2709  v8i16 in0, in1, in2, in3;
2710  v8i16 filter_vec;
2711  v4i32 weight_vec, offset_vec, rnd_vec;
2712 
2713  src0_ptr -= 1;
2714 
2715  /* rearranging filter */
2716  filter_vec = LD_SH(filter);
2717  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2718 
2719  mask1 = mask0 + 2;
2720 
2721  offset = (offset0 + offset1) << rnd_val;
2722  weight0 = weight0 & 0x0000FFFF;
2723  weight = weight0 | (weight1 << 16);
2724  constant = 128 * weight1;
2725  constant <<= 6;
2726  offset += constant;
2727 
2728  offset_vec = __msa_fill_w(offset);
2729  weight_vec = __msa_fill_w(weight);
2730  rnd_vec = __msa_fill_w(rnd_val + 1);
2731 
2732  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2733  XORI_B4_128_SB(src0, src1, src2, src3);
2734  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2735  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2736 
2737  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2738  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2739  VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2740  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2741  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2742  weight_vec, rnd_vec, offset_vec,
2743  dst0, dst1);
2744 
2745  dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2746  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
2747 }
2748 
2750  int32_t src_stride,
2751  int16_t *src1_ptr,
2752  int32_t src2_stride,
2753  uint8_t *dst,
2754  int32_t dst_stride,
2755  const int8_t *filter,
2756  int32_t height,
2757  int32_t weight0,
2758  int32_t weight1,
2759  int32_t offset0,
2760  int32_t offset1,
2761  int32_t rnd_val)
2762 {
2763  uint32_t loop_cnt;
2764  int32_t weight, offset, constant;
2765  v8i16 filt0, filt1;
2766  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2767  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2768  v16i8 mask1;
2769  v16i8 vec0, vec1;
2770  v8i16 dst0, dst1, dst2, dst3;
2771  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2772  v8i16 filter_vec;
2773  v4i32 weight_vec, offset_vec, rnd_vec;
2774 
2775  src0_ptr -= 1;
2776 
2777  filter_vec = LD_SH(filter);
2778  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2779 
2780  offset = (offset0 + offset1) << rnd_val;
2781  weight0 = weight0 & 0x0000FFFF;
2782  weight = weight0 | (weight1 << 16);
2783  constant = 128 * weight1;
2784  constant <<= 6;
2785  offset += constant;
2786 
2787  offset_vec = __msa_fill_w(offset);
2788  weight_vec = __msa_fill_w(weight);
2789  rnd_vec = __msa_fill_w(rnd_val + 1);
2790 
2791  mask1 = mask0 + 2;
2792 
2793  for (loop_cnt = (height >> 3); loop_cnt--;) {
2794  LD_SB8(src0_ptr, src_stride,
2795  src0, src1, src2, src3, src4, src5, src6, src7);
2796  src0_ptr += (8 * src_stride);
2797  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2798  src1_ptr += (4 * src2_stride);
2799  LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2800  src1_ptr += (4 * src2_stride);
2801  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2802  ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2803  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2804 
2805  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2806  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2807  VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2808  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2809  VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2810  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2811  VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2812  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2813  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2814  in0, in1, in2, in3,
2815  weight_vec, rnd_vec, offset_vec,
2816  dst0, dst1, dst2, dst3);
2817 
2818  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2819  ST4x8_UB(dst0, dst1, dst, dst_stride);
2820  dst += (8 * dst_stride);
2821  }
2822 }
2823 
2824 static void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr,
2825  int32_t src_stride,
2826  int16_t *src1_ptr,
2827  int32_t src2_stride,
2828  uint8_t *dst,
2829  int32_t dst_stride,
2830  const int8_t *filter,
2831  int32_t height,
2832  int32_t weight0,
2833  int32_t weight1,
2834  int32_t offset0,
2835  int32_t offset1,
2836  int32_t rnd_val)
2837 {
2838  if (2 == height) {
2839  hevc_hz_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2840  dst, dst_stride, filter,
2841  weight0, weight1, offset0, offset1, rnd_val);
2842  } else if (4 == height) {
2843  hevc_hz_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2844  dst, dst_stride, filter,
2845  weight0, weight1, offset0, offset1, rnd_val);
2846  } else if (0 == (height % 8)) {
2847  hevc_hz_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
2848  src1_ptr, src2_stride,
2849  dst, dst_stride, filter, height,
2850  weight0, weight1, offset0, offset1,
2851  rnd_val);
2852  }
2853 }
2854 
2855 static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr,
2856  int32_t src_stride,
2857  int16_t *src1_ptr,
2858  int32_t src2_stride,
2859  uint8_t *dst,
2860  int32_t dst_stride,
2861  const int8_t *filter,
2862  int32_t height,
2863  int32_t weight0,
2864  int32_t weight1,
2865  int32_t offset0,
2866  int32_t offset1,
2867  int32_t rnd_val)
2868 {
2869  uint32_t loop_cnt;
2870  int32_t offset, weight, constant;
2871  v8i16 filt0, filt1;
2872  v16i8 src0, src1, src2, src3;
2873  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2874  v16i8 mask1;
2875  v16i8 vec0, vec1;
2876  v8i16 in0, in1, in2, in3;
2877  v8i16 dst0, dst1, dst2, dst3;
2878  v8i16 filter_vec;
2879  v4i32 weight_vec, offset_vec, rnd_vec;
2880 
2881  src0_ptr -= 1;
2882 
2883  filter_vec = LD_SH(filter);
2884  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2885 
2886  offset = (offset0 + offset1) << rnd_val;
2887  weight0 = weight0 & 0x0000FFFF;
2888  weight = weight0 | (weight1 << 16);
2889  constant = 128 * weight1;
2890  constant <<= 6;
2891  offset += constant;
2892 
2893  offset_vec = __msa_fill_w(offset);
2894  weight_vec = __msa_fill_w(weight);
2895  rnd_vec = __msa_fill_w(rnd_val + 1);
2896 
2897  mask1 = mask0 + 2;
2898 
2899  for (loop_cnt = 2; loop_cnt--;) {
2900  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2901  src0_ptr += (4 * src_stride);
2902  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2903  src1_ptr += (4 * src2_stride);
2904  XORI_B4_128_SB(src0, src1, src2, src3);
2905 
2906  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2907  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2908  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2909  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2910  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2911  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2912  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2913  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2914 
2915  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2916  in0, in1, in2, in3,
2917  weight_vec, rnd_vec, offset_vec,
2918  dst0, dst1, dst2, dst3);
2919 
2920  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2921  ST6x4_UB(dst0, dst1, dst, dst_stride);
2922  dst += (4 * dst_stride);
2923  }
2924 }
2925 
2926 static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
2927  int32_t src_stride,
2928  int16_t *src1_ptr,
2929  int32_t src2_stride,
2930  uint8_t *dst,
2931  int32_t dst_stride,
2932  const int8_t *filter,
2933  int32_t weight0,
2934  int32_t weight1,
2935  int32_t offset0,
2936  int32_t offset1,
2937  int32_t rnd_val)
2938 {
2939  int32_t offset, weight, constant;
2940  v8i16 filt0, filt1;
2941  v16i8 src0, src1;
2942  v8i16 in0, in1;
2943  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2944  v16i8 mask1, vec0, vec1;
2945  v8i16 dst0, dst1;
2946  v8i16 filter_vec;
2947  v4i32 weight_vec, offset_vec, rnd_vec;
2948 
2949  src0_ptr -= 1;
2950 
2951  filter_vec = LD_SH(filter);
2952  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2953 
2954  offset = (offset0 + offset1) << rnd_val;
2955  weight0 = weight0 & 0x0000FFFF;
2956  weight = weight0 | (weight1 << 16);
2957  constant = 128 * weight1;
2958  constant <<= 6;
2959  offset += constant;
2960 
2961  offset_vec = __msa_fill_w(offset);
2962  weight_vec = __msa_fill_w(weight);
2963  rnd_vec = __msa_fill_w(rnd_val + 1);
2964 
2965  mask1 = mask0 + 2;
2966 
2967  LD_SB2(src0_ptr, src_stride, src0, src1);
2968  LD_SH2(src1_ptr, src2_stride, in0, in1);
2969  XORI_B2_128_SB(src0, src1);
2970  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2971  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2972  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2973  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2974  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2975  weight_vec, rnd_vec, offset_vec,
2976  dst0, dst1);
2977 
2978  dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2979  ST8x2_UB(dst0, dst, dst_stride);
2980 }
2981 
2982 static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
2983  int32_t src_stride,
2984  int16_t *src1_ptr,
2985  int32_t src2_stride,
2986  uint8_t *dst,
2987  int32_t dst_stride,
2988  const int8_t *filter,
2989  int32_t weight0,
2990  int32_t weight1,
2991  int32_t offset0,
2992  int32_t offset1,
2993  int32_t rnd_val)
2994 {
2995  int32_t weight, offset, constant;
2996  v8i16 filt0, filt1;
2997  v16i8 src0, src1, src2, src3, src4, src5;
2998  v8i16 in0, in1, in2, in3, in4, in5;
2999  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3000  v16i8 mask1;
3001  v16i8 vec0, vec1;
3002  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3003  v8i16 filter_vec;
3004  v4i32 weight_vec, offset_vec, rnd_vec;
3005 
3006  src0_ptr -= 1;
3007 
3008  filter_vec = LD_SH(filter);
3009  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3010 
3011  offset = (offset0 + offset1) << rnd_val;
3012  weight0 = weight0 & 0x0000FFFF;
3013  weight = weight0 | (weight1 << 16);
3014  constant = 128 * weight1;
3015  constant <<= 6;
3016  offset += constant;
3017 
3018  offset_vec = __msa_fill_w(offset);
3019  weight_vec = __msa_fill_w(weight);
3020  rnd_vec = __msa_fill_w(rnd_val + 1);
3021 
3022  mask1 = mask0 + 2;
3023 
3024  LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
3025 
3026  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3027  src1_ptr += (4 * src2_stride);
3028  LD_SH2(src1_ptr, src2_stride, in4, in5);
3029  XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
3030  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3031  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3032  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3033  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3034  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3035  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3036  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3037  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3038  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3039  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3040  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3041  dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3042  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3043  in0, in1, in2, in3,
3044  weight_vec, rnd_vec, offset_vec,
3045  dst0, dst1, dst2, dst3);
3046  HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3047  weight_vec, rnd_vec, offset_vec,
3048  dst4, dst5);
3049 
3050  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3051  dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3052  ST8x4_UB(dst0, dst1, dst, dst_stride);
3053  dst += (4 * dst_stride);
3054  ST8x2_UB(dst3, dst, dst_stride);
3055 }
3056 
3058  int32_t src_stride,
3059  int16_t *src1_ptr,
3060  int32_t src2_stride,
3061  uint8_t *dst,
3062  int32_t dst_stride,
3063  const int8_t *filter,
3064  int32_t height,
3065  int32_t weight0,
3066  int32_t weight1,
3067  int32_t offset0,
3068  int32_t offset1,
3069  int32_t rnd_val)
3070 {
3071  uint32_t loop_cnt;
3072  int32_t offset, weight, constant;
3073  v8i16 filt0, filt1;
3074  v16i8 src0, src1, src2, src3;
3075  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3076  v16i8 mask1;
3077  v16i8 vec0, vec1;
3078  v8i16 in0, in1, in2, in3;
3079  v8i16 dst0, dst1, dst2, dst3;
3080  v8i16 filter_vec;
3081  v4i32 weight_vec, offset_vec, rnd_vec;
3082 
3083  src0_ptr -= 1;
3084 
3085  filter_vec = LD_SH(filter);
3086  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3087 
3088  offset = (offset0 + offset1) << rnd_val;
3089  weight0 = weight0 & 0x0000FFFF;
3090  weight = weight0 | (weight1 << 16);
3091  constant = 128 * weight1;
3092  constant <<= 6;
3093  offset += constant;
3094 
3095  offset_vec = __msa_fill_w(offset);
3096  weight_vec = __msa_fill_w(weight);
3097  rnd_vec = __msa_fill_w(rnd_val + 1);
3098 
3099  mask1 = mask0 + 2;
3100 
3101  for (loop_cnt = (height >> 2); loop_cnt--;) {
3102  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3103  src0_ptr += (4 * src_stride);
3104  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3105  src1_ptr += (4 * src2_stride);
3106  XORI_B4_128_SB(src0, src1, src2, src3);
3107 
3108  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3109  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3110  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3111  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3112  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3113  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3114  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3115  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3116  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3117  in0, in1, in2, in3,
3118  weight_vec, rnd_vec, offset_vec,
3119  dst0, dst1, dst2, dst3);
3120 
3121  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3122  ST8x4_UB(dst0, dst1, dst, dst_stride);
3123  dst += (4 * dst_stride);
3124  }
3125 }
3126 
3127 static void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr,
3128  int32_t src_stride,
3129  int16_t *src1_ptr,
3130  int32_t src2_stride,
3131  uint8_t *dst,
3132  int32_t dst_stride,
3133  const int8_t *filter,
3134  int32_t height,
3135  int32_t weight0,
3136  int32_t weight1,
3137  int32_t offset0,
3138  int32_t offset1,
3139  int32_t rnd_val)
3140 {
3141  if (2 == height) {
3142  hevc_hz_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3143  dst, dst_stride, filter,
3144  weight0, weight1, offset0, offset1, rnd_val);
3145  } else if (6 == height) {
3146  hevc_hz_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3147  dst, dst_stride, filter,
3148  weight0, weight1, offset0, offset1, rnd_val);
3149  } else if (0 == (height % 4)) {
3150  hevc_hz_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
3151  src1_ptr, src2_stride,
3152  dst, dst_stride, filter, height,
3153  weight0, weight1, offset0, offset1,
3154  rnd_val);
3155  }
3156 }
3157 
3158 static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr,
3159  int32_t src_stride,
3160  int16_t *src1_ptr,
3161  int32_t src2_stride,
3162  uint8_t *dst,
3163  int32_t dst_stride,
3164  const int8_t *filter,
3165  int32_t height,
3166  int32_t weight0,
3167  int32_t weight1,
3168  int32_t offset0,
3169  int32_t offset1,
3170  int32_t rnd_val)
3171 {
3172  uint32_t loop_cnt;
3173  int32_t offset, weight, constant;
3174  v8i16 filt0, filt1;
3175  v16i8 src0, src1, src2, src3;
3176  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3177  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3178  v16i8 mask2 = {
3179  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
3180  };
3181  v16i8 mask1, mask3;
3182  v16i8 vec0, vec1;
3183  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3184  v8i16 filter_vec;
3185  v4i32 weight_vec, offset_vec, rnd_vec;
3186 
3187  src0_ptr -= 1;
3188 
3189  filter_vec = LD_SH(filter);
3190  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3191 
3192  offset = (offset0 + offset1) << rnd_val;
3193  weight0 = weight0 & 0x0000FFFF;
3194  weight = weight0 | (weight1 << 16);
3195  constant = 128 * weight1;
3196  constant <<= 6;
3197  offset += constant;
3198 
3199  offset_vec = __msa_fill_w(offset);
3200  weight_vec = __msa_fill_w(weight);
3201  rnd_vec = __msa_fill_w(rnd_val + 1);
3202 
3203  mask1 = mask0 + 2;
3204  mask3 = mask2 + 2;
3205 
3206  for (loop_cnt = 4; loop_cnt--;) {
3207  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3208  src0_ptr += (4 * src_stride);
3209  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3210  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
3211  src1_ptr += (4 * src2_stride);
3212  ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3213  XORI_B4_128_SB(src0, src1, src2, src3);
3214 
3215  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3216  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3217  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3218  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3219  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3220  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3221  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3222  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3223  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3224  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3225  VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3226  dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3227 
3228  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3229  in0, in1, in2, in3,
3230  weight_vec, rnd_vec, offset_vec,
3231  dst0, dst1, dst2, dst3);
3232  HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3233  weight_vec, rnd_vec, offset_vec,
3234  dst4, dst5);
3235 
3236  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3237  dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3238  ST12x4_UB(dst0, dst1, dst3, dst, dst_stride);
3239  dst += (4 * dst_stride);
3240  }
3241 }
3242 
3243 static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr,
3244  int32_t src_stride,
3245  int16_t *src1_ptr,
3246  int32_t src2_stride,
3247  uint8_t *dst,
3248  int32_t dst_stride,
3249  const int8_t *filter,
3250  int32_t height,
3251  int32_t weight0,
3252  int32_t weight1,
3253  int32_t offset0,
3254  int32_t offset1,
3255  int32_t rnd_val)
3256 {
3257  uint32_t loop_cnt;
3258  int32_t offset, weight, constant;
3259  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
3260  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3261  v8i16 filt0, filt1;
3262  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3263  v16i8 mask1;
3264  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3265  v16i8 vec0, vec1;
3266  v8i16 filter_vec;
3267  v4i32 weight_vec, offset_vec, rnd_vec;
3268 
3269  src0_ptr -= 1;
3270 
3271  filter_vec = LD_SH(filter);
3272  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3273 
3274  offset = (offset0 + offset1) << rnd_val;
3275  weight0 = weight0 & 0x0000FFFF;
3276  weight = weight0 | (weight1 << 16);
3277  constant = 128 * weight1;
3278  constant <<= 6;
3279  offset += constant;
3280 
3281  offset_vec = __msa_fill_w(offset);
3282  weight_vec = __msa_fill_w(weight);
3283  rnd_vec = __msa_fill_w(rnd_val + 1);
3284 
3285  mask1 = mask0 + 2;
3286 
3287  for (loop_cnt = (height >> 2); loop_cnt--;) {
3288  LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
3289  LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
3290  src0_ptr += (4 * src_stride);
3291  LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
3292  LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
3293  src1_ptr += (4 * src2_stride);
3294  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3295 
3296  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3297  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3298  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3299  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3300  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3301  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3302  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3303  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3304  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3305  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3306  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3307  dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3308  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3309  dst6 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3310  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3311  dst7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3312  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3313  in0, in1, in2, in3,
3314  weight_vec, rnd_vec, offset_vec,
3315  dst0, dst1, dst2, dst3);
3316 
3317  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3318  ST_SH2(dst0, dst1, dst, dst_stride);
3319  dst += (2 * dst_stride);
3320 
3321  HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7,
3322  in4, in5, in6, in7,
3323  weight_vec, rnd_vec, offset_vec,
3324  dst0, dst1, dst2, dst3);
3325 
3326  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3327  ST_SH2(dst0, dst1, dst, dst_stride);
3328  dst += (2 * dst_stride);
3329  }
3330 }
3331 
3332 static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr,
3333  int32_t src_stride,
3334  int16_t *src1_ptr,
3335  int32_t src2_stride,
3336  uint8_t *dst,
3337  int32_t dst_stride,
3338  const int8_t *filter,
3339  int32_t height,
3340  int32_t weight0,
3341  int32_t weight1,
3342  int32_t offset0,
3343  int32_t offset1,
3344  int32_t rnd_val)
3345 {
3346  uint32_t loop_cnt;
3347  int32_t offset, weight, constant;
3348  v16i8 src0, src1, src2, src3;
3349  v8i16 filt0, filt1;
3350  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3351  v16i8 mask1, mask2, mask3;
3352  v16i8 vec0, vec1;
3353  v8i16 dst0, dst1, dst2, dst3;
3354  v8i16 in0, in1, in2, in3, in4, in5;
3355  v8i16 filter_vec;
3356  v4i32 weight_vec, offset_vec, rnd_vec;
3357 
3358  src0_ptr -= 1;
3359 
3360  filter_vec = LD_SH(filter);
3361  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3362 
3363  offset = (offset0 + offset1) << rnd_val;
3364  weight0 = weight0 & 0x0000FFFF;
3365  weight = weight0 | (weight1 << 16);
3366  constant = 128 * weight1;
3367  constant <<= 6;
3368  offset += constant;
3369 
3370  offset_vec = __msa_fill_w(offset);
3371  weight_vec = __msa_fill_w(weight);
3372  rnd_vec = __msa_fill_w(rnd_val + 1);
3373 
3374  mask1 = mask0 + 2;
3375  mask2 = mask0 + 8;
3376  mask3 = mask0 + 10;
3377 
3378  for (loop_cnt = 16; loop_cnt--;) {
3379  LD_SB2(src0_ptr, src_stride, src0, src2);
3380  LD_SB2(src0_ptr + 16, src_stride, src1, src3);
3381  src0_ptr += (2 * src_stride);
3382  LD_SH2(src1_ptr, src2_stride, in0, in2);
3383  LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
3384  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
3385  src1_ptr += (2 * src2_stride);
3386  XORI_B4_128_SB(src0, src1, src2, src3);
3387 
3388  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3389  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3390  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3391  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3392  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3393  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3394  VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3395  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3396  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3397  in0, in1, in2, in3,
3398  weight_vec, rnd_vec, offset_vec,
3399  dst0, dst1, dst2, dst3);
3400 
3401  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3402  ST_SH2(dst0, dst1, dst, dst_stride);
3403 
3404  /* 8 width */
3405  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3406  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3407  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3408  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3409  HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5,
3410  weight_vec, rnd_vec, offset_vec,
3411  dst0, dst1);
3412 
3413  dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3414  ST8x2_UB(dst0, (dst + 16), dst_stride);
3415  dst += (2 * dst_stride);
3416  }
3417 }
3418 
3419 static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr,
3420  int32_t src_stride,
3421  int16_t *src1_ptr,
3422  int32_t src2_stride,
3423  uint8_t *dst,
3424  int32_t dst_stride,
3425  const int8_t *filter,
3426  int32_t height,
3427  int32_t weight0,
3428  int32_t weight1,
3429  int32_t offset0,
3430  int32_t offset1,
3431  int32_t rnd_val)
3432 {
3433  uint32_t loop_cnt;
3434  int32_t offset, weight, constant;
3435  v16i8 src0, src1, src2;
3436  v8i16 filt0, filt1;
3437  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3438  v16i8 mask1, mask2, mask3;
3439  v8i16 dst0, dst1, dst2, dst3;
3440  v16i8 vec0, vec1;
3441  v8i16 in0, in1, in2, in3;
3442  v8i16 filter_vec;
3443  v4i32 weight_vec, offset_vec, rnd_vec;
3444 
3445  src0_ptr -= 1;
3446 
3447  filter_vec = LD_SH(filter);
3448  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3449 
3450  offset = (offset0 + offset1) << rnd_val;
3451  weight0 = weight0 & 0x0000FFFF;
3452  weight = weight0 | (weight1 << 16);
3453  constant = 128 * weight1;
3454  constant <<= 6;
3455  offset += constant;
3456 
3457  offset_vec = __msa_fill_w(offset);
3458  weight_vec = __msa_fill_w(weight);
3459  rnd_vec = __msa_fill_w(rnd_val + 1);
3460 
3461  mask1 = mask0 + 2;
3462  mask2 = mask0 + 8;
3463  mask3 = mask0 + 10;
3464 
3465  for (loop_cnt = height; loop_cnt--;) {
3466  LD_SB2(src0_ptr, 16, src0, src1);
3467  src2 = LD_SB(src0_ptr + 24);
3468  src0_ptr += src_stride;
3469  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
3470  src1_ptr += src2_stride;
3471  XORI_B3_128_SB(src0, src1, src2);
3472 
3473  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3474  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3475  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3476  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3477  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3478  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3479  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3480  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3481  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3482  in0, in1, in2, in3,
3483  weight_vec, rnd_vec, offset_vec,
3484  dst0, dst1, dst2, dst3);
3485 
3486  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3487  ST_SH2(dst0, dst1, dst, 16);
3488  dst += dst_stride;
3489  }
3490 }
3491 
3492 static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
3493  int32_t src_stride,
3494  int16_t *src1_ptr,
3495  int32_t src2_stride,
3496  uint8_t *dst,
3497  int32_t dst_stride,
3498  const int8_t *filter,
3499  int32_t weight0,
3500  int32_t weight1,
3501  int32_t offset0,
3502  int32_t offset1,
3503  int32_t rnd_val)
3504 {
3505  int32_t weight, offset, constant;
3506  v16i8 src0, src1, src2, src3, src4;
3507  v8i16 in0, in1, dst10;
3508  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
3509  v4i32 dst10_r, dst10_l;
3510  v8i16 filt0, filt1;
3511  v8i16 filter_vec, out;
3512  v4i32 weight_vec, offset_vec, rnd_vec;
3513 
3514  src0_ptr -= src_stride;
3515 
3516  offset = (offset0 + offset1) << rnd_val;
3517  weight0 = weight0 & 0x0000FFFF;
3518  weight = weight0 | (weight1 << 16);
3519  constant = 128 * weight1;
3520  constant <<= 6;
3521  offset += constant;
3522 
3523  offset_vec = __msa_fill_w(offset);
3524  weight_vec = __msa_fill_w(weight);
3525  rnd_vec = __msa_fill_w(rnd_val + 1);
3526 
3527  filter_vec = LD_SH(filter);
3528  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3529 
3530  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3531  src0_ptr += (3 * src_stride);
3532  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3533  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3534  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3535  LD_SB2(src0_ptr, src_stride, src3, src4);
3536  src0_ptr += (2 * src_stride);
3537  LD_SH2(src1_ptr, src2_stride, in0, in1);
3538  src1_ptr += (2 * src2_stride);
3539 
3540  in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3541  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3542  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3543  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3544 
3545  dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3546 
3547  ILVRL_H2_SW(dst10, in0, dst10_r, dst10_l);
3548  dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
3549  dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
3550  SRAR_W2_SW(dst10_r, dst10_l, rnd_vec);
3551  dst10_r = (v4i32) __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
3552  out = CLIP_SH_0_255(dst10_r);
3553  out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out);
3554  ST4x2_UB(out, dst, dst_stride);
3555 }
3556 
3557 static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
3558  int32_t src_stride,
3559  int16_t *src1_ptr,
3560  int32_t src2_stride,
3561  uint8_t *dst,
3562  int32_t dst_stride,
3563  const int8_t *filter,
3564  int32_t weight0,
3565  int32_t weight1,
3566  int32_t offset0,
3567  int32_t offset1,
3568  int32_t rnd_val)
3569 {
3570  int32_t weight, offset, constant;
3571  v16i8 src0, src1, src2, src3, src4, src5, src6;
3572  v8i16 in0, in1, in2, in3;
3573  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3574  v16i8 src2110, src4332, src6554;
3575  v8i16 dst10, dst32;
3576  v8i16 filt0, filt1;
3577  v8i16 filter_vec;
3578  v4i32 weight_vec, offset_vec, rnd_vec;
3579 
3580  src0_ptr -= src_stride;
3581 
3582  offset = (offset0 + offset1) << rnd_val;
3583  weight0 = weight0 & 0x0000FFFF;
3584  weight = weight0 | (weight1 << 16);
3585  constant = 128 * weight1;
3586  constant <<= 6;
3587  offset += constant;
3588 
3589  offset_vec = __msa_fill_w(offset);
3590  weight_vec = __msa_fill_w(weight);
3591  rnd_vec = __msa_fill_w(rnd_val + 1);
3592 
3593  filter_vec = LD_SH(filter);
3594  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3595 
3596  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3597  src0_ptr += (3 * src_stride);
3598  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3599  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3600  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3601 
3602  LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3603  src0_ptr += (4 * src_stride);
3604  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3605  src1_ptr += (4 * src2_stride);
3606  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3607  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3608  src32_r, src43_r, src54_r, src65_r);
3609  ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3610  XORI_B2_128_SB(src4332, src6554);
3611 
3612  dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3613  dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3614 
3615  HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1,
3616  weight_vec, rnd_vec, offset_vec,
3617  dst10, dst32);
3618 
3619  dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
3620  ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
3621  dst += (4 * dst_stride);
3622 }
3623 
3625  int32_t src_stride,
3626  int16_t *src1_ptr,
3627  int32_t src2_stride,
3628  uint8_t *dst,
3629  int32_t dst_stride,
3630  const int8_t *filter,
3631  int32_t height,
3632  int32_t weight0,
3633  int32_t weight1,
3634  int32_t offset0,
3635  int32_t offset1,
3636  int32_t rnd_val)
3637 {
3638  uint32_t loop_cnt;
3639  int32_t weight, offset, constant;
3640  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3641  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3642  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3643  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3644  v16i8 src2110, src4332, src6554, src8776;
3645  v8i16 dst10, dst32, dst54, dst76;
3646  v8i16 filt0, filt1;
3647  v8i16 filter_vec;
3648  v4i32 weight_vec, offset_vec, rnd_vec;
3649 
3650  src0_ptr -= src_stride;
3651 
3652  offset = (offset0 + offset1) << rnd_val;
3653  weight0 = weight0 & 0x0000FFFF;
3654  weight = weight0 | (weight1 << 16);
3655  constant = 128 * weight1;
3656  constant <<= 6;
3657  offset += constant;
3658 
3659  offset_vec = __msa_fill_w(offset);
3660  weight_vec = __msa_fill_w(weight);
3661  rnd_vec = __msa_fill_w(rnd_val + 1);
3662 
3663  filter_vec = LD_SH(filter);
3664  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3665 
3666  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3667  src0_ptr += (3 * src_stride);
3668  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3669  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3670  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3671 
3672  for (loop_cnt = (height >> 3); loop_cnt--;) {
3673  LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3674  src0_ptr += (6 * src_stride);
3675  LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3676  src1_ptr += (8 * src2_stride);
3677 
3678  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3679  ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3680 
3681  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3682  src32_r, src43_r, src54_r, src65_r);
3683  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3684  ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3685  src4332, src6554, src8776);
3686  XORI_B3_128_SB(src4332, src6554, src8776);
3687 
3688  dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3689  dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3690  dst54 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
3691 
3692  LD_SB2(src0_ptr, src_stride, src9, src2);
3693  src0_ptr += (2 * src_stride);
3694  ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3695  src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3696  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3697 
3698  dst76 = HEVC_FILT_4TAP_SH(src8776, src2110, filt0, filt1);
3699  HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
3700  in0, in1, in2, in3,
3701  weight_vec, rnd_vec, offset_vec,
3702  dst10, dst32, dst54, dst76);
3703 
3704  PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
3705  ST4x8_UB(dst10, dst32, dst, dst_stride);
3706  dst += (8 * dst_stride);
3707  }
3708 }
3709 
3710 static void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr,
3711  int32_t src_stride,
3712  int16_t *src1_ptr,
3713  int32_t src2_stride,
3714  uint8_t *dst,
3715  int32_t dst_stride,
3716  const int8_t *filter,
3717  int32_t height,
3718  int32_t weight0,
3719  int32_t weight1,
3720  int32_t offset0,
3721  int32_t offset1,
3722  int32_t rnd_val)
3723 {
3724  if (2 == height) {
3725  hevc_vt_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3726  dst, dst_stride, filter,
3727  weight0, weight1, offset0, offset1, rnd_val);
3728  } else if (4 == height) {
3729  hevc_vt_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3730  dst, dst_stride, filter,
3731  weight0, weight1, offset0, offset1, rnd_val);
3732  } else if (0 == (height % 8)) {
3733  hevc_vt_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
3734  src1_ptr, src2_stride,
3735  dst, dst_stride, filter, height,
3736  weight0, weight1, offset0, offset1,
3737  rnd_val);
3738  }
3739 }
3740 
3741 static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr,
3742  int32_t src_stride,
3743  int16_t *src1_ptr,
3744  int32_t src2_stride,
3745  uint8_t *dst,
3746  int32_t dst_stride,
3747  const int8_t *filter,
3748  int32_t height,
3749  int32_t weight0,
3750  int32_t weight1,
3751  int32_t offset0,
3752  int32_t offset1,
3753  int32_t rnd_val)
3754 {
3755  uint32_t loop_cnt;
3756  int32_t offset, weight, constant;
3757  v16i8 src0, src1, src2, src3, src4;
3758  v8i16 in0, in1, in2, in3;
3759  v16i8 src10_r, src32_r, src21_r, src43_r;
3760  v8i16 tmp0, tmp1, tmp2, tmp3;
3761  v8i16 filt0, filt1;
3762  v8i16 filter_vec;
3763  v4i32 weight_vec, offset_vec, rnd_vec;
3764 
3765  src0_ptr -= src_stride;
3766 
3767  offset = (offset0 + offset1) << rnd_val;
3768  weight0 = weight0 & 0x0000FFFF;
3769  weight = weight0 | (weight1 << 16);
3770  constant = 128 * weight1;
3771  constant <<= 6;
3772  offset += constant;
3773 
3774  offset_vec = __msa_fill_w(offset);
3775  weight_vec = __msa_fill_w(weight);
3776  rnd_vec = __msa_fill_w(rnd_val + 1);
3777 
3778  filter_vec = LD_SH(filter);
3779  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3780 
3781  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3782  src0_ptr += (3 * src_stride);
3783  XORI_B3_128_SB(src0, src1, src2);
3784  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3785 
3786  for (loop_cnt = (height >> 2); loop_cnt--;) {
3787  LD_SB2(src0_ptr, src_stride, src3, src4);
3788  src0_ptr += (2 * src_stride);
3789  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3790  src1_ptr += (4 * src2_stride);
3791  XORI_B2_128_SB(src3, src4);
3792  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3793 
3794  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3795  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3796 
3797  LD_SB2(src0_ptr, src_stride, src1, src2);
3798  src0_ptr += (2 * src_stride);
3799  XORI_B2_128_SB(src1, src2);
3800  ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3801 
3802  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
3803  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
3804  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3805  in0, in1, in2, in3,
3806  weight_vec, rnd_vec, offset_vec,
3807  tmp0, tmp1, tmp2, tmp3);
3808 
3809  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3810  ST6x4_UB(tmp0, tmp1, dst, dst_stride);
3811  dst += (4 * dst_stride);
3812  }
3813 }
3814 
3815 static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
3816  int32_t src_stride,
3817  int16_t *src1_ptr,
3818  int32_t src2_stride,
3819  uint8_t *dst,
3820  int32_t dst_stride,
3821  const int8_t *filter,
3822  int32_t weight0,
3823  int32_t weight1,
3824  int32_t offset0,
3825  int32_t offset1,
3826  int32_t rnd_val)
3827 {
3828  int32_t offset, weight, constant;
3829  v16i8 src0, src1, src2, src3, src4;
3830  v8i16 in0, in1, tmp0, tmp1;
3831  v16i8 src10_r, src32_r, src21_r, src43_r;
3832  v8i16 filt0, filt1;
3833  v8i16 filter_vec;
3834  v4i32 weight_vec, offset_vec, rnd_vec;
3835 
3836  src0_ptr -= src_stride;
3837 
3838  offset = (offset0 + offset1) << rnd_val;
3839  weight0 = weight0 & 0x0000FFFF;
3840  weight = weight0 | (weight1 << 16);
3841  constant = 128 * weight1;
3842  constant <<= 6;
3843  offset += constant;
3844 
3845  offset_vec = __msa_fill_w(offset);
3846  weight_vec = __msa_fill_w(weight);
3847  rnd_vec = __msa_fill_w(rnd_val + 1);
3848 
3849  filter_vec = LD_SH(filter);
3850  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3851 
3852  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3853  src0_ptr += (3 * src_stride);
3854  XORI_B3_128_SB(src0, src1, src2);
3855  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3856 
3857  LD_SB2(src0_ptr, src_stride, src3, src4);
3858  LD_SH2(src1_ptr, src2_stride, in0, in1);
3859  XORI_B2_128_SB(src3, src4);
3860  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3861 
3862  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3863  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3864  HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
3865  weight_vec, rnd_vec, offset_vec,
3866  tmp0, tmp1);
3867 
3868  tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3869  ST8x2_UB(tmp0, dst, dst_stride);
3870 }
3871 
3872 static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
3873  int32_t src_stride,
3874  int16_t *src1_ptr,
3875  int32_t src2_stride,
3876  uint8_t *dst,
3877  int32_t dst_stride,
3878  const int8_t *filter,
3879  int32_t weight0,
3880  int32_t weight1,
3881  int32_t offset0,
3882  int32_t offset1,
3883  int32_t rnd_val)
3884 {
3885  int32_t offset, weight, constant;
3886  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3887  v8i16 in0, in1, in2, in3, in4, in5;
3888  v16i8 src10_r, src32_r, src54_r, src76_r;
3889  v16i8 src21_r, src43_r, src65_r, src87_r;
3890  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3891  v8i16 filt0, filt1;
3892  v8i16 filter_vec;
3893  v4i32 weight_vec, offset_vec, rnd_vec;
3894 
3895  src0_ptr -= src_stride;
3896 
3897  offset = (offset0 + offset1) << rnd_val;
3898  weight0 = weight0 & 0x0000FFFF;
3899  weight = weight0 | (weight1 << 16);
3900  constant = 128 * weight1;
3901  constant <<= 6;
3902  offset += constant;
3903 
3904  offset_vec = __msa_fill_w(offset);
3905  weight_vec = __msa_fill_w(weight);
3906  rnd_vec = __msa_fill_w(rnd_val + 1);
3907 
3908  filter_vec = LD_SH(filter);
3909  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3910 
3911  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3912  src0_ptr += (3 * src_stride);
3913  XORI_B3_128_SB(src0, src1, src2);
3914  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3915 
3916  LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3917  LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3918  XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3919  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3920  src32_r, src43_r, src54_r, src65_r);
3921  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3922 
3923  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3924  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3925  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3926  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3927  tmp4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3928  tmp5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3929  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3930  in0, in1, in2, in3,
3931  weight_vec, rnd_vec, offset_vec,
3932  tmp0, tmp1, tmp2, tmp3);
3933  HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
3934  weight_vec, rnd_vec, offset_vec,
3935  tmp4, tmp5);
3936 
3937  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3938  tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
3939  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
3940  dst += (4 * dst_stride);
3941  ST8x2_UB(tmp3, dst, dst_stride);
3942 }
3943 
3945  int32_t src_stride,
3946  int16_t *src1_ptr,
3947  int32_t src2_stride,
3948  uint8_t *dst,
3949  int32_t dst_stride,
3950  const int8_t *filter,
3951  int32_t height,
3952  int32_t weight0,
3953  int32_t weight1,
3954  int32_t offset0,
3955  int32_t offset1,
3956  int32_t rnd_val)
3957 {
3958  uint32_t loop_cnt;
3959  int32_t offset, weight, constant;
3960  v16i8 src0, src1, src2, src3, src4;
3961  v8i16 in0, in1, in2, in3;
3962  v16i8 src10_r, src32_r, src21_r, src43_r;
3963  v8i16 tmp0, tmp1, tmp2, tmp3;
3964  v8i16 filt0, filt1;
3965  v8i16 filter_vec;
3966  v4i32 weight_vec, offset_vec, rnd_vec;
3967 
3968  src0_ptr -= src_stride;
3969 
3970  offset = (offset0 + offset1) << rnd_val;
3971  weight0 = weight0 & 0x0000FFFF;
3972  weight = weight0 | (weight1 << 16);
3973  constant = 128 * weight1;
3974  constant <<= 6;
3975  offset += constant;
3976 
3977  offset_vec = __msa_fill_w(offset);
3978  weight_vec = __msa_fill_w(weight);
3979  rnd_vec = __msa_fill_w(rnd_val + 1);
3980 
3981  filter_vec = LD_SH(filter);
3982  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3983 
3984  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3985  src0_ptr += (3 * src_stride);
3986  XORI_B3_128_SB(src0, src1, src2);
3987  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3988 
3989  for (loop_cnt = (height >> 2); loop_cnt--;) {
3990  LD_SB2(src0_ptr, src_stride, src3, src4);
3991  src0_ptr += (2 * src_stride);
3992  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3993  src1_ptr += (4 * src2_stride);
3994  XORI_B2_128_SB(src3, src4);
3995  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3996 
3997  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3998  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3999 
4000  LD_SB2(src0_ptr, src_stride, src1, src2);
4001  src0_ptr += (2 * src_stride);
4002  XORI_B2_128_SB(src1, src2);
4003  ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
4004 
4005  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4006  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4007  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4008  in0, in1, in2, in3,
4009  weight_vec, rnd_vec, offset_vec,
4010  tmp0, tmp1, tmp2, tmp3);
4011 
4012  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4013  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
4014  dst += (4 * dst_stride);
4015  }
4016 }
4017 
4018 static void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr,
4019  int32_t src_stride,
4020  int16_t *src1_ptr,
4021  int32_t src2_stride,
4022  uint8_t *dst,
4023  int32_t dst_stride,
4024  const int8_t *filter,
4025  int32_t height,
4026  int32_t weight0,
4027  int32_t weight1,
4028  int32_t offset0,
4029  int32_t offset1,
4030  int32_t rnd_val)
4031 {
4032  if (2 == height) {
4033  hevc_vt_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4034  dst, dst_stride, filter,
4035  weight0, weight1, offset0, offset1, rnd_val);
4036  } else if (6 == height) {
4037  hevc_vt_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4038  dst, dst_stride, filter,
4039  weight0, weight1, offset0, offset1, rnd_val);
4040  } else {
4041  hevc_vt_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
4042  src1_ptr, src2_stride,
4043  dst, dst_stride, filter, height,
4044  weight0, weight1, offset0, offset1,
4045  rnd_val);
4046  }
4047 }
4048 
4049 static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr,
4050  int32_t src_stride,
4051  int16_t *src1_ptr,
4052  int32_t src2_stride,
4053  uint8_t *dst,
4054  int32_t dst_stride,
4055  const int8_t *filter,
4056  int32_t height,
4057  int32_t weight0,
4058  int32_t weight1,
4059  int32_t offset0,
4060  int32_t offset1,
4061  int32_t rnd_val)
4062 {
4063  uint32_t loop_cnt;
4064  int32_t offset, weight, constant;
4065  v16i8 src0, src1, src2, src3, src4, src5;
4066  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4067  v16i8 src10_r, src32_r, src21_r, src43_r;
4068  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4069  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4070  v16i8 src2110, src4332;
4071  v8i16 filt0, filt1;
4072  v8i16 filter_vec;
4073  v4i32 weight_vec, offset_vec, rnd_vec;
4074 
4075  src0_ptr -= (1 * src_stride);
4076 
4077  offset = (offset0 + offset1) << rnd_val;
4078  weight0 = weight0 & 0x0000FFFF;
4079  weight = weight0 | (weight1 << 16);
4080  constant = 128 * weight1;
4081  constant <<= 6;
4082  offset += constant;
4083 
4084  offset_vec = __msa_fill_w(offset);
4085  weight_vec = __msa_fill_w(weight);
4086  rnd_vec = __msa_fill_w(rnd_val + 1);
4087 
4088  filter_vec = LD_SH(filter);
4089  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4090 
4091  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4092  src0_ptr += (3 * src_stride);
4093  XORI_B3_128_SB(src0, src1, src2);
4094  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4095  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4096  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
4097 
4098  for (loop_cnt = (height >> 2); loop_cnt--;) {
4099  LD_SB2(src0_ptr, src_stride, src3, src4);
4100  src0_ptr += (2 * src_stride);
4101  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4102  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
4103  src1_ptr += (4 * src2_stride);
4104  ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
4105  XORI_B2_128_SB(src3, src4);
4106 
4107  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4108  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4109  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
4110 
4111  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4112  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4113  tmp4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
4114 
4115  LD_SB2(src0_ptr, src_stride, src5, src2);
4116  src0_ptr += (2 * src_stride);
4117  XORI_B2_128_SB(src5, src2);
4118  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4119  ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
4120  src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
4121 
4122  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4123  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4124  tmp5 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
4125  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4126  in0, in1, in2, in3,
4127  weight_vec, rnd_vec, offset_vec,
4128  tmp0, tmp1, tmp2, tmp3);
4129  HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
4130  weight_vec, rnd_vec, offset_vec,
4131  tmp4, tmp5);
4132 
4133  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4134  tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4135  ST12x4_UB(tmp0, tmp1, tmp2, dst, dst_stride);
4136  dst += (4 * dst_stride);
4137  }
4138 }
4139 
4140 static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr,
4141  int32_t src_stride,
4142  int16_t *src1_ptr,
4143  int32_t src2_stride,
4144  uint8_t *dst,
4145  int32_t dst_stride,
4146  const int8_t *filter,
4147  int32_t height,
4148  int32_t weight0,
4149  int32_t weight1,
4150  int32_t offset0,
4151  int32_t offset1,
4152  int32_t rnd_val)
4153 {
4154  uint32_t loop_cnt;
4155  int32_t offset, weight, constant;
4156  v16i8 src0, src1, src2, src3, src4, src5;
4157  v8i16 in0, in1, in2, in3;
4158  v16i8 src10_r, src32_r, src21_r, src43_r;
4159  v16i8 src10_l, src32_l, src21_l, src43_l;
4160  v8i16 tmp0, tmp1, tmp2, tmp3;
4161  v8i16 filt0, filt1;
4162  v8i16 filter_vec;
4163  v4i32 weight_vec, offset_vec, rnd_vec;
4164 
4165  src0_ptr -= src_stride;
4166 
4167  offset = (offset0 + offset1) << rnd_val;
4168  weight0 = weight0 & 0x0000FFFF;
4169  weight = weight0 | (weight1 << 16);
4170  constant = 128 * weight1;
4171  constant <<= 6;
4172  offset += constant;
4173 
4174  offset_vec = __msa_fill_w(offset);
4175  weight_vec = __msa_fill_w(weight);
4176  rnd_vec = __msa_fill_w(rnd_val + 1);
4177 
4178  filter_vec = LD_SH(filter);
4179  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4180 
4181  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4182  src0_ptr += (3 * src_stride);
4183  XORI_B3_128_SB(src0, src1, src2);
4184  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4185  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4186 
4187  for (loop_cnt = (height >> 2); loop_cnt--;) {
4188  LD_SB2(src0_ptr, src_stride, src3, src4);
4189  src0_ptr += (2 * src_stride);
4190  LD_SH2(src1_ptr, src2_stride, in0, in1);
4191  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4192  src1_ptr += (2 * src2_stride);
4193  XORI_B2_128_SB(src3, src4);
4194  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4195  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4196 
4197  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4198  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4199  tmp2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4200  tmp3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4201 
4202  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4203  in0, in1, in2, in3,
4204  weight_vec, rnd_vec, offset_vec,
4205  tmp0, tmp1, tmp2, tmp3);
4206  PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4207  ST_SH2(tmp0, tmp1, dst, dst_stride);
4208  dst += (2 * dst_stride);
4209  LD_SB2(src0_ptr, src_stride, src5, src2);
4210  src0_ptr += (2 * src_stride);
4211 
4212  LD_SH2(src1_ptr, src2_stride, in0, in1);
4213  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4214  src1_ptr += (2 * src2_stride);
4215  XORI_B2_128_SB(src5, src2);
4216  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4217  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4218 
4219  tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4220  tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4221  tmp2 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4222  tmp3 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4223  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4224  in0, in1, in2, in3,
4225  weight_vec, rnd_vec, offset_vec,
4226  tmp0, tmp1, tmp2, tmp3);
4227 
4228  PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4229  ST_SH2(tmp0, tmp1, dst, dst_stride);
4230  dst += (2 * dst_stride);
4231  }
4232 }
4233 
4234 static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr,
4235  int32_t src_stride,
4236  int16_t *src1_ptr,
4237  int32_t src2_stride,
4238  uint8_t *dst,
4239  int32_t dst_stride,
4240  const int8_t *filter,
4241  int32_t height,
4242  int32_t weight0,
4243  int32_t weight1,
4244  int32_t offset0,
4245  int32_t offset1,
4246  int32_t rnd_val)
4247 {
4248  uint32_t loop_cnt;
4249  int32_t offset, weight, constant;
4250  v16i8 src0, src1, src2, src3, src4, src5;
4251  v16i8 src6, src7, src8, src9, src10, src11;
4252  v8i16 in0, in1, in2, in3, in4, in5;
4253  v16i8 src10_r, src32_r, src76_r, src98_r;
4254  v16i8 src10_l, src32_l, src21_l, src43_l;
4255  v16i8 src21_r, src43_r, src87_r, src109_r;
4256  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4257  v8i16 filt0, filt1;
4258  v8i16 filter_vec;
4259  v4i32 weight_vec, offset_vec, rnd_vec;
4260 
4261  src0_ptr -= src_stride;
4262 
4263  offset = (offset0 + offset1) << rnd_val;
4264  weight0 = weight0 & 0x0000FFFF;
4265  weight = weight0 | (weight1 << 16);
4266  constant = 128 * weight1;
4267  constant <<= 6;
4268  offset += constant;
4269 
4270  offset_vec = __msa_fill_w(offset);
4271  weight_vec = __msa_fill_w(weight);
4272  rnd_vec = __msa_fill_w(rnd_val + 1);
4273 
4274  filter_vec = LD_SH(filter);
4275  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4276 
4277  /* 16width */
4278  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4279  XORI_B3_128_SB(src0, src1, src2);
4280  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4281  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4282  /* 8width */
4283  LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4284  src0_ptr += (3 * src_stride);
4285  XORI_B3_128_SB(src6, src7, src8);
4286  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4287 
4288  for (loop_cnt = (height >> 2); loop_cnt--;) {
4289  /* 16width */
4290  LD_SB2(src0_ptr, src_stride, src3, src4);
4291  LD_SH2(src1_ptr, src2_stride, in0, in1);
4292  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4293  XORI_B2_128_SB(src3, src4);
4294  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4295  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4296 
4297  /* 8width */
4298  LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4299  src0_ptr += (2 * src_stride);
4300  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4301  src1_ptr += (2 * src2_stride);
4302  XORI_B2_128_SB(src9, src10);
4303  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4304  /* 16width */
4305  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4306  tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4307  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4308  tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4309  /* 8width */
4310  tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4311  tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4312  /* 16width */
4313  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4314  in0, in1, in2, in3,
4315  weight_vec, rnd_vec, offset_vec,
4316  tmp0, tmp1, tmp4, tmp5);
4317  /* 8width */
4318  HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4319  weight_vec, rnd_vec, offset_vec,
4320  tmp2, tmp3);
4321  /* 16width */
4322  PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4323  /* 8width */
4324  tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4325  ST_SH2(tmp0, tmp1, dst, dst_stride);
4326  ST8x2_UB(tmp2, dst + 16, dst_stride);
4327  dst += (2 * dst_stride);
4328 
4329  /* 16width */
4330  LD_SB2(src0_ptr, src_stride, src5, src2);
4331  LD_SH2(src1_ptr, src2_stride, in0, in1);
4332  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4333  XORI_B2_128_SB(src5, src2);
4334  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4335  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4336  /* 8width */
4337  LD_SB2(src0_ptr + 16, src_stride, src11, src8);
4338  src0_ptr += (2 * src_stride);
4339  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4340  src1_ptr += (2 * src2_stride);
4341  XORI_B2_128_SB(src11, src8);
4342  ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
4343  /* 16width */
4344  tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4345  tmp4 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4346  tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4347  tmp5 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4348  /* 8width */
4349  tmp2 = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
4350  tmp3 = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
4351  /* 16width */
4352  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4353  in0, in1, in2, in3,
4354  weight_vec, rnd_vec, offset_vec,
4355  tmp0, tmp1, tmp4, tmp5);
4356  /* 8width */
4357  HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4358  weight_vec, rnd_vec, offset_vec,
4359  tmp2, tmp3);
4360  /* 16width */
4361  PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4362 
4363  /* 8width */
4364  tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4365  ST_SH2(tmp0, tmp1, dst, dst_stride);
4366  ST8x2_UB(tmp2, dst + 16, dst_stride);
4367  dst += (2 * dst_stride);
4368  }
4369 }
4370 
4371 static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr,
4372  int32_t src_stride,
4373  int16_t *src1_ptr,
4374  int32_t src2_stride,
4375  uint8_t *dst,
4376  int32_t dst_stride,
4377  const int8_t *filter,
4378  int32_t height,
4379  int32_t weight0,
4380  int32_t weight1,
4381  int32_t offset0,
4382  int32_t offset1,
4383  int32_t rnd_val)
4384 {
4385  uint32_t loop_cnt;
4386  uint8_t *dst_tmp = dst + 16;
4387  int32_t offset, weight, constant;
4388  v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
4389  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4390  v16i8 src10_r, src32_r, src76_r, src98_r;
4391  v16i8 src21_r, src43_r, src87_r, src109_r;
4392  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4393  v16i8 src10_l, src32_l, src76_l, src98_l;
4394  v16i8 src21_l, src43_l, src87_l, src109_l;
4395  v8i16 filt0, filt1;
4396  v8i16 filter_vec;
4397  v4i32 weight_vec, offset_vec, rnd_vec;
4398 
4399  src0_ptr -= src_stride;
4400 
4401  offset = (offset0 + offset1) << rnd_val;
4402  weight0 = weight0 & 0x0000FFFF;
4403  weight = weight0 | (weight1 << 16);
4404  constant = 128 * weight1;
4405  constant <<= 6;
4406  offset += constant;
4407 
4408  offset_vec = __msa_fill_w(offset);
4409  weight_vec = __msa_fill_w(weight);
4410  rnd_vec = __msa_fill_w(rnd_val + 1);
4411 
4412  filter_vec = LD_SH(filter);
4413  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4414 
4415  /* 16width */
4416  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4417  XORI_B3_128_SB(src0, src1, src2);
4418  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4419  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4420  /* next 16width */
4421  LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4422  src0_ptr += (3 * src_stride);
4423  XORI_B3_128_SB(src6, src7, src8);
4424  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4425  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
4426 
4427  for (loop_cnt = (height >> 1); loop_cnt--;) {
4428  /* 16width */
4429  LD_SB2(src0_ptr, src_stride, src3, src4);
4430  LD_SH2(src1_ptr, src2_stride, in0, in1);
4431  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4432  XORI_B2_128_SB(src3, src4);
4433  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4434  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4435 
4436  /* 16width */
4437  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4438  tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4439  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4440  tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4441  /* 16width */
4442  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4443  in0, in1, in2, in3,
4444  weight_vec, rnd_vec, offset_vec,
4445  tmp0, tmp1, tmp4, tmp5);
4446  /* 16width */
4447  PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4448  ST_SH2(tmp0, tmp1, dst, dst_stride);
4449  dst += (2 * dst_stride);
4450 
4451  src10_r = src32_r;
4452  src21_r = src43_r;
4453  src10_l = src32_l;
4454  src21_l = src43_l;
4455  src2 = src4;
4456 
4457  /* next 16width */
4458  LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4459  src0_ptr += (2 * src_stride);
4460  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4461  LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
4462  src1_ptr += (2 * src2_stride);
4463  XORI_B2_128_SB(src9, src10);
4464  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4465  ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
4466  /* next 16width */
4467  tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4468  tmp6 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
4469  tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4470  tmp7 = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
4471  /* next 16width */
4472  HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
4473  in4, in5, in6, in7,
4474  weight_vec, rnd_vec, offset_vec,
4475  tmp2, tmp3, tmp6, tmp7);
4476 
4477  /* next 16width */
4478  PCKEV_B2_SH(tmp6, tmp2, tmp7, tmp3, tmp2, tmp3);
4479  ST_SH2(tmp2, tmp3, dst_tmp, dst_stride);
4480  dst_tmp += (2 * dst_stride);
4481 
4482  src76_r = src98_r;
4483  src87_r = src109_r;
4484  src76_l = src98_l;
4485  src87_l = src109_l;
4486  src8 = src10;
4487  }
4488 }
4489 
4490 static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
4491  int32_t src_stride,
4492  int16_t *src1_ptr,
4493  int32_t src2_stride,
4494  uint8_t *dst,
4495  int32_t dst_stride,
4496  const int8_t *filter_x,
4497  const int8_t *filter_y,
4498  int32_t weight0,
4499  int32_t weight1,
4500  int32_t offset0,
4501  int32_t offset1,
4502  int32_t rnd_val)
4503 {
4504  uint64_t tp0, tp1;
4506  v8i16 in0 = { 0 };
4507  v16u8 out;
4508  v16i8 src0, src1, src2, src3, src4;
4509  v8i16 filt0, filt1;
4510  v8i16 filt_h0, filt_h1;
4511  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4512  v16i8 mask1;
4513  v8i16 filter_vec, tmp, weight_vec;
4514  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4515  v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1;
4516  v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec;
4517 
4518  src0_ptr -= (src_stride + 1);
4519 
4520  filter_vec = LD_SH(filter_x);
4521  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4522 
4523  filter_vec = LD_SH(filter_y);
4524  UNPCK_R_SB_SH(filter_vec, filter_vec);
4525 
4526  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4527 
4528  mask1 = mask0 + 2;
4529 
4530  offset = (offset0 + offset1) << rnd_val;
4531  weight0 = weight0 & 0x0000FFFF;
4532  weight = weight0 | (weight1 << 16);
4533 
4534  const_vec = __msa_fill_w((128 * weight1));
4535  const_vec <<= 6;
4536  offset_vec = __msa_fill_w(offset);
4537  weight_vec = (v8i16) __msa_fill_w(weight);
4538  rnd_vec = __msa_fill_w(rnd_val + 1);
4539  offset_vec += const_vec;
4540 
4541  LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4542  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4543 
4544  VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
4545  VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
4546  VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
4547 
4548  dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4549  dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4550  dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4551 
4552  ILVRL_H2_SH(dst31, dst20, dst10, dst32);
4553  ILVRL_H2_SH(dst42, dst31, dst21, dst43);
4554 
4555  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4556  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4557  dst0 >>= 6;
4558  dst1 >>= 6;
4559  dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4560 
4561  LD2(src1_ptr, src2_stride, tp0, tp1);
4562  INSERT_D2_SH(tp0, tp1, in0);
4563 
4564  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4565  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4566  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4567  SRAR_W2_SW(dst0, dst1, rnd_vec);
4568  tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4569  tmp = CLIP_SH_0_255_MAX_SATU(tmp);
4570  out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
4571  ST4x2_UB(out, dst, dst_stride);
4572 }
4573 
4574 static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
4575  int32_t src_stride,
4576  int16_t *src1_ptr,
4577  int32_t src2_stride,
4578  uint8_t *dst,
4579  int32_t dst_stride,
4580  const int8_t *filter_x,
4581  const int8_t *filter_y,
4582  int32_t weight0,
4583  int32_t weight1,
4584  int32_t offset0,
4585  int32_t offset1,
4586  int32_t rnd_val)
4587 {
4588  uint64_t tp0, tp1;
4590  v16u8 out;
4591  v8i16 in0 = { 0 }, in1 = { 0 };
4592  v16i8 src0, src1, src2, src3, src4, src5, src6;
4593  v8i16 filt0, filt1;
4594  v8i16 filt_h0, filt_h1;
4595  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4596  v16i8 mask1;
4597  v8i16 filter_vec, weight_vec;
4598  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4599  v8i16 tmp0, tmp1, tmp2, tmp3;
4600  v8i16 dst30, dst41, dst52, dst63;
4601  v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
4602  v4i32 offset_vec, rnd_vec, const_vec;
4603  v4i32 dst0, dst1, dst2, dst3;
4604 
4605  src0_ptr -= (src_stride + 1);
4606 
4607  filter_vec = LD_SH(filter_x);
4608  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4609 
4610  filter_vec = LD_SH(filter_y);
4611  UNPCK_R_SB_SH(filter_vec, filter_vec);
4612 
4613  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4614 
4615  mask1 = mask0 + 2;
4616 
4617  offset = (offset0 + offset1) << rnd_val;
4618  weight0 = weight0 & 0x0000FFFF;
4619  weight = weight0 | (weight1 << 16);
4620 
4621  const_vec = __msa_fill_w((128 * weight1));
4622  const_vec <<= 6;
4623  offset_vec = __msa_fill_w(offset);
4624  weight_vec = (v8i16) __msa_fill_w(weight);
4625  rnd_vec = __msa_fill_w(rnd_val + 1);
4626  offset_vec += const_vec;
4627 
4628  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4629  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4630 
4631  VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
4632  VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
4633  VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
4634  VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4635 
4636  dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4637  dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4638  dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4639  dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4640 
4641  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
4642  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
4643  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
4644  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4645  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4646  dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
4647  dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
4648  SRA_4V(dst0, dst1, dst2, dst3, 6);
4649  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
4650 
4651  LD2(src1_ptr, src2_stride, tp0, tp1);
4652  INSERT_D2_SH(tp0, tp1, in0);
4653  src1_ptr += (2 * src2_stride);
4654  LD2(src1_ptr, src2_stride, tp0, tp1);
4655  INSERT_D2_SH(tp0, tp1, in1);
4656 
4657  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
4658  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
4659 
4660  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4661  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4662  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4663  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4664  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4665  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4666  CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
4667  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4668  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
4669 }
4670 
4672  int32_t src_stride,
4673  int16_t *src1_ptr,
4674  int32_t src2_stride,
4675  uint8_t *dst,
4676  int32_t dst_stride,
4677  const int8_t *filter_x,
4678  const int8_t *filter_y,
4679  int32_t height,
4680  int32_t weight0,
4681  int32_t weight1,
4682  int32_t offset0,
4683  int32_t offset1,
4684  int32_t rnd_val)
4685 {
4686  uint32_t loop_cnt;
4687  uint64_t tp0, tp1;
4689  v16u8 out0, out1;
4690  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4691  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4692  v8i16 filt0, filt1;
4693  v8i16 filt_h0, filt_h1;
4694  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4695  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4696  v16i8 mask1;
4697  v8i16 filter_vec, weight_vec;
4698  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4699  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4700  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4701  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4702  v8i16 dst98_r, dst109_r;
4703  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4704  v4i32 offset_vec, rnd_vec, const_vec;
4705 
4706  src0_ptr -= (src_stride + 1);
4707 
4708  filter_vec = LD_SH(filter_x);
4709  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4710 
4711  filter_vec = LD_SH(filter_y);
4712  UNPCK_R_SB_SH(filter_vec, filter_vec);
4713 
4714  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4715 
4716  mask1 = mask0 + 2;
4717 
4718  offset = (offset0 + offset1) << rnd_val;
4719  weight0 = weight0 & 0x0000FFFF;
4720  weight = weight0 | (weight1 << 16);
4721 
4722  const_vec = __msa_fill_w((128 * weight1));
4723  const_vec <<= 6;
4724  offset_vec = __msa_fill_w(offset);
4725  weight_vec = (v8i16) __msa_fill_w(weight);
4726  rnd_vec = __msa_fill_w(rnd_val + 1);
4727  offset_vec += const_vec;
4728 
4729  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4730  src0_ptr += (3 * src_stride);
4731  XORI_B3_128_SB(src0, src1, src2);
4732 
4733  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
4734  VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
4735  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4736  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4737  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4738  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4739 
4740  for (loop_cnt = height >> 3; loop_cnt--;) {
4741  LD_SB8(src0_ptr, src_stride,
4742  src3, src4, src5, src6, src7, src8, src9, src10);
4743  src0_ptr += (8 * src_stride);
4744  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4745  VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4746  VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4747  VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4748  VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4749 
4750  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4751  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4752  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4753  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4754 
4755  dst32_r = __msa_ilvr_h(dst73, dst22);
4756  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4757  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4758  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4759  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4760  dst76_r = __msa_ilvr_h(dst22, dst106);
4761 
4762  LD2(src1_ptr, src2_stride, tp0, tp1);
4763  src1_ptr += 2 * src2_stride;
4764  INSERT_D2_SH(tp0, tp1, in0);
4765  LD2(src1_ptr, src2_stride, tp0, tp1);
4766  src1_ptr += 2 * src2_stride;
4767  INSERT_D2_SH(tp0, tp1, in1);
4768 
4769  LD2(src1_ptr, src2_stride, tp0, tp1);
4770  src1_ptr += 2 * src2_stride;
4771  INSERT_D2_SH(tp0, tp1, in2);
4772  LD2(src1_ptr, src2_stride, tp0, tp1);
4773  src1_ptr += 2 * src2_stride;
4774  INSERT_D2_SH(tp0, tp1, in3);
4775 
4776  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4777  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4778  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4779  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4780  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4781  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4782  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4783  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4784  SRA_4V(dst0, dst1, dst2, dst3, 6);
4785  SRA_4V(dst4, dst5, dst6, dst7, 6);
4786  PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
4787  dst2, dst3);
4788  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4789  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
4790  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
4791  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
4792  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4793  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4794  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4795  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4796  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
4797  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
4798  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
4799  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
4800  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4801  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4802  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4803  tmp2, tmp3);
4804  CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
4805  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4806  ST4x8_UB(out0, out1, dst, dst_stride);
4807  dst += (8 * dst_stride);
4808 
4809  dst10_r = dst98_r;
4810  dst21_r = dst109_r;
4811  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4812  }
4813 }
4814 
4815 static void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr,
4816  int32_t src_stride,
4817  int16_t *src1_ptr,
4818  int32_t src2_stride,
4819  uint8_t *dst,
4820  int32_t dst_stride,
4821  const int8_t *filter_x,
4822  const int8_t *filter_y,
4823  int32_t height,
4824  int32_t weight0,
4825  int32_t weight1,
4826  int32_t offset0,
4827  int32_t offset1,
4828  int32_t rnd_val)
4829 {
4830  if (2 == height) {
4831  hevc_hv_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4832  dst, dst_stride, filter_x, filter_y,
4833  weight0, weight1, offset0, offset1, rnd_val);
4834  } else if (4 == height) {
4835  hevc_hv_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4836  dst, dst_stride, filter_x, filter_y,
4837  weight0, weight1, offset0, offset1, rnd_val);
4838  } else if (0 == (height % 8)) {
4839  hevc_hv_biwgt_4t_4multx8mult_msa(src0_ptr, src_stride,
4840  src1_ptr, src2_stride,
4841  dst, dst_stride, filter_x, filter_y,
4842  height, weight0, weight1,
4843  offset0, offset1, rnd_val);
4844  }
4845 }
4846 
4847 static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
4848  int32_t src_stride,
4849  int16_t *src1_ptr,
4850  int32_t src2_stride,
4851  uint8_t *dst,
4852  int32_t dst_stride,
4853  const int8_t *filter_x,
4854  const int8_t *filter_y,
4855  int32_t height,
4856  int32_t weight0,
4857  int32_t weight1,
4858  int32_t offset0,
4859  int32_t offset1,
4860  int32_t rnd_val)
4861 {
4862  uint32_t tpw0, tpw1, tpw2, tpw3;
4863  uint64_t tp0, tp1;
4865  v16u8 out0, out1, out2;
4866  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4867  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4868  v8i16 in4 = { 0 }, in5 = { 0 };
4869  v8i16 filt0, filt1;
4870  v8i16 filt_h0, filt_h1, filter_vec;
4871  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4872  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4873  v16i8 mask1;
4874  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4875  v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec;
4876  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4877  v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4878  v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4879  v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4880  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4881  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4882  v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4883  v4i32 offset_vec, rnd_vec, const_vec;
4884 
4885  src0_ptr -= (src_stride + 1);
4886 
4887  filter_vec = LD_SH(filter_x);
4888  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4889 
4890  filter_vec = LD_SH(filter_y);
4891  UNPCK_R_SB_SH(filter_vec, filter_vec);
4892 
4893  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4894 
4895  mask1 = mask0 + 2;
4896 
4897  offset = (offset0 + offset1) << rnd_val;
4898  weight0 = weight0 & 0x0000FFFF;
4899  weight = weight0 | (weight1 << 16);
4900 
4901  const_vec = __msa_fill_w((128 * weight1));
4902  const_vec <<= 6;
4903  offset_vec = __msa_fill_w(offset);
4904  weight_vec = (v8i16) __msa_fill_w(weight);
4905  rnd_vec = __msa_fill_w(rnd_val + 1);
4906  offset_vec += const_vec;
4907 
4908  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4909  src0_ptr += (3 * src_stride);
4910  XORI_B3_128_SB(src0, src1, src2);
4911 
4912  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4913  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4914  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4915  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4916  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4917  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4918 
4919  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4920  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4921 
4922  LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
4923  src10);
4924  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4925 
4926  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4927  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4928  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4929  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4930 
4931  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4932  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4933  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4934  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4935 
4936  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4937  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4938  VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4939  VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4940 
4941  dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4942  dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4943  dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4944  dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4945 
4946  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4947  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4948  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4949  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4950  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
4951  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
4952  ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
4953  ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
4954  PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4955  PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4956  dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4957 
4958  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4959  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4960  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4961  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4962  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4963  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4964  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4965  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4966  dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
4967  dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
4968  dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
4969  dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4970  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4971  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4972  SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4973  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1);
4974  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3);
4975 
4976  LD2(src1_ptr, src2_stride, tp0, tp1);
4977  INSERT_D2_SH(tp0, tp1, in0);
4978  LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4979  INSERT_D2_SH(tp0, tp1, in1);
4980 
4981  LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4982  INSERT_D2_SH(tp0, tp1, in2);
4983  LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4984  INSERT_D2_SH(tp0, tp1, in3);
4985 
4986  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4987  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
4988  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
4989  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
4990  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4991  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4992  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4993  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4994  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
4995  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
4996  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
4997  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
4998  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4999  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5000  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5001  tmp2, tmp3);
5002  CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
5003  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5004  ST4x8_UB(out0, out1, dst, dst_stride);
5005 
5006  PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
5007 
5008  LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5009  src1_ptr += (4 * src2_stride);
5010  INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4);
5011  LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5012  INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5);
5013 
5014  ILVRL_H2_SH(dst4, in4, tmp0, tmp1);
5015  ILVRL_H2_SH(dst5, in5, tmp2, tmp3);
5016 
5017  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5018  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5019  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5020  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5021  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5022  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5023 
5024  CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
5025  out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5026  ST2x4_UB(out2, 0, dst + 4, dst_stride);
5027  dst += 4 * dst_stride;
5028  ST2x4_UB(out2, 4, dst + 4, dst_stride);
5029 }
5030 
5031 static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
5032  int32_t src_stride,
5033  int16_t *src1_ptr,
5034  int32_t src2_stride,
5035  uint8_t *dst,
5036  int32_t dst_stride,
5037  const int8_t *filter_x,
5038  const int8_t *filter_y,
5039  int32_t weight0,
5040  int32_t weight1,
5041  int32_t offset0,
5042  int32_t offset1,
5043  int32_t rnd_val)
5044 {
5046  v16u8 out;
5047  v16i8 src0, src1, src2, src3, src4;
5048  v8i16 filt0, filt1;
5049  v8i16 filt_h0, filt_h1;
5050  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5051  v16i8 mask1;
5052  v8i16 filter_vec, weight_vec;
5053  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5054  v8i16 dst0, dst1, dst2, dst3, dst4;
5055  v8i16 in0, in1;
5056  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
5057  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
5058  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
5059  v8i16 tmp0, tmp1, tmp2, tmp3;
5060  v4i32 offset_vec, rnd_vec, const_vec;
5061 
5062  src0_ptr -= (src_stride + 1);
5063 
5064  filter_vec = LD_SH(filter_x);
5065  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5066 
5067  filter_vec = LD_SH(filter_y);
5068  UNPCK_R_SB_SH(filter_vec, filter_vec);
5069 
5070  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5071 
5072  mask1 = mask0 + 2;
5073 
5074  offset = (offset0 + offset1) << rnd_val;
5075  weight0 = weight0 & 0x0000FFFF;
5076  weight = weight0 | (weight1 << 16);
5077 
5078  const_vec = __msa_fill_w((128 * weight1));
5079  const_vec <<= 6;
5080  offset_vec = __msa_fill_w(offset);
5081  weight_vec = (v8i16) __msa_fill_w(weight);
5082  rnd_vec = __msa_fill_w(rnd_val + 1);
5083  offset_vec += const_vec;
5084 
5085  LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5086  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5087 
5088  LD_SH2(src1_ptr, src2_stride, in0, in1);
5089 
5090  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5091  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5092  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5093  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5094  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5095 
5096  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5097  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5098  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5099  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5100  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5101 
5102  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
5103  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
5104  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
5105  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
5106  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5107  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5108  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5109  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5110  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5111  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
5112 
5113  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
5114  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
5115 
5116  dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5117  dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5118  dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5119  dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5120  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5121  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
5122  CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
5123  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
5124  ST8x2_UB(out, dst, dst_stride);
5125 }
5126 
5128  int32_t src_stride,
5129  int16_t *src1_ptr,
5130  int32_t src2_stride,
5131  uint8_t *dst,
5132  int32_t dst_stride,
5133  const int8_t *filter_x,
5134  const int8_t *filter_y,
5135  int32_t weight0,
5136  int32_t weight1,
5137  int32_t offset0,
5138  int32_t offset1,
5139  int32_t rnd_val,
5140  int32_t width8mult)
5141 {
5143  uint32_t cnt;
5144  v16u8 out0, out1;
5145  v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
5146  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5147  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec;
5148  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5149  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3;
5150  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5151  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5152  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5153  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5154  v4i32 offset_vec, rnd_vec, const_vec;
5155 
5156  src0_ptr -= (src_stride + 1);
5157 
5158  filter_vec = LD_SH(filter_x);
5159  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5160 
5161  filter_vec = LD_SH(filter_y);
5162  UNPCK_R_SB_SH(filter_vec, filter_vec);
5163 
5164  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5165 
5166  mask0 = LD_SB(ff_hevc_mask_arr);
5167  mask1 = mask0 + 2;
5168 
5169  offset = (offset0 + offset1) << rnd_val;
5170  weight0 = weight0 & 0x0000FFFF;
5171  weight = weight0 | (weight1 << 16);
5172 
5173  const_vec = __msa_fill_w((128 * weight1));
5174  const_vec <<= 6;
5175  offset_vec = __msa_fill_w(offset);
5176  rnd_vec = __msa_fill_w(rnd_val + 1);
5177  offset_vec += const_vec;
5178  weight_vec = (v8i16) __msa_fill_w(weight);
5179 
5180  for (cnt = width8mult; cnt--;) {
5181  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
5182  src0_ptr += 8;
5183  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
5184 
5185  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
5186  src1_ptr += 8;
5187 
5188  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5189  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5190  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5191 
5192  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5193  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5194  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5195 
5196  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5197  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5198 
5199  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5200  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5201  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5202  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5203 
5204  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5205  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5206  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5207  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5208 
5209  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5210  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5211  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5212  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5213 
5214  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5215  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5216  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5217  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5218  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5219  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5220  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5221  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5222 
5223  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5224  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5225  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5226  dst3_r, dst0, dst1, dst2, dst3);
5227 
5228  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5229  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5230  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5231  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5232  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5233  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5234  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5235  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5236  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5237  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5238  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5239  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5240  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5241  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5242  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5243  tmp0, tmp1, tmp2, tmp3);
5244  CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
5245  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5246  ST8x4_UB(out0, out1, dst, dst_stride);
5247  dst += 8;
5248  }
5249 }
5250 
5251 static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
5252  int32_t src_stride,
5253  int16_t *src1_ptr,
5254  int32_t src2_stride,
5255  uint8_t *dst,
5256  int32_t dst_stride,
5257  const int8_t *filter_x,
5258  const int8_t *filter_y,
5259  int32_t weight0,
5260  int32_t weight1,
5261  int32_t offset0,
5262  int32_t offset1,
5263  int32_t rnd_val)
5264 {
5265  uint32_t offset, weight;
5266  v16u8 out0, out1, out2;
5267  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5268  v8i16 filt0, filt1;
5269  v8i16 filt_h0, filt_h1;
5270  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5271  v16i8 mask1;
5272  v8i16 filter_vec, weight_vec;
5273  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5274  v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
5275  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
5276  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5277  v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
5278  v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
5279  v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
5280  v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
5281  v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
5282  v8i16 in0, in1, in2, in3, in4, in5;
5283  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5284  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5285  v4i32 offset_vec, rnd_vec, const_vec;
5286 
5287  src0_ptr -= (src_stride + 1);
5288 
5289  filter_vec = LD_SH(filter_x);
5290  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5291 
5292  filter_vec = LD_SH(filter_y);
5293  UNPCK_R_SB_SH(filter_vec, filter_vec);
5294 
5295  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5296 
5297  mask1 = mask0 + 2;
5298 
5299  offset = (offset0 + offset1) << rnd_val;
5300  weight0 = weight0 & 0x0000FFFF;
5301  weight = weight0 | (weight1 << 16);
5302 
5303  const_vec = __msa_fill_w((128 * weight1));
5304  const_vec <<= 6;
5305  offset_vec = __msa_fill_w(offset);
5306  weight_vec = (v8i16) __msa_fill_w(weight);
5307  rnd_vec = __msa_fill_w(rnd_val + 1);
5308  offset_vec += const_vec;
5309 
5310  LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5311  src0_ptr += (5 * src_stride);
5312  LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
5313 
5314  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5315  XORI_B4_128_SB(src5, src6, src7, src8);
5316 
5317  LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
5318 
5319  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5320  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5321  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5322  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5323  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5324  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
5325  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
5326  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
5327  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
5328 
5329  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5330  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5331  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5332  dsth3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5333  dsth4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5334  dsth5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
5335  dsth6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
5336  dsth7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
5337  dsth8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
5338 
5339  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5340  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5341  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5342  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5343  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5344  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5345  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
5346  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
5347 
5348  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5349  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5350  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5351  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5352  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5353  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5354  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5355  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5356  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5357  dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
5358  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5359  dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
5360 
5361  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5362  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5363  SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
5364  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
5365  dst0, dst1, dst2, dst3);
5366 
5367  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5368  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5369  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5370  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5371  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5372  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5373  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5374  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5375  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5376  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5377  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5378  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5379  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5380  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5381  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5382  tmp0, tmp1, tmp2, tmp3);
5383  CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
5384  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5385 
5386  PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
5387  ILVRL_H2_SH(dst0, in4, tmp0, tmp1);
5388  ILVRL_H2_SH(dst1, in5, tmp2, tmp3);
5389  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5390  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5391  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5392  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5393  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5394  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5395  CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
5396  out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5397  ST8x4_UB(out0, out1, dst, dst_stride);
5398  dst += (4 * dst_stride);
5399  ST8x2_UB(out2, dst, dst_stride);
5400 }
5401 
5403  int32_t src_stride,
5404  int16_t *src1_ptr,
5405  int32_t src2_stride,
5406  uint8_t *dst,
5407  int32_t dst_stride,
5408  const int8_t *filter_x,
5409  const int8_t *filter_y,
5410  int32_t height,
5411  int32_t weight0,
5412  int32_t weight1,
5413  int32_t offset0,
5414  int32_t offset1,
5415  int32_t rnd_val,
5416  int32_t width)
5417 {
5418  uint32_t loop_cnt;
5419  uint32_t cnt;
5421  uint8_t *src0_ptr_tmp;
5422  int16_t *src1_ptr_tmp;
5423  uint8_t *dst_tmp;
5424  v16u8 out0, out1;
5425  v16i8 src0, src1, src2, src3, src4, src5, src6;
5426  v8i16 in0, in1, in2, in3;
5427  v8i16 filt0, filt1;
5428  v8i16 filt_h0, filt_h1;
5429  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5430  v16i8 mask1;
5431  v8i16 filter_vec;
5432  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5433  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5434  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5435  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5436  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5437  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5438  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec;
5439  v4i32 offset_vec, rnd_vec, const_vec;
5440 
5441  src0_ptr -= (src_stride + 1);
5442 
5443  filter_vec = LD_SH(filter_x);
5444  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5445 
5446  filter_vec = LD_SH(filter_y);
5447  UNPCK_R_SB_SH(filter_vec, filter_vec);
5448 
5449  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5450 
5451  mask1 = mask0 + 2;
5452 
5453  offset = (offset0 + offset1) << rnd_val;
5454  weight0 = weight0 & 0x0000FFFF;
5455  weight = weight0 | (weight1 << 16);
5456 
5457  const_vec = __msa_fill_w((128 * weight1));
5458  const_vec <<= 6;
5459  offset_vec = __msa_fill_w(offset);
5460  weight_vec = (v8i16) __msa_fill_w(weight);
5461  rnd_vec = __msa_fill_w(rnd_val + 1);
5462  offset_vec += const_vec;
5463 
5464  for (cnt = width >> 3; cnt--;) {
5465  src0_ptr_tmp = src0_ptr;
5466  src1_ptr_tmp = src1_ptr;
5467  dst_tmp = dst;
5468 
5469  LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5470  src0_ptr_tmp += (3 * src_stride);
5471  XORI_B3_128_SB(src0, src1, src2);
5472 
5473  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5474  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5475  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5476  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5477  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5478  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5479 
5480  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5481  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5482 
5483  for (loop_cnt = height >> 2; loop_cnt--;) {
5484  LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5485  src0_ptr_tmp += (4 * src_stride);
5486  LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5487  src1_ptr_tmp += (4 * src2_stride);
5488  XORI_B4_128_SB(src3, src4, src5, src6);
5489 
5490  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5491  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5492  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5493  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5494 
5495  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5496  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5497  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5498  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5499 
5500  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5501  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5502  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5503  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5504 
5505  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5506  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5507  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5508  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5509  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5510  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5511  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5512  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5513 
5514  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5515  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5516  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5517  dst3_r, dst0, dst1, dst2, dst3);
5518  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5519  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5520  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5521  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5522  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5523  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5524  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5525  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5526  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5527  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5528  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5529  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5530  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5531  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5532  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5533  tmp0, tmp1, tmp2, tmp3);
5534  CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
5535  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5536  ST8x4_UB(out0, out1, dst_tmp, dst_stride);
5537  dst_tmp += (4 * dst_stride);
5538 
5539  dst10_r = dst54_r;
5540  dst10_l = dst54_l;
5541  dst21_r = dst65_r;
5542  dst21_l = dst65_l;
5543  dsth2 = dsth6;
5544  }
5545 
5546  src0_ptr += 8;
5547  dst += 8;
5548  src1_ptr += 8;
5549  }
5550 }
5551 
5552 static void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr,
5553  int32_t src_stride,
5554  int16_t *src1_ptr,
5555  int32_t src2_stride,
5556  uint8_t *dst,
5557  int32_t dst_stride,
5558  const int8_t *filter_x,
5559  const int8_t *filter_y,
5560  int32_t height,
5561  int32_t weight0,
5562  int32_t weight1,
5563  int32_t offset0,
5564  int32_t offset1,
5565  int32_t rnd_val)
5566 {
5567  if (2 == height) {
5568  hevc_hv_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5569  dst, dst_stride, filter_x, filter_y,
5570  weight0, weight1, offset0, offset1, rnd_val);
5571  } else if (4 == height) {
5572  hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
5573  src2_stride, dst, dst_stride, filter_x,
5574  filter_y, weight0, weight1, offset0,
5575  offset1, rnd_val, 1);
5576  } else if (6 == height) {
5577  hevc_hv_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5578  dst, dst_stride, filter_x, filter_y,
5579  weight0, weight1, offset0, offset1, rnd_val);
5580  } else if (0 == (height % 4)) {
5581  hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5582  src1_ptr, src2_stride,
5583  dst, dst_stride, filter_x, filter_y,
5584  height, weight0,
5585  weight1, offset0, offset1, rnd_val, 8);
5586  }
5587 }
5588 
5589 static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
5590  int32_t src_stride,
5591  int16_t *src1_ptr,
5592  int32_t src2_stride,
5593  uint8_t *dst,
5594  int32_t dst_stride,
5595  const int8_t *filter_x,
5596  const int8_t *filter_y,
5597  int32_t height,
5598  int32_t weight0,
5599  int32_t weight1,
5600  int32_t offset0,
5601  int32_t offset1,
5602  int32_t rnd_val)
5603 {
5604  uint32_t loop_cnt;
5605  uint64_t tp0, tp1;
5607  uint8_t *src0_ptr_tmp, *dst_tmp;
5608  int16_t *src1_ptr_tmp;
5609  v16u8 out0, out1;
5610  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5611  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5612  v16i8 mask0, mask1, mask2, mask3;
5613  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
5614  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5615  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, weight_vec;
5616  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
5617  v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
5618  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
5619  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5620  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5621  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5622  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5623  v4i32 offset_vec, rnd_vec, const_vec;
5624 
5625  src0_ptr -= (src_stride + 1);
5626 
5627  filter_vec = LD_SH(filter_x);
5628  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5629 
5630  filter_vec = LD_SH(filter_y);
5631  UNPCK_R_SB_SH(filter_vec, filter_vec);
5632 
5633  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5634 
5635  mask0 = LD_SB(ff_hevc_mask_arr);
5636  mask1 = mask0 + 2;
5637 
5638  offset = (offset0 + offset1) << rnd_val;
5639  weight0 = weight0 & 0x0000FFFF;
5640  weight = weight0 | (weight1 << 16);
5641 
5642  const_vec = __msa_fill_w((128 * weight1));
5643  const_vec <<= 6;
5644  offset_vec = __msa_fill_w(offset);
5645  rnd_vec = __msa_fill_w(rnd_val + 1);
5646  offset_vec += const_vec;
5647  weight_vec = (v8i16) __msa_fill_w(weight);
5648 
5649  src0_ptr_tmp = src0_ptr;
5650  dst_tmp = dst;
5651  src1_ptr_tmp = src1_ptr;
5652 
5653  LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5654  src0_ptr_tmp += (3 * src_stride);
5655 
5656  XORI_B3_128_SB(src0, src1, src2);
5657 
5658  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5659  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5660  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5661 
5662  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5663  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5664  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5665 
5666  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5667  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5668 
5669  for (loop_cnt = 4; loop_cnt--;) {
5670  LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5671  src0_ptr_tmp += (4 * src_stride);
5672  XORI_B4_128_SB(src3, src4, src5, src6);
5673 
5674  LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5675  src1_ptr_tmp += (4 * src2_stride);
5676 
5677  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5678  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5679  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5680  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5681 
5682  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5683  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5684  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5685  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5686 
5687  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5688  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5689  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5690  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5691 
5692  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5693  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5694  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5695  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5696  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5697  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5698  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5699  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5700 
5701  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5702  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5703  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5704  dst3_r, dst0, dst1, dst2, dst3);
5705  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5706  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5707  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5708  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5709  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5710  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5711  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5712  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5713  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5714  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5715  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5716  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5717  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5718  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5719  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5720  tmp0, tmp1, tmp2, tmp3);
5721  CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
5722  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5723  ST8x4_UB(out0, out1, dst_tmp, dst_stride);
5724  dst_tmp += (4 * dst_stride);
5725 
5726  dst10_r = dst54_r;
5727  dst10_l = dst54_l;
5728  dst21_r = dst65_r;
5729  dst21_l = dst65_l;
5730  dsth2 = dsth6;
5731  }
5732 
5733  src0_ptr += 8;
5734  dst += 8;
5735  src1_ptr += 8;
5736 
5737  mask2 = LD_SB(ff_hevc_mask_arr + 16);
5738  mask3 = mask2 + 2;
5739 
5740  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
5741  src0_ptr += (3 * src_stride);
5742  XORI_B3_128_SB(src0, src1, src2);
5743  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
5744  VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
5745 
5746  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5747  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5748 
5749  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
5750  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
5751 
5752  for (loop_cnt = 2; loop_cnt--;) {
5753  LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
5754  src10);
5755  src0_ptr += (8 * src_stride);
5756  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
5757  VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
5758  VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
5759  VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
5760  VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
5761 
5762  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5763  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5764  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5765  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5766 
5767  dst32_r = __msa_ilvr_h(dst73, dst22);
5768  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
5769  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
5770  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
5771  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
5772  dst76_r = __msa_ilvr_h(dst22, dst106);
5773 
5774  LD2(src1_ptr, src2_stride, tp0, tp1);
5775  src1_ptr += 2 * src2_stride;
5776  INSERT_D2_SH(tp0, tp1, in0);
5777  LD2(src1_ptr, src2_stride, tp0, tp1);
5778  src1_ptr += 2 * src2_stride;
5779  INSERT_D2_SH(tp0, tp1, in1);
5780 
5781  LD2(src1_ptr, src2_stride, tp0, tp1);
5782  src1_ptr += 2 * src2_stride;
5783  INSERT_D2_SH(tp0, tp1, in2);
5784  LD2(src1_ptr, src2_stride, tp0, tp1);
5785  src1_ptr += 2 * src2_stride;
5786  INSERT_D2_SH(tp0, tp1, in3);
5787 
5788  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5789  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5790  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5791  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5792  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5793  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5794  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
5795  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
5796 
5797  SRA_4V(dst0, dst1, dst2, dst3, 6);
5798  SRA_4V(dst4, dst5, dst6, dst7, 6);
5799  PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5800  dst0, dst1, dst2, dst3);
5801  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5802  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5803  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5804  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5805  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5806  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5807  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5808  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5809  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5810  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5811  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5812  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5813  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5814  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5815  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5816  tmp0, tmp1, tmp2, tmp3);
5817  CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
5818  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5819  ST4x8_UB(out0, out1, dst, dst_stride);
5820  dst += (8 * dst_stride);
5821 
5822  dst10_r = dst98_r;
5823  dst21_r = dst109_r;
5824  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5825  }
5826 }
5827 
5828 static void hevc_hv_biwgt_4t_16w_msa(uint8_t *src0_ptr,
5829  int32_t src_stride,
5830  int16_t *src1_ptr,
5831  int32_t src2_stride,
5832  uint8_t *dst,
5833  int32_t dst_stride,
5834  const int8_t *filter_x,
5835  const int8_t *filter_y,
5836  int32_t height,
5837  int32_t weight0,
5838  int32_t weight1,
5839  int32_t offset0,
5840  int32_t offset1,
5841  int32_t rnd_val)
5842 {
5843  if (4 == height) {
5844  hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
5845  src2_stride, dst, dst_stride, filter_x,
5846  filter_y, weight0, weight1, offset0,
5847  offset1, rnd_val, 2);
5848  } else {
5849  hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr,
5850  src2_stride, dst, dst_stride,
5851  filter_x, filter_y, height, weight0,
5852  weight1, offset0, offset1, rnd_val, 16);
5853  }
5854 }
5855 
5856 static void hevc_hv_biwgt_4t_24w_msa(uint8_t *src0_ptr,
5857  int32_t src_stride,
5858  int16_t *src1_ptr,
5859  int32_t src2_stride,
5860  uint8_t *dst,
5861  int32_t dst_stride,
5862  const int8_t *filter_x,
5863  const int8_t *filter_y,
5864  int32_t height,
5865  int32_t weight0,
5866  int32_t weight1,
5867  int32_t offset0,
5868  int32_t offset1,
5869  int32_t rnd_val)
5870 {
5871  hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5872  src1_ptr, src2_stride,
5873  dst, dst_stride,
5874  filter_x, filter_y, height, weight0,
5875  weight1, offset0, offset1, rnd_val, 24);
5876 }
5877 
5878 static void hevc_hv_biwgt_4t_32w_msa(uint8_t *src0_ptr,
5879  int32_t src_stride,
5880  int16_t *src1_ptr,
5881  int32_t src2_stride,
5882  uint8_t *dst,
5883  int32_t dst_stride,
5884  const int8_t *filter_x,
5885  const int8_t *filter_y,
5886  int32_t height,
5887  int32_t weight0,
5888  int32_t weight1,
5889  int32_t offset0,
5890  int32_t offset1,
5891  int32_t rnd_val)
5892 {
5893  hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5894  src1_ptr, src2_stride,
5895  dst, dst_stride,
5896  filter_x, filter_y, height, weight0,
5897  weight1, offset0, offset1, rnd_val, 32);
5898 }
5899 
5900 #define BI_W_MC_COPY(WIDTH) \
5901 void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
5902  ptrdiff_t dst_stride, \
5903  uint8_t *src, \
5904  ptrdiff_t src_stride, \
5905  int16_t *src_16bit, \
5906  int height, \
5907  int denom, \
5908  int weight0, \
5909  int weight1, \
5910  int offset0, \
5911  int offset1, \
5912  intptr_t mx, \
5913  intptr_t my, \
5914  int width) \
5915 { \
5916  int shift = 14 + 1 - 8; \
5917  int log2Wd = denom + shift - 1; \
5918  \
5919  hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
5920  dst, dst_stride, height, \
5921  weight0, weight1, offset0, \
5922  offset1, log2Wd); \
5923 }
5924 
5925 BI_W_MC_COPY(4);
5926 BI_W_MC_COPY(6);
5927 BI_W_MC_COPY(8);
5928 BI_W_MC_COPY(12);
5929 BI_W_MC_COPY(16);
5930 BI_W_MC_COPY(24);
5931 BI_W_MC_COPY(32);
5932 BI_W_MC_COPY(48);
5933 BI_W_MC_COPY(64);
5934 
5935 #undef BI_W_MC_COPY
5936 
5937 #define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
5938 void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
5939  ptrdiff_t \
5940  dst_stride, \
5941  uint8_t *src, \
5942  ptrdiff_t \
5943  src_stride, \
5944  int16_t *src_16bit, \
5945  int height, \
5946  int denom, \
5947  int weight0, \
5948  int weight1, \
5949  int offset0, \
5950  int offset1, \
5951  intptr_t mx, \
5952  intptr_t my, \
5953  int width) \
5954 { \
5955  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
5956  int log2Wd = denom + 14 - 8; \
5957  \
5958  hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
5959  MAX_PB_SIZE, dst, dst_stride, \
5960  filter, height, weight0, \
5961  weight1, offset0, offset1, \
5962  log2Wd); \
5963 }
5964 
5965 BI_W_MC(qpel, h, 4, 8, hz, mx);
5966 BI_W_MC(qpel, h, 8, 8, hz, mx);
5967 BI_W_MC(qpel, h, 12, 8, hz, mx);
5968 BI_W_MC(qpel, h, 16, 8, hz, mx);
5969 BI_W_MC(qpel, h, 24, 8, hz, mx);
5970 BI_W_MC(qpel, h, 32, 8, hz, mx);
5971 BI_W_MC(qpel, h, 48, 8, hz, mx);
5972 BI_W_MC(qpel, h, 64, 8, hz, mx);
5973 
5974 BI_W_MC(qpel, v, 4, 8, vt, my);
5975 BI_W_MC(qpel, v, 8, 8, vt, my);
5976 BI_W_MC(qpel, v, 12, 8, vt, my);
5977 BI_W_MC(qpel, v, 16, 8, vt, my);
5978 BI_W_MC(qpel, v, 24, 8, vt, my);
5979 BI_W_MC(qpel, v, 32, 8, vt, my);
5980 BI_W_MC(qpel, v, 48, 8, vt, my);
5981 BI_W_MC(qpel, v, 64, 8, vt, my);
5982 
5983 BI_W_MC(epel, h, 4, 4, hz, mx);
5984 BI_W_MC(epel, h, 8, 4, hz, mx);
5985 BI_W_MC(epel, h, 6, 4, hz, mx);
5986 BI_W_MC(epel, h, 12, 4, hz, mx);
5987 BI_W_MC(epel, h, 16, 4, hz, mx);
5988 BI_W_MC(epel, h, 24, 4, hz, mx);
5989 BI_W_MC(epel,