FFmpeg
hevc_mc_biw_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
24 
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
29 };
30 
31 #define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \
32  out0, out1) \
33 { \
34  v4i32 out0_r, out1_r, out0_l, out1_l; \
35  \
36  ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
37  ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
38  \
39  out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
40  out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
41  out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
42  out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
43  \
44  SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
45  PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
46  CLIP_SH2_0_255(out0, out1); \
47 }
48 
49 #define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \
50  wgt, rnd, offset, out0, out1, out2, out3) \
51 { \
52  HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1); \
53  HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3); \
54 }
55 
56 #define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, \
57  offset, out0, out1) \
58 { \
59  v4i32 out0_r, out1_r, out0_l, out1_l; \
60  \
61  ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
62  ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
63  out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
64  out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
65  out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
66  out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
67  SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
68  PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
69  CLIP_SH2_0_255(out0, out1); \
70 }
71 
72 #define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
73  vec3, wgt, rnd, offset, out0, out1, \
74  out2, out3) \
75 { \
76  HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, \
77  out0, out1); \
78  HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset, \
79  out2, out3); \
80 }
81 
82 static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
83  int32_t src_stride,
84  int16_t *src1_ptr,
85  int32_t src2_stride,
86  uint8_t *dst,
87  int32_t dst_stride,
89  int32_t weight0,
90  int32_t weight1,
91  int32_t offset0,
92  int32_t offset1,
93  int32_t rnd_val)
94 {
95  uint32_t loop_cnt, tp0, tp1, tp2, tp3;
96  uint64_t tpd0, tpd1, tpd2, tpd3;
98  v16u8 out0, out1;
99  v16i8 zero = { 0 };
100  v16i8 src0 = { 0 }, src1 = { 0 };
101  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
102  v8i16 dst0, dst1, dst2, dst3, weight_vec;
103  v4i32 dst0_r, dst0_l, offset_vec, rnd_vec;
104 
105  offset = (offset0 + offset1) << rnd_val;
106  weight0 = weight0 & 0x0000FFFF;
107  weight = weight0 | (weight1 << 16);
108 
109  offset_vec = __msa_fill_w(offset);
110  weight_vec = (v8i16) __msa_fill_w(weight);
111  rnd_vec = __msa_fill_w(rnd_val + 1);
112 
113  if (2 == height) {
114  LW2(src0_ptr, src_stride, tp0, tp1);
115  INSERT_W2_SB(tp0, tp1, src0);
116  LD2(src1_ptr, src2_stride, tpd0, tpd1);
117  INSERT_D2_SH(tpd0, tpd1, in0);
118 
119  dst0 = (v8i16) __msa_ilvr_b(zero, src0);
120  dst0 <<= 6;
121 
122  ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
123  dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec);
124  dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
125  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
126  dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
127  CLIP_SH_0_255(dst0);
128  out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
129  ST_W2(out0, 0, 1, dst, dst_stride);
130  } else if (4 == height) {
131  LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
132  INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
133  LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
134  INSERT_D2_SH(tpd0, tpd1, in0);
135  INSERT_D2_SH(tpd2, tpd3, in1);
136  ILVRL_B2_SH(zero, src0, dst0, dst1);
137  SLLI_2V(dst0, dst1, 6);
138  HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec,
139  offset_vec, dst0, dst1);
140  out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
141  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
142  } else if (0 == height % 8) {
143  for (loop_cnt = (height >> 3); loop_cnt--;) {
144  LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
145  src0_ptr += 4 * src_stride;
146  INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
147  LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
148  src0_ptr += 4 * src_stride;
149  INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
150  LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
151  src1_ptr += (4 * src2_stride);
152  INSERT_D2_SH(tpd0, tpd1, in0);
153  INSERT_D2_SH(tpd2, tpd3, in1);
154  LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
155  src1_ptr += (4 * src2_stride);
156  INSERT_D2_SH(tpd0, tpd1, in2);
157  INSERT_D2_SH(tpd2, tpd3, in3);
158  ILVRL_B2_SH(zero, src0, dst0, dst1);
159  ILVRL_B2_SH(zero, src1, dst2, dst3);
160  SLLI_4V(dst0, dst1, dst2, dst3, 6);
161  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
162  in3, weight_vec, rnd_vec, offset_vec,
163  dst0, dst1, dst2, dst3);
164  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
165  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
166  dst += (8 * dst_stride);
167  }
168  }
169 }
170 
171 static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr,
172  int32_t src_stride,
173  int16_t *src1_ptr,
174  int32_t src2_stride,
175  uint8_t *dst,
176  int32_t dst_stride,
177  int32_t height,
178  int32_t weight0,
179  int32_t weight1,
180  int32_t offset0,
181  int32_t offset1,
182  int32_t rnd_val)
183 {
184  uint32_t loop_cnt;
186  uint64_t tp0, tp1, tp2, tp3;
187  v16u8 out0, out1;
188  v16i8 zero = { 0 };
189  v16i8 src0 = { 0 }, src1 = { 0 };
190  v8i16 in0, in1, in2, in3;
191  v8i16 dst0, dst1, dst2, dst3;
192  v4i32 offset_vec, weight_vec, rnd_vec;
193 
194  offset = (offset0 + offset1) << rnd_val;
195  weight0 = weight0 & 0x0000FFFF;
196  weight = weight0 | (weight1 << 16);
197 
198  weight_vec = __msa_fill_w(weight);
199  offset_vec = __msa_fill_w(offset);
200  rnd_vec = __msa_fill_w(rnd_val + 1);
201 
202  for (loop_cnt = (height >> 2); loop_cnt--;) {
203  LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
204  src0_ptr += (4 * src_stride);
205  INSERT_D2_SB(tp0, tp1, src0);
206  INSERT_D2_SB(tp2, tp3, src1);
207  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
208  src1_ptr += (4 * src2_stride);
209  ILVRL_B2_SH(zero, src0, dst0, dst1);
210  ILVRL_B2_SH(zero, src1, dst2, dst3);
211  SLLI_4V(dst0, dst1, dst2, dst3, 6);
212  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3,
213  in0, in1, in2, in3,
214  weight_vec, rnd_vec, offset_vec,
215  dst0, dst1, dst2, dst3);
216  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
217  ST_W2(out0, 0, 2, dst, dst_stride);
218  ST_H2(out0, 2, 6, dst + 4, dst_stride);
219  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
220  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
221  dst += (4 * dst_stride);
222  }
223 }
224 
225 static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
226  int32_t src_stride,
227  int16_t *src1_ptr,
228  int32_t src2_stride,
229  uint8_t *dst,
230  int32_t dst_stride,
231  int32_t height,
232  int32_t weight0,
233  int32_t weight1,
234  int32_t offset0,
235  int32_t offset1,
236  int32_t rnd_val)
237 {
238  uint64_t tp0, tp1, tp2, tp3;
240  v16u8 out0, out1, out2;
241  v16i8 zero = { 0 };
242  v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 };
243  v8i16 in0, in1, in2, in3, in4, in5;
244  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
245  v4i32 offset_vec, weight_vec, rnd_vec;
246 
247  offset = (offset0 + offset1) << rnd_val;
248  weight0 = weight0 & 0x0000FFFF;
249  weight = weight0 | (weight1 << 16);
250 
251  offset_vec = __msa_fill_w(offset);
252  weight_vec = __msa_fill_w(weight);
253  rnd_vec = __msa_fill_w(rnd_val + 1);
254 
255  if (2 == height) {
256  LD2(src0_ptr, src_stride, tp0, tp1);
257  INSERT_D2_SB(tp0, tp1, src0);
258  LD_SH2(src1_ptr, src2_stride, in0, in1);
259  ILVRL_B2_SH(zero, src0, dst0, dst1);
260  SLLI_2V(dst0, dst1, 6);
261 
262  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
263  weight_vec, rnd_vec, offset_vec,
264  dst0, dst1);
265 
266  out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
267  ST_D2(out0, 0, 1, dst, dst_stride);
268  } else if (6 == height) {
269  LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
270  src0_ptr += 4 * src_stride;
271  INSERT_D2_SB(tp0, tp1, src0);
272  INSERT_D2_SB(tp2, tp3, src1);
273  LD2(src0_ptr, src_stride, tp0, tp1);
274  INSERT_D2_SB(tp0, tp1, src2);
275  ILVRL_B2_SH(zero, src0, dst0, dst1);
276  ILVRL_B2_SH(zero, src1, dst2, dst3);
277  ILVRL_B2_SH(zero, src2, dst4, dst5);
278  LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
279  SLLI_4V(dst0, dst1, dst2, dst3, 6);
280  SLLI_2V(dst4, dst5, 6);
281  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
282  weight_vec, rnd_vec, offset_vec, dst0, dst1,
283  dst2, dst3);
284  HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
285  offset_vec, dst4, dst5);
286  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
287  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
288  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
289  } else if (0 == height % 4) {
290  uint32_t loop_cnt;
291 
292  for (loop_cnt = (height >> 2); loop_cnt--;) {
293  LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
294  src0_ptr += (4 * src_stride);
295  INSERT_D2_SB(tp0, tp1, src0);
296  INSERT_D2_SB(tp2, tp3, src1);
297  ILVRL_B2_SH(zero, src0, dst0, dst1);
298  ILVRL_B2_SH(zero, src1, dst2, dst3);
299  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
300  src1_ptr += (4 * src2_stride);
301 
302  SLLI_4V(dst0, dst1, dst2, dst3, 6);
303  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
304  in3, weight_vec, rnd_vec, offset_vec,
305  dst0, dst1, dst2, dst3);
306  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
307  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
308  dst += (4 * dst_stride);
309  }
310  }
311 }
312 
313 static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr,
314  int32_t src_stride,
315  int16_t *src1_ptr,
316  int32_t src2_stride,
317  uint8_t *dst,
318  int32_t dst_stride,
319  int32_t height,
320  int32_t weight0,
321  int32_t weight1,
322  int32_t offset0,
323  int32_t offset1,
324  int32_t rnd_val)
325 {
326  uint32_t loop_cnt;
328  v16i8 zero = { 0 };
329  v16u8 out0, out1, out2;
330  v16i8 src0, src1, src2, src3;
331  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
332  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
333  v4i32 offset_vec, weight_vec, rnd_vec;
334 
335  offset = (offset0 + offset1) << rnd_val;
336  weight0 = weight0 & 0x0000FFFF;
337  weight = weight0 | (weight1 << 16);
338 
339  offset_vec = __msa_fill_w(offset);
340  weight_vec = __msa_fill_w(weight);
341  rnd_vec = __msa_fill_w(rnd_val + 1);
342 
343  for (loop_cnt = (16 >> 2); loop_cnt--;) {
344  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
345  src0_ptr += (4 * src_stride);
346  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
347  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
348  src1_ptr += (4 * src2_stride);
349 
350  ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
351  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
352  dst0, dst1, dst2, dst3);
353 
354  SLLI_4V(dst0, dst1, dst2, dst3, 6);
355  ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
356  ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
357 
358  dst4 <<= 6;
359  dst5 <<= 6;
360  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
361  weight_vec, rnd_vec, offset_vec, dst0, dst1,
362  dst2, dst3);
363  HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
364  offset_vec, dst4, dst5);
365  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
366  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
367  ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
368  dst += (4 * dst_stride);
369  }
370 }
371 
372 static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr,
373  int32_t src_stride,
374  int16_t *src1_ptr,
375  int32_t src2_stride,
376  uint8_t *dst,
377  int32_t dst_stride,
378  int32_t height,
379  int32_t weight0,
380  int32_t weight1,
381  int32_t offset0,
382  int32_t offset1,
383  int32_t rnd_val)
384 {
385  uint32_t loop_cnt;
387  v16u8 out0, out1, out2, out3;
388  v16i8 zero = { 0 };
389  v16i8 src0, src1, src2, src3;
390  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
391  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
392  v4i32 offset_vec, weight_vec, rnd_vec;
393 
394  offset = (offset0 + offset1) << rnd_val;
395  weight0 = weight0 & 0x0000FFFF;
396  weight = weight0 | (weight1 << 16);
397 
398  offset_vec = __msa_fill_w(offset);
399  weight_vec = __msa_fill_w(weight);
400  rnd_vec = __msa_fill_w(rnd_val + 1);
401 
402  for (loop_cnt = (height >> 2); loop_cnt--;) {
403  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
404  src0_ptr += (4 * src_stride);
405  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
406  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
407  src1_ptr += (4 * src2_stride);
408  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
409  tmp2, tmp3);
410  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
411  tmp6, tmp7);
412  SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
413  SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
414  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp1, tmp4, tmp5, in0, in1, in4, in5,
415  weight_vec, rnd_vec, offset_vec, tmp0, tmp1,
416  tmp4, tmp5);
417  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp3, tmp6, tmp7, in2, in3, in6, in7,
418  weight_vec, rnd_vec, offset_vec, tmp2, tmp3,
419  tmp6, tmp7);
420  PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
421  PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
422  ST_UB4(out0, out1, out2, out3, dst, dst_stride);
423  dst += (4 * dst_stride);
424  }
425 }
426 
427 static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr,
428  int32_t src_stride,
429  int16_t *src1_ptr,
430  int32_t src2_stride,
431  uint8_t *dst,
432  int32_t dst_stride,
433  int32_t height,
434  int32_t weight0,
435  int32_t weight1,
436  int32_t offset0,
437  int32_t offset1,
438  int32_t rnd_val)
439 {
440  uint32_t loop_cnt;
442  v16u8 out0, out1, out2, out3, out4, out5;
443  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
444  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
445  v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
446  v4i32 offset_vec, weight_vec, rnd_vec;
447 
448  offset = (offset0 + offset1) << rnd_val;
449  weight0 = weight0 & 0x0000FFFF;
450  weight = weight0 | (weight1 << 16);
451 
452  offset_vec = __msa_fill_w(offset);
453  weight_vec = __msa_fill_w(weight);
454  rnd_vec = __msa_fill_w(rnd_val + 1);
455 
456  for (loop_cnt = 8; loop_cnt--;) {
457  LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
458  LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
459  src0_ptr += (4 * src_stride);
460  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
461  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
462  LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
463  src1_ptr += (4 * src2_stride);
464 
465  ILVRL_B2_SH(zero, src0, dst0, dst1);
466  ILVRL_B2_SH(zero, src1, dst2, dst3);
467  ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
468  ILVRL_B2_SH(zero, src4, dst6, dst7);
469  ILVRL_B2_SH(zero, src5, dst8, dst9);
470  ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
471  SLLI_4V(dst0, dst1, dst2, dst3, 6);
472  SLLI_4V(dst4, dst5, dst6, dst7, 6);
473  SLLI_4V(dst8, dst9, dst10, dst11, 6);
474  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in4, in1, in5,
475  weight_vec, rnd_vec, offset_vec, dst0, dst1,
476  dst2, dst3);
477  HEVC_BIW_RND_CLIP4_MAX_SATU(dst4, dst5, dst6, dst7, in8, in9, in2, in6,
478  weight_vec, rnd_vec, offset_vec, dst4, dst5,
479  dst6, dst7);
480  HEVC_BIW_RND_CLIP4_MAX_SATU(dst8, dst9, dst10, dst11, in3, in7, in10,
481  in11, weight_vec, rnd_vec, offset_vec,
482  dst8, dst9, dst10, dst11);
483  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
484  PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
485  ST_UB4(out0, out1, out3, out4, dst, dst_stride);
486  ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
487  dst += (4 * dst_stride);
488  }
489 }
490 
491 static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr,
492  int32_t src_stride,
493  int16_t *src1_ptr,
494  int32_t src2_stride,
495  uint8_t *dst,
496  int32_t dst_stride,
497  int32_t height,
498  int32_t weight0,
499  int32_t weight1,
500  int32_t offset0,
501  int32_t offset1,
502  int32_t rnd_val)
503 {
504  uint32_t loop_cnt;
506  v16u8 out0, out1, out2, out3;
507  v16i8 zero = { 0 };
508  v16i8 src0, src1, src2, src3;
509  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
510  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
511  v4i32 offset_vec, weight_vec, rnd_vec;
512 
513  offset = (offset0 + offset1) << rnd_val;
514  weight0 = weight0 & 0x0000FFFF;
515  weight = weight0 | (weight1 << 16);
516 
517  offset_vec = __msa_fill_w(offset);
518  weight_vec = __msa_fill_w(weight);
519  rnd_vec = __msa_fill_w(rnd_val + 1);
520 
521  for (loop_cnt = (height >> 1); loop_cnt--;) {
522  LD_SB2(src0_ptr, 16, src0, src1);
523  src0_ptr += src_stride;
524  LD_SB2(src0_ptr, 16, src2, src3);
525  src0_ptr += src_stride;
526  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
527  src1_ptr += src2_stride;
528  LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
529  src1_ptr += src2_stride;
530 
531  ILVRL_B2_SH(zero, src0, tmp0, tmp4);
532  ILVRL_B2_SH(zero, src1, tmp1, tmp5);
533  ILVRL_B2_SH(zero, src2, tmp2, tmp6);
534  ILVRL_B2_SH(zero, src3, tmp3, tmp7);
535  SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
536  SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
537  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
538  weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
539  tmp1, tmp5);
540  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
541  weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
542  tmp3, tmp7);
543  PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
544  PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
545  ST_UB2(out0, out1, dst, 16);
546  dst += dst_stride;
547  ST_UB2(out2, out3, dst, 16);
548  dst += dst_stride;
549  }
550 }
551 
552 static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr,
553  int32_t src_stride,
554  int16_t *src1_ptr,
555  int32_t src2_stride,
556  uint8_t *dst,
557  int32_t dst_stride,
558  int32_t height,
559  int32_t weight0,
560  int32_t weight1,
561  int32_t offset0,
562  int32_t offset1,
563  int32_t rnd_val)
564 {
565  uint32_t loop_cnt;
567  v16u8 out0, out1, out2;
568  v16i8 src0, src1, src2;
569  v16i8 zero = { 0 };
570  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5;
571  v4i32 offset_vec, weight_vec, rnd_vec;
572 
573  offset = (offset0 + offset1) << rnd_val;
574  weight0 = weight0 & 0x0000FFFF;
575  weight = weight0 | (weight1 << 16);
576 
577  offset_vec = __msa_fill_w(offset);
578  weight_vec = __msa_fill_w(weight);
579  rnd_vec = __msa_fill_w(rnd_val + 1);
580 
581  for (loop_cnt = 64; loop_cnt--;) {
582  LD_SB3(src0_ptr, 16, src0, src1, src2);
583  src0_ptr += src_stride;
584  LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
585  src1_ptr += src2_stride;
586 
587  ILVRL_B2_SH(zero, src0, dst0, dst1);
588  ILVRL_B2_SH(zero, src1, dst2, dst3);
589  ILVRL_B2_SH(zero, src2, dst4, dst5);
590  SLLI_4V(dst0, dst1, dst2, dst3, 6);
591  SLLI_2V(dst4, dst5, 6);
592  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
593  weight_vec, rnd_vec, offset_vec, dst0, dst1,
594  dst2, dst3);
595  HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
596  offset_vec, dst4, dst5);
597  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
598  ST_UB2(out0, out1, dst, 16);
599  ST_UB(out2, dst + 32);
600  dst += dst_stride;
601  }
602 }
603 
604 static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr,
605  int32_t src_stride,
606  int16_t *src1_ptr,
607  int32_t src2_stride,
608  uint8_t *dst,
609  int32_t dst_stride,
610  int32_t height,
611  int32_t weight0,
612  int32_t weight1,
613  int32_t offset0,
614  int32_t offset1,
615  int32_t rnd_val)
616 {
617  uint32_t loop_cnt;
619  v16u8 out0, out1, out2, out3;
620  v16i8 zero = { 0 };
621  v16i8 src0, src1, src2, src3;
622  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
623  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
624  v4i32 offset_vec, weight_vec, rnd_vec;
625 
626  offset = (offset0 + offset1) << rnd_val;
627  weight0 = weight0 & 0x0000FFFF;
628  weight = weight0 | (weight1 << 16);
629 
630  offset_vec = __msa_fill_w(offset);
631  weight_vec = __msa_fill_w(weight);
632  rnd_vec = __msa_fill_w(rnd_val + 1);
633 
634  for (loop_cnt = height; loop_cnt--;) {
635  LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
636  src0_ptr += src_stride;
637  LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
638  src1_ptr += src2_stride;
639 
640  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
641  tmp2, tmp3);
642  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
643  tmp6, tmp7);
644  SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
645  SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
646  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
647  weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
648  tmp1, tmp5);
649  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
650  weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
651  tmp3, tmp7);
652  PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
653  PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
654  ST_UB4(out0, out1, out2, out3, dst, 16);
655  dst += dst_stride;
656  }
657 }
658 
659 static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr,
660  int32_t src_stride,
661  int16_t *src1_ptr,
662  int32_t src2_stride,
663  uint8_t *dst,
664  int32_t dst_stride,
665  const int8_t *filter,
666  int32_t height,
667  int32_t weight0,
668  int32_t weight1,
669  int32_t offset0,
670  int32_t offset1,
671  int32_t rnd_val)
672 {
673  uint32_t loop_cnt;
674  int32_t offset, weight, constant;
675  v8i16 filt0, filt1, filt2, filt3;
676  v16i8 src0, src1, src2, src3;
677  v16i8 mask1, mask2, mask3;
678  v16i8 vec0, vec1, vec2, vec3;
679  v8i16 dst0, dst1;
680  v8i16 in0, in1, in2, in3;
681  v8i16 filter_vec, out0, out1;
682  v4i32 weight_vec, offset_vec, rnd_vec;
683  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
684 
685  src0_ptr -= 3;
686  filter_vec = LD_SH(filter);
687  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
688 
689  mask1 = mask0 + 2;
690  mask2 = mask0 + 4;
691  mask3 = mask0 + 6;
692 
693  offset = (offset0 + offset1) << rnd_val;
694  weight0 = weight0 & 0x0000FFFF;
695  weight = weight0 | (weight1 << 16);
696  constant = 128 * weight1;
697  constant <<= 6;
698  offset += constant;
699 
700  offset_vec = __msa_fill_w(offset);
701  weight_vec = __msa_fill_w(weight);
702  rnd_vec = __msa_fill_w(rnd_val + 1);
703 
704  for (loop_cnt = (height >> 2); loop_cnt--;) {
705  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
706  src0_ptr += (4 * src_stride);
707  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
708  src1_ptr += (4 * src2_stride);
709  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
710  XORI_B4_128_SB(src0, src1, src2, src3);
711 
712  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
713  vec0, vec1, vec2, vec3);
714  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
715  filt3);
716  VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
717  vec0, vec1, vec2, vec3);
718  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
719  filt3);
720 
721  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
722  weight_vec, rnd_vec, offset_vec,
723  out0, out1);
724 
725  out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
726  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
727  dst += (4 * dst_stride);
728  }
729 }
730 
731 static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr,
732  int32_t src_stride,
733  int16_t *src1_ptr,
734  int32_t src2_stride,
735  uint8_t *dst,
736  int32_t dst_stride,
737  const int8_t *filter,
738  int32_t height,
739  int32_t weight0,
740  int32_t weight1,
741  int32_t offset0,
742  int32_t offset1,
743  int32_t rnd_val)
744 {
745  uint32_t loop_cnt;
746  int32_t offset, weight, constant;
747  v8i16 filt0, filt1, filt2, filt3;
748  v16i8 src0, src1, src2, src3;
749  v16i8 mask1, mask2, mask3;
750  v16i8 vec0, vec1, vec2, vec3;
751  v8i16 dst0, dst1, dst2, dst3;
752  v8i16 in0, in1, in2, in3;
753  v8i16 filter_vec, out0, out1, out2, out3;
754  v4i32 weight_vec, offset_vec, rnd_vec;
755  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
756 
757  src0_ptr -= 3;
758  offset = (offset0 + offset1) << rnd_val;
759  weight0 = weight0 & 0x0000FFFF;
760  weight = weight0 | (weight1 << 16);
761  constant = 128 * weight1;
762  constant <<= 6;
763  offset += constant;
764 
765  offset_vec = __msa_fill_w(offset);
766  weight_vec = __msa_fill_w(weight);
767  rnd_vec = __msa_fill_w(rnd_val + 1);
768 
769  filter_vec = LD_SH(filter);
770  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
771 
772  mask1 = mask0 + 2;
773  mask2 = mask0 + 4;
774  mask3 = mask0 + 6;
775 
776  for (loop_cnt = (height >> 2); loop_cnt--;) {
777  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
778  src0_ptr += (4 * src_stride);
779  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
780  src1_ptr += (4 * src2_stride);
781  XORI_B4_128_SB(src0, src1, src2, src3);
782 
783  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
784  vec0, vec1, vec2, vec3);
785  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
786  filt3);
787  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
788  vec0, vec1, vec2, vec3);
789  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
790  filt3);
791  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
792  vec0, vec1, vec2, vec3);
793  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
794  filt3);
795  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
796  vec0, vec1, vec2, vec3);
797  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
798  filt3);
799 
800  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
801  in0, in1, in2, in3,
802  weight_vec, rnd_vec, offset_vec,
803  out0, out1, out2, out3);
804 
805  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
806  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
807  dst += (4 * dst_stride);
808  }
809 }
810 
811 static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr,
812  int32_t src_stride,
813  int16_t *src1_ptr,
814  int32_t src2_stride,
815  uint8_t *dst,
816  int32_t dst_stride,
817  const int8_t *filter,
818  int32_t height,
819  int32_t weight0,
820  int32_t weight1,
821  int32_t offset0,
822  int32_t offset1,
823  int32_t rnd_val)
824 {
825  uint32_t loop_cnt;
826  int32_t offset, weight, constant;
827  v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3;
828  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
829  v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3;
830  v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec;
831  v4i32 weight_vec, offset_vec, rnd_vec;
832 
833  src0_ptr -= 3;
834 
835  weight0 = weight0 & 0x0000FFFF;
836  weight = weight0 | (weight1 << 16);
837  constant = 128 * weight1;
838  constant <<= 6;
839  offset = (offset0 + offset1) << rnd_val;
840  offset += constant;
841 
842  offset_vec = __msa_fill_w(offset);
843  weight_vec = __msa_fill_w(weight);
844  rnd_vec = __msa_fill_w(rnd_val + 1);
845 
846  filter_vec = LD_SH(filter);
847  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
848 
849  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
850  mask1 = mask0 + 2;
851  mask2 = mask0 + 4;
852  mask3 = mask0 + 6;
853  mask4 = LD_SB(&ff_hevc_mask_arr[16]);
854  mask5 = mask4 + 2;
855  mask6 = mask4 + 4;
856  mask7 = mask4 + 6;
857 
858  for (loop_cnt = 4; loop_cnt--;) {
859  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
860  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
861  XORI_B4_128_SB(src0, src1, src2, src3);
862  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
863  vec3);
864  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
865  filt3);
866  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
867  vec3);
868  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
869  filt3);
870  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
871  vec3);
872  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
873  filt3);
874  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
875  vec3);
876  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
877  filt3);
878  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
879  weight_vec, rnd_vec, offset_vec, out0, out1, out2,
880  out3);
881  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
882  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
883 
884  LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3);
885  src0_ptr += (4 * src_stride);
886  LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3);
887  src1_ptr += (4 * src2_stride);
888  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
889  XORI_B4_128_SB(src0, src1, src2, src3);
890  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
891  vec3);
892  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
893  filt3);
894  VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
895  vec3);
896  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
897  filt3);
898  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec,
899  offset_vec, out0, out1);
900  out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
901  ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
902  dst += (4 * dst_stride);
903  }
904 }
905 
906 static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr,
907  int32_t src_stride,
908  int16_t *src1_ptr,
909  int32_t src2_stride,
910  uint8_t *dst,
911  int32_t dst_stride,
912  const int8_t *filter,
913  int32_t height,
914  int32_t weight0,
915  int32_t weight1,
916  int32_t offset0,
917  int32_t offset1,
918  int32_t rnd_val)
919 {
920  uint32_t loop_cnt;
921  int32_t offset, weight, constant;
922  v16i8 src0, src1, src2, src3;
923  v8i16 in0, in1, in2, in3;
924  v8i16 filt0, filt1, filt2, filt3;
925  v16i8 mask1, mask2, mask3;
926  v8i16 filter_vec, out0, out1, out2, out3;
927  v16i8 vec0, vec1, vec2, vec3;
928  v8i16 dst0, dst1, dst2, dst3;
929  v4i32 weight_vec, offset_vec, rnd_vec;
930  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
931 
932  src0_ptr -= 3;
933  offset = (offset0 + offset1) << rnd_val;
934  weight0 = weight0 & 0x0000FFFF;
935  weight = weight0 | (weight1 << 16);
936  constant = 128 * weight1;
937  constant <<= 6;
938  offset += constant;
939 
940  offset_vec = __msa_fill_w(offset);
941  weight_vec = __msa_fill_w(weight);
942  rnd_vec = __msa_fill_w(rnd_val + 1);
943 
944  filter_vec = LD_SH(filter);
945  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
946 
947  mask1 = mask0 + 2;
948  mask2 = mask0 + 4;
949  mask3 = mask0 + 6;
950 
951  for (loop_cnt = (height >> 1); loop_cnt--;) {
952  LD_SB2(src0_ptr, 8, src0, src1);
953  src0_ptr += src_stride;
954  LD_SB2(src0_ptr, 8, src2, src3);
955  src0_ptr += src_stride;
956  LD_SH2(src1_ptr, 8, in0, in1);
957  src1_ptr += src2_stride;
958  LD_SH2(src1_ptr, 8, in2, in3);
959  src1_ptr += src2_stride;
960  XORI_B4_128_SB(src0, src1, src2, src3);
961 
962  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
963  vec0, vec1, vec2, vec3);
964  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
965  filt3);
966  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
967  vec0, vec1, vec2, vec3);
968  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
969  filt3);
970  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
971  vec0, vec1, vec2, vec3);
972  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
973  filt3);
974  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
975  vec0, vec1, vec2, vec3);
976  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
977  filt3);
978 
979  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
980  in0, in1, in2, in3,
981  weight_vec, rnd_vec, offset_vec,
982  out0, out1, out2, out3);
983 
984  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
985  ST_SH2(out0, out1, dst, dst_stride);
986  dst += (2 * dst_stride);
987  }
988 }
989 
990 static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr,
991  int32_t src_stride,
992  int16_t *src1_ptr,
993  int32_t src2_stride,
994  uint8_t *dst,
995  int32_t dst_stride,
996  const int8_t *filter,
997  int32_t height,
998  int32_t weight0,
999  int32_t weight1,
1000  int32_t offset0,
1001  int32_t offset1,
1002  int32_t rnd_val)
1003 {
1004  uint32_t loop_cnt;
1005  uint64_t dst_val0;
1006  int32_t offset, weight, constant;
1007  v16i8 src0, src1;
1008  v8i16 in0, in1, in2;
1009  v8i16 filt0, filt1, filt2, filt3;
1010  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1011  v16i8 vec0, vec1, vec2, vec3;
1012  v8i16 dst0, dst1, dst2;
1013  v4i32 dst2_r, dst2_l;
1014  v8i16 filter_vec, out0, out1, out2;
1015  v4i32 weight_vec, offset_vec, rnd_vec;
1016  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1017 
1018  src0_ptr = src0_ptr - 3;
1019  offset = (offset0 + offset1) << rnd_val;
1020  weight0 = weight0 & 0x0000FFFF;
1021  weight = weight0 | (weight1 << 16);
1022  constant = 128 * weight1;
1023  constant <<= 6;
1024  offset += constant;
1025 
1026  offset_vec = __msa_fill_w(offset);
1027  weight_vec = __msa_fill_w(weight);
1028  rnd_vec = __msa_fill_w(rnd_val + 1);
1029 
1030  filter_vec = LD_SH(filter);
1031  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1032 
1033  mask1 = mask0 + 2;
1034  mask2 = mask0 + 4;
1035  mask3 = mask0 + 6;
1036  mask4 = mask0 + 8;
1037  mask5 = mask0 + 10;
1038  mask6 = mask0 + 12;
1039  mask7 = mask0 + 14;
1040 
1041  LD_SB2(src0_ptr, 16, src0, src1);
1042  src0_ptr += src_stride;
1043  LD_SH2(src1_ptr, 8, in0, in1);
1044  in2 = LD_SH(src1_ptr + 16);
1045  src1_ptr += src2_stride;
1047 
1048  for (loop_cnt = 31; loop_cnt--;) {
1049  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1050  vec0, vec1, vec2, vec3);
1051  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1052  filt3);
1053  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1054  vec0, vec1, vec2, vec3);
1055  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1056  filt3);
1057  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1058  vec0, vec1, vec2, vec3);
1059  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1060  filt3);
1061 
1062  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
1063  weight_vec, rnd_vec, offset_vec,
1064  out0, out1);
1065 
1066  ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1067  dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1068  (v8i16) weight_vec);
1069  dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1070  (v8i16) weight_vec);
1071  SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1072  out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1073  CLIP_SH_0_255(out2);
1074 
1075  LD_SB2(src0_ptr, 16, src0, src1);
1076  src0_ptr += src_stride;
1077  LD_SH2(src1_ptr, 8, in0, in1);
1078  in2 = LD_SH(src1_ptr + 16);
1079  src1_ptr += src2_stride;
1081  PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1082  dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1083  ST_SH(out0, dst);
1084  SD(dst_val0, dst + 16);
1085  dst += dst_stride;
1086  }
1087 
1088  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1089  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1090  filt3);
1091  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1092  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1093  filt3);
1094  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1095  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1096  filt3);
1097  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec,
1098  out0, out1);
1099  ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1100  dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
1101  dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
1102  SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1103  out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1104  CLIP_SH_0_255(out2);
1105  PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1106  dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1107  ST_SH(out0, dst);
1108  SD(dst_val0, dst + 16);
1109  dst += dst_stride;
1110 }
1111 
1112 static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr,
1113  int32_t src_stride,
1114  int16_t *src1_ptr,
1115  int32_t src2_stride,
1116  uint8_t *dst,
1117  int32_t dst_stride,
1118  const int8_t *filter,
1119  int32_t height,
1120  int32_t weight0,
1121  int32_t weight1,
1122  int32_t offset0,
1123  int32_t offset1,
1124  int32_t rnd_val)
1125 {
1126  uint32_t loop_cnt;
1127  int32_t offset, weight, constant;
1128  v16i8 src0, src1, src2;
1129  v8i16 in0, in1, in2, in3;
1130  v8i16 filt0, filt1, filt2, filt3;
1131  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1132  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1133  v16i8 vec0, vec1, vec2, vec3;
1134  v8i16 dst0, dst1, dst2, dst3;
1135  v8i16 filter_vec, out0, out1, out2, out3;
1136  v4i32 weight_vec, offset_vec, rnd_vec;
1137 
1138  src0_ptr -= 3;
1139  offset = (offset0 + offset1) << rnd_val;
1140  weight0 = weight0 & 0x0000FFFF;
1141  weight = weight0 | (weight1 << 16);
1142  constant = 128 * weight1;
1143  constant <<= 6;
1144  offset += constant;
1145 
1146  offset_vec = __msa_fill_w(offset);
1147  weight_vec = __msa_fill_w(weight);
1148  rnd_vec = __msa_fill_w(rnd_val + 1);
1149 
1150  filter_vec = LD_SH(filter);
1151  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1152 
1153  mask1 = mask0 + 2;
1154  mask2 = mask0 + 4;
1155  mask3 = mask0 + 6;
1156  mask4 = mask0 + 8;
1157  mask5 = mask0 + 10;
1158  mask6 = mask0 + 12;
1159  mask7 = mask0 + 14;
1160 
1161  for (loop_cnt = height; loop_cnt--;) {
1162  LD_SB2(src0_ptr, 16, src0, src1);
1163  src2 = LD_SB(src0_ptr + 24);
1164  src0_ptr += src_stride;
1165  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1166  src1_ptr += src2_stride;
1167 
1168  XORI_B3_128_SB(src0, src1, src2);
1169 
1170  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1171  vec0, vec1, vec2, vec3);
1172  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1173  filt3);
1174  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1175  vec0, vec1, vec2, vec3);
1176  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1177  filt3);
1178  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1179  vec0, vec1, vec2, vec3);
1180  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1181  filt3);
1182  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1183  vec0, vec1, vec2, vec3);
1184  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1185  filt3);
1186 
1187  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1188  in0, in1, in2, in3,
1189  weight_vec, rnd_vec, offset_vec,
1190  out0, out1, out2, out3);
1191 
1192  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1193  ST_SH2(out0, out1, dst, 16);
1194  dst += dst_stride;
1195  }
1196 }
1197 
1198 static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr,
1199  int32_t src_stride,
1200  int16_t *src1_ptr,
1201  int32_t src2_stride,
1202  uint8_t *dst,
1203  int32_t dst_stride,
1204  const int8_t *filter,
1205  int32_t height,
1206  int32_t weight0,
1207  int32_t weight1,
1208  int32_t offset0,
1209  int32_t offset1,
1210  int32_t rnd_val)
1211 {
1212  uint32_t loop_cnt;
1213  int32_t offset, weight, constant;
1214  v16i8 src0, src1, src2, src3, src4;
1215  v8i16 in0, in1, in2, in3;
1216  v8i16 filt0, filt1, filt2, filt3;
1217  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1218  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1219  v16i8 vec0, vec1, vec2, vec3;
1220  v8i16 dst0, dst1, dst2, dst3;
1221  v8i16 filter_vec, out0, out1, out2, out3;
1222  v4i32 weight_vec, offset_vec, rnd_vec;
1223 
1224  src0_ptr -= 3;
1225  offset = (offset0 + offset1) << rnd_val;
1226  weight0 = weight0 & 0x0000FFFF;
1227  weight = weight0 | (weight1 << 16);
1228  constant = 128 * weight1;
1229  constant <<= 6;
1230  offset += constant;
1231 
1232  offset_vec = __msa_fill_w(offset);
1233  weight_vec = __msa_fill_w(weight);
1234  rnd_vec = __msa_fill_w(rnd_val + 1);
1235 
1236  filter_vec = LD_SH(filter);
1237  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1238 
1239  mask1 = mask0 + 2;
1240  mask2 = mask0 + 4;
1241  mask3 = mask0 + 6;
1242  mask4 = mask0 + 8;
1243  mask5 = mask0 + 10;
1244  mask6 = mask0 + 12;
1245  mask7 = mask0 + 14;
1246 
1247  for (loop_cnt = 64; loop_cnt--;) {
1248  LD_SB2(src0_ptr, 16, src0, src1);
1249  src2 = LD_SB(src0_ptr + 24);
1250  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1251  XORI_B3_128_SB(src0, src1, src2);
1252  LD_SB2(src0_ptr + 32, 8, src3, src4);
1253  src0_ptr += src_stride;
1254  XORI_B2_128_SB(src3, src4);
1255 
1256  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1257  vec0, vec1, vec2, vec3);
1258  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1259  filt3);
1260  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1261  vec0, vec1, vec2, vec3);
1262  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1263  filt3);
1264  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1265  vec0, vec1, vec2, vec3);
1266  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1267  filt3);
1268  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1269  vec0, vec1, vec2, vec3);
1270  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1271  filt3);
1272 
1273  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
1274  weight_vec, rnd_vec, offset_vec,
1275  out0, out1, out2, out3);
1276 
1277  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1278  ST_SH2(out0, out1, dst, 16);
1279 
1280  LD_SH2(src1_ptr + 32, 8, in2, in3);
1281  src1_ptr += src2_stride;
1282 
1283  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1284  vec0, vec1, vec2, vec3);
1285  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1286  filt3);
1287  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1288  vec0, vec1, vec2, vec3);
1289  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1290  filt3);
1291 
1292  HEVC_BIW_RND_CLIP2(dst0, dst1, in2, in3,
1293  weight_vec, rnd_vec, offset_vec,
1294  out0, out1);
1295 
1296  out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1297  ST_SH(out0, dst + 32);
1298  dst += dst_stride;
1299  }
1300 }
1301 
1302 static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr,
1303  int32_t src_stride,
1304  int16_t *src1_ptr,
1305  int32_t src2_stride,
1306  uint8_t *dst,
1307  int32_t dst_stride,
1308  const int8_t *filter,
1309  int32_t height,
1310  int32_t weight0,
1311  int32_t weight1,
1312  int32_t offset0,
1313  int32_t offset1,
1314  int32_t rnd_val)
1315 {
1316  uint8_t *src0_ptr_tmp;
1317  uint8_t *dst_tmp;
1318  int16_t *src1_ptr_tmp;
1319  uint32_t loop_cnt, cnt;
1320  int32_t offset, weight, constant;
1321  v16i8 src0, src1, src2;
1322  v8i16 in0, in1, in2, in3;
1323  v8i16 filt0, filt1, filt2, filt3;
1324  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1325  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1326  v16i8 vec0, vec1, vec2, vec3;
1327  v8i16 dst0, dst1, dst2, dst3;
1328  v8i16 filter_vec, out0, out1, out2, out3;
1329  v4i32 weight_vec, offset_vec, rnd_vec;
1330 
1331  src0_ptr -= 3;
1332  offset = (offset0 + offset1) << rnd_val;
1333  weight0 = weight0 & 0x0000FFFF;
1334  weight = weight0 | (weight1 << 16);
1335  constant = 128 * weight1;
1336  constant <<= 6;
1337  offset += constant;
1338 
1339  offset_vec = __msa_fill_w(offset);
1340  weight_vec = __msa_fill_w(weight);
1341  rnd_vec = __msa_fill_w(rnd_val + 1);
1342 
1343  filter_vec = LD_SH(filter);
1344  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1345 
1346  mask1 = mask0 + 2;
1347  mask2 = mask0 + 4;
1348  mask3 = mask0 + 6;
1349  mask4 = mask0 + 8;
1350  mask5 = mask0 + 10;
1351  mask6 = mask0 + 12;
1352  mask7 = mask0 + 14;
1353 
1354  for (loop_cnt = height; loop_cnt--;) {
1355  src0_ptr_tmp = src0_ptr;
1356  dst_tmp = dst;
1357  src1_ptr_tmp = src1_ptr;
1358 
1359  for (cnt = 2; cnt--;) {
1360  LD_SB2(src0_ptr_tmp, 16, src0, src1);
1361  src2 = LD_SB(src0_ptr_tmp + 24);
1362  src0_ptr_tmp += 32;
1363  LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
1364  src1_ptr_tmp += 32;
1365  XORI_B3_128_SB(src0, src1, src2);
1366 
1367  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1368  vec0, vec1, vec2, vec3);
1369  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1370  filt2, filt3);
1371  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1372  vec0, vec1, vec2, vec3);
1373  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1374  filt2, filt3);
1375  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1376  vec0, vec1, vec2, vec3);
1377  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1378  filt2, filt3);
1379  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1380  vec0, vec1, vec2, vec3);
1381  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1382  filt2, filt3);
1383 
1384  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1385  in0, in1, in2, in3,
1386  weight_vec, rnd_vec, offset_vec,
1387  out0, out1, out2, out3);
1388 
1389  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1390  ST_SH2(out0, out1, dst_tmp, 16);
1391  dst_tmp += 32;
1392  }
1393 
1394  src0_ptr += src_stride;
1395  src1_ptr += src2_stride;
1396  dst += dst_stride;
1397 
1398  }
1399 }
1400 
1401 static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr,
1402  int32_t src_stride,
1403  int16_t *src1_ptr,
1404  int32_t src2_stride,
1405  uint8_t *dst,
1406  int32_t dst_stride,
1407  const int8_t *filter,
1408  int32_t height,
1409  int32_t weight0,
1410  int32_t weight1,
1411  int32_t offset0,
1412  int32_t offset1,
1413  int32_t rnd_val)
1414 {
1415  uint32_t loop_cnt;
1417  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1418  v16i8 src11, src12, src13, src14;
1419  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1420  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1421  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1422  v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1423  v16i8 src2110, src4332, src6554, src8776, src10998;
1424  v16i8 src12111110, src14131312;
1425  v8i16 dst10, dst32, dst54, dst76;
1426  v8i16 filt0, filt1, filt2, filt3;
1427  v8i16 filter_vec, out0, out1, out2, out3;
1428  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1429 
1430  src0_ptr -= (3 * src_stride);
1431  offset = (offset0 + offset1) << rnd_val;
1432  weight0 = weight0 & 0x0000FFFF;
1433  weight = weight0 | (weight1 << 16);
1434 
1435  const_vec = __msa_ldi_w(128);
1436  const_vec <<= 6;
1437  offset_vec = __msa_fill_w(offset);
1438  weight_vec = __msa_fill_w(weight);
1439  rnd_vec = __msa_fill_w(rnd_val + 1);
1440  weight1_vec = __msa_fill_w(weight1);
1441  offset_vec += const_vec * weight1_vec;
1442 
1443  filter_vec = LD_SH(filter);
1444  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1445 
1446  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1447  src0_ptr += (7 * src_stride);
1448 
1449  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1450  src10_r, src32_r, src54_r, src21_r);
1451  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1452  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1453  src2110, src4332, src6554);
1454  XORI_B3_128_SB(src2110, src4332, src6554);
1455 
1456  for (loop_cnt = (height >> 3); loop_cnt--;) {
1457  LD_SB8(src0_ptr, src_stride,
1458  src7, src8, src9, src10, src11, src12, src13, src14);
1459  src0_ptr += (8 * src_stride);
1460  LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1461  src1_ptr += (8 * src2_stride);
1462 
1463  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1464  ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1465  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1466  src76_r, src87_r, src98_r, src109_r);
1467  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1468  src1110_r, src1211_r, src1312_r, src1413_r);
1469  ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1470  src1413_r, src1312_r,
1471  src8776, src10998, src12111110, src14131312);
1472  XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1473 
1474  DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
1475  filt0, dst10, dst32, dst54, dst76);
1476  DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
1477  filt1, dst10, dst32, dst54, dst76);
1478  DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
1479  filt2, filt2, dst10, dst32, dst54, dst76);
1480  DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
1481  filt3, filt3, dst10, dst32, dst54, dst76);
1482 
1483  HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
1484  in0, in1, in2, in3,
1485  weight_vec, rnd_vec, offset_vec,
1486  out0, out1, out2, out3);
1487 
1488  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1489  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1490  dst += (8 * dst_stride);
1491 
1492  src2110 = src10998;
1493  src4332 = src12111110;
1494  src6554 = src14131312;
1495  src6 = src14;
1496  }
1497 }
1498 
1499 static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr,
1500  int32_t src_stride,
1501  int16_t *src1_ptr,
1502  int32_t src2_stride,
1503  uint8_t *dst,
1504  int32_t dst_stride,
1505  const int8_t *filter,
1506  int32_t height,
1507  int32_t weight0,
1508  int32_t weight1,
1509  int32_t offset0,
1510  int32_t offset1,
1511  int32_t rnd_val)
1512 {
1513  uint32_t loop_cnt;
1515  v16i8 src0, src1, src2, src3, src4, src5;
1516  v16i8 src6, src7, src8, src9, src10;
1517  v8i16 in0, in1, in2, in3;
1518  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1519  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1520  v8i16 tmp0, tmp1, tmp2, tmp3;
1521  v8i16 filt0, filt1, filt2, filt3;
1522  v8i16 filter_vec, out0, out1, out2, out3;
1523  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1524 
1525  src0_ptr -= (3 * src_stride);
1526  offset = (offset0 + offset1) << rnd_val;
1527  weight0 = weight0 & 0x0000FFFF;
1528  weight = weight0 | (weight1 << 16);
1529 
1530  const_vec = __msa_ldi_w(128);
1531  const_vec <<= 6;
1532  offset_vec = __msa_fill_w(offset);
1533  weight_vec = __msa_fill_w(weight);
1534  rnd_vec = __msa_fill_w(rnd_val + 1);
1535  weight1_vec = __msa_fill_w(weight1);
1536  offset_vec += const_vec * weight1_vec;
1537 
1538  filter_vec = LD_SH(filter);
1539  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1540 
1541  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1542  src0_ptr += (7 * src_stride);
1543  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1544 
1545  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1546  src10_r, src32_r, src54_r, src21_r);
1547  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1548 
1549  for (loop_cnt = (height >> 2); loop_cnt--;) {
1550  LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1551  src0_ptr += (4 * src_stride);
1552  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1553  src1_ptr += (4 * src2_stride);
1554 
1555  XORI_B4_128_SB(src7, src8, src9, src10);
1556  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1557  src76_r, src87_r, src98_r, src109_r);
1558 
1559  DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1560  filt0, tmp0, tmp1, tmp2, tmp3);
1561  DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1562  filt1, tmp0, tmp1, tmp2, tmp3);
1563  DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1564  filt2, tmp0, tmp1, tmp2, tmp3);
1565  DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1566  filt3, tmp0, tmp1, tmp2, tmp3);
1567 
1568  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1569  in0, in1, in2, in3,
1570  weight_vec, rnd_vec, offset_vec,
1571  out0, out1, out2, out3);
1572 
1573  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1574  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1575  dst += (4 * dst_stride);
1576 
1577  src10_r = src54_r;
1578  src32_r = src76_r;
1579  src54_r = src98_r;
1580  src21_r = src65_r;
1581  src43_r = src87_r;
1582  src65_r = src109_r;
1583  src6 = src10;
1584  }
1585 }
1586 
1587 static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr,
1588  int32_t src_stride,
1589  int16_t *src1_ptr,
1590  int32_t src2_stride,
1591  uint8_t *dst,
1592  int32_t dst_stride,
1593  const int8_t *filter,
1594  int32_t height,
1595  int32_t weight0,
1596  int32_t weight1,
1597  int32_t offset0,
1598  int32_t offset1,
1599  int32_t rnd_val)
1600 {
1601  uint32_t loop_cnt;
1603  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1604  v8i16 in0, in1, in2, in3;
1605  v16i8 src10_r, src32_r, src54_r, src76_r;
1606  v16i8 src21_r, src43_r, src65_r, src87_r;
1607  v8i16 tmp0, tmp1, tmp2;
1608  v16i8 src10_l, src32_l, src54_l, src76_l;
1609  v16i8 src21_l, src43_l, src65_l, src87_l;
1610  v16i8 src2110, src4332, src6554, src8776;
1611  v8i16 filt0, filt1, filt2, filt3;
1612  v8i16 out0, out1, out2, filter_vec;
1613  v4i32 dst2_r, dst2_l;
1614  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1615 
1616  src0_ptr -= (3 * src_stride);
1617  offset = (offset0 + offset1) << rnd_val;
1618  weight0 = weight0 & 0x0000FFFF;
1619  weight = weight0 | (weight1 << 16);
1620 
1621  const_vec = __msa_ldi_w(128);
1622  const_vec <<= 6;
1623  offset_vec = __msa_fill_w(offset);
1624  weight_vec = __msa_fill_w(weight);
1625  rnd_vec = __msa_fill_w(rnd_val + 1);
1626  weight1_vec = __msa_fill_w(weight1);
1627  offset_vec += const_vec * weight1_vec;
1628 
1629  filter_vec = LD_SH(filter);
1630  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1631 
1632  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1633  src0_ptr += (7 * src_stride);
1634  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1635 
1636  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1637  src10_r, src32_r, src54_r, src21_r);
1638  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1639  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1640  src10_l, src32_l, src54_l, src21_l);
1641  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1642  ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1643  src2110, src4332, src6554);
1644 
1645  for (loop_cnt = 8; loop_cnt--;) {
1646  LD_SB2(src0_ptr, src_stride, src7, src8);
1647  src0_ptr += (2 * src_stride);
1648  LD_SH2(src1_ptr, src2_stride, in0, in1);
1649  LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
1650  src1_ptr += (2 * src2_stride);
1651  in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
1652  XORI_B2_128_SB(src7, src8);
1653 
1654  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1655  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1656  src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
1657 
1658  DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0,
1659  tmp0, tmp1, tmp2);
1660  DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1);
1661  tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1);
1662  DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1);
1663  tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2);
1664  DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1);
1665  tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3);
1666 
1667  HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
1668  weight_vec, rnd_vec, offset_vec,
1669  out0, out1);
1670 
1671  ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l);
1672  dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1673  (v8i16) weight_vec);
1674  dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1675  (v8i16) weight_vec);
1676  SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1677  out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1678  CLIP_SH_0_255(out2);
1679  PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1680  ST_D2(out0, 0, 1, dst, dst_stride);
1681  ST_W2(out2, 0, 1, dst + 8, dst_stride);
1682  dst += (2 * dst_stride);
1683 
1684  src10_r = src32_r;
1685  src32_r = src54_r;
1686  src54_r = src76_r;
1687  src21_r = src43_r;
1688  src43_r = src65_r;
1689  src65_r = src87_r;
1690  src2110 = src4332;
1691  src4332 = src6554;
1692  src6554 = src8776;
1693  src6 = src8;
1694  }
1695 }
1696 
1697 static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr,
1698  int32_t src_stride,
1699  int16_t *src1_ptr,
1700  int32_t src2_stride,
1701  uint8_t *dst,
1702  int32_t dst_stride,
1703  const int8_t *filter,
1704  int32_t height,
1705  int32_t weight0,
1706  int32_t weight1,
1707  int32_t offset0,
1708  int32_t offset1,
1709  int32_t rnd_val,
1710  int32_t width)
1711 {
1712  uint8_t *src0_ptr_tmp;
1713  int16_t *src1_ptr_tmp;
1714  uint8_t *dst_tmp;
1715  uint32_t loop_cnt, cnt;
1717  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1718  v8i16 in0, in1, in2, in3;
1719  v16i8 src10_r, src32_r, src54_r, src76_r;
1720  v16i8 src21_r, src43_r, src65_r, src87_r;
1721  v16i8 src10_l, src32_l, src54_l, src76_l;
1722  v16i8 src21_l, src43_l, src65_l, src87_l;
1723  v8i16 tmp0, tmp1, tmp2, tmp3;
1724  v8i16 filt0, filt1, filt2, filt3;
1725  v8i16 filter_vec;
1726  v8i16 out0, out1, out2, out3;
1727  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1728 
1729  src0_ptr -= (3 * src_stride);
1730 
1731  offset = (offset0 + offset1) << rnd_val;
1732  weight0 = weight0 & 0x0000FFFF;
1733  weight = weight0 | (weight1 << 16);
1734 
1735  const_vec = __msa_ldi_w(128);
1736  const_vec <<= 6;
1737  offset_vec = __msa_fill_w(offset);
1738  weight_vec = __msa_fill_w(weight);
1739  rnd_vec = __msa_fill_w(rnd_val + 1);
1740  weight1_vec = __msa_fill_w(weight1);
1741  offset_vec += const_vec * weight1_vec;
1742 
1743  filter_vec = LD_SH(filter);
1744  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1745 
1746  for (cnt = (width >> 4); cnt--;) {
1747  src0_ptr_tmp = src0_ptr;
1748  src1_ptr_tmp = src1_ptr;
1749  dst_tmp = dst;
1750 
1751  LD_SB7(src0_ptr_tmp, src_stride,
1752  src0, src1, src2, src3, src4, src5, src6);
1753  src0_ptr_tmp += (7 * src_stride);
1754 
1755  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1756  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1757  src10_r, src32_r, src54_r, src21_r);
1758  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1759  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1760  src10_l, src32_l, src54_l, src21_l);
1761  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1762 
1763  for (loop_cnt = (height >> 1); loop_cnt--;) {
1764  LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1765  src0_ptr_tmp += (2 * src_stride);
1766  LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1767  LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1768  src1_ptr_tmp += (2 * src2_stride);
1769 
1770  XORI_B2_128_SB(src7, src8);
1771  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1772  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1773 
1774  DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0,
1775  filt0, filt0, tmp0, tmp1, tmp2, tmp3);
1776  DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1,
1777  filt1, filt1, tmp0, tmp1, tmp2, tmp3);
1778  DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2,
1779  filt2, filt2, tmp0, tmp1, tmp2, tmp3);
1780  DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3,
1781  filt3, filt3, tmp0, tmp1, tmp2, tmp3);
1782 
1783  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1784  in0, in1, in2, in3,
1785  weight_vec, rnd_vec, offset_vec,
1786  out0, out1, out2, out3);
1787 
1788  PCKEV_B2_SH(out2, out0, out3, out1, out0, out1);
1789  ST_SH2(out0, out1, dst_tmp, dst_stride);
1790  dst_tmp += (2 * dst_stride);
1791 
1792  src10_r = src32_r;
1793  src32_r = src54_r;
1794  src54_r = src76_r;
1795  src21_r = src43_r;
1796  src43_r = src65_r;
1797  src65_r = src87_r;
1798  src10_l = src32_l;
1799  src32_l = src54_l;
1800  src54_l = src76_l;
1801  src21_l = src43_l;
1802  src43_l = src65_l;
1803  src65_l = src87_l;
1804  src6 = src8;
1805  }
1806 
1807  src0_ptr += 16;
1808  src1_ptr += 16;
1809  dst += 16;
1810  }
1811 }
1812 
1813 static void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr,
1814  int32_t src_stride,
1815  int16_t *src1_ptr,
1816  int32_t src2_stride,
1817  uint8_t *dst,
1818  int32_t dst_stride,
1819  const int8_t *filter,
1820  int32_t height,
1821  int32_t weight0,
1822  int32_t weight1,
1823  int32_t offset0,
1824  int32_t offset1,
1825  int32_t rnd_val)
1826 {
1827  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1828  src1_ptr, src2_stride,
1829  dst, dst_stride, filter, height,
1830  weight0, weight1, offset0, offset1,
1831  rnd_val, 16);
1832 }
1833 
1834 static void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr,
1835  int32_t src_stride,
1836  int16_t *src1_ptr,
1837  int32_t src2_stride,
1838  uint8_t *dst,
1839  int32_t dst_stride,
1840  const int8_t *filter,
1841  int32_t height,
1842  int32_t weight0,
1843  int32_t weight1,
1844  int32_t offset0,
1845  int32_t offset1,
1846  int32_t rnd_val)
1847 {
1848  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1849  src1_ptr, src2_stride,
1850  dst, dst_stride, filter, height,
1851  weight0, weight1, offset0, offset1,
1852  rnd_val, 16);
1853  hevc_vt_biwgt_8t_8w_msa(src0_ptr + 16, src_stride,
1854  src1_ptr + 16, src2_stride,
1855  dst + 16, dst_stride, filter, height,
1856  weight0, weight1, offset0, offset1, rnd_val);
1857 }
1858 
1859 static void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr,
1860  int32_t src_stride,
1861  int16_t *src1_ptr,
1862  int32_t src2_stride,
1863  uint8_t *dst,
1864  int32_t dst_stride,
1865  const int8_t *filter,
1866  int32_t height,
1867  int32_t weight0,
1868  int32_t weight1,
1869  int32_t offset0,
1870  int32_t offset1,
1871  int32_t rnd_val)
1872 {
1873  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1874  src1_ptr, src2_stride,
1875  dst, dst_stride, filter, height,
1876  weight0, weight1, offset0, offset1,
1877  rnd_val, 32);
1878 }
1879 
1880 static void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr,
1881  int32_t src_stride,
1882  int16_t *src1_ptr,
1883  int32_t src2_stride,
1884  uint8_t *dst,
1885  int32_t dst_stride,
1886  const int8_t *filter,
1887  int32_t height,
1888  int32_t weight0,
1889  int32_t weight1,
1890  int32_t offset0,
1891  int32_t offset1,
1892  int32_t rnd_val)
1893 {
1894  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1895  src1_ptr, src2_stride,
1896  dst, dst_stride, filter, height,
1897  weight0, weight1, offset0, offset1,
1898  rnd_val, 48);
1899 }
1900 
1901 static void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr,
1902  int32_t src_stride,
1903  int16_t *src1_ptr,
1904  int32_t src2_stride,
1905  uint8_t *dst,
1906  int32_t dst_stride,
1907  const int8_t *filter,
1908  int32_t height,
1909  int32_t weight0,
1910  int32_t weight1,
1911  int32_t offset0,
1912  int32_t offset1,
1913  int32_t rnd_val)
1914 {
1915  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1916  src1_ptr, src2_stride,
1917  dst, dst_stride, filter, height,
1918  weight0, weight1, offset0, offset1,
1919  rnd_val, 64);
1920 }
1921 
1922 static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
1923  int32_t src_stride,
1924  int16_t *src1_ptr,
1925  int32_t src2_stride,
1926  uint8_t *dst,
1927  int32_t dst_stride,
1928  const int8_t *filter_x,
1929  const int8_t *filter_y,
1930  int32_t height,
1931  int32_t weight0,
1932  int32_t weight1,
1933  int32_t offset0,
1934  int32_t offset1,
1935  int32_t rnd_val)
1936 {
1937  uint32_t loop_cnt;
1938  uint64_t tp0, tp1;
1940  v16u8 out;
1941  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1942  v8i16 in0 = { 0 }, in1 = { 0 };
1943  v8i16 filt0, filt1, filt2, filt3;
1944  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1945  v16i8 mask1, mask2, mask3;
1946  v8i16 filter_vec, weight_vec;
1947  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1948  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1949  v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1950  v8i16 tmp0, tmp1, tmp2, tmp3;
1951  v8i16 dst10, dst32, dst54, dst76;
1952  v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98;
1953  v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
1954  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1955 
1956  src0_ptr -= ((3 * src_stride) + 3);
1957 
1958  filter_vec = LD_SH(filter_x);
1959  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1960 
1961  filter_vec = LD_SH(filter_y);
1962  UNPCK_R_SB_SH(filter_vec, filter_vec);
1963 
1964  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1965 
1966  mask1 = mask0 + 2;
1967  mask2 = mask0 + 4;
1968  mask3 = mask0 + 6;
1969 
1970  offset = (offset0 + offset1) << rnd_val;
1971  weight0 = weight0 & 0x0000FFFF;
1972  weight = weight0 | (weight1 << 16);
1973 
1974  const_vec = __msa_fill_w((128 * weight1));
1975  const_vec <<= 6;
1976  offset_vec = __msa_fill_w(offset);
1977  rnd_vec = __msa_fill_w(rnd_val + 1);
1978  offset_vec += const_vec;
1979  weight_vec = (v8i16) __msa_fill_w(weight);
1980 
1981  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1982  src0_ptr += (7 * src_stride);
1983 
1984  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1985 
1986  VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1987  VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1988  VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1989  vec8, vec9, vec10, vec11);
1990  VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1991  vec12, vec13, vec14, vec15);
1992 
1993  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1994  filt3);
1995  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1996  filt3);
1997  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1998  filt3);
1999  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2000  filt3);
2001 
2002  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2003  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2004  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2005 
2006  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2007 
2008  for (loop_cnt = height >> 2; loop_cnt--;) {
2009  LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2010  src0_ptr += (4 * src_stride);
2011  XORI_B4_128_SB(src7, src8, src9, src10);
2012 
2013  LD2(src1_ptr, src2_stride, tp0, tp1);
2014  INSERT_D2_SH(tp0, tp1, in0);
2015  src1_ptr += (2 * src2_stride);
2016  LD2(src1_ptr, src2_stride, tp0, tp1);
2017  INSERT_D2_SH(tp0, tp1, in1);
2018  src1_ptr += (2 * src2_stride);
2019 
2020  VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
2021  vec0, vec1, vec2, vec3);
2022  VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
2023  vec4, vec5, vec6, vec7);
2024  dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2025  filt3);
2026  dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2027  filt3);
2028 
2029  dst76 = __msa_ilvr_h(dst97, dst66);
2030  ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2031  dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2032  dst98 = __msa_ilvr_h(dst66, dst108);
2033 
2034  dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2035  filt_h2, filt_h3);
2036  dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2037  filt_h2, filt_h3);
2038  dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2039  filt_h2, filt_h3);
2040  dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2041  filt_h2, filt_h3);
2042  SRA_4V(dst0, dst1, dst2, dst3, 6);
2043  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2044  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2045  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2046  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2047  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2048  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2049  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2050  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2051  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
2052  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2053  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2054  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2055  dst += (4 * dst_stride);
2056 
2057  dst10 = dst54;
2058  dst32 = dst76;
2059  dst54 = dst98;
2060  dst21 = dst65;
2061  dst43 = dst87;
2062  dst65 = dst109;
2063  dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2064  }
2065 }
2066 
2067 static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr,
2068  int32_t src_stride,
2069  int16_t *src1_ptr,
2070  int32_t src2_stride,
2071  uint8_t *dst,
2072  int32_t dst_stride,
2073  const int8_t *filter_x,
2074  const int8_t *filter_y,
2075  int32_t height,
2076  int32_t weight0,
2077  int32_t weight1,
2078  int32_t offset0,
2079  int32_t offset1,
2080  int32_t rnd_val,
2081  int32_t width8mult)
2082 {
2083  uint32_t loop_cnt, cnt;
2085  uint8_t *src0_ptr_tmp;
2086  int16_t *src1_ptr_tmp;
2087  uint8_t *dst_tmp;
2088  v16u8 out;
2089  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2090  v8i16 in0, in1;
2091  v8i16 filt0, filt1, filt2, filt3;
2092  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2093  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2094  v16i8 mask1, mask2, mask3;
2095  v8i16 filter_vec, weight_vec;
2096  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2097  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2098  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
2099  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
2100  v8i16 tmp0, tmp1, tmp2, tmp3;
2101  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2102  v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
2103  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
2104  v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
2105  v4i32 offset_vec, rnd_vec, const_vec;
2106 
2107  src0_ptr -= ((3 * src_stride) + 3);
2108 
2109  offset = (offset0 + offset1) << rnd_val;
2110  weight0 = weight0 & 0x0000FFFF;
2111  weight = weight0 | (weight1 << 16);
2112 
2113  const_vec = __msa_fill_w((128 * weight1));
2114  const_vec <<= 6;
2115  offset_vec = __msa_fill_w(offset);
2116  rnd_vec = __msa_fill_w(rnd_val + 1);
2117  offset_vec += const_vec;
2118  weight_vec = (v8i16) __msa_fill_w(weight);
2119 
2120  filter_vec = LD_SH(filter_x);
2121  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2122 
2123  filter_vec = LD_SH(filter_y);
2124  UNPCK_R_SB_SH(filter_vec, filter_vec);
2125 
2126  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2127 
2128  mask1 = mask0 + 2;
2129  mask2 = mask0 + 4;
2130  mask3 = mask0 + 6;
2131 
2132  for (cnt = width8mult; cnt--;) {
2133  src0_ptr_tmp = src0_ptr;
2134  src1_ptr_tmp = src1_ptr;
2135  dst_tmp = dst;
2136 
2137  LD_SB7(src0_ptr_tmp, src_stride,
2138  src0, src1, src2, src3, src4, src5, src6);
2139  src0_ptr_tmp += (7 * src_stride);
2140 
2141  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2142 
2143  /* row 0 row 1 row 2 row 3 */
2144  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
2145  vec0, vec1, vec2, vec3);
2146  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
2147  vec4, vec5, vec6, vec7);
2148  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
2149  vec8, vec9, vec10, vec11);
2150  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
2151  vec12, vec13, vec14, vec15);
2152 
2153  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2154  filt3);
2155  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2156  filt3);
2157  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2158  filt3);
2159  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2160  filt2, filt3);
2161 
2162  /* row 4 row 5 row 6 */
2163  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
2164  vec0, vec1, vec2, vec3);
2165  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
2166  vec4, vec5, vec6, vec7);
2167  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
2168  vec8, vec9, vec10, vec11);
2169 
2170  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2171  filt3);
2172  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2173  filt3);
2174  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2175  filt3);
2176 
2177  for (loop_cnt = height >> 1; loop_cnt--;) {
2178  LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2179  XORI_B2_128_SB(src7, src8);
2180  src0_ptr_tmp += 2 * src_stride;
2181 
2182  LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2183  src1_ptr_tmp += (2 * src2_stride);
2184 
2185  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
2186  dst32_r, dst54_r, dst21_r);
2187  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
2188  dst32_l, dst54_l, dst21_l);
2189  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
2190  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
2191 
2192  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2193  vec0, vec1, vec2, vec3);
2194  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2195  filt2, filt3);
2196 
2197  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
2198  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
2199  filt_h0, filt_h1, filt_h2, filt_h3);
2200  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
2201  filt_h0, filt_h1, filt_h2, filt_h3);
2202 
2203  dst0_r >>= 6;
2204  dst0_l >>= 6;
2205 
2206  /* row 8 */
2207  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2208  vec0, vec1, vec2, vec3);
2209  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2210  filt2, filt3);
2211 
2212  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
2213  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
2214  filt_h0, filt_h1, filt_h2, filt_h3);
2215  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
2216  filt_h0, filt_h1, filt_h2, filt_h3);
2217 
2218  dst1_r >>= 6;
2219  dst1_l >>= 6;
2220 
2221  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
2222  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2223  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2224  dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2225  dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2226  dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2227  dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2228  SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
2229  CLIP_SW4_0_255(dst0_l, dst0_r, dst1_l, dst1_r);
2230  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
2231  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2232  ST_D2(out, 0, 1, dst_tmp, dst_stride);
2233  dst_tmp += (2 * dst_stride);
2234 
2235  dst0 = dst2;
2236  dst1 = dst3;
2237  dst2 = dst4;
2238  dst3 = dst5;
2239  dst4 = dst6;
2240  dst5 = dst7;
2241  dst6 = dst8;
2242  }
2243 
2244  src0_ptr += 8;
2245  src1_ptr += 8;
2246  dst += 8;
2247  }
2248 }
2249 
2250 static void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr,
2251  int32_t src_stride,
2252  int16_t *src1_ptr,
2253  int32_t src2_stride,
2254  uint8_t *dst,
2255  int32_t dst_stride,
2256  const int8_t *filter_x,
2257  const int8_t *filter_y,
2258  int32_t height,
2259  int32_t weight0,
2260  int32_t weight1,
2261  int32_t offset0,
2262  int32_t offset1,
2263  int32_t rnd_val)
2264 {
2265  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2266  src1_ptr, src2_stride,
2267  dst, dst_stride, filter_x, filter_y,
2268  height, weight0, weight1, offset0,
2269  offset1, rnd_val, 1);
2270 }
2271 
2272 static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
2273  int32_t src_stride,
2274  int16_t *src1_ptr,
2275  int32_t src2_stride,
2276  uint8_t *dst,
2277  int32_t dst_stride,
2278  const int8_t *filter_x,
2279  const int8_t *filter_y,
2280  int32_t height,
2281  int32_t weight0,
2282  int32_t weight1,
2283  int32_t offset0,
2284  int32_t offset1,
2285  int32_t rnd_val)
2286 {
2287  uint32_t loop_cnt;
2288  uint8_t *src0_ptr_tmp, *dst_tmp;
2289  int16_t *src1_ptr_tmp;
2291  uint64_t tp0, tp1;
2292  v16u8 out;
2293  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2294  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2295  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2296  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2297  v8i16 in0 = { 0 }, in1 = { 0 };
2298  v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3;
2299  v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2300  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
2301  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
2302  v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
2303  v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76;
2304  v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l;
2305  v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
2306 
2307  src0_ptr -= ((3 * src_stride) + 3);
2308 
2309  offset = (offset0 + offset1) << rnd_val;
2310  weight0 = weight0 & 0x0000FFFF;
2311  weight = weight0 | (weight1 << 16);
2312 
2313  const_vec = __msa_fill_w((128 * weight1));
2314  const_vec <<= 6;
2315  offset_vec = __msa_fill_w(offset);
2316  rnd_vec = __msa_fill_w(rnd_val + 1);
2317  offset_vec += const_vec;
2318  weight_vec = (v8i16) __msa_fill_w(weight);
2319 
2320  filter_vec = LD_SH(filter_x);
2321  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2322 
2323  filter_vec = LD_SH(filter_y);
2324  UNPCK_R_SB_SH(filter_vec, filter_vec);
2325 
2326  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2327 
2328  mask0 = LD_SB(ff_hevc_mask_arr);
2329  mask1 = mask0 + 2;
2330  mask2 = mask0 + 4;
2331  mask3 = mask0 + 6;
2332 
2333  src0_ptr_tmp = src0_ptr;
2334  src1_ptr_tmp = src1_ptr;
2335  dst_tmp = dst;
2336 
2337  LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
2338  src0_ptr_tmp += (7 * src_stride);
2339  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2340 
2341  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2342  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2343  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2344  vec11);
2345  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2346  vec15);
2347  dsth0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2348  filt3);
2349  dsth1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2350  filt3);
2351  dsth2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2352  filt3);
2353  dsth3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2354  filt2, filt3);
2355  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2356  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2357  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2358  vec11);
2359  dsth4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2360  filt3);
2361  dsth5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2362  filt3);
2363  dsth6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2364  filt3);
2365 
2366  for (loop_cnt = 8; loop_cnt--;) {
2367  LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2368  src0_ptr_tmp += (2 * src_stride);
2369  XORI_B2_128_SB(src7, src8);
2370 
2371  LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2372  src1_ptr_tmp += (2 * src2_stride);
2373 
2374  ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2375  dst10_r, dst32_r, dst54_r, dst21_r);
2376  ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2377  dst10_l, dst32_l, dst54_l, dst21_l);
2378  ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r);
2379  ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l);
2380 
2381  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2382  vec3);
2383  dsth7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2384  filt3);
2385 
2386  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
2387  dst0 = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2388  filt_h1, filt_h2, filt_h3);
2389  dst1 = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
2390  filt_h1, filt_h2, filt_h3);
2391  dst0 >>= 6;
2392  dst1 >>= 6;
2393 
2394  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2395  vec3);
2396  dsth8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2397  filt3);
2398 
2399  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
2400  dst2 = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2401  filt_h1, filt_h2, filt_h3);
2402  dst3 = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0,
2403  filt_h1, filt_h2, filt_h3);
2404  dst2 >>= 6;
2405  dst3 >>= 6;
2406 
2407  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2408  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2409  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2410  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2411  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2412  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2413  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2414  SRAR_W4_SW(dst1, dst0, dst3, dst2, rnd_vec);
2415  CLIP_SW4_0_255(dst1, dst0, dst3, dst2);
2416  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2417  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2418  ST_D2(out, 0, 1, dst_tmp, dst_stride);
2419  dst_tmp += (2 * dst_stride);
2420 
2421  dsth0 = dsth2;
2422  dsth1 = dsth3;
2423  dsth2 = dsth4;
2424  dsth3 = dsth5;
2425  dsth4 = dsth6;
2426  dsth5 = dsth7;
2427  dsth6 = dsth8;
2428  }
2429 
2430  src0_ptr += 8;
2431  src1_ptr += 8;
2432  dst += 8;
2433 
2434  mask4 = LD_SB(ff_hevc_mask_arr + 16);
2435  mask5 = mask4 + 2;
2436  mask6 = mask4 + 4;
2437  mask7 = mask4 + 6;
2438 
2439  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2440  src0_ptr += (7 * src_stride);
2441  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2442 
2443  VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2444  VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2445  VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2446  vec11);
2447  VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2448  vec15);
2449  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2450  filt3);
2451  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2452  filt3);
2453  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2454  filt3);
2455  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2456  filt3);
2457  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2458  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2459  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2460 
2461  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2462 
2463  for (loop_cnt = 4; loop_cnt--;) {
2464  LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2465  src0_ptr += (4 * src_stride);
2466  XORI_B4_128_SB(src7, src8, src9, src10);
2467 
2468  LD2(src1_ptr, src2_stride, tp0, tp1);
2469  INSERT_D2_SH(tp0, tp1, in0);
2470  src1_ptr += (2 * src2_stride);
2471  LD2(src1_ptr, src2_stride, tp0, tp1);
2472  INSERT_D2_SH(tp0, tp1, in1);
2473  src1_ptr += (2 * src2_stride);
2474 
2475  VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2476  vec3);
2477  VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2478  vec7);
2479  dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2480  filt3);
2481  dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2482  filt3);
2483 
2484  dst76 = __msa_ilvr_h(dst97, dst66);
2485  ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2486  dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2487  dst98 = __msa_ilvr_h(dst66, dst108);
2488 
2489  dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2490  filt_h2, filt_h3);
2491  dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2492  filt_h2, filt_h3);
2493  dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2494  filt_h2, filt_h3);
2495  dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2496  filt_h2, filt_h3);
2497  SRA_4V(dst0, dst1, dst2, dst3, 6);
2498  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2499  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2500  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2501  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2502  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2503  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2504  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2505  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2506  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
2507  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2508  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2509  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2510  dst += (4 * dst_stride);
2511 
2512  dst10 = dst54;
2513  dst32 = dst76;
2514  dst54 = dst98;
2515  dst21 = dst65;
2516  dst43 = dst87;
2517  dst65 = dst109;
2518  dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2519  }
2520 }
2521 
2522 static void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr,
2523  int32_t src_stride,
2524  int16_t *src1_ptr,
2525  int32_t src2_stride,
2526  uint8_t *dst,
2527  int32_t dst_stride,
2528  const int8_t *filter_x,
2529  const int8_t *filter_y,
2530  int32_t height,
2531  int32_t weight0,
2532  int32_t weight1,
2533  int32_t offset0,
2534  int32_t offset1,
2535  int32_t rnd_val)
2536 {
2537  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2538  src1_ptr, src2_stride,
2539  dst, dst_stride, filter_x, filter_y,
2540  height, weight0, weight1, offset0,
2541  offset1, rnd_val, 2);
2542 }
2543 
2544 static void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr,
2545  int32_t src_stride,
2546  int16_t *src1_ptr,
2547  int32_t src2_stride,
2548  uint8_t *dst,
2549  int32_t dst_stride,
2550  const int8_t *filter_x,
2551  const int8_t *filter_y,
2552  int32_t height,
2553  int32_t weight0,
2554  int32_t weight1,
2555  int32_t offset0,
2556  int32_t offset1,
2557  int32_t rnd_val)
2558 {
2559  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2560  src1_ptr, src2_stride,
2561  dst, dst_stride, filter_x, filter_y,
2562  height, weight0, weight1, offset0,
2563  offset1, rnd_val, 3);
2564 }
2565 
2566 static void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr,
2567  int32_t src_stride,
2568  int16_t *src1_ptr,
2569  int32_t src2_stride,
2570  uint8_t *dst,
2571  int32_t dst_stride,
2572  const int8_t *filter_x,
2573  const int8_t *filter_y,
2574  int32_t height,
2575  int32_t weight0,
2576  int32_t weight1,
2577  int32_t offset0,
2578  int32_t offset1,
2579  int32_t rnd_val)
2580 {
2581  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2582  src1_ptr, src2_stride,
2583  dst, dst_stride, filter_x, filter_y,
2584  height, weight0, weight1, offset0,
2585  offset1, rnd_val, 4);
2586 }
2587 
2588 static void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr,
2589  int32_t src_stride,
2590  int16_t *src1_ptr,
2591  int32_t src2_stride,
2592  uint8_t *dst,
2593  int32_t dst_stride,
2594  const int8_t *filter_x,
2595  const int8_t *filter_y,
2596  int32_t height,
2597  int32_t weight0,
2598  int32_t weight1,
2599  int32_t offset0,
2600  int32_t offset1,
2601  int32_t rnd_val)
2602 {
2603  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2604  src1_ptr, src2_stride,
2605  dst, dst_stride, filter_x, filter_y,
2606  height, weight0, weight1, offset0,
2607  offset1, rnd_val, 6);
2608 }
2609 
2610 static void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr,
2611  int32_t src_stride,
2612  int16_t *src1_ptr,
2613  int32_t src2_stride,
2614  uint8_t *dst,
2615  int32_t dst_stride,
2616  const int8_t *filter_x,
2617  const int8_t *filter_y,
2618  int32_t height,
2619  int32_t weight0,
2620  int32_t weight1,
2621  int32_t offset0,
2622  int32_t offset1,
2623  int32_t rnd_val)
2624 {
2625  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2626  src1_ptr, src2_stride,
2627  dst, dst_stride, filter_x, filter_y,
2628  height, weight0, weight1, offset0,
2629  offset1, rnd_val, 8);
2630 }
2631 
2632 static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
2633  int32_t src_stride,
2634  int16_t *src1_ptr,
2635  int32_t src2_stride,
2636  uint8_t *dst,
2637  int32_t dst_stride,
2638  const int8_t *filter,
2639  int32_t weight0,
2640  int32_t weight1,
2641  int32_t offset0,
2642  int32_t offset1,
2643  int32_t rnd_val)
2644 {
2645  int32_t offset, weight, constant;
2646  v8i16 filt0, filt1;
2647  v16i8 src0, src1;
2648  v8i16 in0, in1;
2649  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2650  v16i8 mask1, vec0, vec1;
2651  v8i16 dst0;
2652  v4i32 dst0_r, dst0_l;
2653  v8i16 out0, filter_vec;
2654  v4i32 weight_vec, offset_vec, rnd_vec;
2655 
2656  src0_ptr -= 1;
2657 
2658  filter_vec = LD_SH(filter);
2659  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2660 
2661  mask1 = mask0 + 2;
2662 
2663  offset = (offset0 + offset1) << rnd_val;
2664  weight0 = weight0 & 0x0000FFFF;
2665  weight = weight0 | (weight1 << 16);
2666  constant = 128 * weight1;
2667  constant <<= 6;
2668  offset += constant;
2669 
2670  offset_vec = __msa_fill_w(offset);
2671  weight_vec = __msa_fill_w(weight);
2672  rnd_vec = __msa_fill_w(rnd_val + 1);
2673 
2674  LD_SB2(src0_ptr, src_stride, src0, src1);
2675  LD_SH2(src1_ptr, src2_stride, in0, in1);
2676  in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2678 
2679  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2680  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2681 
2682  ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
2683  dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
2684  dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
2685  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2686  out0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2687  CLIP_SH_0_255(out0);
2688  out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
2689  ST_W2(out0, 0, 1, dst, dst_stride);
2690 }
2691 
2692 static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
2693  int32_t src_stride,
2694  int16_t *src1_ptr,
2695  int32_t src2_stride,
2696  uint8_t *dst,
2697  int32_t dst_stride,
2698  const int8_t *filter,
2699  int32_t weight0,
2700  int32_t weight1,
2701  int32_t offset0,
2702  int32_t offset1,
2703  int32_t rnd_val)
2704 {
2705  int32_t offset, weight, constant;
2706  v8i16 filt0, filt1;
2707  v16i8 src0, src1, src2, src3;
2708  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2709  v16i8 mask1;
2710  v8i16 dst0, dst1;
2711  v16i8 vec0, vec1;
2712  v8i16 in0, in1, in2, in3;
2713  v8i16 filter_vec;
2714  v4i32 weight_vec, offset_vec, rnd_vec;
2715 
2716  src0_ptr -= 1;
2717 
2718  /* rearranging filter */
2719  filter_vec = LD_SH(filter);
2720  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2721 
2722  mask1 = mask0 + 2;
2723 
2724  offset = (offset0 + offset1) << rnd_val;
2725  weight0 = weight0 & 0x0000FFFF;
2726  weight = weight0 | (weight1 << 16);
2727  constant = 128 * weight1;
2728  constant <<= 6;
2729  offset += constant;
2730 
2731  offset_vec = __msa_fill_w(offset);
2732  weight_vec = __msa_fill_w(weight);
2733  rnd_vec = __msa_fill_w(rnd_val + 1);
2734 
2735  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2736  XORI_B4_128_SB(src0, src1, src2, src3);
2737  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2738  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2739 
2740  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2741  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2742  VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2743  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2744  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2745  weight_vec, rnd_vec, offset_vec,
2746  dst0, dst1);
2747 
2748  dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2749  ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
2750 }
2751 
2752 static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
2753  int32_t src_stride,
2754  int16_t *src1_ptr,
2755  int32_t src2_stride,
2756  uint8_t *dst,
2757  int32_t dst_stride,
2758  const int8_t *filter,
2759  int32_t height,
2760  int32_t weight0,
2761  int32_t weight1,
2762  int32_t offset0,
2763  int32_t offset1,
2764  int32_t rnd_val)
2765 {
2766  uint32_t loop_cnt;
2767  int32_t weight, offset, constant;
2768  v8i16 filt0, filt1;
2769  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2770  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2771  v16i8 mask1;
2772  v16i8 vec0, vec1;
2773  v8i16 dst0, dst1, dst2, dst3;
2774  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2775  v8i16 filter_vec;
2776  v4i32 weight_vec, offset_vec, rnd_vec;
2777 
2778  src0_ptr -= 1;
2779 
2780  filter_vec = LD_SH(filter);
2781  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2782 
2783  offset = (offset0 + offset1) << rnd_val;
2784  weight0 = weight0 & 0x0000FFFF;
2785  weight = weight0 | (weight1 << 16);
2786  constant = 128 * weight1;
2787  constant <<= 6;
2788  offset += constant;
2789 
2790  offset_vec = __msa_fill_w(offset);
2791  weight_vec = __msa_fill_w(weight);
2792  rnd_vec = __msa_fill_w(rnd_val + 1);
2793 
2794  mask1 = mask0 + 2;
2795 
2796  for (loop_cnt = (height >> 3); loop_cnt--;) {
2797  LD_SB8(src0_ptr, src_stride,
2798  src0, src1, src2, src3, src4, src5, src6, src7);
2799  src0_ptr += (8 * src_stride);
2800  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2801  src1_ptr += (4 * src2_stride);
2802  LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2803  src1_ptr += (4 * src2_stride);
2804  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2805  ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2806  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2807 
2808  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2809  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2810  VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2811  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2812  VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2813  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2814  VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2815  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2816  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2817  in0, in1, in2, in3,
2818  weight_vec, rnd_vec, offset_vec,
2819  dst0, dst1, dst2, dst3);
2820 
2821  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2822  ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2823  dst += (8 * dst_stride);
2824  }
2825 }
2826 
2827 static void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr,
2828  int32_t src_stride,
2829  int16_t *src1_ptr,
2830  int32_t src2_stride,
2831  uint8_t *dst,
2832  int32_t dst_stride,
2833  const int8_t *filter,
2834  int32_t height,
2835  int32_t weight0,
2836  int32_t weight1,
2837  int32_t offset0,
2838  int32_t offset1,
2839  int32_t rnd_val)
2840 {
2841  if (2 == height) {
2842  hevc_hz_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2843  dst, dst_stride, filter,
2844  weight0, weight1, offset0, offset1, rnd_val);
2845  } else if (4 == height) {
2846  hevc_hz_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2847  dst, dst_stride, filter,
2848  weight0, weight1, offset0, offset1, rnd_val);
2849  } else if (0 == (height % 8)) {
2850  hevc_hz_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
2851  src1_ptr, src2_stride,
2852  dst, dst_stride, filter, height,
2853  weight0, weight1, offset0, offset1,
2854  rnd_val);
2855  }
2856 }
2857 
2858 static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr,
2859  int32_t src_stride,
2860  int16_t *src1_ptr,
2861  int32_t src2_stride,
2862  uint8_t *dst,
2863  int32_t dst_stride,
2864  const int8_t *filter,
2865  int32_t height,
2866  int32_t weight0,
2867  int32_t weight1,
2868  int32_t offset0,
2869  int32_t offset1,
2870  int32_t rnd_val)
2871 {
2872  uint32_t loop_cnt;
2873  int32_t offset, weight, constant;
2874  v8i16 filt0, filt1;
2875  v16i8 src0, src1, src2, src3;
2876  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2877  v16i8 mask1;
2878  v16i8 vec0, vec1;
2879  v8i16 in0, in1, in2, in3;
2880  v8i16 dst0, dst1, dst2, dst3;
2881  v8i16 filter_vec;
2882  v4i32 weight_vec, offset_vec, rnd_vec;
2883 
2884  src0_ptr -= 1;
2885 
2886  filter_vec = LD_SH(filter);
2887  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2888 
2889  offset = (offset0 + offset1) << rnd_val;
2890  weight0 = weight0 & 0x0000FFFF;
2891  weight = weight0 | (weight1 << 16);
2892  constant = 128 * weight1;
2893  constant <<= 6;
2894  offset += constant;
2895 
2896  offset_vec = __msa_fill_w(offset);
2897  weight_vec = __msa_fill_w(weight);
2898  rnd_vec = __msa_fill_w(rnd_val + 1);
2899 
2900  mask1 = mask0 + 2;
2901 
2902  for (loop_cnt = 2; loop_cnt--;) {
2903  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2904  src0_ptr += (4 * src_stride);
2905  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2906  src1_ptr += (4 * src2_stride);
2907  XORI_B4_128_SB(src0, src1, src2, src3);
2908 
2909  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2910  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2911  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2912  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2913  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2914  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2915  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2916  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2917 
2918  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2919  in0, in1, in2, in3,
2920  weight_vec, rnd_vec, offset_vec,
2921  dst0, dst1, dst2, dst3);
2922 
2923  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2924  ST_W2(dst0, 0, 2, dst, dst_stride);
2925  ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2926  ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
2927  ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2928  dst += (4 * dst_stride);
2929  }
2930 }
2931 
2932 static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
2933  int32_t src_stride,
2934  int16_t *src1_ptr,
2935  int32_t src2_stride,
2936  uint8_t *dst,
2937  int32_t dst_stride,
2938  const int8_t *filter,
2939  int32_t weight0,
2940  int32_t weight1,
2941  int32_t offset0,
2942  int32_t offset1,
2943  int32_t rnd_val)
2944 {
2945  int32_t offset, weight, constant;
2946  v8i16 filt0, filt1;
2947  v16i8 src0, src1;
2948  v8i16 in0, in1;
2949  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2950  v16i8 mask1, vec0, vec1;
2951  v8i16 dst0, dst1;
2952  v8i16 filter_vec;
2953  v4i32 weight_vec, offset_vec, rnd_vec;
2954 
2955  src0_ptr -= 1;
2956 
2957  filter_vec = LD_SH(filter);
2958  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2959 
2960  offset = (offset0 + offset1) << rnd_val;
2961  weight0 = weight0 & 0x0000FFFF;
2962  weight = weight0 | (weight1 << 16);
2963  constant = 128 * weight1;
2964  constant <<= 6;
2965  offset += constant;
2966 
2967  offset_vec = __msa_fill_w(offset);
2968  weight_vec = __msa_fill_w(weight);
2969  rnd_vec = __msa_fill_w(rnd_val + 1);
2970 
2971  mask1 = mask0 + 2;
2972 
2973  LD_SB2(src0_ptr, src_stride, src0, src1);
2974  LD_SH2(src1_ptr, src2_stride, in0, in1);
2976  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2977  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2978  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2979  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2980  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2981  weight_vec, rnd_vec, offset_vec,
2982  dst0, dst1);
2983 
2984  dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2985  ST_D2(dst0, 0, 1, dst, dst_stride);
2986 }
2987 
2988 static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
2989  int32_t src_stride,
2990  int16_t *src1_ptr,
2991  int32_t src2_stride,
2992  uint8_t *dst,
2993  int32_t dst_stride,
2994  const int8_t *filter,
2995  int32_t weight0,
2996  int32_t weight1,
2997  int32_t offset0,
2998  int32_t offset1,
2999  int32_t rnd_val)
3000 {
3001  int32_t weight, offset, constant;
3002  v8i16 filt0, filt1;
3003  v16i8 src0, src1, src2, src3, src4, src5;
3004  v8i16 in0, in1, in2, in3, in4, in5;
3005  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3006  v16i8 mask1;
3007  v16i8 vec0, vec1;
3008  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3009  v8i16 filter_vec;
3010  v4i32 weight_vec, offset_vec, rnd_vec;
3011 
3012  src0_ptr -= 1;
3013 
3014  filter_vec = LD_SH(filter);
3015  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3016 
3017  offset = (offset0 + offset1) << rnd_val;
3018  weight0 = weight0 & 0x0000FFFF;
3019  weight = weight0 | (weight1 << 16);
3020  constant = 128 * weight1;
3021  constant <<= 6;
3022  offset += constant;
3023 
3024  offset_vec = __msa_fill_w(offset);
3025  weight_vec = __msa_fill_w(weight);
3026  rnd_vec = __msa_fill_w(rnd_val + 1);
3027 
3028  mask1 = mask0 + 2;
3029 
3030  LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
3031 
3032  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3033  src1_ptr += (4 * src2_stride);
3034  LD_SH2(src1_ptr, src2_stride, in4, in5);
3035  XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
3036  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3037  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3038  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3039  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3040  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3041  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3042  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3043  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3044  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3045  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3046  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3047  dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3048  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3049  in0, in1, in2, in3,
3050  weight_vec, rnd_vec, offset_vec,
3051  dst0, dst1, dst2, dst3);
3052  HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3053  weight_vec, rnd_vec, offset_vec,
3054  dst4, dst5);
3055 
3056  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3057  dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3058  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3059  ST_D2(dst3, 0, 1, dst + 4 * dst_stride, dst_stride);
3060 }
3061 
3062 static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
3063  int32_t src_stride,
3064  int16_t *src1_ptr,
3065  int32_t src2_stride,
3066  uint8_t *dst,
3067  int32_t dst_stride,
3068  const int8_t *filter,
3069  int32_t height,
3070  int32_t weight0,
3071  int32_t weight1,
3072  int32_t offset0,
3073  int32_t offset1,
3074  int32_t rnd_val)
3075 {
3076  uint32_t loop_cnt;
3077  int32_t offset, weight, constant;
3078  v8i16 filt0, filt1;
3079  v16i8 src0, src1, src2, src3;
3080  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3081  v16i8 mask1;
3082  v16i8 vec0, vec1;
3083  v8i16 in0, in1, in2, in3;
3084  v8i16 dst0, dst1, dst2, dst3;
3085  v8i16 filter_vec;
3086  v4i32 weight_vec, offset_vec, rnd_vec;
3087 
3088  src0_ptr -= 1;
3089 
3090  filter_vec = LD_SH(filter);
3091  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3092 
3093  offset = (offset0 + offset1) << rnd_val;
3094  weight0 = weight0 & 0x0000FFFF;
3095  weight = weight0 | (weight1 << 16);
3096  constant = 128 * weight1;
3097  constant <<= 6;
3098  offset += constant;
3099 
3100  offset_vec = __msa_fill_w(offset);
3101  weight_vec = __msa_fill_w(weight);
3102  rnd_vec = __msa_fill_w(rnd_val + 1);
3103 
3104  mask1 = mask0 + 2;
3105 
3106  for (loop_cnt = (height >> 2); loop_cnt--;) {
3107  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3108  src0_ptr += (4 * src_stride);
3109  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3110  src1_ptr += (4 * src2_stride);
3111  XORI_B4_128_SB(src0, src1, src2, src3);
3112 
3113  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3114  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3115  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3116  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3117  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3118  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3119  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3120  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3121  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3122  in0, in1, in2, in3,
3123  weight_vec, rnd_vec, offset_vec,
3124  dst0, dst1, dst2, dst3);
3125 
3126  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3127  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3128  dst += (4 * dst_stride);
3129  }
3130 }
3131 
3132 static void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr,
3133  int32_t src_stride,
3134  int16_t *src1_ptr,
3135  int32_t src2_stride,
3136  uint8_t *dst,
3137  int32_t dst_stride,
3138  const int8_t *filter,
3139  int32_t height,
3140  int32_t weight0,
3141  int32_t weight1,
3142  int32_t offset0,
3143  int32_t offset1,
3144  int32_t rnd_val)
3145 {
3146  if (2 == height) {
3147  hevc_hz_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3148  dst, dst_stride, filter,
3149  weight0, weight1, offset0, offset1, rnd_val);
3150  } else if (6 == height) {
3151  hevc_hz_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3152  dst, dst_stride, filter,
3153  weight0, weight1, offset0, offset1, rnd_val);
3154  } else if (0 == (height % 4)) {
3155  hevc_hz_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
3156  src1_ptr, src2_stride,
3157  dst, dst_stride, filter, height,
3158  weight0, weight1, offset0, offset1,
3159  rnd_val);
3160  }
3161 }
3162 
3163 static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr,
3164  int32_t src_stride,
3165  int16_t *src1_ptr,
3166  int32_t src2_stride,
3167  uint8_t *dst,
3168  int32_t dst_stride,
3169  const int8_t *filter,
3170  int32_t height,
3171  int32_t weight0,
3172  int32_t weight1,
3173  int32_t offset0,
3174  int32_t offset1,
3175  int32_t rnd_val)
3176 {
3177  uint32_t loop_cnt;
3178  int32_t offset, weight, constant;
3179  v8i16 filt0, filt1;
3180  v16i8 src0, src1, src2, src3;
3181  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3182  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3183  v16i8 mask2 = {
3184  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
3185  };
3186  v16i8 mask1, mask3;
3187  v16i8 vec0, vec1;
3188  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3189  v8i16 filter_vec;
3190  v4i32 weight_vec, offset_vec, rnd_vec;
3191 
3192  src0_ptr -= 1;
3193 
3194  filter_vec = LD_SH(filter);
3195  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3196 
3197  offset = (offset0 + offset1) << rnd_val;
3198  weight0 = weight0 & 0x0000FFFF;
3199  weight = weight0 | (weight1 << 16);
3200  constant = 128 * weight1;
3201  constant <<= 6;
3202  offset += constant;
3203 
3204  offset_vec = __msa_fill_w(offset);
3205  weight_vec = __msa_fill_w(weight);
3206  rnd_vec = __msa_fill_w(rnd_val + 1);
3207 
3208  mask1 = mask0 + 2;
3209  mask3 = mask2 + 2;
3210 
3211  for (loop_cnt = 4; loop_cnt--;) {
3212  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3213  src0_ptr += (4 * src_stride);
3214  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3215  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
3216  src1_ptr += (4 * src2_stride);
3217  ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3218  XORI_B4_128_SB(src0, src1, src2, src3);
3219 
3220  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3221  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3222  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3223  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3224  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3225  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3226  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3227  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3228  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3229  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3230  VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3231  dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3232 
3233  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3234  in0, in1, in2, in3,
3235  weight_vec, rnd_vec, offset_vec,
3236  dst0, dst1, dst2, dst3);
3237  HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3238  weight_vec, rnd_vec, offset_vec,
3239  dst4, dst5);
3240 
3241  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3242  dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3243  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3244  ST_W4(dst3, 0, 1, 2, 3, dst + 8, dst_stride);
3245  dst += (4 * dst_stride);
3246  }
3247 }
3248 
3249 static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr,
3250  int32_t src_stride,
3251  int16_t *src1_ptr,
3252  int32_t src2_stride,
3253  uint8_t *dst,
3254  int32_t dst_stride,
3255  const int8_t *filter,
3256  int32_t height,
3257  int32_t weight0,
3258  int32_t weight1,
3259  int32_t offset0,
3260  int32_t offset1,
3261  int32_t rnd_val)
3262 {
3263  uint32_t loop_cnt;
3264  int32_t offset, weight, constant;
3265  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
3266  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3267  v8i16 filt0, filt1;
3268  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3269  v16i8 mask1;
3270  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3271  v16i8 vec0, vec1;
3272  v8i16 filter_vec;
3273  v4i32 weight_vec, offset_vec, rnd_vec;
3274 
3275  src0_ptr -= 1;
3276 
3277  filter_vec = LD_SH(filter);
3278  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3279 
3280  offset = (offset0 + offset1) << rnd_val;
3281  weight0 = weight0 & 0x0000FFFF;
3282  weight = weight0 | (weight1 << 16);
3283  constant = 128 * weight1;
3284  constant <<= 6;
3285  offset += constant;
3286 
3287  offset_vec = __msa_fill_w(offset);
3288  weight_vec = __msa_fill_w(weight);
3289  rnd_vec = __msa_fill_w(rnd_val + 1);
3290 
3291  mask1 = mask0 + 2;
3292 
3293  for (loop_cnt = (height >> 2); loop_cnt--;) {
3294  LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
3295  LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
3296  src0_ptr += (4 * src_stride);
3297  LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
3298  LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
3299  src1_ptr += (4 * src2_stride);
3300  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3301 
3302  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3303  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3304  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3305  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3306  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3307  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3308  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3309  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3310  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3311  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3312  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3313  dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3314  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3315  dst6 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3316  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3317  dst7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3318  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3319  in0, in1, in2, in3,
3320  weight_vec, rnd_vec, offset_vec,
3321  dst0, dst1, dst2, dst3);
3322 
3323  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3324  ST_SH2(dst0, dst1, dst, dst_stride);
3325  dst += (2 * dst_stride);
3326 
3327  HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7,
3328  in4, in5, in6, in7,
3329  weight_vec, rnd_vec, offset_vec,
3330  dst0, dst1, dst2, dst3);
3331 
3332  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3333  ST_SH2(dst0, dst1, dst, dst_stride);
3334  dst += (2 * dst_stride);
3335  }
3336 }
3337 
3338 static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr,
3339  int32_t src_stride,
3340  int16_t *src1_ptr,
3341  int32_t src2_stride,
3342  uint8_t *dst,
3343  int32_t dst_stride,
3344  const int8_t *filter,
3345  int32_t height,
3346  int32_t weight0,
3347  int32_t weight1,
3348  int32_t offset0,
3349  int32_t offset1,
3350  int32_t rnd_val)
3351 {
3352  uint32_t loop_cnt;
3353  int32_t offset, weight, constant;
3354  v16i8 src0, src1, src2, src3;
3355  v8i16 filt0, filt1;
3356  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3357  v16i8 mask1, mask2, mask3;
3358  v16i8 vec0, vec1;
3359  v8i16 dst0, dst1, dst2, dst3;
3360  v8i16 in0, in1, in2, in3, in4, in5;
3361  v8i16 filter_vec;
3362  v4i32 weight_vec, offset_vec, rnd_vec;
3363 
3364  src0_ptr -= 1;
3365 
3366  filter_vec = LD_SH(filter);
3367  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3368 
3369  offset = (offset0 + offset1) << rnd_val;
3370  weight0 = weight0 & 0x0000FFFF;
3371  weight = weight0 | (weight1 << 16);
3372  constant = 128 * weight1;
3373  constant <<= 6;
3374  offset += constant;
3375 
3376  offset_vec = __msa_fill_w(offset);
3377  weight_vec = __msa_fill_w(weight);
3378  rnd_vec = __msa_fill_w(rnd_val + 1);
3379 
3380  mask1 = mask0 + 2;
3381  mask2 = mask0 + 8;
3382  mask3 = mask0 + 10;
3383 
3384  for (loop_cnt = 16; loop_cnt--;) {
3385  LD_SB2(src0_ptr, src_stride, src0, src2);
3386  LD_SB2(src0_ptr + 16, src_stride, src1, src3);
3387  src0_ptr += (2 * src_stride);
3388  LD_SH2(src1_ptr, src2_stride, in0, in2);
3389  LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
3390  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
3391  src1_ptr += (2 * src2_stride);
3392  XORI_B4_128_SB(src0, src1, src2, src3);
3393 
3394  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3395  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3396  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3397  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3398  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3399  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3400  VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3401  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3402  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3403  in0, in1, in2, in3,
3404  weight_vec, rnd_vec, offset_vec,
3405  dst0, dst1, dst2, dst3);
3406 
3407  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3408  ST_SH2(dst0, dst1, dst, dst_stride);
3409 
3410  /* 8 width */
3411  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3412  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3413  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3414  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3415  HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5,
3416  weight_vec, rnd_vec, offset_vec,
3417  dst0, dst1);
3418 
3419  dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3420  ST_D2(dst0, 0, 1, (dst + 16), dst_stride);
3421  dst += (2 * dst_stride);
3422  }
3423 }
3424 
3425 static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr,
3426  int32_t src_stride,
3427  int16_t *src1_ptr,
3428  int32_t src2_stride,
3429  uint8_t *dst,
3430  int32_t dst_stride,
3431  const int8_t *filter,
3432  int32_t height,
3433  int32_t weight0,
3434  int32_t weight1,
3435  int32_t offset0,
3436  int32_t offset1,
3437  int32_t rnd_val)
3438 {
3439  uint32_t loop_cnt;
3440  int32_t offset, weight, constant;
3441  v16i8 src0, src1, src2;
3442  v8i16 filt0, filt1;
3443  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3444  v16i8 mask1, mask2, mask3;
3445  v8i16 dst0, dst1, dst2, dst3;
3446  v16i8 vec0, vec1;
3447  v8i16 in0, in1, in2, in3;
3448  v8i16 filter_vec;
3449  v4i32 weight_vec, offset_vec, rnd_vec;
3450 
3451  src0_ptr -= 1;
3452 
3453  filter_vec = LD_SH(filter);
3454  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3455 
3456  offset = (offset0 + offset1) << rnd_val;
3457  weight0 = weight0 & 0x0000FFFF;
3458  weight = weight0 | (weight1 << 16);
3459  constant = 128 * weight1;
3460  constant <<= 6;
3461  offset += constant;
3462 
3463  offset_vec = __msa_fill_w(offset);
3464  weight_vec = __msa_fill_w(weight);
3465  rnd_vec = __msa_fill_w(rnd_val + 1);
3466 
3467  mask1 = mask0 + 2;
3468  mask2 = mask0 + 8;
3469  mask3 = mask0 + 10;
3470 
3471  for (loop_cnt = height; loop_cnt--;) {
3472  LD_SB2(src0_ptr, 16, src0, src1);
3473  src2 = LD_SB(src0_ptr + 24);
3474  src0_ptr += src_stride;
3475  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
3476  src1_ptr += src2_stride;
3477  XORI_B3_128_SB(src0, src1, src2);
3478 
3479  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3480  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3481  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3482  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3483  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3484  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3485  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3486  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3487  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3488  in0, in1, in2, in3,
3489  weight_vec, rnd_vec, offset_vec,
3490  dst0, dst1, dst2, dst3);
3491 
3492  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3493  ST_SH2(dst0, dst1, dst, 16);
3494  dst += dst_stride;
3495  }
3496 }
3497 
3498 static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
3499  int32_t src_stride,
3500  int16_t *src1_ptr,
3501  int32_t src2_stride,
3502  uint8_t *dst,
3503  int32_t dst_stride,
3504  const int8_t *filter,
3505  int32_t weight0,
3506  int32_t weight1,
3507  int32_t offset0,
3508  int32_t offset1,
3509  int32_t rnd_val)
3510 {
3511  int32_t weight, offset, constant;
3512  v16i8 src0, src1, src2, src3, src4;
3513  v8i16 in0, in1, dst10;
3514  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
3515  v4i32 dst10_r, dst10_l;
3516  v8i16 filt0, filt1;
3517  v8i16 filter_vec, out;
3518  v4i32 weight_vec, offset_vec, rnd_vec;
3519 
3520  src0_ptr -= src_stride;
3521 
3522  offset = (offset0 + offset1) << rnd_val;
3523  weight0 = weight0 & 0x0000FFFF;
3524  weight = weight0 | (weight1 << 16);
3525  constant = 128 * weight1;
3526  constant <<= 6;
3527  offset += constant;
3528 
3529  offset_vec = __msa_fill_w(offset);
3530  weight_vec = __msa_fill_w(weight);
3531  rnd_vec = __msa_fill_w(rnd_val + 1);
3532 
3533  filter_vec = LD_SH(filter);
3534  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3535 
3536  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3537  src0_ptr += (3 * src_stride);
3538  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3539  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3540  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3541  LD_SB2(src0_ptr, src_stride, src3, src4);
3542  src0_ptr += (2 * src_stride);
3543  LD_SH2(src1_ptr, src2_stride, in0, in1);
3544  src1_ptr += (2 * src2_stride);
3545 
3546  in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3547  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3548  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3549  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3550 
3551  dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3552 
3553  ILVRL_H2_SW(dst10, in0, dst10_r, dst10_l);
3554  dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
3555  dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
3556  SRAR_W2_SW(dst10_r, dst10_l, rnd_vec);
3557  out = __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
3558  CLIP_SH_0_255(out);
3559  out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out);
3560  ST_W2(out, 0, 1, dst, dst_stride);
3561 }
3562 
3563 static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
3564  int32_t src_stride,
3565  int16_t *src1_ptr,
3566  int32_t src2_stride,
3567  uint8_t *dst,
3568  int32_t dst_stride,
3569  const int8_t *filter,
3570  int32_t weight0,
3571  int32_t weight1,
3572  int32_t offset0,
3573  int32_t offset1,
3574  int32_t rnd_val)
3575 {
3576  int32_t weight, offset, constant;
3577  v16i8 src0, src1, src2, src3, src4, src5, src6;
3578  v8i16 in0, in1, in2, in3;
3579  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3580  v16i8 src2110, src4332, src6554;
3581  v8i16 dst10, dst32;
3582  v8i16 filt0, filt1;
3583  v8i16 filter_vec;
3584  v4i32 weight_vec, offset_vec, rnd_vec;
3585 
3586  src0_ptr -= src_stride;
3587 
3588  offset = (offset0 + offset1) << rnd_val;
3589  weight0 = weight0 & 0x0000FFFF;
3590  weight = weight0 | (weight1 << 16);
3591  constant = 128 * weight1;
3592  constant <<= 6;
3593  offset += constant;
3594 
3595  offset_vec = __msa_fill_w(offset);
3596  weight_vec = __msa_fill_w(weight);
3597  rnd_vec = __msa_fill_w(rnd_val + 1);
3598 
3599  filter_vec = LD_SH(filter);
3600  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3601 
3602  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3603  src0_ptr += (3 * src_stride);
3604  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3605  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3606  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3607 
3608  LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3609  src0_ptr += (4 * src_stride);
3610  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3611  src1_ptr += (4 * src2_stride);
3612  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3613  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3614  src32_r, src43_r, src54_r, src65_r);
3615  ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3616  XORI_B2_128_SB(src4332, src6554);
3617 
3618  dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3619  dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3620 
3621  HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1,
3622  weight_vec, rnd_vec, offset_vec,
3623  dst10, dst32);
3624 
3625  dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
3626  ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
3627  dst += (4 * dst_stride);
3628 }
3629 
3630 static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
3631  int32_t src_stride,
3632  int16_t *src1_ptr,
3633  int32_t src2_stride,
3634  uint8_t *dst,
3635  int32_t dst_stride,
3636  const int8_t *filter,
3637  int32_t height,
3638  int32_t weight0,
3639  int32_t weight1,
3640  int32_t offset0,
3641  int32_t offset1,
3642  int32_t rnd_val)
3643 {
3644  uint32_t loop_cnt;
3645  int32_t weight, offset, constant;
3646  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3647  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3648  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3649  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3650  v16i8 src2110, src4332, src6554, src8776;
3651  v8i16 dst10, dst32, dst54, dst76;
3652  v8i16 filt0, filt1;
3653  v8i16 filter_vec;
3654  v4i32 weight_vec, offset_vec, rnd_vec;
3655 
3656  src0_ptr -= src_stride;
3657 
3658  offset = (offset0 + offset1) << rnd_val;
3659  weight0 = weight0 & 0x0000FFFF;
3660  weight = weight0 | (weight1 << 16);
3661  constant = 128 * weight1;
3662  constant <<= 6;
3663  offset += constant;
3664 
3665  offset_vec = __msa_fill_w(offset);
3666  weight_vec = __msa_fill_w(weight);
3667  rnd_vec = __msa_fill_w(rnd_val + 1);
3668 
3669  filter_vec = LD_SH(filter);
3670  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3671 
3672  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3673  src0_ptr += (3 * src_stride);
3674  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3675  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3676  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3677 
3678  for (loop_cnt = (height >> 3); loop_cnt--;) {
3679  LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3680  src0_ptr += (6 * src_stride);
3681  LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3682  src1_ptr += (8 * src2_stride);
3683 
3684  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3685  ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3686 
3687  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3688  src32_r, src43_r, src54_r, src65_r);
3689  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3690  ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3691  src4332, src6554, src8776);
3692  XORI_B3_128_SB(src4332, src6554, src8776);
3693 
3694  dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3695  dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3696  dst54 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
3697 
3698  LD_SB2(src0_ptr, src_stride, src9, src2);
3699  src0_ptr += (2 * src_stride);
3700  ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3701  src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3702  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3703 
3704  dst76 = HEVC_FILT_4TAP_SH(src8776, src2110, filt0, filt1);
3705  HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
3706  in0, in1, in2, in3,
3707  weight_vec, rnd_vec, offset_vec,
3708  dst10, dst32, dst54, dst76);
3709 
3710  PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
3711  ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3712  dst += (8 * dst_stride);
3713  }
3714 }
3715 
3716 static void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr,
3717  int32_t src_stride,
3718  int16_t *src1_ptr,
3719  int32_t src2_stride,
3720  uint8_t *dst,
3721  int32_t dst_stride,
3722  const int8_t *filter,
3723  int32_t height,
3724  int32_t weight0,
3725  int32_t weight1,
3726  int32_t offset0,
3727  int32_t offset1,
3728  int32_t rnd_val)
3729 {
3730  if (2 == height) {
3731  hevc_vt_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3732  dst, dst_stride, filter,
3733  weight0, weight1, offset0, offset1, rnd_val);
3734  } else if (4 == height) {
3735  hevc_vt_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3736  dst, dst_stride, filter,
3737  weight0, weight1, offset0, offset1, rnd_val);
3738  } else if (0 == (height % 8)) {
3739  hevc_vt_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
3740  src1_ptr, src2_stride,
3741  dst, dst_stride, filter, height,
3742  weight0, weight1, offset0, offset1,
3743  rnd_val);
3744  }
3745 }
3746 
3747 static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr,
3748  int32_t src_stride,
3749  int16_t *src1_ptr,
3750  int32_t src2_stride,
3751  uint8_t *dst,
3752  int32_t dst_stride,
3753  const int8_t *filter,
3754  int32_t height,
3755  int32_t weight0,
3756  int32_t weight1,
3757  int32_t offset0,
3758  int32_t offset1,
3759  int32_t rnd_val)
3760 {
3761  uint32_t loop_cnt;
3762  int32_t offset, weight, constant;
3763  v16i8 src0, src1, src2, src3, src4;
3764  v8i16 in0, in1, in2, in3;
3765  v16i8 src10_r, src32_r, src21_r, src43_r;
3766  v8i16 tmp0, tmp1, tmp2, tmp3;
3767  v8i16 filt0, filt1;
3768  v8i16 filter_vec;
3769  v4i32 weight_vec, offset_vec, rnd_vec;
3770 
3771  src0_ptr -= src_stride;
3772 
3773  offset = (offset0 + offset1) << rnd_val;
3774  weight0 = weight0 & 0x0000FFFF;
3775  weight = weight0 | (weight1 << 16);
3776  constant = 128 * weight1;
3777  constant <<= 6;
3778  offset += constant;
3779 
3780  offset_vec = __msa_fill_w(offset);
3781  weight_vec = __msa_fill_w(weight);
3782  rnd_vec = __msa_fill_w(rnd_val + 1);
3783 
3784  filter_vec = LD_SH(filter);
3785  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3786 
3787  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3788  src0_ptr += (3 * src_stride);
3789  XORI_B3_128_SB(src0, src1, src2);
3790  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3791 
3792  for (loop_cnt = (height >> 2); loop_cnt--;) {
3793  LD_SB2(src0_ptr, src_stride, src3, src4);
3794  src0_ptr += (2 * src_stride);
3795  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3796  src1_ptr += (4 * src2_stride);
3797  XORI_B2_128_SB(src3, src4);
3798  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3799 
3800  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3801  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3802 
3803  LD_SB2(src0_ptr, src_stride, src1, src2);
3804  src0_ptr += (2 * src_stride);
3805  XORI_B2_128_SB(src1, src2);
3806  ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3807 
3808  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
3809  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
3810  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3811  in0, in1, in2, in3,
3812  weight_vec, rnd_vec, offset_vec,
3813  tmp0, tmp1, tmp2, tmp3);
3814 
3815  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3816  ST_W2(tmp0, 0, 2, dst, dst_stride);
3817  ST_H2(tmp0, 2, 6, dst + 4, dst_stride);
3818  ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
3819  ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3820  dst += (4 * dst_stride);
3821  }
3822 }
3823 
3824 static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
3825  int32_t src_stride,
3826  int16_t *src1_ptr,
3827  int32_t src2_stride,
3828  uint8_t *dst,
3829  int32_t dst_stride,
3830  const int8_t *filter,
3831  int32_t weight0,
3832  int32_t weight1,
3833  int32_t offset0,
3834  int32_t offset1,
3835  int32_t rnd_val)
3836 {
3837  int32_t offset, weight, constant;
3838  v16i8 src0, src1, src2, src3, src4;
3839  v8i16 in0, in1, tmp0, tmp1;
3840  v16i8 src10_r, src32_r, src21_r, src43_r;
3841  v8i16 filt0, filt1;
3842  v8i16 filter_vec;
3843  v4i32 weight_vec, offset_vec, rnd_vec;
3844 
3845  src0_ptr -= src_stride;
3846 
3847  offset = (offset0 + offset1) << rnd_val;
3848  weight0 = weight0 & 0x0000FFFF;
3849  weight = weight0 | (weight1 << 16);
3850  constant = 128 * weight1;
3851  constant <<= 6;
3852  offset += constant;
3853 
3854  offset_vec = __msa_fill_w(offset);
3855  weight_vec = __msa_fill_w(weight);
3856  rnd_vec = __msa_fill_w(rnd_val + 1);
3857 
3858  filter_vec = LD_SH(filter);
3859  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3860 
3861  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3862  src0_ptr += (3 * src_stride);
3863  XORI_B3_128_SB(src0, src1, src2);
3864  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3865 
3866  LD_SB2(src0_ptr, src_stride, src3, src4);
3867  LD_SH2(src1_ptr, src2_stride, in0, in1);
3868  XORI_B2_128_SB(src3, src4);
3869  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3870 
3871  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3872  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3873  HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
3874  weight_vec, rnd_vec, offset_vec,
3875  tmp0, tmp1);
3876 
3877  tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3878  ST_D2(tmp0, 0, 1, dst, dst_stride);
3879 }
3880 
3881 static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
3882  int32_t src_stride,
3883  int16_t *src1_ptr,
3884  int32_t src2_stride,
3885  uint8_t *dst,
3886  int32_t dst_stride,
3887  const int8_t *filter,
3888  int32_t weight0,
3889  int32_t weight1,
3890  int32_t offset0,
3891  int32_t offset1,
3892  int32_t rnd_val)
3893 {
3894  int32_t offset, weight, constant;
3895  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3896  v8i16 in0, in1, in2, in3, in4, in5;
3897  v16i8 src10_r, src32_r, src54_r, src76_r;
3898  v16i8 src21_r, src43_r, src65_r, src87_r;
3899  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3900  v8i16 filt0, filt1;
3901  v8i16 filter_vec;
3902  v4i32 weight_vec, offset_vec, rnd_vec;
3903 
3904  src0_ptr -= src_stride;
3905 
3906  offset = (offset0 + offset1) << rnd_val;
3907  weight0 = weight0 & 0x0000FFFF;
3908  weight = weight0 | (weight1 << 16);
3909  constant = 128 * weight1;
3910  constant <<= 6;
3911  offset += constant;
3912 
3913  offset_vec = __msa_fill_w(offset);
3914  weight_vec = __msa_fill_w(weight);
3915  rnd_vec = __msa_fill_w(rnd_val + 1);
3916 
3917  filter_vec = LD_SH(filter);
3918  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3919 
3920  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3921  src0_ptr += (3 * src_stride);
3922  XORI_B3_128_SB(src0, src1, src2);
3923  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3924 
3925  LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3926  LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3927  XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3928  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3929  src32_r, src43_r, src54_r, src65_r);
3930  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3931 
3932  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3933  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3934  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3935  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3936  tmp4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3937  tmp5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3938  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3939  in0, in1, in2, in3,
3940  weight_vec, rnd_vec, offset_vec,
3941  tmp0, tmp1, tmp2, tmp3);
3942  HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
3943  weight_vec, rnd_vec, offset_vec,
3944  tmp4, tmp5);
3945 
3946  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3947  tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
3948  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
3949  ST_D2(tmp3, 0, 1, dst + 4 * dst_stride, dst_stride);
3950 }
3951 
3952 static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
3953  int32_t src_stride,
3954  int16_t *src1_ptr,
3955  int32_t src2_stride,
3956  uint8_t *dst,
3957  int32_t dst_stride,
3958  const int8_t *filter,
3959  int32_t height,
3960  int32_t weight0,
3961  int32_t weight1,
3962  int32_t offset0,
3963  int32_t offset1,
3964  int32_t rnd_val)
3965 {
3966  uint32_t loop_cnt;
3967  int32_t offset, weight, constant;
3968  v16i8 src0, src1, src2, src3, src4;
3969  v8i16 in0, in1, in2, in3;
3970  v16i8 src10_r, src32_r, src21_r, src43_r;
3971  v8i16 tmp0, tmp1, tmp2, tmp3;
3972  v8i16 filt0, filt1;
3973  v8i16 filter_vec;
3974  v4i32 weight_vec, offset_vec, rnd_vec;
3975 
3976  src0_ptr -= src_stride;
3977 
3978  offset = (offset0 + offset1) << rnd_val;
3979  weight0 = weight0 & 0x0000FFFF;
3980  weight = weight0 | (weight1 << 16);
3981  constant = 128 * weight1;
3982  constant <<= 6;
3983  offset += constant;
3984 
3985  offset_vec = __msa_fill_w(offset);
3986  weight_vec = __msa_fill_w(weight);
3987  rnd_vec = __msa_fill_w(rnd_val + 1);
3988 
3989  filter_vec = LD_SH(filter);
3990  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3991 
3992  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3993  src0_ptr += (3 * src_stride);
3994  XORI_B3_128_SB(src0, src1, src2);
3995  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3996 
3997  for (loop_cnt = (height >> 2); loop_cnt--;) {
3998  LD_SB2(src0_ptr, src_stride, src3, src4);
3999  src0_ptr += (2 * src_stride);
4000  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4001  src1_ptr += (4 * src2_stride);
4002  XORI_B2_128_SB(src3, src4);
4003  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4004 
4005  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4006  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4007 
4008  LD_SB2(src0_ptr, src_stride, src1, src2);
4009  src0_ptr += (2 * src_stride);
4010  XORI_B2_128_SB(src1, src2);
4011  ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
4012 
4013  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4014  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4015  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4016  in0, in1, in2, in3,
4017  weight_vec, rnd_vec, offset_vec,
4018  tmp0, tmp1, tmp2, tmp3);
4019 
4020  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4021  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4022  dst += (4 * dst_stride);
4023  }
4024 }
4025 
4026 static void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr,
4027  int32_t src_stride,
4028  int16_t *src1_ptr,
4029  int32_t src2_stride,
4030  uint8_t *dst,
4031  int32_t dst_stride,
4032  const int8_t *filter,
4033  int32_t height,
4034  int32_t weight0,
4035  int32_t weight1,
4036  int32_t offset0,
4037  int32_t offset1,
4038  int32_t rnd_val)
4039 {
4040  if (2 == height) {
4041  hevc_vt_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4042  dst, dst_stride, filter,
4043  weight0, weight1, offset0, offset1, rnd_val);
4044  } else if (6 == height) {
4045  hevc_vt_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4046  dst, dst_stride, filter,
4047  weight0, weight1, offset0, offset1, rnd_val);
4048  } else {
4049  hevc_vt_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
4050  src1_ptr, src2_stride,
4051  dst, dst_stride, filter, height,
4052  weight0, weight1, offset0, offset1,
4053  rnd_val);
4054  }
4055 }
4056 
4057 static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr,
4058  int32_t src_stride,
4059  int16_t *src1_ptr,
4060  int32_t src2_stride,
4061  uint8_t *dst,
4062  int32_t dst_stride,
4063  const int8_t *filter,
4064  int32_t height,
4065  int32_t weight0,
4066  int32_t weight1,
4067  int32_t offset0,
4068  int32_t offset1,
4069  int32_t rnd_val)
4070 {
4071  uint32_t loop_cnt;
4072  int32_t offset, weight, constant;
4073  v16i8 src0, src1, src2, src3, src4, src5;
4074  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4075  v16i8 src10_r, src32_r, src21_r, src43_r;
4076  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4077  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4078  v16i8 src2110, src4332;
4079  v8i16 filt0, filt1;
4080  v8i16 filter_vec;
4081  v4i32 weight_vec, offset_vec, rnd_vec;
4082 
4083  src0_ptr -= (1 * src_stride);
4084 
4085  offset = (offset0 + offset1) << rnd_val;
4086  weight0 = weight0 & 0x0000FFFF;
4087  weight = weight0 | (weight1 << 16);
4088  constant = 128 * weight1;
4089  constant <<= 6;
4090  offset += constant;
4091 
4092  offset_vec = __msa_fill_w(offset);
4093  weight_vec = __msa_fill_w(weight);
4094  rnd_vec = __msa_fill_w(rnd_val + 1);
4095 
4096  filter_vec = LD_SH(filter);
4097  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4098 
4099  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4100  src0_ptr += (3 * src_stride);
4101  XORI_B3_128_SB(src0, src1, src2);
4102  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4103  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4104  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
4105 
4106  for (loop_cnt = (height >> 2); loop_cnt--;) {
4107  LD_SB2(src0_ptr, src_stride, src3, src4);
4108  src0_ptr += (2 * src_stride);
4109  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4110  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
4111  src1_ptr += (4 * src2_stride);
4112  ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
4113  XORI_B2_128_SB(src3, src4);
4114 
4115  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4116  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4117  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
4118 
4119  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4120  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4121  tmp4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
4122 
4123  LD_SB2(src0_ptr, src_stride, src5, src2);
4124  src0_ptr += (2 * src_stride);
4125  XORI_B2_128_SB(src5, src2);
4126  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4127  ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
4128  src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
4129 
4130  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4131  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4132  tmp5 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
4133  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4134  in0, in1, in2, in3,
4135  weight_vec, rnd_vec, offset_vec,
4136  tmp0, tmp1, tmp2, tmp3);
4137  HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
4138  weight_vec, rnd_vec, offset_vec,
4139  tmp4, tmp5);
4140 
4141  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4142  tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4143  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4144  ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
4145  dst += (4 * dst_stride);
4146  }
4147 }
4148 
4149 static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr,
4150  int32_t src_stride,
4151  int16_t *src1_ptr,
4152  int32_t src2_stride,
4153  uint8_t *dst,
4154  int32_t dst_stride,
4155  const int8_t *filter,
4156  int32_t height,
4157  int32_t weight0,
4158  int32_t weight1,
4159  int32_t offset0,
4160  int32_t offset1,
4161  int32_t rnd_val)
4162 {
4163  uint32_t loop_cnt;
4164  int32_t offset, weight, constant;
4165  v16i8 src0, src1, src2, src3, src4, src5;
4166  v8i16 in0, in1, in2, in3;
4167  v16i8 src10_r, src32_r, src21_r, src43_r;
4168  v16i8 src10_l, src32_l, src21_l, src43_l;
4169  v8i16 tmp0, tmp1, tmp2, tmp3;
4170  v8i16 filt0, filt1;
4171  v8i16 filter_vec;
4172  v4i32 weight_vec, offset_vec, rnd_vec;
4173 
4174  src0_ptr -= src_stride;
4175 
4176  offset = (offset0 + offset1) << rnd_val;
4177  weight0 = weight0 & 0x0000FFFF;
4178  weight = weight0 | (weight1 << 16);
4179  constant = 128 * weight1;
4180  constant <<= 6;
4181  offset += constant;
4182 
4183  offset_vec = __msa_fill_w(offset);
4184  weight_vec = __msa_fill_w(weight);
4185  rnd_vec = __msa_fill_w(rnd_val + 1);
4186 
4187  filter_vec = LD_SH(filter);
4188  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4189 
4190  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4191  src0_ptr += (3 * src_stride);
4192  XORI_B3_128_SB(src0, src1, src2);
4193  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4194  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4195 
4196  for (loop_cnt = (height >> 2); loop_cnt--;) {
4197  LD_SB2(src0_ptr, src_stride, src3, src4);
4198  src0_ptr += (2 * src_stride);
4199  LD_SH2(src1_ptr, src2_stride, in0, in1);
4200  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4201  src1_ptr += (2 * src2_stride);
4202  XORI_B2_128_SB(src3, src4);
4203  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4204  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4205 
4206  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4207  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4208  tmp2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4209  tmp3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4210 
4211  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4212  in0, in1, in2, in3,
4213  weight_vec, rnd_vec, offset_vec,
4214  tmp0, tmp1, tmp2, tmp3);
4215  PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4216  ST_SH2(tmp0, tmp1, dst, dst_stride);
4217  dst += (2 * dst_stride);
4218  LD_SB2(src0_ptr, src_stride, src5, src2);
4219  src0_ptr += (2 * src_stride);
4220 
4221  LD_SH2(src1_ptr, src2_stride, in0, in1);
4222  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4223  src1_ptr += (2 * src2_stride);
4224  XORI_B2_128_SB(src5, src2);
4225  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4226  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4227 
4228  tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4229  tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4230  tmp2 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4231  tmp3 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4232  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4233  in0, in1, in2, in3,
4234  weight_vec, rnd_vec, offset_vec,
4235  tmp0, tmp1, tmp2, tmp3);
4236 
4237  PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4238  ST_SH2(tmp0, tmp1, dst, dst_stride);
4239  dst += (2 * dst_stride);
4240  }
4241 }
4242 
4243 static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr,
4244  int32_t src_stride,
4245  int16_t *src1_ptr,
4246  int32_t src2_stride,
4247  uint8_t *dst,
4248  int32_t dst_stride,
4249  const int8_t *filter,
4250  int32_t height,
4251  int32_t weight0,
4252  int32_t weight1,
4253  int32_t offset0,
4254  int32_t offset1,
4255  int32_t rnd_val)
4256 {
4257  uint32_t loop_cnt;
4258  int32_t offset, weight, constant;
4259  v16i8 src0, src1, src2, src3, src4, src5;
4260  v16i8 src6, src7, src8, src9, src10, src11;
4261  v8i16 in0, in1, in2, in3, in4, in5;
4262  v16i8 src10_r, src32_r, src76_r, src98_r;
4263  v16i8 src10_l, src32_l, src21_l, src43_l;
4264  v16i8 src21_r, src43_r, src87_r, src109_r;
4265  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4266  v8i16 filt0, filt1;
4267  v8i16 filter_vec;
4268  v4i32 weight_vec, offset_vec, rnd_vec;
4269 
4270  src0_ptr -= src_stride;
4271 
4272  offset = (offset0 + offset1) << rnd_val;
4273  weight0 = weight0 & 0x0000FFFF;
4274  weight = weight0 | (weight1 << 16);
4275  constant = 128 * weight1;
4276  constant <<= 6;
4277  offset += constant;
4278 
4279  offset_vec = __msa_fill_w(offset);
4280  weight_vec = __msa_fill_w(weight);
4281  rnd_vec = __msa_fill_w(rnd_val + 1);
4282 
4283  filter_vec = LD_SH(filter);
4284  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4285 
4286  /* 16width */
4287  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4288  XORI_B3_128_SB(src0, src1, src2);
4289  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4290  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4291  /* 8width */
4292  LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4293  src0_ptr += (3 * src_stride);
4294  XORI_B3_128_SB(src6, src7, src8);
4295  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4296 
4297  for (loop_cnt = (height >> 2); loop_cnt--;) {
4298  /* 16width */
4299  LD_SB2(src0_ptr, src_stride, src3, src4);
4300  LD_SH2(src1_ptr, src2_stride, in0, in1);
4301  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4302  XORI_B2_128_SB(src3, src4);
4303  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4304  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4305 
4306  /* 8width */
4307  LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4308  src0_ptr += (2 * src_stride);
4309  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4310  src1_ptr += (2 * src2_stride);
4311  XORI_B2_128_SB(src9, src10);
4312  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4313  /* 16width */
4314  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4315  tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4316  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4317  tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4318  /* 8width */
4319  tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4320  tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4321  /* 16width */
4322  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4323  in0, in1, in2, in3,
4324  weight_vec, rnd_vec, offset_vec,
4325  tmp0, tmp1, tmp4, tmp5);
4326  /* 8width */
4327  HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4328  weight_vec, rnd_vec, offset_vec,
4329  tmp2, tmp3);
4330  /* 16width */
4331  PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4332  /* 8width */
4333  tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4334  ST_SH2(tmp0, tmp1, dst, dst_stride);
4335  ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4336  dst += (2 * dst_stride);
4337 
4338  /* 16width */
4339  LD_SB2(src0_ptr, src_stride, src5, src2);
4340  LD_SH2(src1_ptr, src2_stride, in0, in1);
4341  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4342  XORI_B2_128_SB(src5, src2);
4343  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4344  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4345  /* 8width */
4346  LD_SB2(src0_ptr + 16, src_stride, src11, src8);
4347  src0_ptr += (2 * src_stride);
4348  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4349  src1_ptr += (2 * src2_stride);
4350  XORI_B2_128_SB(src11, src8);
4351  ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
4352  /* 16width */
4353  tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4354  tmp4 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4355  tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4356  tmp5 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4357  /* 8width */
4358  tmp2 = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
4359  tmp3 = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
4360  /* 16width */
4361  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4362  in0, in1, in2, in3,
4363  weight_vec, rnd_vec, offset_vec,
4364  tmp0, tmp1, tmp4, tmp5);
4365  /* 8width */
4366  HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4367  weight_vec, rnd_vec, offset_vec,
4368  tmp2, tmp3);
4369  /* 16width */
4370  PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4371 
4372  /* 8width */
4373  tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4374  ST_SH2(tmp0, tmp1, dst, dst_stride);
4375  ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4376  dst += (2 * dst_stride);
4377  }
4378 }
4379 
4380 static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr,
4381  int32_t src_stride,
4382  int16_t *src1_ptr,
4383  int32_t src2_stride,
4384  uint8_t *dst,
4385  int32_t dst_stride,
4386  const int8_t *filter,
4387  int32_t height,
4388  int32_t weight0,
4389  int32_t weight1,
4390  int32_t offset0,
4391  int32_t offset1,
4392  int32_t rnd_val)
4393 {
4394  uint32_t loop_cnt;
4395  uint8_t *dst_tmp = dst + 16;
4396  int32_t offset, weight, constant;
4397  v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
4398  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4399  v16i8 src10_r, src32_r, src76_r, src98_r;
4400  v16i8 src21_r, src43_r, src87_r, src109_r;
4401  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4402  v16i8 src10_l, src32_l, src76_l, src98_l;
4403  v16i8 src21_l, src43_l, src87_l, src109_l;
4404  v8i16 filt0, filt1;
4405  v8i16 filter_vec;
4406  v4i32 weight_vec, offset_vec, rnd_vec;
4407 
4408  src0_ptr -= src_stride;
4409 
4410  offset = (offset0 + offset1) << rnd_val;
4411  weight0 = weight0 & 0x0000FFFF;
4412  weight = weight0 | (weight1 << 16);
4413  constant = 128 * weight1;
4414  constant <<= 6;
4415  offset += constant;
4416 
4417  offset_vec = __msa_fill_w(offset);
4418  weight_vec = __msa_fill_w(weight);
4419  rnd_vec = __msa_fill_w(rnd_val + 1);
4420 
4421  filter_vec = LD_SH(filter);
4422  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4423 
4424  /* 16width */
4425  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4426  XORI_B3_128_SB(src0, src1, src2);
4427  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4428  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4429  /* next 16width */
4430  LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4431  src0_ptr += (3 * src_stride);
4432  XORI_B3_128_SB(src6, src7, src8);
4433  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4434  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
4435 
4436  for (loop_cnt = (height >> 1); loop_cnt--;) {
4437  /* 16width */
4438  LD_SB2(src0_ptr, src_stride, src3, src4);
4439  LD_SH2(src1_ptr, src2_stride, in0, in1);
4440  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4441  XORI_B2_128_SB(src3, src4);
4442  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4443  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4444 
4445  /* 16width */
4446  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4447  tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4448  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4449  tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4450  /* 16width */
4451  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4452  in0, in1, in2, in3,
4453  weight_vec, rnd_vec, offset_vec,
4454  tmp0, tmp1, tmp4, tmp5);
4455  /* 16width */
4456  PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4457  ST_SH2(tmp0, tmp1, dst, dst_stride);
4458  dst += (2 * dst_stride);
4459 
4460  src10_r = src32_r;
4461  src21_r = src43_r;
4462  src10_l = src32_l;
4463  src21_l = src43_l;
4464  src2 = src4;
4465 
4466  /* next 16width */
4467  LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4468  src0_ptr += (2 * src_stride);
4469  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4470  LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
4471  src1_ptr += (2 * src2_stride);
4472  XORI_B2_128_SB(src9, src10);
4473  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4474  ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
4475  /* next 16width */
4476  tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4477  tmp6 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
4478  tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4479  tmp7 = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
4480  /* next 16width */
4481  HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
4482  in4, in5, in6, in7,
4483  weight_vec, rnd_vec, offset_vec,
4484  tmp2, tmp3, tmp6, tmp7);
4485 
4486  /* next 16width */
4487  PCKEV_B2_SH(tmp6, tmp2, tmp7, tmp3, tmp2, tmp3);
4488  ST_SH2(tmp2, tmp3, dst_tmp, dst_stride);
4489  dst_tmp += (2 * dst_stride);
4490 
4491  src76_r = src98_r;
4492  src87_r = src109_r;
4493  src76_l = src98_l;
4494  src87_l = src109_l;
4495  src8 = src10;
4496  }
4497 }
4498 
4499 static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
4500  int32_t src_stride,
4501  int16_t *src1_ptr,
4502  int32_t src2_stride,
4503  uint8_t *dst,
4504  int32_t dst_stride,
4505  const int8_t *filter_x,
4506  const int8_t *filter_y,
4507  int32_t weight0,
4508  int32_t weight1,
4509  int32_t offset0,
4510  int32_t offset1,
4511  int32_t rnd_val)
4512 {
4513  uint64_t tp0, tp1;
4515  v8i16 in0 = { 0 };
4516  v16u8 out;
4517  v16i8 src0, src1, src2, src3, src4;
4518  v8i16 filt0, filt1;
4519  v8i16 filt_h0, filt_h1;
4520  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4521  v16i8 mask1;
4522  v8i16 filter_vec, tmp, weight_vec;
4523  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4524  v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1;
4525  v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec;
4526 
4527  src0_ptr -= (src_stride + 1);
4528 
4529  filter_vec = LD_SH(filter_x);
4530  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4531 
4532  filter_vec = LD_SH(filter_y);
4533  UNPCK_R_SB_SH(filter_vec, filter_vec);
4534 
4535  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4536 
4537  mask1 = mask0 + 2;
4538 
4539  offset = (offset0 + offset1) << rnd_val;
4540  weight0 = weight0 & 0x0000FFFF;
4541  weight = weight0 | (weight1 << 16);
4542 
4543  const_vec = __msa_fill_w((128 * weight1));
4544  const_vec <<= 6;
4545  offset_vec = __msa_fill_w(offset);
4546  weight_vec = (v8i16) __msa_fill_w(weight);
4547  rnd_vec = __msa_fill_w(rnd_val + 1);
4548  offset_vec += const_vec;
4549 
4550  LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4551  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4552 
4553  VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
4554  VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
4555  VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
4556 
4557  dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4558  dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4559  dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4560 
4561  ILVRL_H2_SH(dst31, dst20, dst10, dst32);
4562  ILVRL_H2_SH(dst42, dst31, dst21, dst43);
4563 
4564  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4565  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4566  dst0 >>= 6;
4567  dst1 >>= 6;
4568  dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4569 
4570  LD2(src1_ptr, src2_stride, tp0, tp1);
4571  INSERT_D2_SH(tp0, tp1, in0);
4572 
4573  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4574  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4575  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4576  SRAR_W2_SW(dst0, dst1, rnd_vec);
4577  tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4578  CLIP_SH_0_255(tmp);
4579  out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
4580  ST_W2(out, 0, 1, dst, dst_stride);
4581 }
4582 
4583 static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
4584  int32_t src_stride,
4585  int16_t *src1_ptr,
4586  int32_t src2_stride,
4587  uint8_t *dst,
4588  int32_t dst_stride,
4589  const int8_t *filter_x,
4590  const int8_t *filter_y,
4591  int32_t weight0,
4592  int32_t weight1,
4593  int32_t offset0,
4594  int32_t offset1,
4595  int32_t rnd_val)
4596 {
4597  uint64_t tp0, tp1;
4599  v16u8 out;
4600  v8i16 in0 = { 0 }, in1 = { 0 };
4601  v16i8 src0, src1, src2, src3, src4, src5, src6;
4602  v8i16 filt0, filt1;
4603  v8i16 filt_h0, filt_h1;
4604  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4605  v16i8 mask1;
4606  v8i16 filter_vec, weight_vec;
4607  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4608  v8i16 tmp0, tmp1, tmp2, tmp3;
4609  v8i16 dst30, dst41, dst52, dst63;
4610  v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
4611  v4i32 offset_vec, rnd_vec, const_vec;
4612  v4i32 dst0, dst1, dst2, dst3;
4613 
4614  src0_ptr -= (src_stride + 1);
4615 
4616  filter_vec = LD_SH(filter_x);
4617  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4618 
4619  filter_vec = LD_SH(filter_y);
4620  UNPCK_R_SB_SH(filter_vec, filter_vec);
4621 
4622  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4623 
4624  mask1 = mask0 + 2;
4625 
4626  offset = (offset0 + offset1) << rnd_val;
4627  weight0 = weight0 & 0x0000FFFF;
4628  weight = weight0 | (weight1 << 16);
4629 
4630  const_vec = __msa_fill_w((128 * weight1));
4631  const_vec <<= 6;
4632  offset_vec = __msa_fill_w(offset);
4633  weight_vec = (v8i16) __msa_fill_w(weight);
4634  rnd_vec = __msa_fill_w(rnd_val + 1);
4635  offset_vec += const_vec;
4636 
4637  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4638  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4639 
4640  VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
4641  VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
4642  VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
4643  VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4644 
4645  dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4646  dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4647  dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4648  dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4649 
4650  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
4651  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
4652  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
4653  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4654  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4655  dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
4656  dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
4657  SRA_4V(dst0, dst1, dst2, dst3, 6);
4658  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
4659 
4660  LD2(src1_ptr, src2_stride, tp0, tp1);
4661  INSERT_D2_SH(tp0, tp1, in0);
4662  src1_ptr += (2 * src2_stride);
4663  LD2(src1_ptr, src2_stride, tp0, tp1);
4664  INSERT_D2_SH(tp0, tp1, in1);
4665 
4666  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
4667  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
4668 
4669  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4670  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4671  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4672  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4673  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4674  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4675  CLIP_SH2_0_255(tmp0, tmp1);
4676  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4677  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
4678 }
4679 
4680 static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
4681  int32_t src_stride,
4682  int16_t *src1_ptr,
4683  int32_t src2_stride,
4684  uint8_t *dst,
4685  int32_t dst_stride,
4686  const int8_t *filter_x,
4687  const int8_t *filter_y,
4688  int32_t height,
4689  int32_t weight0,
4690  int32_t weight1,
4691  int32_t offset0,
4692  int32_t offset1,
4693  int32_t rnd_val)
4694 {
4695  uint32_t loop_cnt;
4696  uint64_t tp0, tp1;
4698  v16u8 out0, out1;
4699  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4700  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4701  v8i16 filt0, filt1;
4702  v8i16 filt_h0, filt_h1;
4703  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4704  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4705  v16i8 mask1;
4706  v8i16 filter_vec, weight_vec;
4707  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4708  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4709  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4710  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4711  v8i16 dst98_r, dst109_r;
4712  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4713  v4i32 offset_vec, rnd_vec, const_vec;
4714 
4715  src0_ptr -= (src_stride + 1);
4716 
4717  filter_vec = LD_SH(filter_x);
4718  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4719 
4720  filter_vec = LD_SH(filter_y);
4721  UNPCK_R_SB_SH(filter_vec, filter_vec);
4722 
4723  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4724 
4725  mask1 = mask0 + 2;
4726 
4727  offset = (offset0 + offset1) << rnd_val;
4728  weight0 = weight0 & 0x0000FFFF;
4729  weight = weight0 | (weight1 << 16);
4730 
4731  const_vec = __msa_fill_w((128 * weight1));
4732  const_vec <<= 6;
4733  offset_vec = __msa_fill_w(offset);
4734  weight_vec = (v8i16) __msa_fill_w(weight);
4735  rnd_vec = __msa_fill_w(rnd_val + 1);
4736  offset_vec += const_vec;
4737 
4738  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4739  src0_ptr += (3 * src_stride);
4740  XORI_B3_128_SB(src0, src1, src2);
4741 
4742  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
4743  VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
4744  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4745  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4746  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4747  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4748 
4749  for (loop_cnt = height >> 3; loop_cnt--;) {
4750  LD_SB8(src0_ptr, src_stride,
4751  src3, src4, src5, src6, src7, src8, src9, src10);
4752  src0_ptr += (8 * src_stride);
4753  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4754  VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4755  VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4756  VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4757  VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4758 
4759  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4760  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4761  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4762  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4763 
4764  dst32_r = __msa_ilvr_h(dst73, dst22);
4765  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4766  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4767  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4768  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4769  dst76_r = __msa_ilvr_h(dst22, dst106);
4770 
4771  LD2(src1_ptr, src2_stride, tp0, tp1);
4772  src1_ptr += 2 * src2_stride;
4773  INSERT_D2_SH(tp0, tp1, in0);
4774  LD2(src1_ptr, src2_stride, tp0, tp1);
4775  src1_ptr += 2 * src2_stride;
4776  INSERT_D2_SH(tp0, tp1, in1);
4777 
4778  LD2(src1_ptr, src2_stride, tp0, tp1);
4779  src1_ptr += 2 * src2_stride;
4780  INSERT_D2_SH(tp0, tp1, in2);
4781  LD2(src1_ptr, src2_stride, tp0, tp1);
4782  src1_ptr += 2 * src2_stride;
4783  INSERT_D2_SH(tp0, tp1, in3);
4784 
4785  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4786  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4787  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4788  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4789  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4790  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4791  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4792  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4793  SRA_4V(dst0, dst1, dst2, dst3, 6);
4794  SRA_4V(dst4, dst5, dst6, dst7, 6);
4795  PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
4796  dst2, dst3);
4797  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4798  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
4799  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
4800  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
4801  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4802  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4803  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4804  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4805  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
4806  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
4807  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
4808  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
4809  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4810  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4811  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4812  tmp2, tmp3);
4813  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4814  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4815  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4816  dst += (8 * dst_stride);
4817 
4818  dst10_r = dst98_r;
4819  dst21_r = dst109_r;
4820  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4821  }
4822 }
4823 
4824 static void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr,
4825  int32_t src_stride,
4826  int16_t *src1_ptr,
4827  int32_t src2_stride,
4828  uint8_t *dst,
4829  int32_t dst_stride,
4830  const int8_t *filter_x,
4831  const int8_t *filter_y,
4832  int32_t height,
4833  int32_t weight0,
4834  int32_t weight1,
4835  int32_t offset0,
4836  int32_t offset1,
4837  int32_t rnd_val)
4838 {
4839  if (2 == height) {
4840  hevc_hv_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4841  dst, dst_stride, filter_x, filter_y,
4842  weight0, weight1, offset0, offset1, rnd_val);
4843  } else if (4 == height) {
4844  hevc_hv_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4845  dst, dst_stride, filter_x, filter_y,
4846  weight0, weight1, offset0, offset1, rnd_val);
4847  } else if (0 == (height % 8)) {
4848  hevc_hv_biwgt_4t_4multx8mult_msa(src0_ptr, src_stride,
4849  src1_ptr, src2_stride,
4850  dst, dst_stride, filter_x, filter_y,
4851  height, weight0, weight1,
4852  offset0, offset1, rnd_val);
4853  }
4854 }
4855 
4856 static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
4857  int32_t src_stride,
4858  int16_t *src1_ptr,
4859  int32_t src2_stride,
4860  uint8_t *dst,
4861  int32_t dst_stride,
4862  const int8_t *filter_x,
4863  const int8_t *filter_y,
4864  int32_t height,
4865  int32_t weight0,
4866  int32_t weight1,
4867  int32_t offset0,
4868  int32_t offset1,
4869  int32_t rnd_val)
4870 {
4871  uint32_t tpw0, tpw1, tpw2, tpw3;
4872  uint64_t tp0, tp1;
4874  v16u8 out0, out1, out2;
4875  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4876  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4877  v8i16 in4 = { 0 }, in5 = { 0 };
4878  v8i16 filt0, filt1;
4879  v8i16 filt_h0, filt_h1, filter_vec;
4880  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4881  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4882  v16i8 mask1;
4883  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4884  v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec;
4885  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4886  v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4887  v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4888  v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4889  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4890  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4891  v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4892  v4i32 offset_vec, rnd_vec, const_vec;
4893 
4894  src0_ptr -= (src_stride + 1);
4895 
4896  filter_vec = LD_SH(filter_x);
4897  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4898 
4899  filter_vec = LD_SH(filter_y);
4900  UNPCK_R_SB_SH(filter_vec, filter_vec);
4901 
4902  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4903 
4904  mask1 = mask0 + 2;
4905 
4906  offset = (offset0 + offset1) << rnd_val;
4907  weight0 = weight0 & 0x0000FFFF;
4908  weight = weight0 | (weight1 << 16);
4909 
4910  const_vec = __msa_fill_w((128 * weight1));
4911  const_vec <<= 6;
4912  offset_vec = __msa_fill_w(offset);
4913  weight_vec = (v8i16) __msa_fill_w(weight);
4914  rnd_vec = __msa_fill_w(rnd_val + 1);
4915  offset_vec += const_vec;
4916 
4917  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4918  src0_ptr += (3 * src_stride);
4919  XORI_B3_128_SB(src0, src1, src2);
4920 
4921  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4922  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4923  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4924  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4925  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4926  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4927 
4928  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4929  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4930 
4931  LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
4932  src10);
4933  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4934 
4935  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4936  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4937  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4938  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4939 
4940  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4941  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4942  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4943  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4944 
4945  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4946  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4947  VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4948  VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4949 
4950  dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4951  dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4952  dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4953  dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4954 
4955  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4956  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4957  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4958  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4959  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
4960  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
4961  ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
4962  ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
4963  PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4964  PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4965  dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4966 
4967  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4968  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4969  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4970  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4971  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4972  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4973  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4974  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4975  dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
4976  dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
4977  dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
4978  dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4979  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4980  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4981  SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4982  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1);
4983  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3);
4984 
4985  LD2(src1_ptr, src2_stride, tp0, tp1);
4986  INSERT_D2_SH(tp0, tp1, in0);
4987  LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4988  INSERT_D2_SH(tp0, tp1, in1);
4989 
4990  LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4991  INSERT_D2_SH(tp0, tp1, in2);
4992  LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4993  INSERT_D2_SH(tp0, tp1, in3);
4994 
4995  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4996  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
4997  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
4998  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
4999  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5000  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5001  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5002  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5003  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5004  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5005  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5006  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5007  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5008  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5009  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5010  tmp2, tmp3);
5011  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5012  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5013  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5014 
5015  PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
5016 
5017  LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5018  src1_ptr += (4 * src2_stride);
5019  INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4);
5020  LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5021  INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5);
5022 
5023  ILVRL_H2_SH(dst4, in4, tmp0, tmp1);
5024  ILVRL_H2_SH(dst5, in5, tmp2, tmp3);
5025 
5026  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5027  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5028  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5029  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5030  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5031  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5032 
5033  CLIP_SH2_0_255(tmp4, tmp5);
5034  out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5035  ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
5036 }
5037 
5038 static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
5039  int32_t src_stride,
5040  int16_t *src1_ptr,
5041  int32_t src2_stride,
5042  uint8_t *dst,
5043  int32_t dst_stride,
5044  const int8_t *filter_x,
5045  const int8_t *filter_y,
5046  int32_t weight0,
5047  int32_t weight1,
5048  int32_t offset0,
5049  int32_t offset1,
5050  int32_t rnd_val)
5051 {
5053  v16u8 out;
5054  v16i8 src0, src1, src2, src3, src4;
5055  v8i16 filt0, filt1;
5056  v8i16 filt_h0, filt_h1;
5057  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5058  v16i8 mask1;
5059  v8i16 filter_vec, weight_vec;
5060  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5061  v8i16 dst0, dst1, dst2, dst3, dst4;
5062  v8i16 in0, in1;
5063  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
5064  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
5065  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
5066  v8i16 tmp0, tmp1, tmp2, tmp3;
5067  v4i32 offset_vec, rnd_vec, const_vec;
5068 
5069  src0_ptr -= (src_stride + 1);
5070 
5071  filter_vec = LD_SH(filter_x);
5072  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5073 
5074  filter_vec = LD_SH(filter_y);
5075  UNPCK_R_SB_SH(filter_vec, filter_vec);
5076 
5077  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5078 
5079  mask1 = mask0 + 2;
5080 
5081  offset = (offset0 + offset1) << rnd_val;
5082  weight0 = weight0 & 0x0000FFFF;
5083  weight = weight0 | (weight1 << 16);
5084 
5085  const_vec = __msa_fill_w((128 * weight1));
5086  const_vec <<= 6;
5087  offset_vec = __msa_fill_w(offset);
5088  weight_vec = (v8i16) __msa_fill_w(weight);
5089  rnd_vec = __msa_fill_w(rnd_val + 1);
5090  offset_vec += const_vec;
5091 
5092  LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5093  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5094 
5095  LD_SH2(src1_ptr, src2_stride, in0, in1);
5096 
5097  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5098  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5099  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5100  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5101  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5102 
5103  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5104  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5105  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5106  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5107  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5108 
5109  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
5110  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
5111  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
5112  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
5113  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5114  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5115  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5116  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5117  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5118  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
5119 
5120  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
5121  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
5122 
5123  dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5124  dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5125  dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5126  dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5127  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5128  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
5129  CLIP_SH2_0_255(tmp0, tmp1);
5130  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
5131  ST_D2(out, 0, 1, dst, dst_stride);
5132 }
5133 
5134 static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr,
5135  int32_t src_stride,
5136  int16_t *src1_ptr,
5137  int32_t src2_stride,
5138  uint8_t *dst,
5139  int32_t dst_stride,
5140  const int8_t *filter_x,
5141  const int8_t *filter_y,
5142  int32_t weight0,
5143  int32_t weight1,
5144  int32_t offset0,
5145  int32_t offset1,
5146  int32_t rnd_val,
5147  int32_t width8mult)
5148 {
5150  uint32_t cnt;
5151  v16u8 out0, out1;
5152  v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
5153  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5154  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec;
5155  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5156  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3;
5157  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5158  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5159  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5160  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5161  v4i32 offset_vec, rnd_vec, const_vec;
5162 
5163  src0_ptr -= (src_stride + 1);
5164 
5165  filter_vec = LD_SH(filter_x);
5166  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5167 
5168  filter_vec = LD_SH(filter_y);
5169  UNPCK_R_SB_SH(filter_vec, filter_vec);
5170 
5171  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5172 
5173  mask0 = LD_SB(ff_hevc_mask_arr);
5174  mask1 = mask0 + 2;
5175 
5176  offset = (offset0 + offset1) << rnd_val;
5177  weight0 = weight0 & 0x0000FFFF;
5178  weight = weight0 | (weight1 << 16);
5179 
5180  const_vec = __msa_fill_w((128 * weight1));
5181  const_vec <<= 6;
5182  offset_vec = __msa_fill_w(offset);
5183  rnd_vec = __msa_fill_w(rnd_val + 1);
5184  offset_vec += const_vec;
5185  weight_vec = (v8i16) __msa_fill_w(weight);
5186 
5187  for (cnt = width8mult; cnt--;) {
5188  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
5189  src0_ptr += 8;
5190  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
5191 
5192  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
5193  src1_ptr += 8;
5194 
5195  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5196  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5197  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5198 
5199  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5200  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5201  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5202 
5203  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5204  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5205 
5206  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5207  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5208  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5209  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5210 
5211  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5212  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5213  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5214  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5215 
5216  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5217  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5218  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5219  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5220 
5221  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5222  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5223  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5224  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5225  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5226  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5227  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5228  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5229 
5230  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5231  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5232  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5233  dst3_r, dst0, dst1, dst2, dst3);
5234 
5235  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5236  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5237  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5238  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5239  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5240  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5241  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5242  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5243  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5244  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5245  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5246  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5247  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5248  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5249  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5250  tmp0, tmp1, tmp2, tmp3);
5251  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5252  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5253  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5254  dst += 8;
5255  }
5256 }
5257 
5258 static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
5259  int32_t src_stride,
5260  int16_t *src1_ptr,
5261  int32_t src2_stride,
5262  uint8_t *dst,
5263  int32_t dst_stride,
5264  const int8_t *filter_x,
5265  const int8_t *filter_y,
5266  int32_t weight0,
5267  int32_t weight1,
5268  int32_t offset0,
5269  int32_t offset1,
5270  int32_t rnd_val)
5271 {
5272  uint32_t offset, weight;
5273  v16u8 out0, out1, out2;
5274  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5275  v8i16 filt0, filt1;
5276  v8i16 filt_h0, filt_h1;
5277  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5278  v16i8 mask1;
5279  v8i16 filter_vec, weight_vec;
5280  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5281  v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
5282  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
5283  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5284  v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
5285  v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
5286  v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
5287  v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
5288  v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
5289  v8i16 in0, in1, in2, in3, in4, in5;
5290  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5291  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5292  v4i32 offset_vec, rnd_vec, const_vec;
5293 
5294  src0_ptr -= (src_stride + 1);
5295 
5296  filter_vec = LD_SH(filter_x);
5297  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5298 
5299  filter_vec = LD_SH(filter_y);
5300  UNPCK_R_SB_SH(filter_vec, filter_vec);
5301 
5302  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5303 
5304  mask1 = mask0 + 2;
5305 
5306  offset = (offset0 + offset1) << rnd_val;
5307  weight0 = weight0 & 0x0000FFFF;
5308  weight = weight0 | (weight1 << 16);
5309 
5310  const_vec = __msa_fill_w((128 * weight1));
5311  const_vec <<= 6;
5312  offset_vec = __msa_fill_w(offset);
5313  weight_vec = (v8i16) __msa_fill_w(weight);
5314  rnd_vec = __msa_fill_w(rnd_val + 1);
5315  offset_vec += const_vec;
5316 
5317  LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5318  src0_ptr += (5 * src_stride);
5319  LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
5320 
5321  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5322  XORI_B4_128_SB(src5, src6, src7, src8);
5323 
5324  LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
5325 
5326  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5327  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5328  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5329  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5330  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5331  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
5332  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
5333  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
5334  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
5335 
5336  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5337  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5338  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5339  dsth3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5340  dsth4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5341  dsth5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
5342  dsth6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
5343  dsth7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
5344  dsth8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
5345 
5346  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5347  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5348  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5349  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5350  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5351  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5352  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
5353  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
5354 
5355  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5356  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5357  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5358  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5359  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5360  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5361  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5362  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5363  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5364  dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
5365  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5366  dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
5367 
5368  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5369  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5370  SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
5371  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
5372  dst0, dst1, dst2, dst3);
5373 
5374  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5375  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5376  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5377  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5378  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5379  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5380  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5381  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5382  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5383  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5384  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5385  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5386  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5387  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5388  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5389  tmp0, tmp1, tmp2, tmp3);
5390  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5391  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5392 
5393  PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
5394  ILVRL_H2_SH(dst0, in4, tmp0, tmp1);
5395  ILVRL_H2_SH(dst1, in5, tmp2, tmp3);
5396  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5397  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5398  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5399  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5400  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5401  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5402  CLIP_SH2_0_255(tmp4, tmp5);
5403  out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5404  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5405  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
5406 }
5407 
5408 static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
5409  int32_t src_stride,
5410  int16_t *src1_ptr,
5411  int32_t src2_stride,
5412  uint8_t *dst,
5413  int32_t dst_stride,
5414  const int8_t *filter_x,
5415  const int8_t *filter_y,
5416  int32_t height,
5417  int32_t weight0,
5418  int32_t weight1,
5419  int32_t offset0,
5420  int32_t offset1,
5421  int32_t rnd_val,
5422  int32_t width)
5423 {
5424  uint32_t loop_cnt;
5425  uint32_t cnt;
5427  uint8_t *src0_ptr_tmp;
5428  int16_t *src1_ptr_tmp;
5429  uint8_t *dst_tmp;
5430  v16u8 out0, out1;
5431  v16i8 src0, src1, src2, src3, src4, src5, src6;
5432  v8i16 in0, in1, in2, in3;
5433  v8i16 filt0, filt1;
5434  v8i16 filt_h0, filt_h1;
5435  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5436  v16i8 mask1;
5437  v8i16 filter_vec;
5438  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5439  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5440  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5441  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5442  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5443  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5444  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec;
5445  v4i32 offset_vec, rnd_vec, const_vec;
5446 
5447  src0_ptr -= (src_stride + 1);
5448 
5449  filter_vec = LD_SH(filter_x);
5450  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5451 
5452  filter_vec = LD_SH(filter_y);
5453  UNPCK_R_SB_SH(filter_vec, filter_vec);
5454 
5455  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5456 
5457  mask1 = mask0 + 2;
5458 
5459  offset = (offset0 + offset1) << rnd_val;
5460  weight0 = weight0 & 0x0000FFFF;
5461  weight = weight0 | (weight1 << 16);
5462 
5463  const_vec = __msa_fill_w((128 * weight1));
5464  const_vec <<= 6;
5465  offset_vec = __msa_fill_w(offset);
5466  weight_vec = (v8i16) __msa_fill_w(weight);
5467  rnd_vec = __msa_fill_w(rnd_val + 1);
5468  offset_vec += const_vec;
5469 
5470  for (cnt = width >> 3; cnt--;) {
5471  src0_ptr_tmp = src0_ptr;
5472  src1_ptr_tmp = src1_ptr;
5473  dst_tmp = dst;
5474 
5475  LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5476  src0_ptr_tmp += (3 * src_stride);
5477  XORI_B3_128_SB(src0, src1, src2);
5478 
5479  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5480  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5481  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5482  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5483  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5484  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5485 
5486  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5487  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5488 
5489  for (loop_cnt = height >> 2; loop_cnt--;) {
5490  LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5491  src0_ptr_tmp += (4 * src_stride);
5492  LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5493  src1_ptr_tmp += (4 * src2_stride);
5494  XORI_B4_128_SB(src3, src4, src5, src6);
5495 
5496  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5497  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5498  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5499  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5500 
5501  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5502  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5503  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5504  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5505 
5506  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5507  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5508  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5509  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5510 
5511  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5512  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5513  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5514  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5515  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5516  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5517  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5518  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5519 
5520  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5521  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5522  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5523  dst3_r, dst0, dst1, dst2, dst3);
5524  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5525  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5526  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5527  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5528  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5529  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5530  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5531  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5532  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5533  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5534  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5535  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5536  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5537  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5538  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5539  tmp0, tmp1, tmp2, tmp3);
5540  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5541  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5542  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5543  dst_tmp += (4 * dst_stride);
5544 
5545  dst10_r = dst54_r;
5546  dst10_l = dst54_l;
5547  dst21_r = dst65_r;
5548  dst21_l = dst65_l;
5549  dsth2 = dsth6;
5550  }
5551 
5552  src0_ptr += 8;
5553  dst += 8;
5554  src1_ptr += 8;
5555  }
5556 }
5557 
5558 static void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr,
5559  int32_t src_stride,
5560  int16_t *src1_ptr,
5561  int32_t src2_stride,
5562  uint8_t *dst,
5563  int32_t dst_stride,
5564  const int8_t *filter_x,
5565  const int8_t *filter_y,
5566  int32_t height,
5567  int32_t weight0,
5568  int32_t weight1,
5569  int32_t offset0,
5570  int32_t offset1,
5571  int32_t rnd_val)
5572 {
5573  if (2 == height) {
5574  hevc_hv_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5575  dst, dst_stride, filter_x, filter_y,
5576  weight0, weight1, offset0, offset1, rnd_val);
5577  } else if (4 == height) {
5578  hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
5579  src2_stride, dst, dst_stride, filter_x,
5580  filter_y, weight0, weight1, offset0,
5581  offset1, rnd_val, 1);
5582  } else if (6 == height) {
5583  hevc_hv_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5584  dst, dst_stride, filter_x, filter_y,
5585  weight0, weight1, offset0, offset1, rnd_val);
5586  } else if (0 == (height % 4)) {
5587  hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5588  src1_ptr, src2_stride,
5589  dst, dst_stride, filter_x, filter_y,
5590  height, weight0,
5591  weight1, offset0, offset1, rnd_val, 8);
5592  }
5593 }
5594 
5595 static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
5596  int32_t src_stride,
5597  int16_t *src1_ptr,
5598  int32_t src2_stride,
5599  uint8_t *dst,
5600  int32_t dst_stride,
5601  const int8_t *filter_x,
5602  const int8_t *filter_y,
5603  int32_t height,
5604  int32_t weight0,
5605  int32_t weight1,
5606  int32_t offset0,
5607  int32_t offset1,
5608  int32_t rnd_val)
5609 {
5610  uint32_t loop_cnt;
5611  uint64_t tp0, tp1;
5613  uint8_t *src0_ptr_tmp, *dst_tmp;
5614  int16_t *src1_ptr_tmp;
5615  v16u8 out0, out1;
5616  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5617  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5618  v16i8 mask0, mask1, mask2, mask3;
5619  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
5620  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5621  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, weight_vec;
5622  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
5623  v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
5624  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
5625  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5626  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5627  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5628  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5629  v4i32 offset_vec, rnd_vec, const_vec;
5630 
5631  src0_ptr -= (src_stride + 1);
5632 
5633  filter_vec = LD_SH(filter_x);
5634  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5635 
5636  filter_vec = LD_SH(filter_y);
5637  UNPCK_R_SB_SH(filter_vec, filter_vec);
5638 
5639  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5640 
5641  mask0 = LD_SB(ff_hevc_mask_arr);
5642  mask1 = mask0 + 2;
5643 
5644  offset = (offset0 + offset1) << rnd_val;
5645  weight0 = weight0 & 0x0000FFFF;
5646  weight = weight0 | (weight1 << 16);
5647 
5648  const_vec = __msa_fill_w((128 * weight1));
5649  const_vec <<= 6;
5650  offset_vec = __msa_fill_w(offset);
5651  rnd_vec = __msa_fill_w(rnd_val + 1);
5652  offset_vec += const_vec;
5653  weight_vec = (v8i16) __msa_fill_w(weight);
5654 
5655  src0_ptr_tmp = src0_ptr;
5656  dst_tmp = dst;
5657  src1_ptr_tmp = src1_ptr;
5658 
5659  LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5660  src0_ptr_tmp += (3 * src_stride);
5661 
5662  XORI_B3_128_SB(src0, src1, src2);
5663 
5664  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5665  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5666  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5667 
5668  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5669  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5670  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5671 
5672  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5673  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5674 
5675  for (loop_cnt = 4; loop_cnt--;) {
5676  LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5677  src0_ptr_tmp += (4 * src_stride);
5678  XORI_B4_128_SB(src3, src4, src5, src6);
5679 
5680  LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5681  src1_ptr_tmp += (4 * src2_stride);
5682 
5683  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5684  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5685  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5686  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5687 
5688  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5689  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5690  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5691  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5692 
5693  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5694  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5695  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5696  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5697 
5698  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5699  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5700  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5701  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5702  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5703  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5704  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5705  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5706 
5707  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5708  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5709  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5710  dst3_r, dst0, dst1, dst2, dst3);
5711  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5712  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5713  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5714  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5715  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5716  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5717  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5718  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5719  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5720  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5721  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5722  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5723  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5724  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5725  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5726  tmp0, tmp1, tmp2, tmp3);
5727  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5728  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5729  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5730  dst_tmp += (4 * dst_stride);
5731 
5732  dst10_r = dst54_r;
5733  dst10_l = dst54_l;
5734  dst21_r = dst65_r;
5735  dst21_l = dst65_l;
5736  dsth2 = dsth6;
5737  }
5738 
5739  src0_ptr += 8;
5740  dst += 8;
5741  src1_ptr += 8;
5742 
5743  mask2 = LD_SB(ff_hevc_mask_arr + 16);
5744  mask3 = mask2 + 2;
5745 
5746  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
5747  src0_ptr += (3 * src_stride);
5748  XORI_B3_128_SB(src0, src1, src2);
5749  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
5750  VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
5751 
5752  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5753  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5754 
5755  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
5756  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
5757 
5758  for (loop_cnt = 2; loop_cnt--;) {
5759  LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
5760  src10);
5761  src0_ptr += (8 * src_stride);
5762  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
5763  VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
5764  VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
5765  VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
5766  VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
5767 
5768  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5769  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5770  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5771  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5772 
5773  dst32_r = __msa_ilvr_h(dst73, dst22);
5774  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
5775  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
5776  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
5777  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
5778  dst76_r = __msa_ilvr_h(dst22, dst106);
5779 
5780  LD2(src1_ptr, src2_stride, tp0, tp1);
5781  src1_ptr += 2 * src2_stride;
5782  INSERT_D2_SH(tp0, tp1, in0);
5783  LD2(src1_ptr, src2_stride, tp0, tp1);
5784  src1_ptr += 2 * src2_stride;
5785  INSERT_D2_SH(tp0, tp1, in1);
5786 
5787  LD2(src1_ptr, src2_stride, tp0, tp1);
5788  src1_ptr += 2 * src2_stride;
5789  INSERT_D2_SH(tp0, tp1, in2);
5790  LD2(src1_ptr, src2_stride, tp0, tp1);
5791  src1_ptr += 2 * src2_stride;
5792  INSERT_D2_SH(tp0, tp1, in3);
5793 
5794  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5795  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5796  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5797  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5798  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5799  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5800  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
5801  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
5802 
5803  SRA_4V(dst0, dst1, dst2, dst3, 6);
5804  SRA_4V(dst4, dst5, dst6, dst7, 6);
5805  PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5806  dst0, dst1, dst2, dst3);
5807  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5808  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5809  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5810  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5811  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5812  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5813  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5814  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5815  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5816  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5817  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5818  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5819  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5820  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5821  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5822  tmp0, tmp1, tmp2, tmp3);
5823  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5824  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5825  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5826  dst += (8 * dst_stride);
5827 
5828  dst10_r = dst98_r;
5829  dst21_r = dst109_r;
5830  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5831  }
5832 }
5833 
5834 static void hevc_hv_biwgt_4t_16w_msa(uint8_t *src0_ptr,
5835  int32_t src_stride,
5836  int16_t *src1_ptr,
5837  int32_t src2_stride,
5838  uint8_t *dst,
5839  int32_t dst_stride,
5840  const int8_t *filter_x,
5841  const int8_t *filter_y,
5842  int32_t height,
5843  int32_t weight0,
5844  int32_t weight1,
5845  int32_t offset0,
5846  int32_t offset1,
5847  int32_t rnd_val)
5848 {
5849  if (4 == height) {
5850  hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
5851  src2_stride, dst, dst_stride, filter_x,
5852  filter_y, weight0, weight1, offset0,
5853  offset1, rnd_val, 2);
5854  } else {
5855  hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr,
5856  src2_stride, dst, dst_stride,
5857  filter_x, filter_y, height, weight0,
5858  weight1, offset0, offset1, rnd_val, 16);
5859  }
5860 }
5861 
5862 static void hevc_hv_biwgt_4t_24w_msa(uint8_t *src0_ptr,
5863  int32_t src_stride,
5864  int16_t *src1_ptr,
5865  int32_t src2_stride,
5866  uint8_t *dst,
5867  int32_t dst_stride,
5868  const int8_t *filter_x,
5869  const int8_t *filter_y,
5870  int32_t height,
5871  int32_t weight0,
5872  int32_t weight1,
5873  int32_t offset0,
5874  int32_t offset1,
5875  int32_t rnd_val)
5876 {
5877  hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5878  src1_ptr, src2_stride,
5879  dst, dst_stride,
5880  filter_x, filter_y, height, weight0,
5881  weight1, offset0, offset1, rnd_val, 24);
5882 }
5883 
5884 static void hevc_hv_biwgt_4t_32w_msa(uint8_t *src0_ptr,
5885  int32_t src_stride,
5886  int16_t *src1_ptr,
5887  int32_t src2_stride,
5888  uint8_t *dst,
5889  int32_t dst_stride,
5890  const int8_t *filter_x,
5891  const int8_t *filter_y,
5892  int32_t height,
5893  int32_t weight0,
5894  int32_t weight1,
5895  int32_t offset0,
5896  int32_t offset1,
5897  int32_t rnd_val)
5898 {
5899  hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5900  src1_ptr, src2_stride,
5901  dst, dst_stride,
5902  filter_x, filter_y, height, weight0,
5903  weight1, offset0, offset1, rnd_val, 32);
5904 }
5905 
5906 #define BI_W_MC_COPY(WIDTH) \
5907 void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
5908  ptrdiff_t dst_stride, \
5909  uint8_t *src, \
5910  ptrdiff_t src_stride, \
5911  int16_t *src_16bit, \
5912  int height, \
5913  int denom, \
5914  int weight0, \
5915  int weight1, \
5916  int offset0, \
5917  int offset1, \
5918  intptr_t mx, \
5919  intptr_t my, \
5920  int width) \
5921 { \
5922  int shift = 14 + 1 - 8; \
5923  int log2Wd = denom + shift - 1; \
5924  \
5925  hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
5926  dst, dst_stride, height, \
5927  weight0, weight1, offset0, \
5928  offset1, log2Wd); \
5929 }
5930 
5931 BI_W_MC_COPY(4);
5932 BI_W_MC_COPY(6);
5933 BI_W_MC_COPY(8);
5934 BI_W_MC_COPY(12);
5935 BI_W_MC_COPY(16);
5936 BI_W_MC_COPY(24);
5937 BI_W_MC_COPY(32);
5938 BI_W_MC_COPY(48);
5939 BI_W_MC_COPY(64);
5940 
5941 #undef BI_W_MC_COPY
5942 
5943 #define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
5944 void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
5945  ptrdiff_t \
5946  dst_stride, \
5947  uint8_t *src, \
5948  ptrdiff_t \
5949  src_stride, \
5950  int16_t *src_16bit, \
5951  int height, \
5952  int denom, \
5953  int weight0, \
5954  int weight1, \
5955  int offset0, \
5956  int offset1, \
5957  intptr_t mx, \
5958  intptr_t my, \
5959  int width) \
5960 { \
5961  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
5962  int log2Wd = denom + 14 - 8; \
5963  \
5964  hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
5965  MAX_PB_SIZE, dst, dst_stride, \
5966  filter, height, weight0, \
5967  weight1, offset0, offset1, \
5968  log2Wd); \
5969 }
5970 
5971 BI_W_MC(qpel, h, 4, 8, hz, mx);
5972 BI_W_MC(qpel, h, 8, 8, hz, mx);
5973 BI_W_MC(qpel, h, 12, 8, hz, mx);
5974 BI_W_MC(qpel, h, 16, 8, hz, mx);
5975 BI_W_MC(qpel, h, 24, 8, hz, mx);
5976 BI_W_MC(qpel, h, 32, 8, hz, mx);
5977 BI_W_MC(qpel, h, 48, 8, hz, mx);
5978 BI_W_MC(qpel, h, 64, 8, hz, mx);
5979 
5980 BI_W_MC(qpel, v, 4, 8, vt, my);
5981 BI_W_MC(qpel, v, 8, 8, vt, my);
5982 BI_W_MC(qpel, v, 12, 8, vt, my);
5983 BI_W_MC(qpel, v, 16, 8, vt, my);
5984 BI_W_MC(qpel, v, 24, 8, vt, my);
5985 BI_W_MC(qpel, v, 32, 8, vt, my);
5986 BI_W_MC(qpel, v, 48, 8, vt, my);
5987 BI_W_MC(qpel, v, 64, 8, vt, my);
5988 
5989 BI_W_MC(epel, h, 4, 4, hz, mx);
5990 BI_W_MC(epel, h, 8, 4, hz, mx);
5991 BI_W_MC(epel, h, 6, 4, hz, mx);
5992 BI_W_MC(epel, h, 12, 4, hz, mx);
5993 BI_W_MC(epel, h, 16, 4, hz, mx);
5994 BI_W_MC(epel, h, 24, 4, hz, mx);
5995 BI_W_MC(epel, h, 32, 4, hz, mx);
5996 
5997 BI_W_MC(epel, v, 4, 4, vt, my);
5998 BI_W_MC(epel, v, 8, 4, vt, my);
5999 BI_W_MC(epel, v, 6, 4, vt, my);
6000 BI_W_MC(epel, v, 12, 4, vt, my);
6001 BI_W_MC(epel, v, 16, 4, vt, my);
6002 BI_W_MC(epel, v, 24, 4, vt, my);
6003 BI_W_MC(epel, v, 32, 4, vt, my);
6004 
6005 #undef BI_W_MC
6006 
6007 #define BI_W_MC_HV(PEL, WIDTH, TAP) \
6008 void ff_hevc_put_hevc_bi_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
6009  ptrdiff_t dst_stride, \
6010  uint8_t *src, \
6011  ptrdiff_t src_stride, \
6012  int16_t *src_16bit, \
6013  int height, \
6014  int denom, \
6015  int weight0, \
6016  int weight1, \
6017  int offset0, \
6018  int offset1, \
6019  intptr_t mx, \
6020  intptr_t my, \
6021  int width) \
6022 { \
6023  const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
6024  const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
6025  int log2Wd = denom + 14 - 8; \
6026  \
6027  hevc_hv_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
6028  MAX_PB_SIZE, dst, dst_stride, \
6029  filter_x, filter_y, height, \
6030  weight0, weight1, offset0, \
6031  offset1, log2Wd); \
6032 }
6033 
6034 BI_W_MC_HV(qpel, 4, 8);
6035 BI_W_MC_HV(qpel, 8, 8);
6036 BI_W_MC_HV(qpel, 12, 8);
6037 BI_W_MC_HV(qpel, 16, 8);
6038 BI_W_MC_HV(qpel, 24, 8);
6039 BI_W_MC_HV(qpel, 32, 8);
6040 BI_W_MC_HV(qpel, 48, 8);
6041 BI_W_MC_HV(qpel, 64, 8);
6042 
6043 BI_W_MC_HV(epel, 4, 4);
6044 BI_W_MC_HV(epel, 8, 4);
6045 BI_W_MC_HV(epel, 6, 4);
6046 BI_W_MC_HV(epel, 12, 4);
6047 BI_W_MC_HV(epel, 16, 4);
6048 BI_W_MC_HV(epel, 24, 4);
6049 BI_W_MC_HV(epel, 32, 4);
6050 
6051 #undef BI_W_MC_HV
VSHF_B2_SB
#define VSHF_B2_SB(...)
Definition: generic_macros_msa.h:662
LD_SB4
#define LD_SB4(...)
Definition: generic_macros_msa.h:297
hevc_vt_biwgt_8t_24w_msa
static void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1834
hevc_hv_biwgt_4t_4multx8mult_msa
static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4680
LD_SH2
#define LD_SH2(...)
Definition: generic_macros_msa.h:280
hevc_vt_biwgt_4t_8x6_msa
static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3881
hevc_hz_biwgt_4t_4x8multiple_msa
static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2752
ILVR_H2_SH
#define ILVR_H2_SH(...)
Definition: generic_macros_msa.h:1392
DPADD_SB2_SH
#define DPADD_SB2_SH(...)
Definition: generic_macros_msa.h:833
LD_SH4
#define LD_SH4(...)
Definition: generic_macros_msa.h:299
out
FILE * out
Definition: movenc.c:54
hevc_biwgt_copy_32w_msa
static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:491
SPLATI_H4_SH
#define SPLATI_H4_SH(...)
Definition: generic_macros_msa.h:1674
ILVL_B4_SH
#define ILVL_B4_SH(...)
Definition: generic_macros_msa.h:1276
hevc_hz_biwgt_8t_4w_msa
static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:659
ST_UB2
#define ST_UB2(...)
Definition: generic_macros_msa.h:363
PCKEV_H2_SW
#define PCKEV_H2_SW(...)
Definition: generic_macros_msa.h:1760
INSERT_W4_SH
#define INSERT_W4_SH(...)
Definition: generic_macros_msa.h:1155
HEVC_BIW_RND_CLIP4_MAX_SATU
#define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, vec3, wgt, rnd, offset, out0, out1, out2, out3)
Definition: hevc_mc_biw_msa.c:72
hevc_hv_biwgt_4t_8w_msa
static void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5558
BI_W_MC_HV
#define BI_W_MC_HV(PEL, WIDTH, TAP)
Definition: hevc_mc_biw_msa.c:6007
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
hevc_biwgt_copy_12w_msa
static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:313
ff_hevc_mask_arr
static const uint8_t ff_hevc_mask_arr[16 *2]
Definition: hevc_mc_biw_msa.c:25
LD_SH
#define LD_SH(...)
Definition: generic_macros_msa.h:35
hevc_biwgt_copy_6w_msa
static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:171
hevc_hz_biwgt_8t_32w_msa
static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1112
SLLI_2V
#define SLLI_2V(in0, in1, shift)
Definition: generic_macros_msa.h:1916
VSHF_B4_SB
#define VSHF_B4_SB(...)
Definition: generic_macros_msa.h:680
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
hevc_vt_biwgt_8t_32w_msa
static void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1859
hevc_hz_biwgt_8t_64w_msa
static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1302
ST_UB4
#define ST_UB4(...)
Definition: generic_macros_msa.h:374
SRAR_W2_SW
#define SRAR_W2_SW(...)
Definition: generic_macros_msa.h:2034
hevc_hz_biwgt_8t_12w_msa
static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:811
hevc_biwgt_copy_24w_msa
static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:427
hevc_vt_biwgt_8t_8w_msa
static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1499
INSERT_W2_SB
#define INSERT_W2_SB(...)
Definition: generic_macros_msa.h:1144
hevc_hz_biwgt_8t_24w_msa
static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:990
hevc_vt_biwgt_4t_16w_msa
static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4149
hevc_vt_biwgt_4t_4x2_msa
static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3498
XORI_B4_128_SB
#define XORI_B4_128_SB(...)
Definition: generic_macros_msa.h:1851
hevc_hv_biwgt_8t_12w_msa
static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2272
hevc_hv_biwgt_8t_4w_msa
static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1922
generic_macros_msa.h
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: vp8_lpf_lsx.c:234
hevc_hz_biwgt_4t_8x2_msa
static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2932
hevc_hz_biwgt_4t_4x2_msa
static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2632
hevc_biwgt_copy_64w_msa
static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:604
PCKEV_H4_SW
#define PCKEV_H4_SW(...)
Definition: generic_macros_msa.h:1769
LD_SB
#define LD_SB(...)
Definition: generic_macros_msa.h:33
LD_SB5
#define LD_SB5(...)
Definition: generic_macros_msa.h:308
hevc_hv_biwgt_8t_8w_msa
static void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2250
ILVL_W2_SB
#define ILVL_W2_SB(...)
Definition: generic_macros_msa.h:1319
aligned
static int aligned(int val)
Definition: dashdec.c:169
hevc_vt_biwgt_8t_64w_msa
static void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1901
ILVL_H2_SH
#define ILVL_H2_SH(...)
Definition: generic_macros_msa.h:1292
hevc_hv_biwgt_4t_8multx4mult_msa
static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width)
Definition: hevc_mc_biw_msa.c:5408
hevc_hv_biwgt_8t_32w_msa
static void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2566
width
#define width
HEVC_FILT_8TAP_SH
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
Definition: hevc_macros_msa.h:24
ST_H8
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:429
hevc_hv_biwgt_4t_32w_msa
static void hevc_hv_biwgt_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5884
UNPCK_R_SB_SH
#define UNPCK_R_SB_SH(in, out)
Definition: generic_macros_msa.h:2156
SRA_4V
#define SRA_4V(in0, in1, in2, in3, shift)
Definition: generic_macros_msa.h:1939
hevc_hv_biwgt_4t_4x4_msa
static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4583
PCKEV_H4_SH
#define PCKEV_H4_SH(...)
Definition: generic_macros_msa.h:1768
HEVC_FILT_8TAP
#define HEVC_FILT_8TAP(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
Definition: hevc_macros_msa.h:35
hevc_hv_biwgt_8t_16w_msa
static void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2522
INSERT_D2_SB
#define INSERT_D2_SB(...)
Definition: generic_macros_msa.h:1170
hevc_hz_biwgt_8t_48w_msa
static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1198
hevc_hz_biwgt_4t_4x4_msa
static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2692
hevc_hv_biwgt_4t_4x2_msa
static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4499
hevc_macros_msa.h
hevc_hz_biwgt_4t_8x6_msa
static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2988
hevc_vt_biwgt_4t_32w_msa
static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4380
ILVR_B4_SB
#define ILVR_B4_SB(...)
Definition: generic_macros_msa.h:1360
LD2
#define LD2(psrc, stride, out0, out1)
Definition: generic_macros_msa.h:223
ILVR_D2_SB
#define ILVR_D2_SB(...)
Definition: generic_macros_msa.h:1444
hevc_hz_biwgt_4t_6w_msa
static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2858
ST_SH2
#define ST_SH2(...)
Definition: generic_macros_msa.h:366
PCKEV_B2_UB
#define PCKEV_B2_UB(...)
Definition: generic_macros_msa.h:1720
XORI_B5_128_SB
#define XORI_B5_128_SB(...)
Definition: generic_macros_msa.h:1859
ILVRL_H2_SH
#define ILVRL_H2_SH(...)
Definition: generic_macros_msa.h:1508
INSERT_W4_SB
#define INSERT_W4_SB(...)
Definition: generic_macros_msa.h:1154
hevc_hz_biwgt_4t_8x4multiple_msa
static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3062
DOTP_SB3_SH
#define DOTP_SB3_SH(...)
Definition: generic_macros_msa.h:776
hevc_hz_biwgt_4t_12w_msa
static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3163
hevc_hv_biwgt_8t_48w_msa
static void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2588
hevc_hz_biwgt_8t_8w_msa
static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:731
ST_W2
#define ST_W2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:450
hevc_vt_biwgt_4t_4x8multiple_msa
static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3630
hevc_vt_biwgt_4t_6w_msa
static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3747
hevc_vt_biwgt_4t_24w_msa
static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4243
weight
static int weight(int i, int blen, int offset)
Definition: diracdec.c:1561
ILVR_D3_SB
#define ILVR_D3_SB(...)
Definition: generic_macros_msa.h:1452
ILVR_D4_SB
#define ILVR_D4_SB(...)
Definition: generic_macros_msa.h:1460
hevc_vt_biwgt_4t_8w_msa
static void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4026
CLIP_SH2_0_255
#define CLIP_SH2_0_255(in0, in1)
Definition: generic_macros_msa.h:941
CLIP_SW4_0_255
#define CLIP_SW4_0_255(in0, in1, in2, in3)
Definition: generic_macros_msa.h:978
hevcdsp_mips.h
hevc_hz_biwgt_4t_24w_msa
static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3338
SLLI_4V
#define SLLI_4V(in0, in1, in2, in3, shift)
Definition: generic_macros_msa.h:1921
LD_SB7
#define LD_SB7(...)
Definition: generic_macros_msa.h:327
BI_W_MC_COPY
#define BI_W_MC_COPY(WIDTH)
Definition: hevc_mc_biw_msa.c:5906
hevc_vt_biwgt_8t_16w_msa
static void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1813
XORI_B8_128_SB
#define XORI_B8_128_SB(...)
Definition: generic_macros_msa.h:1880
ILVR_B2_SB
#define ILVR_B2_SB(...)
Definition: generic_macros_msa.h:1338
XORI_B2_128_SB
#define XORI_B2_128_SB(...)
Definition: generic_macros_msa.h:1835
height
#define height
hevc_hz_biwgt_4t_32w_msa
static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3425
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
LW4
#define LW4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:202
ST_D2
#define ST_D2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:491
SPLATI_H2_SH
#define SPLATI_H2_SH(...)
Definition: generic_macros_msa.h:1656
hevc_hz_biwgt_4t_16w_msa
static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3249
LD_SB6
#define LD_SB6(...)
Definition: generic_macros_msa.h:316
hevc_hv_biwgt_4t_6w_msa
static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4856
src0
#define src0
Definition: h264pred.c:139
SPLATI_W4_SH
#define SPLATI_W4_SH(...)
Definition: generic_macros_msa.h:1700
ILVRL_H2_SW
#define ILVRL_H2_SW(...)
Definition: generic_macros_msa.h:1509
HEVC_FILT_4TAP_SH
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
Definition: hevc_macros_msa.h:46
XORI_B6_128_SB
#define XORI_B6_128_SB(...)
Definition: generic_macros_msa.h:1866
src1
#define src1
Definition: h264pred.c:140
CLIP_SH4_0_255
#define CLIP_SH4_0_255(in0, in1, in2, in3)
Definition: generic_macros_msa.h:947
PCKEV_B2_SH
#define PCKEV_B2_SH(...)
Definition: generic_macros_msa.h:1721
HEVC_BIW_RND_CLIP4
#define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, wgt, rnd, offset, out0, out1, out2, out3)
Definition: hevc_mc_biw_msa.c:49
LD_SH8
#define LD_SH8(...)
Definition: generic_macros_msa.h:338
hevc_biwgt_copy_16w_msa
static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:372
CLIP_SH_0_255
#define CLIP_SH_0_255(in)
Definition: generic_macros_msa.h:935
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:470
hevc_vt_biwgt_4t_8x2_msa
static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3824
PCKEV_B3_UB
#define PCKEV_B3_UB(...)
Definition: generic_macros_msa.h:1729
LD4
#define LD4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:228
hevc_hv_biwgt_4t_4w_msa
static void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4824
ILVL_B2_SB
#define ILVL_B2_SB(...)
Definition: generic_macros_msa.h:1263
DPADD_SB4_SH
#define DPADD_SB4_SH(...)
Definition: generic_macros_msa.h:841
hevc_hv_biwgt_4t_12w_msa
static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5595
ST_H2
#define ST_H2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:409
SPLATI_W2_SH
#define SPLATI_W2_SH(...)
Definition: generic_macros_msa.h:1692
HEVC_BIW_RND_CLIP2
#define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1)
Definition: hevc_mc_biw_msa.c:31
hevc_hz_biwgt_4t_8w_msa
static void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3132
LD_SB3
#define LD_SB3(...)
Definition: generic_macros_msa.h:289
ILVL_H4_SH
#define ILVL_H4_SH(...)
Definition: generic_macros_msa.h:1301
hevc_biwgt_copy_4w_msa
static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:82
HEVC_BIW_RND_CLIP2_MAX_SATU
#define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1)
Definition: hevc_mc_biw_msa.c:56
ST_UB
#define ST_UB(...)
Definition: generic_macros_msa.h:40
hevc_hv_biwgt_8t_24w_msa
static void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2544
BI_W_MC
#define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
Definition: hevc_mc_biw_msa.c:5943
hevc_vt_biwgt_4t_4x4_msa
static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3563
hevc_vt_biwgt_8t_4w_msa
static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1401
hevc_biwgt_copy_48w_msa
static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:552
ILVR_D2_SH
#define ILVR_D2_SH(...)
Definition: generic_macros_msa.h:1445
hevc_vt_biwgt_8t_16multx2mult_msa
static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width)
Definition: hevc_mc_biw_msa.c:1697
hevc_vt_biwgt_4t_4w_msa
static void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3716
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:499
DOTP_SB4_SH
#define DOTP_SB4_SH(...)
Definition: generic_macros_msa.h:784
ILVL_B4_SB
#define ILVL_B4_SB(...)
Definition: generic_macros_msa.h:1274
hevc_hz_biwgt_8t_16w_msa
static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:906
LD_SB8
#define LD_SB8(...)
Definition: generic_macros_msa.h:336
ILVR_B4_SH
#define ILVR_B4_SH(...)
Definition: generic_macros_msa.h:1362
hevc_vt_biwgt_4t_8x4multiple_msa
static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:3952
LD_SH6
#define LD_SH6(...)
Definition: generic_macros_msa.h:318
zero
#define zero
Definition: regdef.h:64
hevc_hv_biwgt_4t_24w_msa
static void hevc_hv_biwgt_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5862
hevc_hv_biwgt_8t_8multx2mult_msa
static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width8mult)
Definition: hevc_mc_biw_msa.c:2067
XORI_B7_128_SB
#define XORI_B7_128_SB(...)
Definition: generic_macros_msa.h:1873
hevc_hv_biwgt_4t_8x2_msa
static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5038
ILVRL_B2_SH
#define ILVRL_B2_SH(...)
Definition: generic_macros_msa.h:1498
ST_SH
#define ST_SH(...)
Definition: generic_macros_msa.h:43
hevc_vt_biwgt_8t_48w_msa
static void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1880
HEVC_FILT_4TAP
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
Definition: hevc_macros_msa.h:55
hevc_hz_biwgt_4t_4w_msa
static void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2827
int32_t
int32_t
Definition: audioconvert.c:56
hevc_hv_biwgt_8t_64w_msa
static void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:2610
h
h
Definition: vp9dsp_template.c:2038
ILVR_H4_SH
#define ILVR_H4_SH(...)
Definition: generic_macros_msa.h:1408
hevc_vt_biwgt_4t_12w_msa
static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:4057
ILVR_B2_SH
#define ILVR_B2_SH(...)
Definition: generic_macros_msa.h:1340
PCKEV_D2_SH
#define PCKEV_D2_SH(...)
Definition: generic_macros_msa.h:1789
INSERT_D2_SH
#define INSERT_D2_SH(...)
Definition: generic_macros_msa.h:1171
hevc_vt_biwgt_8t_12w_msa
static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:1587
SD
#define SD
Definition: ccaption_dec.c:928
hevc_hv_biwgt_4t_8x6_msa
static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5258
PCKEV_H2_SH
#define PCKEV_H2_SH(...)
Definition: generic_macros_msa.h:1759
hevc_biwgt_copy_8w_msa
static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:225
XORI_B3_128_SB
#define XORI_B3_128_SB(...)
Definition: generic_macros_msa.h:1843
LD_SB2
#define LD_SB2(...)
Definition: generic_macros_msa.h:278
SRAR_W4_SW
#define SRAR_W4_SW(...)
Definition: generic_macros_msa.h:2041
LW2
#define LW2(psrc, stride, out0, out1)
Definition: generic_macros_msa.h:210
hevc_hv_biwgt_4t_16w_msa
static void hevc_hv_biwgt_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
Definition: hevc_mc_biw_msa.c:5834
hevc_hv_biwgt_4t_8multx4_msa
static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width8mult)
Definition: hevc_mc_biw_msa.c:5134