FFmpeg
h264qpel_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 -2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "h264dsp_mips.h"
23 
24 static const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = {
25  /* 8 width cases */
26  0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
27  1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
28  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
29 
30  /* 4 width cases */
31  0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
32  1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
33  2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
34 };
35 
36 #define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5, \
37  out1, out2) \
38 { \
39  v16i8 tmp0_m, tmp1_m; \
40  v16i8 minus5b_m = __msa_ldi_b(-5); \
41  v16i8 plus20b_m = __msa_ldi_b(20); \
42  \
43  ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m); \
44  HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2); \
45  ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m); \
46  DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2); \
47  ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m); \
48  DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2); \
49 }
50 
51 #define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \
52 ( { \
53  v8i16 out0_m; \
54  v16i8 tmp0_m; \
55  v16i8 minus5b = __msa_ldi_b(-5); \
56  v16i8 plus20b = __msa_ldi_b(20); \
57  \
58  tmp0_m = __msa_vshf_b((v16i8) mask0, in1, in0); \
59  out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m); \
60  \
61  tmp0_m = __msa_vshf_b((v16i8) mask1, in1, in0); \
62  out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m); \
63  \
64  tmp0_m = __msa_vshf_b((v16i8) mask2, in1, in0); \
65  out0_m = __msa_dpadd_s_h(out0_m, plus20b, tmp0_m); \
66  \
67  out0_m; \
68 } )
69 
70 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
71 ( { \
72  v8i16 out0_m; \
73  \
74  out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
75  out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
76  out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \
77  \
78  out0_m; \
79 } )
80 
81 #define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2) \
82 ( { \
83  v4i32 out0_m; \
84  \
85  out0_m = __msa_dotp_s_w((v8i16) in0, (v8i16) coeff0); \
86  out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in1, (v8i16) coeff1); \
87  out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in2, (v8i16) coeff2); \
88  out0_m = __msa_srari_w(out0_m, 10); \
89  out0_m = __msa_sat_s_w(out0_m, 7); \
90  out0_m; \
91 } )
92 
93 static void avc_luma_hv_qrt_4x4_msa(const uint8_t *src_x, const uint8_t *src_y,
94  uint8_t *dst, int32_t stride)
95 {
96  const int16_t filt_const0 = 0xfb01;
97  const int16_t filt_const1 = 0x1414;
98  const int16_t filt_const2 = 0x1fb;
99  v16u8 out;
100  v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
101  v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
102  v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
103  v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
104  v8i16 hz_out0, hz_out1, vt_out0, vt_out1, out0, out1;
105 
106  filt0 = (v16i8) __msa_fill_h(filt_const0);
107  filt1 = (v16i8) __msa_fill_h(filt_const1);
108  filt2 = (v16i8) __msa_fill_h(filt_const2);
109 
110  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
111 
112  LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
113  src_y += (5 * stride);
114 
115  src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
116  src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
117  src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
118  src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
119 
120  XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
121 
122  LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
123  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
124  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2);
125  hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2);
126 
127  SRARI_H2_SH(hz_out0, hz_out1, 5);
128  SAT_SH2_SH(hz_out0, hz_out1, 7);
129 
130  LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
131 
132  src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
133  src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
134  src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
135  src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
136 
137  XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
138  ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
139  ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
140  vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
141  filt2);
142  vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
143  filt2);
144  SRARI_H2_SH(vt_out0, vt_out1, 5);
145  SAT_SH2_SH(vt_out0, vt_out1, 7);
146 
147  out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
148  out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
149 
150  SAT_SH2_SH(out0, out1, 7);
151  out = PCKEV_XORI128_UB(out0, out1);
152  ST_W4(out, 0, 1, 2, 3, dst, stride);
153 }
154 
155 static void avc_luma_hv_qrt_8x8_msa(const uint8_t *src_x, const uint8_t *src_y,
156  uint8_t *dst, int32_t stride)
157 {
158  const int16_t filt_const0 = 0xfb01;
159  const int16_t filt_const1 = 0x1414;
160  const int16_t filt_const2 = 0x1fb;
161  v16u8 out0, out1;
162  v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
163  v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
164  v16i8 src_vt7, src_vt8, src_vt9, src_vt10, src_vt11, src_vt12;
165  v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
166  v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
167  v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
168  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
169  v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
170 
171  filt0 = (v16i8) __msa_fill_h(filt_const0);
172  filt1 = (v16i8) __msa_fill_h(filt_const1);
173  filt2 = (v16i8) __msa_fill_h(filt_const2);
174 
175  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
176  LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
177  src_y += (5 * stride);
178 
179  XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
180 
181  LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
182  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
183  src_x += (4 * stride);
184 
185  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
186  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
187  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
188  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
189 
190  SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
191  SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
192 
193  LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
194  src_y += (4 * stride);
195  XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
196 
197  ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
198  src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
199  ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
200  src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
201  vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
202  filt2);
203  vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
204  filt2);
205  vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
206  filt2);
207  vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
208  filt2);
209  SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
210  SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
211 
212  tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
213  tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
214  tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
215  tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
216 
217  LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
218  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
219 
220  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
221  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
222  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
223  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
224  dst += (4 * stride);
225 
226  LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
227  XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12);
228 
229  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
230  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
231  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
232  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
233 
234  SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
235  SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
236 
237  ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
238  src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
239  src_vt1211_r);
240  vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
241  filt2);
242  vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
243  filt2);
244  vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
245  filt2);
246  vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
247  filt1, filt2);
248  SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
249  SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
250 
251  tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
252  tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
253  tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
254  tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
255 
256  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
257  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
258  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
259  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
260 }
261 
262 static void avc_luma_hv_qrt_16x16_msa(const uint8_t *src_x,
263  const uint8_t *src_y, uint8_t *dst,
264  int32_t stride)
265 {
266  const int16_t filt_const0 = 0xfb01;
267  const int16_t filt_const1 = 0x1414;
268  const int16_t filt_const2 = 0x1fb;
269  const uint8_t *src_x_tmp = src_x;
270  const uint8_t *src_y_tmp = src_y;
271  uint8_t *dst_tmp = dst;
272  uint32_t multiple8_cnt, loop_cnt;
273  v16u8 tmp0, tmp1;
274  v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
275  v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
276  v16i8 src_vt7, src_vt8;
277  v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
278  v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
279  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
280  v8i16 vt_out3, out0, out1, out2, out3;
281 
282  filt0 = (v16i8) __msa_fill_h(filt_const0);
283  filt1 = (v16i8) __msa_fill_h(filt_const1);
284  filt2 = (v16i8) __msa_fill_h(filt_const2);
285 
286  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
287 
288  for (multiple8_cnt = 2; multiple8_cnt--;) {
289  src_x = src_x_tmp;
290  src_y = src_y_tmp;
291  dst = dst_tmp;
292 
293  LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
294  src_y += (5 * stride);
295 
296  XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
297 
298  for (loop_cnt = 4; loop_cnt--;) {
299  LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
300  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
301  src_x += (4 * stride);
302 
303  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
304  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
305  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
306  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
307  SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
308  SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
309 
310  LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
311  src_y += (4 * stride);
312 
313  XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
314  ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
315  src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
316  src_vt43_r);
317  ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
318  src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
319  src_vt87_r);
320  vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
321  filt1, filt2);
322  vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
323  filt1, filt2);
324  vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
325  filt1, filt2);
326  vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
327  filt1, filt2);
328  SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
329  SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
330 
331  out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
332  out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
333  out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
334  out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
335 
336  SAT_SH4_SH(out0, out1, out2, out3, 7);
337  tmp0 = PCKEV_XORI128_UB(out0, out1);
338  tmp1 = PCKEV_XORI128_UB(out2, out3);
339  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, stride);
340  dst += (4 * stride);
341 
342  src_vt0 = src_vt4;
343  src_vt1 = src_vt5;
344  src_vt2 = src_vt6;
345  src_vt3 = src_vt7;
346  src_vt4 = src_vt8;
347  }
348 
349  src_x_tmp += 8;
350  src_y_tmp += 8;
351  dst_tmp += 8;
352  }
353 }
354 
356  const uint8_t *src_y,
357  uint8_t *dst,
358  int32_t stride)
359 {
360  uint32_t tp0, tp1, tp2, tp3;
361  const int16_t filt_const0 = 0xfb01;
362  const int16_t filt_const1 = 0x1414;
363  const int16_t filt_const2 = 0x1fb;
364  v16u8 res, dst0 = { 0 };
365  v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
366  v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
367  v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
368  v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
369  v8i16 hz_out0, hz_out1, vt_out0, vt_out1, res0, res1;
370 
371  filt0 = (v16i8) __msa_fill_h(filt_const0);
372  filt1 = (v16i8) __msa_fill_h(filt_const1);
373  filt2 = (v16i8) __msa_fill_h(filt_const2);
374 
375  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
376 
377  LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
378  src_y += (5 * stride);
379 
380  src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
381  src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
382  src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
383  src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
384 
385  XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
386 
387  LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
388  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
389  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2);
390  hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2);
391 
392  SRARI_H2_SH(hz_out0, hz_out1, 5);
393  SAT_SH2_SH(hz_out0, hz_out1, 7);
394 
395  LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
396 
397  src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
398  src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
399  src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
400  src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
401 
402  XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
403  ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
404  ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
405  vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
406  filt2);
407  vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
408  filt2);
409  SRARI_H2_SH(vt_out0, vt_out1, 5);
410  SAT_SH2_SH(vt_out0, vt_out1, 7);
411  LW4(dst, stride, tp0, tp1, tp2, tp3);
412  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
413 
414  res1 = __msa_srari_h((hz_out1 + vt_out1), 1);
415  res0 = __msa_srari_h((hz_out0 + vt_out0), 1);
416 
417  SAT_SH2_SH(res0, res1, 7);
418  res = PCKEV_XORI128_UB(res0, res1);
419  dst0 = __msa_aver_u_b(res, dst0);
420 
421  ST_W4(dst0, 0, 1, 2, 3, dst, stride);
422 }
423 
425  const uint8_t *src_y,
426  uint8_t *dst,
427  int32_t stride)
428 {
429  const int16_t filt_const0 = 0xfb01;
430  const int16_t filt_const1 = 0x1414;
431  const int16_t filt_const2 = 0x1fb;
432  uint64_t tp0, tp1, tp2, tp3;
433  v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
434  v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt0, src_vt1, src_vt2;
435  v16i8 src_vt3, src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
436  v16i8 src_vt9, src_vt10, src_vt11, src_vt12, mask0, mask1, mask2;
437  v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
438  v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
439  v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
440  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
441  v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
442 
443  filt0 = (v16i8) __msa_fill_h(filt_const0);
444  filt1 = (v16i8) __msa_fill_h(filt_const1);
445  filt2 = (v16i8) __msa_fill_h(filt_const2);
446 
447  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
448  LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
449  src_y += (5 * stride);
450 
451  XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
452 
453  LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
454  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
455  src_x += (4 * stride);
456 
457  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
458  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
459  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
460  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
461 
462  SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
463  SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
464 
465  LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
466  src_y += (4 * stride);
467  XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
468 
469  ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
470  src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
471  ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
472  src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
473  vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
474  filt2);
475  vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
476  filt2);
477  vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
478  filt2);
479  vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
480  filt2);
481  SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
482  SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
483 
484  tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
485  tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
486  tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
487  tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
488 
489  LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
490  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
491 
492  LD4(dst, stride, tp0, tp1, tp2, tp3);
493  INSERT_D2_UB(tp0, tp1, dst0);
494  INSERT_D2_UB(tp2, tp3, dst1);
495 
496  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
497  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
498  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
499  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
500  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
501  dst += (4 * stride);
502 
503  LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
504  XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12);
505 
506  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
507  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
508  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
509  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
510 
511  SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
512  SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
513 
514  ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
515  src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
516  src_vt1211_r);
517  vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
518  filt2);
519  vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
520  filt2);
521  vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
522  filt2);
523  vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
524  filt1, filt2);
525  SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
526  SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
527 
528  tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
529  tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
530  tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
531  tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
532 
533  LD4(dst, stride, tp0, tp1, tp2, tp3);
534  INSERT_D2_UB(tp0, tp1, dst0);
535  INSERT_D2_UB(tp2, tp3, dst1);
536 
537  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
538  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
539  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
540  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
541  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
542 }
543 
545  const uint8_t *src_y,
546  uint8_t *dst,
547  int32_t stride)
548 {
549  const int16_t filt_const0 = 0xfb01;
550  const int16_t filt_const1 = 0x1414;
551  const int16_t filt_const2 = 0x1fb;
552  const uint8_t *src_x_tmp = src_x;
553  const uint8_t *src_y_tmp = src_y;
554  uint8_t *dst_tmp = dst;
555  uint32_t multiple8_cnt, loop_cnt;
556  uint64_t tp0, tp1, tp2, tp3;
557  v16u8 tmp0, tmp1, dst0 = { 0 }, dst1 = { 0 };
558  v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
559  v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
560  v16i8 src_vt7, src_vt8;
561  v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
562  v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
563  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
564  v8i16 vt_out3, out0, out1, out2, out3;
565 
566  filt0 = (v16i8) __msa_fill_h(filt_const0);
567  filt1 = (v16i8) __msa_fill_h(filt_const1);
568  filt2 = (v16i8) __msa_fill_h(filt_const2);
569 
570  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
571 
572  for (multiple8_cnt = 2; multiple8_cnt--;) {
573  src_x = src_x_tmp;
574  src_y = src_y_tmp;
575  dst = dst_tmp;
576 
577  LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
578  src_y += (5 * stride);
579 
580  XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
581 
582  for (loop_cnt = 4; loop_cnt--;) {
583  LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
584  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
585  src_x += (4 * stride);
586 
587  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
588  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
589  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
590  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
591  SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
592  SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
593 
594  LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
595  src_y += (4 * stride);
596 
597  XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
598  ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
599  src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
600  src_vt43_r);
601  ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
602  src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
603  src_vt87_r);
604  vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
605  filt1, filt2);
606  vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
607  filt1, filt2);
608  vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
609  filt1, filt2);
610  vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
611  filt1, filt2);
612  SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
613  SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
614 
615  out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
616  out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
617  out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
618  out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
619 
620  LD4(dst, stride, tp0, tp1, tp2, tp3);
621  INSERT_D2_UB(tp0, tp1, dst0);
622  INSERT_D2_UB(tp2, tp3, dst1);
623 
624  SAT_SH4_SH(out0, out1, out2, out3, 7);
625  tmp0 = PCKEV_XORI128_UB(out0, out1);
626  tmp1 = PCKEV_XORI128_UB(out2, out3);
627  AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
628  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
629  dst += (4 * stride);
630 
631  src_vt0 = src_vt4;
632  src_vt1 = src_vt5;
633  src_vt2 = src_vt6;
634  src_vt3 = src_vt7;
635  src_vt4 = src_vt8;
636  }
637 
638  src_x_tmp += 8;
639  src_y_tmp += 8;
640  dst_tmp += 8;
641  }
642 }
643 
645  ptrdiff_t stride)
646 {
647  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
648  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
649 
650  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
651  src += (8 * stride);
652  LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, src15);
653 
654  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride);
655  dst += (8 * stride);
656  ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, stride);
657 }
658 
660  ptrdiff_t stride)
661 {
662  uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
663 
664  LD4(src, stride, src0, src1, src2, src3);
665  src += 4 * stride;
666  LD4(src, stride, src4, src5, src6, src7);
667  SD4(src0, src1, src2, src3, dst, stride);
668  dst += 4 * stride;
669  SD4(src4, src5, src6, src7, dst, stride);
670 }
671 
673  ptrdiff_t stride)
674 {
675  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
676  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
677 
678  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
679  src += (8 * stride);
680  LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
681 
682  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
683  dst2, dst3);
684  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
685  dst6, dst7);
686  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
687  dst += (8 * stride);
688 
689  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
690  LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
691 
692  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
693  dst2, dst3);
694  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
695  dst6, dst7);
696  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
697 }
698 
700  ptrdiff_t stride)
701 {
702  uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
703  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
704  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
705 
706  LD4(src, stride, tp0, tp1, tp2, tp3);
707  src += 4 * stride;
708  LD4(src, stride, tp4, tp5, tp6, tp7);
709  INSERT_D2_UB(tp0, tp1, src0);
710  INSERT_D2_UB(tp2, tp3, src1);
711  INSERT_D2_UB(tp4, tp5, src2);
712  INSERT_D2_UB(tp6, tp7, src3);
713 
714  LD4(dst, stride, tp0, tp1, tp2, tp3);
715  LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
716  INSERT_D2_UB(tp0, tp1, dst0);
717  INSERT_D2_UB(tp2, tp3, dst1);
718  INSERT_D2_UB(tp4, tp5, dst2);
719  INSERT_D2_UB(tp6, tp7, dst3);
720 
721  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
722  dst2, dst3);
723 
724  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
725 }
726 
728  ptrdiff_t stride)
729 {
730  uint32_t tp0, tp1, tp2, tp3;
731  v16u8 src0 = { 0 }, dst0 = { 0 };
732 
733  LW4(src, stride, tp0, tp1, tp2, tp3);
734  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
735  LW4(dst, stride, tp0, tp1, tp2, tp3);
736  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
737 
738  dst0 = __msa_aver_u_b(src0, dst0);
739 
740  ST_W4(dst0, 0, 1, 2, 3, dst, stride);
741 }
742 
744  ptrdiff_t stride)
745 {
746  uint32_t loop_cnt;
747  v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
748  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
749  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
750  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
751  v16i8 minus5b = __msa_ldi_b(-5);
752  v16i8 plus20b = __msa_ldi_b(20);
753 
754  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
755  mask3 = mask0 + 8;
756  mask4 = mask1 + 8;
757  mask5 = mask2 + 8;
758  src -= 2;
759 
760  for (loop_cnt = 4; loop_cnt--;) {
761  LD_SB2(src, 16, src0, src1);
762  src += stride;
763  LD_SB2(src, 16, src2, src3);
764  src += stride;
765  LD_SB2(src, 16, src4, src5);
766  src += stride;
767  LD_SB2(src, 16, src6, src7);
768  src += stride;
769 
770  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
771  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
772  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
773  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
774  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
775  VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
776  VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
777  HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
778  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
779  minus5b, res0, res1, res2, res3);
780  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
781  plus20b, res0, res1, res2, res3);
782  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
783  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
784  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
785  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
786  VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
787  VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
788  HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
789  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
790  minus5b, res4, res5, res6, res7);
791  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
792  plus20b, res4, res5, res6, res7);
793  SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2,
794  src0, src2, src4, src6);
795  SRARI_H4_SH(res0, res1, res2, res3, 5);
796  SRARI_H4_SH(res4, res5, res6, res7, 5);
797  SAT_SH4_SH(res0, res1, res2, res3, 7);
798  SAT_SH4_SH(res4, res5, res6, res7, 7);
799  PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
800  PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
801  dst0 = __msa_aver_s_b(dst0, src0);
802  dst1 = __msa_aver_s_b(dst1, src2);
803  dst2 = __msa_aver_s_b(dst2, src4);
804  dst3 = __msa_aver_s_b(dst3, src6);
805  XORI_B4_128_SB(dst0, dst1, dst2, dst3);
806  ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
807  dst += (4 * stride);
808  }
809 }
810 
812  ptrdiff_t stride)
813 {
814  uint32_t loop_cnt;
815  v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
816  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
817  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
818  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
819  v16i8 minus5b = __msa_ldi_b(-5);
820  v16i8 plus20b = __msa_ldi_b(20);
821 
822  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
823  mask3 = mask0 + 8;
824  mask4 = mask1 + 8;
825  mask5 = mask2 + 8;
826  src -= 2;
827 
828  for (loop_cnt = 4; loop_cnt--;) {
829  LD_SB2(src, 16, src0, src1);
830  src += stride;
831  LD_SB2(src, 16, src2, src3);
832  src += stride;
833  LD_SB2(src, 16, src4, src5);
834  src += stride;
835  LD_SB2(src, 16, src6, src7);
836  src += stride;
837 
838  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
839  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
840  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
841  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
842  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
843  VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
844  VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
845  HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
846  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
847  minus5b, res0, res1, res2, res3);
848  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
849  plus20b, res0, res1, res2, res3);
850  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
851  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
852  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
853  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
854  VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
855  VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
856  HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
857  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
858  minus5b, res4, res5, res6, res7);
859  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
860  plus20b, res4, res5, res6, res7);
861  SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3,
862  src0, src2, src4, src6);
863  SRARI_H4_SH(res0, res1, res2, res3, 5);
864  SRARI_H4_SH(res4, res5, res6, res7, 5);
865  SAT_SH4_SH(res0, res1, res2, res3, 7);
866  SAT_SH4_SH(res4, res5, res6, res7, 7);
867  PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
868  PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
869  dst0 = __msa_aver_s_b(dst0, src0);
870  dst1 = __msa_aver_s_b(dst1, src2);
871  dst2 = __msa_aver_s_b(dst2, src4);
872  dst3 = __msa_aver_s_b(dst3, src6);
873  XORI_B4_128_SB(dst0, dst1, dst2, dst3);
874  ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
875  dst += (4 * stride);
876  }
877 }
878 
880  ptrdiff_t stride)
881 {
882  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
883  v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
884  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
885  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
886  v16i8 minus5b = __msa_ldi_b(-5);
887  v16i8 plus20b = __msa_ldi_b(20);
888 
889  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
890  LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
891  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
892  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
893  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
894  HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
895  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
896  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
897  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
898  res0, res1, res2, res3);
899  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
900  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
901  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
902  res0, res1, res2, res3);
903  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
904  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
905  HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
906  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
907  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
908  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
909  res4, res5, res6, res7);
910  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
911  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
912  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
913  res4, res5, res6, res7);
914  SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
915  src0, src1, src2, src3);
916  SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2,
917  src4, src5, src6, src7);
918  PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
919  PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
920  SRARI_H4_SH(res0, res1, res2, res3, 5);
921  SRARI_H4_SH(res4, res5, res6, res7, 5);
922  SAT_SH4_SH(res0, res1, res2, res3, 7);
923  SAT_SH4_SH(res4, res5, res6, res7, 7);
924  PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
925  PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
926  tmp0 = __msa_aver_s_b(tmp0, src0);
927  tmp1 = __msa_aver_s_b(tmp1, src1);
928  tmp2 = __msa_aver_s_b(tmp2, src4);
929  tmp3 = __msa_aver_s_b(tmp3, src5);
930  XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
931  ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
932 }
933 
935  ptrdiff_t stride)
936 {
937  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
938  v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
939  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
940  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
941  v16i8 minus5b = __msa_ldi_b(-5);
942  v16i8 plus20b = __msa_ldi_b(20);
943 
944  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
945  LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
946  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
947  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
948  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
949  HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
950  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
951  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
952  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
953  res0, res1, res2, res3);
954  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
955  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
956  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
957  res0, res1, res2, res3);
958  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
959  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
960  HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
961  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
962  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
963  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
964  res4, res5, res6, res7);
965  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
966  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
967  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
968  res4, res5, res6, res7);
969  SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
970  src0, src1, src2, src3);
971  SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3,
972  src4, src5, src6, src7);
973  PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
974  PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
975  SRARI_H4_SH(res0, res1, res2, res3, 5);
976  SRARI_H4_SH(res4, res5, res6, res7, 5);
977  SAT_SH4_SH(res0, res1, res2, res3, 7);
978  SAT_SH4_SH(res4, res5, res6, res7, 7);
979  PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
980  PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
981  tmp0 = __msa_aver_s_b(tmp0, src0);
982  tmp1 = __msa_aver_s_b(tmp1, src1);
983  tmp2 = __msa_aver_s_b(tmp2, src4);
984  tmp3 = __msa_aver_s_b(tmp3, src5);
985  XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
986  ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
987 }
988 
990  ptrdiff_t stride)
991 {
992  v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
993  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
994  v8i16 res0, res1;
995  v16i8 minus5b = __msa_ldi_b(-5);
996  v16i8 plus20b = __msa_ldi_b(20);
997 
998  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
999  LD_SB4(src - 2, stride, src0, src1, src2, src3);
1000  XORI_B4_128_SB(src0, src1, src2, src3);
1001  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1002  HADD_SB2_SH(vec0, vec1, res0, res1);
1003  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1004  DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1005  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1006  DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1007  SRARI_H2_SH(res0, res1, 5);
1008  SAT_SH2_SH(res0, res1, 7);
1009  res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
1010  SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
1011  src0, src1, src2, src3);
1012  src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
1013  src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1014  src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
1015  res = __msa_aver_s_b(res, src0);
1016  res = (v16i8) __msa_xori_b((v16u8) res, 128);
1017  ST_W4(res, 0, 1, 2, 3, dst, stride);
1018 }
1019 
1021  ptrdiff_t stride)
1022 {
1023  v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
1024  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1025  v8i16 res0, res1;
1026  v16i8 minus5b = __msa_ldi_b(-5);
1027  v16i8 plus20b = __msa_ldi_b(20);
1028 
1029  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1030  LD_SB4(src - 2, stride, src0, src1, src2, src3);
1031  XORI_B4_128_SB(src0, src1, src2, src3);
1032  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1033  HADD_SB2_SH(vec0, vec1, res0, res1);
1034  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1035  DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1036  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1037  DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1038  SRARI_H2_SH(res0, res1, 5);
1039  SAT_SH2_SH(res0, res1, 7);
1040  res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
1041  SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
1042  src0, src1, src2, src3);
1043  src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
1044  src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1045  src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
1046  res = __msa_aver_s_b(res, src0);
1047  res = (v16i8) __msa_xori_b((v16u8) res, 128);
1048  ST_W4(res, 0, 1, 2, 3, dst, stride);
1049 }
1050 
1052  ptrdiff_t stride)
1053 {
1054  uint32_t loop_cnt;
1055  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1056  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1057  v16i8 vec11;
1058  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1059  v16i8 minus5b = __msa_ldi_b(-5);
1060  v16i8 plus20b = __msa_ldi_b(20);
1061 
1062  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1063  src -= 2;
1064 
1065  for (loop_cnt = 4; loop_cnt--;) {
1066  LD_SB2(src, 8, src0, src1);
1067  src += stride;
1068  LD_SB2(src, 8, src2, src3);
1069  src += stride;
1070  LD_SB2(src, 8, src4, src5);
1071  src += stride;
1072  LD_SB2(src, 8, src6, src7);
1073  src += stride;
1074 
1075  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1076  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
1077  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
1078  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
1079  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
1080  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
1081  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
1082  HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
1083  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1084  minus5b, res0, res1, res2, res3);
1085  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1086  plus20b, res0, res1, res2, res3);
1087  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
1088  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
1089  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
1090  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
1091  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
1092  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
1093  HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
1094  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1095  minus5b, res4, res5, res6, res7);
1096  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1097  plus20b, res4, res5, res6, res7);
1098  SRARI_H4_SH(res0, res1, res2, res3, 5);
1099  SRARI_H4_SH(res4, res5, res6, res7, 5);
1100  SAT_SH4_SH(res0, res1, res2, res3, 7);
1101  SAT_SH4_SH(res4, res5, res6, res7, 7);
1102  PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
1103  vec2, vec3);
1104  XORI_B4_128_SB(vec0, vec1, vec2, vec3);
1105  ST_SB4(vec0, vec1, vec2, vec3, dst, stride);
1106  dst += (4 * stride);
1107  }
1108 }
1109 
1111  ptrdiff_t stride)
1112 {
1113  v16u8 out0, out1, out2, out3;
1114  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1115  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1116  v16i8 vec11;
1117  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1118  v16i8 minus5b = __msa_ldi_b(-5);
1119  v16i8 plus20b = __msa_ldi_b(20);
1120 
1121  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1122  LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1123  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1124  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1125  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
1126  HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
1127  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
1128  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
1129  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1130  res0, res1, res2, res3);
1131  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
1132  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
1133  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1134  plus20b, res0, res1, res2, res3);
1135  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
1136  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
1137  HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
1138  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
1139  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
1140  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1141  res4, res5, res6, res7);
1142  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
1143  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
1144  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1145  plus20b, res4, res5, res6, res7);
1146  SRARI_H4_SH(res0, res1, res2, res3, 5);
1147  SRARI_H4_SH(res4, res5, res6, res7, 5);
1148  SAT_SH4_SH(res0, res1, res2, res3, 7);
1149  SAT_SH4_SH(res4, res5, res6, res7, 7);
1150  out0 = PCKEV_XORI128_UB(res0, res1);
1151  out1 = PCKEV_XORI128_UB(res2, res3);
1152  out2 = PCKEV_XORI128_UB(res4, res5);
1153  out3 = PCKEV_XORI128_UB(res6, res7);
1154  ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1155 }
1156 
1158  ptrdiff_t stride)
1159 {
1160  v16u8 out;
1161  v16i8 src0, src1, src2, src3, mask0, mask1, mask2;
1162  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1163  v8i16 res0, res1;
1164  v16i8 minus5b = __msa_ldi_b(-5);
1165  v16i8 plus20b = __msa_ldi_b(20);
1166 
1167  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1168  LD_SB4(src - 2, stride, src0, src1, src2, src3);
1169  XORI_B4_128_SB(src0, src1, src2, src3);
1170  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1171  HADD_SB2_SH(vec0, vec1, res0, res1);
1172  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1173  DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1174  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1175  DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1176  SRARI_H2_SH(res0, res1, 5);
1177  SAT_SH2_SH(res0, res1, 7);
1178  out = PCKEV_XORI128_UB(res0, res1);
1179  ST_W4(out, 0, 1, 2, 3, dst, stride);
1180 }
1181 
1183  ptrdiff_t stride)
1184 {
1185  int32_t loop_cnt;
1186  int16_t filt_const0 = 0xfb01;
1187  int16_t filt_const1 = 0x1414;
1188  int16_t filt_const2 = 0x1fb;
1189  v16u8 res0, res1, res2, res3;
1190  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1191  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1192  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1193  v16i8 src65_l, src87_l, filt0, filt1, filt2;
1194  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1195 
1196  filt0 = (v16i8) __msa_fill_h(filt_const0);
1197  filt1 = (v16i8) __msa_fill_h(filt_const1);
1198  filt2 = (v16i8) __msa_fill_h(filt_const2);
1199 
1200  src -= (stride * 2);
1201 
1202  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1203  src += (5 * stride);
1204 
1205  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1206  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1207  src32_r, src43_r);
1208  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
1209  src32_l, src43_l);
1210 
1211  for (loop_cnt = 4; loop_cnt--;) {
1212  LD_SB4(src, stride, src5, src6, src7, src8);
1213  src += (4 * stride);
1214 
1215  XORI_B4_128_SB(src5, src6, src7, src8);
1216  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
1217  src65_r, src76_r, src87_r);
1218  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
1219  src65_l, src76_l, src87_l);
1220  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1221  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1222  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1223  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1224  out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1225  out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1226  out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1227  out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1228  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1229  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1230  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1231  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1232  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1233  out3_r, res0, res1, res2, res3);
1234  res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
1235  res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
1236  res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
1237  res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
1238  XORI_B4_128_UB(res0, res1, res2, res3);
1239  ST_UB4(res0, res1, res2, res3, dst, stride);
1240  dst += (4 * stride);
1241 
1242  src10_r = src54_r;
1243  src32_r = src76_r;
1244  src21_r = src65_r;
1245  src43_r = src87_r;
1246  src10_l = src54_l;
1247  src32_l = src76_l;
1248  src21_l = src65_l;
1249  src43_l = src87_l;
1250  src2 = src6;
1251  src3 = src7;
1252  src4 = src8;
1253  }
1254 }
1255 
1257  ptrdiff_t stride)
1258 {
1259  int32_t loop_cnt;
1260  int16_t filt_const0 = 0xfb01;
1261  int16_t filt_const1 = 0x1414;
1262  int16_t filt_const2 = 0x1fb;
1263  v16u8 res0, res1, res2, res3;
1264  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1265  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1266  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1267  v16i8 src65_l, src87_l, filt0, filt1, filt2;
1268  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1269 
1270  filt0 = (v16i8) __msa_fill_h(filt_const0);
1271  filt1 = (v16i8) __msa_fill_h(filt_const1);
1272  filt2 = (v16i8) __msa_fill_h(filt_const2);
1273 
1274  src -= (stride * 2);
1275 
1276  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1277  src += (5 * stride);
1278 
1279  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1280  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1281  src32_r, src43_r);
1282  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
1283  src32_l, src43_l);
1284 
1285  for (loop_cnt = 4; loop_cnt--;) {
1286  LD_SB4(src, stride, src5, src6, src7, src8);
1287  src += (4 * stride);
1288 
1289  XORI_B4_128_SB(src5, src6, src7, src8);
1290  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
1291  src65_r, src76_r, src87_r);
1292  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
1293  src65_l, src76_l, src87_l);
1294  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1295  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1296  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1297  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1298  out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1299  out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1300  out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1301  out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1302  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1303  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1304  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1305  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1306  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1307  out3_r, res0, res1, res2, res3);
1308  res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
1309  res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
1310  res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
1311  res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
1312  XORI_B4_128_UB(res0, res1, res2, res3);
1313  ST_UB4(res0, res1, res2, res3, dst, stride);
1314  dst += (4 * stride);
1315 
1316  src10_r = src54_r;
1317  src32_r = src76_r;
1318  src21_r = src65_r;
1319  src43_r = src87_r;
1320  src10_l = src54_l;
1321  src32_l = src76_l;
1322  src21_l = src65_l;
1323  src43_l = src87_l;
1324  src3 = src7;
1325  src4 = src8;
1326  }
1327 }
1328 
1330  ptrdiff_t stride)
1331 {
1332  const int16_t filt_const0 = 0xfb01;
1333  const int16_t filt_const1 = 0x1414;
1334  const int16_t filt_const2 = 0x1fb;
1335  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1336  v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
1337  v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
1338  v16i8 tmp0, tmp1, tmp2, tmp3, filt0, filt1, filt2, out0, out1, out2, out3;
1339  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
1340 
1341  filt0 = (v16i8) __msa_fill_h(filt_const0);
1342  filt1 = (v16i8) __msa_fill_h(filt_const1);
1343  filt2 = (v16i8) __msa_fill_h(filt_const2);
1344 
1345  src -= (stride * 2);
1346 
1347  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1348  src += (5 * stride);
1349  LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
1350  XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
1351  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1352  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1353  src32_r, src43_r);
1354  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1355  src76_r, src87_r);
1356  ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
1357  src109_r, src1110_r, src1211_r);
1358  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1359  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1360  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1361  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1362  out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
1363  out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
1364  out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
1365  out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
1366  PCKEV_D2_SB(src3, src2, src5, src4, tmp0, tmp1);
1367  PCKEV_D2_SB(src7, src6, src9, src8, tmp2, tmp3);
1368  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1369  SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
1370  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1371  SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
1372  PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
1373  PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
1374  out0 = __msa_aver_s_b(out0, tmp0);
1375  out1 = __msa_aver_s_b(out1, tmp1);
1376  out2 = __msa_aver_s_b(out2, tmp2);
1377  out3 = __msa_aver_s_b(out3, tmp3);
1378  XORI_B4_128_SB(out0, out1, out2, out3);
1379  ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1380 }
1381 
1383  ptrdiff_t stride)
1384 {
1385  const int16_t filt_const0 = 0xfb01;
1386  const int16_t filt_const1 = 0x1414;
1387  const int16_t filt_const2 = 0x1fb;
1388  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1389  v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
1390  v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
1391  v16i8 filt0, filt1, filt2, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
1392  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
1393 
1394  filt0 = (v16i8) __msa_fill_h(filt_const0);
1395  filt1 = (v16i8) __msa_fill_h(filt_const1);
1396  filt2 = (v16i8) __msa_fill_h(filt_const2);
1397 
1398  src -= (stride * 2);
1399 
1400  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1401  src += (5 * stride);
1402  LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
1403  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1404  XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
1405  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1406  src32_r, src43_r);
1407  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1408  src76_r, src87_r);
1409  ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
1410  src109_r, src1110_r, src1211_r);
1411  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1412  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1413  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1414  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1415  out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
1416  out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
1417  out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
1418  out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
1419  PCKEV_D2_SB(src4, src3, src6, src5, tmp0, tmp1);
1420  PCKEV_D2_SB(src8, src7, src10, src9, tmp2, tmp3);
1421  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1422  SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
1423  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1424  SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
1425  PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
1426  PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
1427  out0 = __msa_aver_s_b(out0, tmp0);
1428  out1 = __msa_aver_s_b(out1, tmp1);
1429  out2 = __msa_aver_s_b(out2, tmp2);
1430  out3 = __msa_aver_s_b(out3, tmp3);
1431  XORI_B4_128_SB(out0, out1, out2, out3);
1432  ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1433 }
1434 
1436  ptrdiff_t stride)
1437 {
1438  int16_t filt_const0 = 0xfb01;
1439  int16_t filt_const1 = 0x1414;
1440  int16_t filt_const2 = 0x1fb;
1441  v16u8 out;
1442  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1443  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1444  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
1445  v8i16 out10, out32;
1446 
1447  filt0 = (v16i8) __msa_fill_h(filt_const0);
1448  filt1 = (v16i8) __msa_fill_h(filt_const1);
1449  filt2 = (v16i8) __msa_fill_h(filt_const2);
1450 
1451  src -= (stride * 2);
1452 
1453  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1454  src += (5 * stride);
1455  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1456  src32_r, src43_r);
1457  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1458  XORI_B2_128_SB(src2110, src4332);
1459  LD_SB4(src, stride, src5, src6, src7, src8);
1460  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1461  src76_r, src87_r);
1462  ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1463  XORI_B2_128_SB(src6554, src8776);
1464  out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1465  out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1466  SRARI_H2_SH(out10, out32, 5);
1467  SAT_SH2_SH(out10, out32, 7);
1468  out = PCKEV_XORI128_UB(out10, out32);
1469  src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1470  src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
1471  src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1472  out = __msa_aver_u_b(out, (v16u8) src32_r);
1473  ST_W4(out, 0, 1, 2, 3, dst, stride);
1474 }
1475 
1477  ptrdiff_t stride)
1478 {
1479  int16_t filt_const0 = 0xfb01;
1480  int16_t filt_const1 = 0x1414;
1481  int16_t filt_const2 = 0x1fb;
1482  v16u8 out;
1483  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1484  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1485  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
1486  v8i16 out10, out32;
1487 
1488  filt0 = (v16i8) __msa_fill_h(filt_const0);
1489  filt1 = (v16i8) __msa_fill_h(filt_const1);
1490  filt2 = (v16i8) __msa_fill_h(filt_const2);
1491 
1492  src -= (stride * 2);
1493 
1494  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1495  src += (5 * stride);
1496  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1497  src32_r, src43_r);
1498  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1499  XORI_B2_128_SB(src2110, src4332);
1500  LD_SB4(src, stride, src5, src6, src7, src8);
1501  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1502  src76_r, src87_r);
1503  ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1504  XORI_B2_128_SB(src6554, src8776);
1505  out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1506  out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1507  SRARI_H2_SH(out10, out32, 5);
1508  SAT_SH2_SH(out10, out32, 7);
1509  out = PCKEV_XORI128_UB(out10, out32);
1510  src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
1511  src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
1512  src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1513  out = __msa_aver_u_b(out, (v16u8) src32_r);
1514  ST_W4(out, 0, 1, 2, 3, dst, stride);
1515 }
1516 
1518  ptrdiff_t stride)
1519 {
1520  avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2), dst, stride);
1521 }
1522 
1524  ptrdiff_t stride)
1525 {
1526  avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1527 }
1528 
1530  ptrdiff_t stride)
1531 {
1532  avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2), dst,
1533  stride);
1534 }
1535 
1537  ptrdiff_t stride)
1538 {
1539  avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1540  stride);
1541 }
1542 
1544  ptrdiff_t stride)
1545 {
1546  avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2), dst, stride);
1547 }
1548 
1550  ptrdiff_t stride)
1551 {
1552  avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1553 }
1554 
1556  ptrdiff_t stride)
1557 {
1558  avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2), dst, stride);
1559 }
1560 
1562  ptrdiff_t stride)
1563 {
1564  avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1565  stride);
1566 }
1567 
1568 
1570  ptrdiff_t stride)
1571 {
1572  avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2), dst, stride);
1573 }
1574 
1576  ptrdiff_t stride)
1577 {
1578  avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1579 }
1580 
1582  ptrdiff_t stride)
1583 {
1584  avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2), dst, stride);
1585 }
1586 
1588  ptrdiff_t stride)
1589 {
1590  avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1591  stride);
1592 }
1593 
1595  ptrdiff_t stride)
1596 {
1597  uint8_t *dst_tmp = dst;
1598  const uint8_t *src_tmp = src - (2 * stride) - 2;
1599  uint32_t multiple8_cnt, loop_cnt;
1600  const int32_t filt_const0 = 0xfffb0001;
1601  const int32_t filt_const1 = 0x140014;
1602  const int32_t filt_const2 = 0x1fffb;
1603  v16u8 out0, out1;
1604  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
1605  v16i8 mask2;
1606  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1607  v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1608  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1609  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
1610  v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
1611  v8i16 hz_out87_l, filt0, filt1, filt2;
1612  v4i32 tmp0, tmp1;
1613 
1614  filt0 = (v8i16) __msa_fill_w(filt_const0);
1615  filt1 = (v8i16) __msa_fill_w(filt_const1);
1616  filt2 = (v8i16) __msa_fill_w(filt_const2);
1617 
1618  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1619 
1620  for (multiple8_cnt = 2; multiple8_cnt--;) {
1621  dst = dst_tmp;
1622  src = src_tmp;
1623 
1624  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1625  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1626  src += (5 * stride);
1627 
1628  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1629  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1630  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1631  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1632  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1633 
1634  for (loop_cnt = 4; loop_cnt--;) {
1635  LD_SB4(src, stride, src5, src6, src7, src8);
1636  src += (4 * stride);
1637 
1638  XORI_B4_128_SB(src5, src6, src7, src8);
1639 
1640  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1641  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1642  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1643  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1644 
1645  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1646  hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
1647  hz_out43_r);
1648  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1649  hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
1650  hz_out43_l);
1651  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1652  hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
1653  hz_out87_r);
1654  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1655  hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
1656  hz_out87_l);
1657 
1658  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
1659  filt1, filt2);
1660  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
1661  filt1, filt2);
1662  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1663  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
1664  filt1, filt2);
1665  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
1666  filt1, filt2);
1667  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1668  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
1669  filt1, filt2);
1670  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
1671  filt1, filt2);
1672  dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1673  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
1674  filt1, filt2);
1675  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
1676  filt1, filt2);
1677  dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1678 
1679  dst1 = __msa_srari_h(hz_out2, 5);
1680  dst3 = __msa_srari_h(hz_out3, 5);
1681  dst5 = __msa_srari_h(hz_out4, 5);
1682  dst7 = __msa_srari_h(hz_out5, 5);
1683  SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
1684 
1685  dst0 = __msa_aver_s_h(dst0, dst1);
1686  dst1 = __msa_aver_s_h(dst2, dst3);
1687  dst2 = __msa_aver_s_h(dst4, dst5);
1688  dst3 = __msa_aver_s_h(dst6, dst7);
1689 
1690  out0 = PCKEV_XORI128_UB(dst0, dst1);
1691  out1 = PCKEV_XORI128_UB(dst2, dst3);
1692  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1693  dst += (4 * stride);
1694 
1695  hz_out0 = hz_out4;
1696  hz_out1 = hz_out5;
1697  hz_out2 = hz_out6;
1698  hz_out3 = hz_out7;
1699  hz_out4 = hz_out8;
1700  }
1701 
1702  src_tmp += 8;
1703  dst_tmp += 8;
1704  }
1705 }
1706 
1708  ptrdiff_t stride)
1709 {
1710  uint8_t *dst_tmp = dst;
1711  const uint8_t *src_tmp = src - (2 * stride) - 2;
1712  uint32_t multiple8_cnt, loop_cnt;
1713  const int32_t filt_const0 = 0xfffb0001;
1714  const int32_t filt_const1 = 0x140014;
1715  const int32_t filt_const2 = 0x1fffb;
1716  v16u8 out0, out1;
1717  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
1718  v16i8 mask2;
1719  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1720  v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1721  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1722  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
1723  v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
1724  v8i16 hz_out87_l, filt0, filt1, filt2;
1725  v4i32 tmp0, tmp1;
1726 
1727  filt0 = (v8i16) __msa_fill_w(filt_const0);
1728  filt1 = (v8i16) __msa_fill_w(filt_const1);
1729  filt2 = (v8i16) __msa_fill_w(filt_const2);
1730 
1731  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1732 
1733  for (multiple8_cnt = 2; multiple8_cnt--;) {
1734  dst = dst_tmp;
1735  src = src_tmp;
1736 
1737  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1738  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1739  src += (5 * stride);
1740 
1741  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1742  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1743  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1744  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1745  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1746 
1747  for (loop_cnt = 4; loop_cnt--;) {
1748  LD_SB4(src, stride, src5, src6, src7, src8);
1749  src += (4 * stride);
1750 
1751  XORI_B4_128_SB(src5, src6, src7, src8);
1752 
1753  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1754  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1755  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1756  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1757 
1758  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1759  hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
1760  hz_out43_r);
1761  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1762  hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
1763  hz_out43_l);
1764  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1765  hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
1766  hz_out87_r);
1767  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1768  hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
1769  hz_out87_l);
1770 
1771  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
1772  filt1, filt2);
1773  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
1774  filt1, filt2);
1775  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1776  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
1777  filt1, filt2);
1778  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
1779  filt1, filt2);
1780  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1781  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
1782  filt1, filt2);
1783  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
1784  filt1, filt2);
1785  dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1786  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
1787  filt1, filt2);
1788  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
1789  filt1, filt2);
1790  dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1791 
1792  dst1 = __msa_srari_h(hz_out3, 5);
1793  dst3 = __msa_srari_h(hz_out4, 5);
1794  dst5 = __msa_srari_h(hz_out5, 5);
1795  dst7 = __msa_srari_h(hz_out6, 5);
1796  SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
1797 
1798  dst0 = __msa_aver_s_h(dst0, dst1);
1799  dst1 = __msa_aver_s_h(dst2, dst3);
1800  dst2 = __msa_aver_s_h(dst4, dst5);
1801  dst3 = __msa_aver_s_h(dst6, dst7);
1802 
1803  out0 = PCKEV_XORI128_UB(dst0, dst1);
1804  out1 = PCKEV_XORI128_UB(dst2, dst3);
1805  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1806  dst += (4 * stride);
1807 
1808  hz_out0 = hz_out4;
1809  hz_out1 = hz_out5;
1810  hz_out2 = hz_out6;
1811  hz_out3 = hz_out7;
1812  hz_out4 = hz_out8;
1813  }
1814 
1815  src_tmp += 8;
1816  dst_tmp += 8;
1817  }
1818 }
1819 
1821  ptrdiff_t stride)
1822 {
1823  const int32_t filt_const0 = 0xfffb0001;
1824  const int32_t filt_const1 = 0x140014;
1825  const int32_t filt_const2 = 0x1fffb;
1826  v16u8 out0, out1;
1827  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1828  v16i8 src11, src12, mask0, mask1, mask2;
1829  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1830  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
1831  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1832  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
1833  v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
1834  v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
1835  v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
1836  v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
1837  v4i32 tmp0, tmp1;
1838 
1839  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1840 
1841  filt0 = (v8i16) __msa_fill_w(filt_const0);
1842  filt1 = (v8i16) __msa_fill_w(filt_const1);
1843  filt2 = (v8i16) __msa_fill_w(filt_const2);
1844 
1845  src -= ((2 * stride) + 2);
1846 
1847  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1848  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1849  src += (5 * stride);
1850 
1851  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1852  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1853  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1854  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1855  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1856 
1857  LD_SB4(src, stride, src5, src6, src7, src8);
1858  src += (4 * stride);
1859  XORI_B4_128_SB(src5, src6, src7, src8);
1860 
1861  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1862  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1863  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1864  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1865 
1866  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
1867  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
1868  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
1869  hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
1870  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
1871  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
1872  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
1873  hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
1874 
1875  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
1876  filt2);
1877  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
1878  filt2);
1879  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1880  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
1881  filt2);
1882  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
1883  filt2);
1884  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1885  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
1886  filt2);
1887  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
1888  filt2);
1889  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1890  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
1891  filt2);
1892  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
1893  filt2);
1894  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1895 
1896  SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
1897  SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
1898 
1899  dst0 = __msa_aver_s_h(dst0, hz_out2);
1900  dst1 = __msa_aver_s_h(dst1, hz_out3);
1901  dst2 = __msa_aver_s_h(dst2, hz_out4);
1902  dst3 = __msa_aver_s_h(dst3, hz_out5);
1903 
1904  out0 = PCKEV_XORI128_UB(dst0, dst1);
1905  out1 = PCKEV_XORI128_UB(dst2, dst3);
1906  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1907  dst += (4 * stride);
1908 
1909  LD_SB4(src, stride, src9, src10, src11, src12);
1910  XORI_B4_128_SB(src9, src10, src11, src12);
1911  hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
1912  hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
1913  hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
1914  hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
1915  ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
1916  hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
1917  hz_out1211_r);
1918  ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
1919  hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
1920  hz_out1211_l);
1921  tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
1922  filt2);
1923  tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
1924  filt2);
1925  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1926  tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
1927  filt2);
1928  tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
1929  filt2);
1930  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1931  tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
1932  filt2);
1933  tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
1934  filt2);
1935  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1936  tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
1937  filt2);
1938  tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
1939  filt2);
1940  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1941 
1942  SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
1943  SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
1944 
1945  dst0 = __msa_aver_s_h(dst0, hz_out6);
1946  dst1 = __msa_aver_s_h(dst1, hz_out7);
1947  dst2 = __msa_aver_s_h(dst2, hz_out8);
1948  dst3 = __msa_aver_s_h(dst3, hz_out9);
1949 
1950  out0 = PCKEV_XORI128_UB(dst0, dst1);
1951  out1 = PCKEV_XORI128_UB(dst2, dst3);
1952  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1953 }
1954 
1956  ptrdiff_t stride)
1957 {
1958  const int32_t filt_const0 = 0xfffb0001;
1959  const int32_t filt_const1 = 0x140014;
1960  const int32_t filt_const2 = 0x1fffb;
1961  v16u8 out0, out1;
1962  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1963  v16i8 src11, src12, mask0, mask1, mask2;
1964  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1965  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
1966  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1967  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
1968  v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
1969  v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
1970  v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
1971  v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
1972  v4i32 tmp0, tmp1;
1973 
1974  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1975 
1976  filt0 = (v8i16) __msa_fill_w(filt_const0);
1977  filt1 = (v8i16) __msa_fill_w(filt_const1);
1978  filt2 = (v8i16) __msa_fill_w(filt_const2);
1979 
1980  src -= ((2 * stride) + 2);
1981 
1982  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1983  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1984  src += (5 * stride);
1985 
1986  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1987  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1988  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1989  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1990  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1991 
1992  LD_SB4(src, stride, src5, src6, src7, src8);
1993  src += (4 * stride);
1994  XORI_B4_128_SB(src5, src6, src7, src8);
1995 
1996  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1997  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1998  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1999  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2000 
2001  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2002  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2003  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2004  hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
2005  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2006  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2007  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2008  hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
2009 
2010  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2011  filt2);
2012  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
2013  filt2);
2014  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2015  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2016  filt2);
2017  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
2018  filt2);
2019  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2020  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2021  filt2);
2022  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
2023  filt2);
2024  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2025  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2026  filt2);
2027  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
2028  filt2);
2029  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2030 
2031  SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
2032  SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
2033 
2034  dst0 = __msa_aver_s_h(dst0, hz_out3);
2035  dst1 = __msa_aver_s_h(dst1, hz_out4);
2036  dst2 = __msa_aver_s_h(dst2, hz_out5);
2037  dst3 = __msa_aver_s_h(dst3, hz_out6);
2038 
2039  out0 = PCKEV_XORI128_UB(dst0, dst1);
2040  out1 = PCKEV_XORI128_UB(dst2, dst3);
2041  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
2042  dst += (4 * stride);
2043 
2044  LD_SB4(src, stride, src9, src10, src11, src12);
2045  XORI_B4_128_SB(src9, src10, src11, src12);
2046  hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
2047  hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
2048  hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
2049  hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
2050  ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2051  hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
2052  hz_out1211_r);
2053  ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2054  hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
2055  hz_out1211_l);
2056  tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
2057  filt2);
2058  tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
2059  filt2);
2060  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2061  tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
2062  filt2);
2063  tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
2064  filt2);
2065  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2066  tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
2067  filt2);
2068  tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
2069  filt2);
2070  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2071  tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
2072  filt2);
2073  tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
2074  filt2);
2075  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2076 
2077  SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
2078  SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
2079 
2080  dst0 = __msa_aver_s_h(dst0, hz_out7);
2081  dst1 = __msa_aver_s_h(dst1, hz_out8);
2082  dst2 = __msa_aver_s_h(dst2, hz_out9);
2083  dst3 = __msa_aver_s_h(dst3, hz_out10);
2084 
2085  out0 = PCKEV_XORI128_UB(dst0, dst1);
2086  out1 = PCKEV_XORI128_UB(dst2, dst3);
2087  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
2088 }
2089 
2091  ptrdiff_t stride)
2092 {
2093  const int32_t filt_const0 = 0xfffb0001;
2094  const int32_t filt_const1 = 0x140014;
2095  const int32_t filt_const2 = 0x1fffb;
2096  v16u8 res;
2097  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2098  v16i8 mask0, mask1, mask2;
2099  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2100  v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
2101  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2102  v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
2103  v4i32 tmp0, tmp1;
2104 
2105  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2106 
2107  filt0 = (v8i16) __msa_fill_w(filt_const0);
2108  filt1 = (v8i16) __msa_fill_w(filt_const1);
2109  filt2 = (v8i16) __msa_fill_w(filt_const2);
2110 
2111  src -= ((2 * stride) + 2);
2112 
2113  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2114  src += (5 * stride);
2115  LD_SB4(src, stride, src5, src6, src7, src8);
2116 
2117  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2118  XORI_B4_128_SB(src5, src6, src7, src8);
2119 
2120  hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
2121  hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
2122  hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
2123  hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
2124  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2125  PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2126  PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
2127 
2128  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2129  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2130  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2131  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2132 
2133  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2134  filt2);
2135  tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2136  filt2);
2137  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2138  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2139  filt2);
2140  tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2141  filt2);
2142  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2143 
2144  SRARI_H2_SH(hz_out2, hz_out4, 5);
2145  SAT_SH2_SH(hz_out2, hz_out4, 7);
2146 
2147  dst0 = __msa_aver_s_h(dst0, hz_out2);
2148  dst1 = __msa_aver_s_h(dst1, hz_out4);
2149 
2150  res = PCKEV_XORI128_UB(dst0, dst1);
2151  ST_W4(res, 0, 1, 2, 3, dst, stride);
2152 }
2153 
2155  ptrdiff_t stride)
2156 {
2157  const int32_t filt_const0 = 0xfffb0001;
2158  const int32_t filt_const1 = 0x140014;
2159  const int32_t filt_const2 = 0x1fffb;
2160  v16u8 res;
2161  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2162  v16i8 mask0, mask1, mask2;
2163  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2164  v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
2165  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2166  v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
2167  v4i32 tmp0, tmp1;
2168 
2169  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2170 
2171  filt0 = (v8i16) __msa_fill_w(filt_const0);
2172  filt1 = (v8i16) __msa_fill_w(filt_const1);
2173  filt2 = (v8i16) __msa_fill_w(filt_const2);
2174 
2175  src -= ((2 * stride) + 2);
2176 
2177  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2178  src += (5 * stride);
2179  LD_SB4(src, stride, src5, src6, src7, src8);
2180 
2181  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2182  XORI_B4_128_SB(src5, src6, src7, src8);
2183 
2184  hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
2185  hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
2186  hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
2187  hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
2188  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2189  PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2190  PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
2191 
2192  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2193  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2194  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2195  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2196 
2197  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2198  filt2);
2199  tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2200  filt2);
2201  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2202  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2203  filt2);
2204  tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2205  filt2);
2206  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2207 
2208  PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
2209  SRARI_H2_SH(hz_out0, hz_out1, 5);
2210  SAT_SH2_SH(hz_out0, hz_out1, 7);
2211 
2212  dst0 = __msa_aver_s_h(dst0, hz_out0);
2213  dst1 = __msa_aver_s_h(dst1, hz_out1);
2214 
2215  res = PCKEV_XORI128_UB(dst0, dst1);
2216  ST_W4(res, 0, 1, 2, 3, dst, stride);
2217 }
2218 
2220  ptrdiff_t stride)
2221 {
2222  int32_t loop_cnt;
2223  int16_t filt_const0 = 0xfb01;
2224  int16_t filt_const1 = 0x1414;
2225  int16_t filt_const2 = 0x1fb;
2226  v16u8 res0, res1, res2, res3;
2227  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2228  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2229  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2230  v16i8 src65_l, src87_l, filt0, filt1, filt2;
2231  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2232 
2233  filt0 = (v16i8) __msa_fill_h(filt_const0);
2234  filt1 = (v16i8) __msa_fill_h(filt_const1);
2235  filt2 = (v16i8) __msa_fill_h(filt_const2);
2236  src -= (stride * 2);
2237 
2238  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2239  src += (5 * stride);
2240 
2241  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2242  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2243  src32_r, src43_r);
2244  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2245  src32_l, src43_l);
2246 
2247  for (loop_cnt = 4; loop_cnt--;) {
2248  LD_SB4(src, stride, src5, src6, src7, src8);
2249  src += (4 * stride);
2250 
2251  XORI_B4_128_SB(src5, src6, src7, src8);
2252  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
2253  src65_r, src76_r, src87_r);
2254  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
2255  src65_l, src76_l, src87_l);
2256  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2257  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2258  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2259  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2260  out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2261  out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2262  out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2263  out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2264  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2265  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2266  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
2267  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2268  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2269  out3_r, res0, res1, res2, res3);
2270  XORI_B4_128_UB(res0, res1, res2, res3);
2271  ST_UB4(res0, res1, res2, res3, dst, stride);
2272  dst += (4 * stride);
2273 
2274  src10_r = src54_r;
2275  src32_r = src76_r;
2276  src21_r = src65_r;
2277  src43_r = src87_r;
2278  src10_l = src54_l;
2279  src32_l = src76_l;
2280  src21_l = src65_l;
2281  src43_l = src87_l;
2282  src4 = src8;
2283  }
2284 }
2285 
2287  ptrdiff_t stride)
2288 {
2289  const int16_t filt_const0 = 0xfb01;
2290  const int16_t filt_const1 = 0x1414;
2291  const int16_t filt_const2 = 0x1fb;
2292  v16u8 out0, out1, out2, out3;
2293  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2294  v16i8 src11, src12, src10_r, src21_r, src32_r, src43_r, src76_r, src87_r;
2295  v16i8 src98_r, src109_r, src89_r, src910_r, src1110_r, src1211_r;
2296  v16i8 filt0, filt1, filt2;
2297  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
2298 
2299  filt0 = (v16i8) __msa_fill_h(filt_const0);
2300  filt1 = (v16i8) __msa_fill_h(filt_const1);
2301  filt2 = (v16i8) __msa_fill_h(filt_const2);
2302 
2303  src -= (stride * 2);
2304 
2305  LD_SB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2306  src += (8 * stride);
2307  LD_SB5(src, stride, src8, src9, src10, src11, src12);
2308  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2309  src32_r, src43_r);
2310  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src76_r, src87_r,
2311  src98_r, src109_r);
2312  ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src89_r,
2313  src910_r, src1110_r, src1211_r);
2314  XORI_B4_128_SB(src10_r, src21_r, src32_r, src43_r);
2315  XORI_B4_128_SB(src76_r, src87_r, src98_r, src109_r);
2316  XORI_B4_128_SB(src89_r, src910_r, src1110_r, src1211_r);
2317  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
2318  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
2319  out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
2320  out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
2321  out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src89_r, filt0, filt1, filt2);
2322  out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src910_r, filt0, filt1, filt2);
2323  out6_r = AVC_DOT_SH3_SH(src98_r, src89_r, src1110_r, filt0, filt1, filt2);
2324  out7_r = AVC_DOT_SH3_SH(src109_r, src910_r, src1211_r, filt0, filt1, filt2);
2325  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2326  SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
2327  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2328  SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
2329  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
2330  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
2331  out2 = PCKEV_XORI128_UB(out4_r, out5_r);
2332  out3 = PCKEV_XORI128_UB(out6_r, out7_r);
2333  ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
2334 }
2335 
2337  ptrdiff_t stride)
2338 {
2339  const int16_t filt_const0 = 0xfb01;
2340  const int16_t filt_const1 = 0x1414;
2341  const int16_t filt_const2 = 0x1fb;
2342  v16u8 out;
2343  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2344  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2345  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
2346  v8i16 out10, out32;
2347 
2348  filt0 = (v16i8) __msa_fill_h(filt_const0);
2349  filt1 = (v16i8) __msa_fill_h(filt_const1);
2350  filt2 = (v16i8) __msa_fill_h(filt_const2);
2351 
2352  src -= (stride * 2);
2353 
2354  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2355  src += (5 * stride);
2356  LD_SB4(src, stride, src5, src6, src7, src8);
2357 
2358  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2359  src32_r, src43_r);
2360  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2361  src76_r, src87_r);
2362  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
2363  src76_r, src2110, src4332, src6554, src8776);
2364  XORI_B4_128_SB(src2110, src4332, src6554, src8776);
2365  out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
2366  out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
2367  SRARI_H2_SH(out10, out32, 5);
2368  SAT_SH2_SH(out10, out32, 7);
2369  out = PCKEV_XORI128_UB(out10, out32);
2370  ST_W4(out, 0, 1, 2, 3, dst, stride);
2371 }
2372 
2374  ptrdiff_t stride)
2375 {
2376  uint32_t row;
2377  v16u8 out;
2378  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2379  v16i8 src11;
2380  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
2381  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2382  v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
2383  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2384  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2385  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2386  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2387  v8i16 minus5h = __msa_ldi_h(-5);
2388  v8i16 plus20h = __msa_ldi_h(20);
2389 
2390  mask3 = mask0 + 4;
2391  mask4 = mask1 + 4;
2392  mask5 = mask2 + 4;
2393 
2394  src -= ((2 * stride) + 2);
2395 
2396  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2397  LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
2398  src += (5 * stride);
2399  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2400  XORI_B5_128_SB(src7, src8, src9, src10, src11);
2401 
2402  for (row = 16; row--;) {
2403  LD_SB2(src, 8, src5, src6);
2404  src += stride;
2405  XORI_B2_128_SB(src5, src6);
2406 
2407  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2408  vt_res0, vt_res1);
2409  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
2410  vt_res2, vt_res3);
2411  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2412  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2413  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2414  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2415  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2416  mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2417  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2418  mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2419  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2420  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2421  hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2422  hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2423  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2424  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2425  DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2426  DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2427  SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2428  SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2429  dst0 = __msa_srari_h(shf_vec2, 5);
2430  dst1 = __msa_srari_h(shf_vec5, 5);
2431  dst2 = __msa_srari_h(shf_vec8, 5);
2432  dst3 = __msa_srari_h(shf_vec11, 5);
2433  SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2434  PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1);
2435  PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2436  dst0 = __msa_aver_s_h(dst2, dst0);
2437  dst1 = __msa_aver_s_h(dst3, dst1);
2438  out = PCKEV_XORI128_UB(dst0, dst1);
2439  ST_UB(out, dst);
2440  dst += stride;
2441 
2442  src0 = src1;
2443  src1 = src2;
2444  src2 = src3;
2445  src3 = src4;
2446  src4 = src5;
2447  src7 = src8;
2448  src8 = src9;
2449  src9 = src10;
2450  src10 = src11;
2451  src11 = src6;
2452  }
2453 }
2454 
2456  ptrdiff_t stride)
2457 {
2458  uint32_t row;
2459  v16u8 out;
2460  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2461  v16i8 src11;
2462  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
2463  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2464  v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
2465  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2466  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2467  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2468  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2469  v8i16 minus5h = __msa_ldi_h(-5);
2470  v8i16 plus20h = __msa_ldi_h(20);
2471 
2472  mask3 = mask0 + 4;
2473  mask4 = mask1 + 4;
2474  mask5 = mask2 + 4;
2475 
2476  src -= ((2 * stride) + 2);
2477 
2478  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2479  LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
2480  src += (5 * stride);
2481  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2482  XORI_B5_128_SB(src7, src8, src9, src10, src11);
2483 
2484  for (row = 16; row--;) {
2485  LD_SB2(src, 8, src5, src6);
2486  src += stride;
2487  XORI_B2_128_SB(src5, src6);
2488 
2489  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2490  vt_res0, vt_res1);
2491  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
2492  vt_res2, vt_res3);
2493  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2494  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2495  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2496  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2497  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2498  mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2499  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2500  mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2501  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2502  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2503  hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2504  hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2505  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2506  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2507  DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2508  DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2509  SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2510  SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2511  dst0 = __msa_srari_h(shf_vec2, 5);
2512  dst1 = __msa_srari_h(shf_vec5, 5);
2513  dst2 = __msa_srari_h(shf_vec8, 5);
2514  dst3 = __msa_srari_h(shf_vec11, 5);
2515  SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2516  dst0 = __msa_pckod_h(dst2, dst0);
2517  dst1 = __msa_pckod_h(dst3, dst1);
2518  PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2519  dst0 = __msa_aver_s_h(dst2, dst0);
2520  dst1 = __msa_aver_s_h(dst3, dst1);
2521  out = PCKEV_XORI128_UB(dst0, dst1);
2522  ST_UB(out, dst);
2523  dst += stride;
2524 
2525  src0 = src1;
2526  src1 = src2;
2527  src2 = src3;
2528  src3 = src4;
2529  src4 = src5;
2530  src7 = src8;
2531  src8 = src9;
2532  src9 = src10;
2533  src10 = src11;
2534  src11 = src6;
2535  }
2536 }
2537 
2539  ptrdiff_t stride)
2540 {
2541  uint32_t row;
2542  v16u8 out;
2543  v16i8 src0, src1, src2, src3, src4, src5, src6;
2544  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
2545  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2546  v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
2547  v8i16 mask3, mask4, mask5;
2548  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2549  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2550  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2551  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2552  v8i16 minus5h = __msa_ldi_h(-5);
2553  v8i16 plus20h = __msa_ldi_h(20);
2554 
2555  mask3 = mask0 + 4;
2556  mask4 = mask1 + 4;
2557  mask5 = mask2 + 4;
2558 
2559  src -= ((2 * stride) + 2);
2560 
2561  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2562  src += (5 * stride);
2563  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2564 
2565  for (row = 4; row--;) {
2566  LD_SB2(src, stride, src5, src6);
2567  src += (2 * stride);
2568  XORI_B2_128_SB(src5, src6);
2569 
2570  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2571  vt_res0, vt_res1);
2572  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
2573  vt_res2, vt_res3);
2574  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2575  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2576  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2577  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2578  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2579  mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2580  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2581  mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2582  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2583  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2584  hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2585  hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2586  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2587  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2588  DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2589  DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2590  SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2591  SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2592  dst0 = __msa_srari_h(shf_vec2, 5);
2593  dst1 = __msa_srari_h(shf_vec5, 5);
2594  dst2 = __msa_srari_h(shf_vec8, 5);
2595  dst3 = __msa_srari_h(shf_vec11, 5);
2596  SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2597  PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1);
2598  PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2599  dst0 = __msa_aver_s_h(dst2, dst0);
2600  dst1 = __msa_aver_s_h(dst3, dst1);
2601  out = PCKEV_XORI128_UB(dst0, dst1);
2602  ST_D2(out, 0, 1, dst, stride);
2603  dst += (2 * stride);
2604 
2605  src0 = src2;
2606  src1 = src3;
2607  src2 = src4;
2608  src3 = src5;
2609  src4 = src6;
2610  }
2611 }
2612 
2614  ptrdiff_t stride)
2615 {
2616  uint32_t row;
2617  v16u8 out;
2618  v16i8 src0, src1, src2, src3, src4, src5, src6;
2619  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
2620  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2621  v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
2622  v8i16 mask3, mask4, mask5;
2623  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2624  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2625  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2626  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2627  v8i16 minus5h = __msa_ldi_h(-5);
2628  v8i16 plus20h = __msa_ldi_h(20);
2629 
2630  mask3 = mask0 + 4;
2631  mask4 = mask1 + 4;
2632  mask5 = mask2 + 4;
2633 
2634  src -= ((2 * stride) + 2);
2635 
2636  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2637  src += (5 * stride);
2638  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2639 
2640  for (row = 4; row--;) {
2641  LD_SB2(src, stride, src5, src6);
2642  src += (2 * stride);
2643  XORI_B2_128_SB(src5, src6);
2644 
2645  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2646  vt_res0, vt_res1);
2647  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
2648  vt_res2, vt_res3);
2649  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2650  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2651  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2652  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2653  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2654  mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2655  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2656  mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2657  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2658  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2659  hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2660  hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2661  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2662  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2663  DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2664  DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2665  SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2666  SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2667  dst0 = __msa_srari_h(shf_vec2, 5);
2668  dst1 = __msa_srari_h(shf_vec5, 5);
2669  dst2 = __msa_srari_h(shf_vec8, 5);
2670  dst3 = __msa_srari_h(shf_vec11, 5);
2671  SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2672  dst0 = __msa_pckod_h(dst2, dst0);
2673  dst1 = __msa_pckod_h(dst3, dst1);
2674  PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2675  dst0 = __msa_aver_s_h(dst2, dst0);
2676  dst1 = __msa_aver_s_h(dst3, dst1);
2677  out = PCKEV_XORI128_UB(dst0, dst1);
2678  ST_D2(out, 0, 1, dst, stride);
2679  dst += (2 * stride);
2680 
2681  src0 = src2;
2682  src1 = src3;
2683  src2 = src4;
2684  src3 = src5;
2685  src4 = src6;
2686  }
2687 }
2688 
2690  ptrdiff_t stride)
2691 {
2692  const int16_t filt_const0 = 0xfb01;
2693  const int16_t filt_const1 = 0x1414;
2694  const int16_t filt_const2 = 0x1fb;
2695  v16u8 out;
2696  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2697  v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
2698  v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
2699  v16i8 src76_l, src87_l, filt0, filt1, filt2;
2700  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
2701  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2702  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2703  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2704  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2705  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2706  v8i16 minus5h = __msa_ldi_h(-5);
2707  v8i16 plus20h = __msa_ldi_h(20);
2708  v8i16 zeros = { 0 };
2709 
2710  filt0 = (v16i8) __msa_fill_h(filt_const0);
2711  filt1 = (v16i8) __msa_fill_h(filt_const1);
2712  filt2 = (v16i8) __msa_fill_h(filt_const2);
2713 
2714  src -= ((2 * stride) + 2);
2715 
2716  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2717  src += (5 * stride);
2718  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2719  LD_SB4(src, stride, src5, src6, src7, src8);
2720  XORI_B4_128_SB(src5, src6, src7, src8);
2721 
2722  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2723  src32_r, src43_r);
2724  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2725  src76_r, src87_r);
2726  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2727  src32_l, src43_l);
2728  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
2729  src76_l, src87_l);
2730  vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2731  vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2732  vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2733  vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2734  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2735  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2736  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2737  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2738  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2739  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2740  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2741  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2742 
2743  vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2744  vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2745  vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2746  vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2747  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2748  mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
2749  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2750  mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
2751  hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2752  DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
2753  hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2754  DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
2755 
2756  SRARI_W2_SW(hz_res0, hz_res1, 10);
2757  SAT_SW2_SW(hz_res0, hz_res1, 7);
2758  SRARI_W2_SW(hz_res2, hz_res3, 10);
2759  SAT_SW2_SW(hz_res2, hz_res3, 7);
2760 
2761  dst0 = __msa_srari_h(shf_vec2, 5);
2762  dst1 = __msa_srari_h(shf_vec5, 5);
2763  dst2 = __msa_srari_h(shf_vec6, 5);
2764  dst3 = __msa_srari_h(shf_vec7, 5);
2765 
2766  SAT_SH2_SH(dst0, dst1, 7);
2767  SAT_SH2_SH(dst2, dst3, 7);
2768  ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
2769  ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
2770 
2771  hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
2772  hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
2773  hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
2774  hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
2775 
2776  PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
2777  out = PCKEV_XORI128_UB(dst0, dst2);
2778  ST_W4(out, 0, 1, 2, 3, dst, stride);
2779 }
2780 
2782  ptrdiff_t stride)
2783 {
2784  const int16_t filt_const0 = 0xfb01;
2785  const int16_t filt_const1 = 0x1414;
2786  const int16_t filt_const2 = 0x1fb;
2787  v16u8 out;
2788  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2789  v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
2790  v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
2791  v16i8 src76_l, src87_l, filt0, filt1, filt2;
2792  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
2793  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2794  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2795  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2796  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2797  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2798  v8i16 minus5h = __msa_ldi_h(-5);
2799  v8i16 plus20h = __msa_ldi_h(20);
2800  v8i16 zeros = { 0 };
2801 
2802  filt0 = (v16i8) __msa_fill_h(filt_const0);
2803  filt1 = (v16i8) __msa_fill_h(filt_const1);
2804  filt2 = (v16i8) __msa_fill_h(filt_const2);
2805 
2806  src -= ((2 * stride) + 2);
2807 
2808  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2809  src += (5 * stride);
2810  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2811  LD_SB4(src, stride, src5, src6, src7, src8);
2812  XORI_B4_128_SB(src5, src6, src7, src8);
2813 
2814  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2815  src32_r, src43_r);
2816  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2817  src76_r, src87_r);
2818  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2819  src32_l, src43_l);
2820  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
2821  src76_l, src87_l);
2822 
2823  vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2824  vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2825  vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2826  vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2827  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2828  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2829  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2830  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2831  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2832  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2833  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2834  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2835 
2836  vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2837  vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2838  vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2839  vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2840  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2841  mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
2842  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2843  mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
2844  hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2845  DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
2846  hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2847  DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
2848 
2849  SRARI_W2_SW(hz_res0, hz_res1, 10);
2850  SAT_SW2_SW(hz_res0, hz_res1, 7);
2851  SRARI_W2_SW(hz_res2, hz_res3, 10);
2852  SAT_SW2_SW(hz_res2, hz_res3, 7);
2853 
2854  dst0 = __msa_srari_h(shf_vec2, 5);
2855  dst1 = __msa_srari_h(shf_vec5, 5);
2856  dst2 = __msa_srari_h(shf_vec6, 5);
2857  dst3 = __msa_srari_h(shf_vec7, 5);
2858 
2859  SAT_SH2_SH(dst0, dst1, 7);
2860  SAT_SH2_SH(dst2, dst3, 7);
2861 
2862  dst0 = __msa_ilvod_h(zeros, dst0);
2863  dst1 = __msa_ilvod_h(zeros, dst1);
2864  dst2 = __msa_ilvod_h(zeros, dst2);
2865  dst3 = __msa_ilvod_h(zeros, dst3);
2866 
2867  hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
2868  hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
2869  hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
2870  hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
2871 
2872  PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
2873  out = PCKEV_XORI128_UB(dst0, dst2);
2874  ST_W4(out, 0, 1, 2, 3, dst, stride);
2875 }
2876 
2878  ptrdiff_t stride)
2879 {
2880  const int32_t filt_const0 = 0xfffb0001;
2881  const int32_t filt_const1 = 0x140014;
2882  const int32_t filt_const2 = 0x1fffb;
2883  const uint8_t *src_tmp = src - (2 * stride) - 2;
2884  uint8_t *dst_tmp = dst;
2885  uint32_t multiple8_cnt, loop_cnt;
2886  v16u8 out0, out1;
2887  v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
2888  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2889  v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3;
2890  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2891  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
2892  v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
2893  v8i16 hz_out87_l, filt0, filt1, filt2;
2894  v4i32 tmp0, tmp1;
2895 
2896  filt0 = (v8i16) __msa_fill_w(filt_const0);
2897  filt1 = (v8i16) __msa_fill_w(filt_const1);
2898  filt2 = (v8i16) __msa_fill_w(filt_const2);
2899 
2900  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2901 
2902  for (multiple8_cnt = 2; multiple8_cnt--;) {
2903  src = src_tmp;
2904  dst = dst_tmp;
2905 
2906  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2907  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2908  src += (5 * stride);
2909 
2910  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
2911  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
2912  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
2913  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
2914  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
2915 
2916  for (loop_cnt = 4; loop_cnt--;) {
2917  LD_SB4(src, stride, src0, src1, src2, src3);
2918  XORI_B4_128_SB(src0, src1, src2, src3);
2919  src += (4 * stride);
2920 
2921  hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
2922  hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
2923  hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
2924  hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
2925 
2926  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2927  hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
2928  hz_out43_r);
2929  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2930  hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
2931  hz_out43_l);
2932  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2933  hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
2934  hz_out87_r);
2935  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2936  hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
2937  hz_out87_l);
2938 
2939  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
2940  filt1, filt2);
2941  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
2942  filt1, filt2);
2943  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2944  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
2945  filt1, filt2);
2946  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
2947  filt1, filt2);
2948  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2949  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
2950  filt1, filt2);
2951  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
2952  filt1, filt2);
2953  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2954  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
2955  filt1, filt2);
2956  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
2957  filt1, filt2);
2958  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2959 
2960  out0 = PCKEV_XORI128_UB(dst0, dst1);
2961  out1 = PCKEV_XORI128_UB(dst2, dst3);
2962  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
2963  dst += (4 * stride);
2964 
2965  hz_out0 = hz_out4;
2966  hz_out1 = hz_out5;
2967  hz_out2 = hz_out6;
2968  hz_out3 = hz_out7;
2969  hz_out4 = hz_out8;
2970  }
2971 
2972  src_tmp += 8;
2973  dst_tmp += 8;
2974  }
2975 }
2976 
2978  ptrdiff_t stride)
2979 {
2980  const int32_t filt_const0 = 0xfffb0001;
2981  const int32_t filt_const1 = 0x140014;
2982  const int32_t filt_const2 = 0x1fffb;
2983  v16u8 out0, out1;
2984  v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
2985  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2986  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
2987  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2988  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
2989  v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
2990  v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
2991  v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
2992  v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
2993  v4i32 tmp0, tmp1;
2994 
2995  filt0 = (v8i16) __msa_fill_w(filt_const0);
2996  filt1 = (v8i16) __msa_fill_w(filt_const1);
2997  filt2 = (v8i16) __msa_fill_w(filt_const2);
2998 
2999  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3000 
3001  src -= ((2 * stride) + 2);
3002  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3003  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3004  src += (5 * stride);
3005 
3006  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3007  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3008  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3009  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3010  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
3011 
3012  LD_SB4(src, stride, src0, src1, src2, src3);
3013  XORI_B4_128_SB(src0, src1, src2, src3);
3014  src += (4 * stride);
3015  hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3016  hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3017  hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3018  hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3019  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3020  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3021  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3022  hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
3023  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3024  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3025  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3026  hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
3027 
3028  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3029  filt2);
3030  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
3031  filt2);
3032  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3033  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3034  filt2);
3035  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
3036  filt2);
3037  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3038  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3039  filt2);
3040  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
3041  filt2);
3042  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3043  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3044  filt2);
3045  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
3046  filt2);
3047  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3048  out0 = PCKEV_XORI128_UB(dst0, dst1);
3049  out1 = PCKEV_XORI128_UB(dst2, dst3);
3050  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
3051  dst += (4 * stride);
3052 
3053  LD_SB4(src, stride, src0, src1, src2, src3);
3054  XORI_B4_128_SB(src0, src1, src2, src3);
3055  hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3056  hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3057  hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3058  hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3059  ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3060  hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
3061  hz_out1211_r);
3062  ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3063  hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
3064  hz_out1211_l);
3065  tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
3066  filt2);
3067  tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
3068  filt2);
3069  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3070  tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
3071  filt2);
3072  tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
3073  filt2);
3074  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3075  tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
3076  filt2);
3077  tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
3078  filt2);
3079  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3080  tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
3081  filt2);
3082  tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
3083  filt2);
3084  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3085  out0 = PCKEV_XORI128_UB(dst0, dst1);
3086  out1 = PCKEV_XORI128_UB(dst2, dst3);
3087  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
3088 }
3089 
3091  ptrdiff_t stride)
3092 {
3093  const int32_t filt_const0 = 0xfffb0001;
3094  const int32_t filt_const1 = 0x140014;
3095  const int32_t filt_const2 = 0x1fffb;
3096  v16u8 res;
3097  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3098  v16i8 mask0, mask1, mask2;
3099  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3100  v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
3101  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3102  v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
3103  v4i32 tmp0, tmp1;
3104 
3105  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3106 
3107  filt0 = (v8i16) __msa_fill_w(filt_const0);
3108  filt1 = (v8i16) __msa_fill_w(filt_const1);
3109  filt2 = (v8i16) __msa_fill_w(filt_const2);
3110 
3111  src -= ((2 * stride) + 2);
3112 
3113  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3114  src += (5 * stride);
3115  LD_SB4(src, stride, src5, src6, src7, src8);
3116 
3117  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3118  XORI_B4_128_SB(src5, src6, src7, src8);
3119  hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
3120  hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
3121  hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
3122  hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
3123  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3124  PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
3125  PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
3126  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3127  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3128  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3129  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3130 
3131  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3132  filt2);
3133  tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3134  filt2);
3135  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3136  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3137  filt2);
3138  tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3139  filt2);
3140  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3141  res = PCKEV_XORI128_UB(dst0, dst1);
3142  ST_W4(res, 0, 1, 2, 3, dst, stride);
3143 }
3144 
3146  ptrdiff_t stride)
3147 {
3148  uint32_t loop_cnt;
3149  v16u8 dst0, dst1, dst2, dst3;
3150  v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6;
3151  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3152  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3153  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3154  v16i8 minus5b = __msa_ldi_b(-5);
3155  v16i8 plus20b = __msa_ldi_b(20);
3156 
3157  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3158  mask3 = mask0 + 8;
3159  mask4 = mask1 + 8;
3160  mask5 = mask2 + 8;
3161  src -= 2;
3162 
3163  for (loop_cnt = 4; loop_cnt--;) {
3164  LD_SB2(src, 16, src0, src1);
3165  src += stride;
3166  LD_SB2(src, 16, src2, src3);
3167  src += stride;
3168  LD_SB2(src, 16, src4, src5);
3169  src += stride;
3170  LD_SB2(src, 16, src6, src7);
3171  src += stride;
3172 
3173  LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3174  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3175  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
3176  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3177  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
3178  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3179  VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
3180  VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3181  HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3182  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3183  minus5b, res0, res1, res2, res3);
3184  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3185  plus20b, res0, res1, res2, res3);
3186  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3187  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3188  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3189  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3190  VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3191  VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3192  HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3193  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3194  minus5b, res4, res5, res6, res7);
3195  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3196  plus20b, res4, res5, res6, res7);
3197  SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2,
3198  src0, src2, src4, src6);
3199  SRARI_H4_SH(res0, res1, res2, res3, 5);
3200  SRARI_H4_SH(res4, res5, res6, res7, 5);
3201  SAT_SH4_SH(res0, res1, res2, res3, 7);
3202  SAT_SH4_SH(res4, res5, res6, res7, 7);
3203  PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
3204  PCKEV_B2_SB(res5, res4, res7, res6, out2, out3);
3205  out0 = __msa_aver_s_b(out0, src0);
3206  out1 = __msa_aver_s_b(out1, src2);
3207  out2 = __msa_aver_s_b(out2, src4);
3208  out3 = __msa_aver_s_b(out3, src6);
3209  XORI_B4_128_SB(out0, out1, out2, out3);
3210  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
3211  AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3);
3212  ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3213  dst += (4 * stride);
3214  }
3215 }
3216 
3218  ptrdiff_t stride)
3219 {
3220  uint32_t loop_cnt;
3221  v16u8 dst0, dst1, dst2, dst3;
3222  v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6;
3223  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3224  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3225  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3226  v16i8 minus5b = __msa_ldi_b(-5);
3227  v16i8 plus20b = __msa_ldi_b(20);
3228 
3229  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3230  mask3 = mask0 + 8;
3231  mask4 = mask1 + 8;
3232  mask5 = mask2 + 8;
3233  src -= 2;
3234 
3235  for (loop_cnt = 4; loop_cnt--;) {
3236  LD_SB2(src, 16, src0, src1);
3237  src += stride;
3238  LD_SB2(src, 16, src2, src3);
3239  src += stride;
3240  LD_SB2(src, 16, src4, src5);
3241  src += stride;
3242  LD_SB2(src, 16, src6, src7);
3243  src += stride;
3244 
3245  LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3246  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3247  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
3248  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3249  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
3250  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3251  VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
3252  VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3253  HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3254  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3255  minus5b, res0, res1, res2, res3);
3256  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3257  plus20b, res0, res1, res2, res3);
3258  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3259  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3260  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3261  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3262  VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3263  VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3264  HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3265  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3266  minus5b, res4, res5, res6, res7);
3267  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3268  plus20b, res4, res5, res6, res7);
3269  SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3,
3270  src0, src2, src4, src6);
3271  SRARI_H4_SH(res0, res1, res2, res3, 5);
3272  SRARI_H4_SH(res4, res5, res6, res7, 5);
3273  SAT_SH4_SH(res0, res1, res2, res3, 7);
3274  SAT_SH4_SH(res4, res5, res6, res7, 7);
3275  PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
3276  PCKEV_B2_SB(res5, res4, res7, res6, out2, out3);
3277  out0 = __msa_aver_s_b(out0, src0);
3278  out1 = __msa_aver_s_b(out1, src2);
3279  out2 = __msa_aver_s_b(out2, src4);
3280  out3 = __msa_aver_s_b(out3, src6);
3281  XORI_B4_128_SB(out0, out1, out2, out3);
3282  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
3283  AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3);
3284  ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3285  dst += (4 * stride);
3286  }
3287 }
3288 
3290  ptrdiff_t stride)
3291 {
3292  uint64_t tp0, tp1, tp2, tp3;
3293  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3294  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3295  v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3296  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3297  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3298  v16i8 minus5b = __msa_ldi_b(-5);
3299  v16i8 plus20b = __msa_ldi_b(20);
3300 
3301  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3302  LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3303  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3304  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3305  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3306  HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3307  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3308  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3309  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3310  res0, res1, res2, res3);
3311  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3312  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3313  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3314  res0, res1, res2, res3);
3315  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3316  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3317  HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3318  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3319  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3320  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3321  res4, res5, res6, res7);
3322  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3323  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3324  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3325  res4, res5, res6, res7);
3326  SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
3327  src0, src1, src2, src3);
3328  SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2,
3329  src4, src5, src6, src7);
3330  PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
3331  PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
3332  SRARI_H4_SH(res0, res1, res2, res3, 5);
3333  SRARI_H4_SH(res4, res5, res6, res7, 5);
3334  SAT_SH4_SH(res0, res1, res2, res3, 7);
3335  SAT_SH4_SH(res4, res5, res6, res7, 7);
3336  PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
3337  PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
3338  tmp0 = __msa_aver_s_b(tmp0, src0);
3339  tmp1 = __msa_aver_s_b(tmp1, src1);
3340  tmp2 = __msa_aver_s_b(tmp2, src4);
3341  tmp3 = __msa_aver_s_b(tmp3, src5);
3342  XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
3343  LD4(dst, stride, tp0, tp1, tp2, tp3);
3344  INSERT_D2_UB(tp0, tp1, dst0);
3345  INSERT_D2_UB(tp2, tp3, dst1);
3346  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3347  INSERT_D2_UB(tp0, tp1, dst2);
3348  INSERT_D2_UB(tp2, tp3, dst3);
3349  AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
3350  AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
3351  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3352 }
3353 
3355  ptrdiff_t stride)
3356 {
3357  uint64_t tp0, tp1, tp2, tp3;
3358  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3359  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3360  v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3361  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3362  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3363  v16i8 minus5b = __msa_ldi_b(-5);
3364  v16i8 plus20b = __msa_ldi_b(20);
3365 
3366  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3367  LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3368  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3369  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3370  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3371  HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3372  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3373  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3374  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3375  res0, res1, res2, res3);
3376  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3377  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3378  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3379  res0, res1, res2, res3);
3380  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3381  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3382  HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3383  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3384  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3385  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3386  res4, res5, res6, res7);
3387  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3388  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3389  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3390  res4, res5, res6, res7);
3391  SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
3392  src0, src1, src2, src3);
3393  SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3,
3394  src4, src5, src6, src7);
3395  PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
3396  PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
3397  SRARI_H4_SH(res0, res1, res2, res3, 5);
3398  SRARI_H4_SH(res4, res5, res6, res7, 5);
3399  SAT_SH4_SH(res0, res1, res2, res3, 7);
3400  SAT_SH4_SH(res4, res5, res6, res7, 7);
3401  PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
3402  PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
3403  tmp0 = __msa_aver_s_b(tmp0, src0);
3404  tmp1 = __msa_aver_s_b(tmp1, src1);
3405  tmp2 = __msa_aver_s_b(tmp2, src4);
3406  tmp3 = __msa_aver_s_b(tmp3, src5);
3407  XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
3408  LD4(dst, stride, tp0, tp1, tp2, tp3);
3409  INSERT_D2_UB(tp0, tp1, dst0);
3410  INSERT_D2_UB(tp2, tp3, dst1);
3411  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3412  INSERT_D2_UB(tp0, tp1, dst2);
3413  INSERT_D2_UB(tp2, tp3, dst3);
3414  AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
3415  AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
3416  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3417 }
3418 
3420  ptrdiff_t stride)
3421 {
3422  uint32_t tp0, tp1, tp2, tp3;
3423  v16u8 dst0 = { 0 };
3424  v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
3425  v16i8 mask0, mask1, mask2;
3426  v8i16 out0, out1;
3427  v16i8 minus5b = __msa_ldi_b(-5);
3428  v16i8 plus20b = __msa_ldi_b(20);
3429 
3430  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3431  LD_SB4(src - 2, stride, src0, src1, src2, src3);
3432  XORI_B4_128_SB(src0, src1, src2, src3);
3433  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3434  HADD_SB2_SH(vec0, vec1, out0, out1);
3435  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3436  DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
3437  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3438  DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
3439  SRARI_H2_SH(out0, out1, 5);
3440  SAT_SH2_SH(out0, out1, 7);
3441  res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
3442  SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
3443  src0, src1, src2, src3);
3444  src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
3445  src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3446  src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
3447  res = __msa_aver_s_b(res, src0);
3448  res = (v16i8) __msa_xori_b((v16u8) res, 128);
3449  LW4(dst, stride, tp0, tp1, tp2, tp3);
3450  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3451  dst0 = __msa_aver_u_b((v16u8) res, dst0);
3452  ST_W4(dst0, 0, 1, 2, 3, dst, stride);
3453 }
3454 
3456  ptrdiff_t stride)
3457 {
3458  uint32_t tp0, tp1, tp2, tp3;
3459  v16u8 dst0 = { 0 };
3460  v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
3461  v16i8 mask0, mask1, mask2;
3462  v8i16 out0, out1;
3463  v16i8 minus5b = __msa_ldi_b(-5);
3464  v16i8 plus20b = __msa_ldi_b(20);
3465 
3466  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3467  LD_SB4(src - 2, stride, src0, src1, src2, src3);
3468  XORI_B4_128_SB(src0, src1, src2, src3);
3469  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3470  HADD_SB2_SH(vec0, vec1, out0, out1);
3471  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3472  DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
3473  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3474  DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
3475  SRARI_H2_SH(out0, out1, 5);
3476  SAT_SH2_SH(out0, out1, 7);
3477  res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
3478  SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
3479  src0, src1, src2, src3);
3480  src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
3481  src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3482  src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
3483  res = __msa_aver_s_b(res, src0);
3484  res = (v16i8) __msa_xori_b((v16u8) res, 128);
3485  LW4(dst, stride, tp0, tp1, tp2, tp3);
3486  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3487  dst0 = __msa_aver_u_b((v16u8) res, dst0);
3488  ST_W4(dst0, 0, 1, 2, 3, dst, stride);
3489 }
3490 
3492  ptrdiff_t stride)
3493 {
3494  uint32_t loop_cnt;
3495  v16u8 dst0, dst1, dst2, dst3;
3496  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3497  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3498  v16i8 vec11;
3499  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3500  v16i8 minus5b = __msa_ldi_b(-5);
3501  v16i8 plus20b = __msa_ldi_b(20);
3502 
3503  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3504  src -= 2;
3505 
3506  for (loop_cnt = 4; loop_cnt--;) {
3507  LD_SB2(src, 8, src0, src1);
3508  src += stride;
3509  LD_SB2(src, 8, src2, src3);
3510  src += stride;
3511  LD_SB2(src, 8, src4, src5);
3512  src += stride;
3513  LD_SB2(src, 8, src6, src7);
3514  src += stride;
3515 
3516  LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3517  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3518  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
3519  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
3520  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
3521  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
3522  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
3523  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
3524  HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3525  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3526  minus5b, res0, res1, res2, res3);
3527  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3528  plus20b, res0, res1, res2, res3);
3529  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
3530  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
3531  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
3532  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
3533  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
3534  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
3535  HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3536  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3537  minus5b, res4, res5, res6, res7);
3538  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3539  plus20b, res4, res5, res6, res7);
3540  SRARI_H4_SH(res0, res1, res2, res3, 5);
3541  SRARI_H4_SH(res4, res5, res6, res7, 5);
3542  SAT_SH4_SH(res0, res1, res2, res3, 7);
3543  SAT_SH4_SH(res4, res5, res6, res7, 7);
3544  PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
3545  vec2, vec3);
3546  XORI_B4_128_SB(vec0, vec1, vec2, vec3);
3547  AVER_UB2_UB(vec0, dst0, vec1, dst1, dst0, dst1);
3548  AVER_UB2_UB(vec2, dst2, vec3, dst3, dst2, dst3);
3549  ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3550  dst += (4 * stride);
3551  }
3552 }
3553 
3555  ptrdiff_t stride)
3556 {
3557  uint64_t tp0, tp1, tp2, tp3;
3558  v16u8 out0, out1, out2 = { 0 }, out3 = { 0 };
3559  v16u8 out4, out5, out6 = { 0 }, out7 = { 0 };
3560  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3561  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3562  v16i8 vec11;
3563  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3564  v16i8 minus5b = __msa_ldi_b(-5);
3565  v16i8 plus20b = __msa_ldi_b(20);
3566 
3567  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3568 
3569  LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3570  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3571  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3572  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3573  HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3574  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3575  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3576  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3577  res0, res1, res2, res3);
3578  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3579  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3580  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3581  res0, res1, res2, res3);
3582  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3583  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3584  HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3585  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3586  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3587  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3588  res4, res5, res6, res7);
3589  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3590  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3591  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3592  res4, res5, res6, res7);
3593  SRARI_H4_SH(res0, res1, res2, res3, 5);
3594  SRARI_H4_SH(res4, res5, res6, res7, 5);
3595  SAT_SH4_SH(res0, res1, res2, res3, 7);
3596  SAT_SH4_SH(res4, res5, res6, res7, 7);
3597  out0 = PCKEV_XORI128_UB(res0, res1);
3598  out1 = PCKEV_XORI128_UB(res2, res3);
3599  out4 = PCKEV_XORI128_UB(res4, res5);
3600  out5 = PCKEV_XORI128_UB(res6, res7);
3601  LD4(dst, stride, tp0, tp1, tp2, tp3);
3602  INSERT_D2_UB(tp0, tp1, out2);
3603  INSERT_D2_UB(tp2, tp3, out3);
3604  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3605  INSERT_D2_UB(tp0, tp1, out6);
3606  INSERT_D2_UB(tp2, tp3, out7);
3607  AVER_UB2_UB(out0, out2, out1, out3, out0, out1);
3608  AVER_UB2_UB(out4, out6, out5, out7, out4, out5);
3609  ST_D8(out0, out1, out4, out5, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3610 }
3611 
3613  ptrdiff_t stride)
3614 {
3615  uint32_t tp0, tp1, tp2, tp3;
3616  v16u8 res, dst0 = { 0 };
3617  v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, vec4, vec5;
3618  v16i8 mask0, mask1, mask2;
3619  v8i16 res0, res1;
3620  v16i8 minus5b = __msa_ldi_b(-5);
3621  v16i8 plus20b = __msa_ldi_b(20);
3622 
3623  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3624  LD_SB4(src - 2, stride, src0, src1, src2, src3);
3625  XORI_B4_128_SB(src0, src1, src2, src3);
3626  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3627  HADD_SB2_SH(vec0, vec1, res0, res1);
3628  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3629  DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
3630  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3631  DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
3632  SRARI_H2_SH(res0, res1, 5);
3633  SAT_SH2_SH(res0, res1, 7);
3634  res = PCKEV_XORI128_UB(res0, res1);
3635  LW4(dst, stride, tp0, tp1, tp2, tp3);
3636  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3637  res = __msa_aver_u_b(res, dst0);
3638  ST_W4(res, 0, 1, 2, 3, dst, stride);
3639 }
3640 
3642  ptrdiff_t stride)
3643 {
3644  int32_t loop_cnt;
3645  int16_t filt_const0 = 0xfb01;
3646  int16_t filt_const1 = 0x1414;
3647  int16_t filt_const2 = 0x1fb;
3648  v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
3649  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3650  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3651  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3652  v16i8 src65_l, src87_l, filt0, filt1, filt2;
3653  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3654 
3655  filt0 = (v16i8) __msa_fill_h(filt_const0);
3656  filt1 = (v16i8) __msa_fill_h(filt_const1);
3657  filt2 = (v16i8) __msa_fill_h(filt_const2);
3658 
3659  src -= (stride * 2);
3660 
3661  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3662  src += (5 * stride);
3663 
3664  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3665  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3666  src32_r, src43_r);
3667  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3668  src32_l, src43_l);
3669 
3670  for (loop_cnt = 4; loop_cnt--;) {
3671  LD_SB4(src, stride, src5, src6, src7, src8);
3672  src += (4 * stride);
3673 
3674  XORI_B4_128_SB(src5, src6, src7, src8);
3675  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3676  src65_r, src76_r, src87_r);
3677  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3678  src65_l, src76_l, src87_l);
3679  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3680  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3681  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3682  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3683  out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3684  out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3685  out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3686  out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3687  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3688  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3689  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
3690  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3691  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3692  out3_r, res0, res1, res2, res3);
3693  res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
3694  res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
3695  res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
3696  res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
3697  LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3698  XORI_B4_128_UB(res0, res1, res2, res3);
3699  AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
3700  AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3);
3701  ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3702  dst += (4 * stride);
3703 
3704  src10_r = src54_r;
3705  src32_r = src76_r;
3706  src21_r = src65_r;
3707  src43_r = src87_r;
3708  src10_l = src54_l;
3709  src32_l = src76_l;
3710  src21_l = src65_l;
3711  src43_l = src87_l;
3712  src2 = src6;
3713  src3 = src7;
3714  src4 = src8;
3715  }
3716 }
3717 
3719  ptrdiff_t stride)
3720 {
3721  int32_t loop_cnt;
3722  int16_t filt_const0 = 0xfb01;
3723  int16_t filt_const1 = 0x1414;
3724  int16_t filt_const2 = 0x1fb;
3725  v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
3726  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3727  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3728  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3729  v16i8 src65_l, src87_l, filt0, filt1, filt2;
3730  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3731 
3732  filt0 = (v16i8) __msa_fill_h(filt_const0);
3733  filt1 = (v16i8) __msa_fill_h(filt_const1);
3734  filt2 = (v16i8) __msa_fill_h(filt_const2);
3735 
3736  src -= (stride * 2);
3737 
3738  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3739  src += (5 * stride);
3740 
3741  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3742  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3743  src32_r, src43_r);
3744  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3745  src32_l, src43_l);
3746 
3747  for (loop_cnt = 4; loop_cnt--;) {
3748  LD_SB4(src, stride, src5, src6, src7, src8);
3749  src += (4 * stride);
3750 
3751  XORI_B4_128_SB(src5, src6, src7, src8);
3752  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3753  src65_r, src76_r, src87_r);
3754  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3755  src65_l, src76_l, src87_l);
3756  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3757  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3758  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3759  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3760  out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3761  out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3762  out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3763  out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3764  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3765  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3766  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
3767  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3768  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3769  out3_r, res0, res1, res2, res3);
3770  res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
3771  res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
3772  res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
3773  res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
3774  LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3775  XORI_B4_128_UB(res0, res1, res2, res3);
3776  AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
3777  AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3);
3778  ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3779  dst += (4 * stride);
3780 
3781  src10_r = src54_r;
3782  src32_r = src76_r;
3783  src21_r = src65_r;
3784  src43_r = src87_r;
3785  src10_l = src54_l;
3786  src32_l = src76_l;
3787  src21_l = src65_l;
3788  src43_l = src87_l;
3789  src3 = src7;
3790  src4 = src8;
3791  }
3792 }
3793 
3795  ptrdiff_t stride)
3796 {
3797  uint64_t tp0, tp1, tp2, tp3;
3798  const int16_t filt_const0 = 0xfb01;
3799  const int16_t filt_const1 = 0x1414;
3800  const int16_t filt_const2 = 0x1fb;
3801  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3802  v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
3803  v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
3804  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
3805  v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
3806  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3807 
3808  filt0 = (v16i8) __msa_fill_h(filt_const0);
3809  filt1 = (v16i8) __msa_fill_h(filt_const1);
3810  filt2 = (v16i8) __msa_fill_h(filt_const2);
3811 
3812  src -= (stride * 2);
3813 
3814  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3815  src += (5 * stride);
3816 
3817  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3818  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3819  src32_r, src43_r);
3820  LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14);
3821  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
3822  ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
3823  src87_r, src98_r, src109_r);
3824  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3825  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3826  out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3827  out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3828  PCKEV_D2_SB(src3, src2, src7, src4, tmp0, tmp1);
3829  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
3830  src21_r, src32_r, src43_r);
3831  out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
3832  out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
3833  out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
3834  out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
3835  PCKEV_D2_SB(src9, src8, src11, src10, tmp2, tmp3);
3836  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3837  SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
3838  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3839  SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3840 
3841  LD4(dst, stride, tp0, tp1, tp2, tp3);
3842  INSERT_D2_UB(tp0, tp1, dst0);
3843  INSERT_D2_UB(tp2, tp3, dst1);
3844  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3845  INSERT_D2_UB(tp0, tp1, dst2);
3846  INSERT_D2_UB(tp2, tp3, dst3);
3847 
3848  PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3849  PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3850  out0 = __msa_aver_s_b(out0, tmp0);
3851  out1 = __msa_aver_s_b(out1, tmp1);
3852  out2 = __msa_aver_s_b(out2, tmp2);
3853  out3 = __msa_aver_s_b(out3, tmp3);
3854  XORI_B4_128_SB(out0, out1, out2, out3);
3855  AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
3856  dst2, dst3);
3857  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3858 }
3859 
3861  ptrdiff_t stride)
3862 {
3863  uint64_t tp0, tp1, tp2, tp3;
3864  const int16_t filt_const0 = 0xfb01;
3865  const int16_t filt_const1 = 0x1414;
3866  const int16_t filt_const2 = 0x1fb;
3867  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3868  v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
3869  v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
3870  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
3871  v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
3872  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3873 
3874  filt0 = (v16i8) __msa_fill_h(filt_const0);
3875  filt1 = (v16i8) __msa_fill_h(filt_const1);
3876  filt2 = (v16i8) __msa_fill_h(filt_const2);
3877 
3878  src -= (stride * 2);
3879 
3880  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3881  src += (5 * stride);
3882 
3883  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3884  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3885  src32_r, src43_r);
3886  LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14);
3887  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
3888  ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
3889  src87_r, src98_r, src109_r);
3890  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3891  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3892  out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3893  out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3894  PCKEV_D2_SB(src4, src3, src8, src7, tmp0, tmp1);
3895  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
3896  src21_r, src32_r, src43_r);
3897  out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
3898  out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
3899  out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
3900  out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
3901  PCKEV_D2_SB(src10, src9, src12, src11, tmp2, tmp3);
3902  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3903  SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
3904  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3905  SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3906 
3907  LD4(dst, stride, tp0, tp1, tp2, tp3);
3908  INSERT_D2_UB(tp0, tp1, dst0);
3909  INSERT_D2_UB(tp2, tp3, dst1);
3910  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3911  INSERT_D2_UB(tp0, tp1, dst2);
3912  INSERT_D2_UB(tp2, tp3, dst3);
3913 
3914  PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3915  PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3916  out0 = __msa_aver_s_b(out0, tmp0);
3917  out1 = __msa_aver_s_b(out1, tmp1);
3918  out2 = __msa_aver_s_b(out2, tmp2);
3919  out3 = __msa_aver_s_b(out3, tmp3);
3920  XORI_B4_128_SB(out0, out1, out2, out3);
3921  AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
3922  dst2, dst3);
3923  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
3924 }
3925 
3927  ptrdiff_t stride)
3928 {
3929  uint32_t tp0, tp1, tp2, tp3;
3930  int16_t filt_const0 = 0xfb01;
3931  int16_t filt_const1 = 0x1414;
3932  int16_t filt_const2 = 0x1fb;
3933  v16u8 res, dst0 = { 0 };
3934  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3935  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3936  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3937  v8i16 out10, out32;
3938 
3939  filt0 = (v16i8) __msa_fill_h(filt_const0);
3940  filt1 = (v16i8) __msa_fill_h(filt_const1);
3941  filt2 = (v16i8) __msa_fill_h(filt_const2);
3942 
3943  src -= (stride * 2);
3944  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3945  src += (5 * stride);
3946 
3947  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3948  src32_r, src43_r);
3949  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3950  XORI_B2_128_SB(src2110, src4332);
3951  LD_SB4(src, stride, src5, src6, src7, src8);
3952  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3953  src76_r, src87_r);
3954  ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
3955  XORI_B2_128_SB(src6554, src8776);
3956  src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3957  src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
3958  src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
3959  out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
3960  out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
3961  SRARI_H2_SH(out10, out32, 5);
3962  SAT_SH2_SH(out10, out32, 7);
3963  LW4(dst, stride, tp0, tp1, tp2, tp3);
3964  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3965  res = PCKEV_XORI128_UB(out10, out32);
3966  res = __msa_aver_u_b(res, (v16u8) src32_r);
3967  dst0 = __msa_aver_u_b(res, dst0);
3968  ST_W4(dst0, 0, 1, 2, 3, dst, stride);
3969 }
3970 
3972  ptrdiff_t stride)
3973 {
3974  uint32_t tp0, tp1, tp2, tp3;
3975  int16_t filt_const0 = 0xfb01;
3976  int16_t filt_const1 = 0x1414;
3977  int16_t filt_const2 = 0x1fb;
3978  v16u8 res, dst0 = { 0 };
3979  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3980  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3981  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3982  v8i16 out10, out32;
3983 
3984  filt0 = (v16i8) __msa_fill_h(filt_const0);
3985  filt1 = (v16i8) __msa_fill_h(filt_const1);
3986  filt2 = (v16i8) __msa_fill_h(filt_const2);
3987 
3988  src -= (stride * 2);
3989 
3990  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3991  src += (5 * stride);
3992 
3993  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3994  src32_r, src43_r);
3995  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3996  XORI_B2_128_SB(src2110, src4332);
3997  LD_SB4(src, stride, src5, src6, src7, src8);
3998  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3999  src76_r, src87_r);
4000  ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
4001  XORI_B2_128_SB(src6554, src8776);
4002  out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
4003  out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
4004  SRARI_H2_SH(out10, out32, 5);
4005  SAT_SH2_SH(out10, out32, 7);
4006  LW4(dst, stride, tp0, tp1, tp2, tp3);
4007  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4008  res = PCKEV_XORI128_UB(out10, out32);
4009  src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
4010  src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
4011  src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
4012  res = __msa_aver_u_b(res, (v16u8) src32_r);
4013  dst0 = __msa_aver_u_b(res, dst0);
4014  ST_W4(dst0, 0, 1, 2, 3, dst, stride);
4015 }
4016 
4018  ptrdiff_t stride)
4019 {
4021  src - (stride * 2),
4022  dst, stride);
4023 }
4024 
4026  ptrdiff_t stride)
4027 {
4029  src - (stride * 2) +
4030  sizeof(uint8_t),
4031  dst, stride);
4032 }
4033 
4035  ptrdiff_t stride)
4036 {
4037  avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
4038  src - (stride * 2),
4039  dst, stride);
4040 }
4041 
4043  ptrdiff_t stride)
4044 {
4045  avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
4046  src - (stride * 2) +
4047  sizeof(uint8_t),
4048  dst, stride);
4049 }
4050 
4052  ptrdiff_t stride)
4053 {
4055  src - (stride * 2),
4056  dst, stride);
4057 }
4058 
4060  ptrdiff_t stride)
4061 {
4063  src - (stride * 2) +
4064  sizeof(uint8_t), dst, stride);
4065 }
4066 
4068  ptrdiff_t stride)
4069 {
4070  avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
4071  src - (stride * 2),
4072  dst, stride);
4073 }
4074 
4076  ptrdiff_t stride)
4077 {
4078  avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
4079  src - (stride * 2) +
4080  sizeof(uint8_t), dst, stride);
4081 }
4082 
4083 
4085  ptrdiff_t stride)
4086 {
4088  src - (stride * 2),
4089  dst, stride);
4090 }
4091 
4093  ptrdiff_t stride)
4094 {
4096  src - (stride * 2) +
4097  sizeof(uint8_t), dst, stride);
4098 }
4099 
4101  ptrdiff_t stride)
4102 {
4103  avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
4104  src - (stride * 2),
4105  dst, stride);
4106 }
4107 
4109  ptrdiff_t stride)
4110 {
4111  avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
4112  src - (stride * 2) +
4113  sizeof(uint8_t), dst, stride);
4114 }
4115 
4117  ptrdiff_t stride)
4118 {
4119  uint64_t tp0, tp1, tp2, tp3;
4120  uint8_t *dst_tmp = dst;
4121  const uint8_t *src_tmp = src - (2 * stride) - 2;
4122  uint32_t multiple8_cnt, loop_cnt;
4123  const int32_t filt_const0 = 0xfffb0001;
4124  const int32_t filt_const1 = 0x140014;
4125  const int32_t filt_const2 = 0x1fffb;
4126  v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
4127  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
4128  v16i8 mask2;
4129  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4130  v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4131  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4132  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
4133  v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
4134  v8i16 hz_out87_l, filt0, filt1, filt2;
4135  v4i32 tmp0_w, tmp1_w;
4136 
4137  filt0 = (v8i16) __msa_fill_w(filt_const0);
4138  filt1 = (v8i16) __msa_fill_w(filt_const1);
4139  filt2 = (v8i16) __msa_fill_w(filt_const2);
4140 
4141  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4142 
4143  for (multiple8_cnt = 2; multiple8_cnt--;) {
4144  dst = dst_tmp;
4145  src = src_tmp;
4146 
4147  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4148  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4149  src += (5 * stride);
4150 
4151  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4152  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4153  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4154  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4155  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4156 
4157  for (loop_cnt = 4; loop_cnt--;) {
4158  LD_SB2(src, stride, src5, src6);
4159  src += (2 * stride);
4160 
4161  XORI_B2_128_SB(src5, src6);
4162  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4163  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4164  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4165  hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
4166  hz_out43_r);
4167  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4168  hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
4169  hz_out43_l);
4170  ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r,
4171  hz_out65_r);
4172  ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l,
4173  hz_out65_l);
4174  tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
4175  filt1, filt2);
4176  tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
4177  filt1, filt2);
4178  tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4179  tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
4180  filt1, filt2);
4181  tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
4182  filt1, filt2);
4183  tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4184 
4185  tmp1 = __msa_srari_h(hz_out2, 5);
4186  tmp3 = __msa_srari_h(hz_out3, 5);
4187  SAT_SH2_SH(tmp1, tmp3, 7);
4188 
4189  tmp0 = __msa_aver_s_h(tmp0, tmp1);
4190  tmp1 = __msa_aver_s_h(tmp2, tmp3);
4191 
4192  LD2(dst, stride, tp0, tp1);
4193  INSERT_D2_UB(tp0, tp1, dst0);
4194 
4195  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4196  dst0 = __msa_aver_u_b(out0, dst0);
4197  ST_D2(dst0, 0, 1, dst, stride);
4198  dst += (2 * stride);
4199 
4200  LD_SB2(src, stride, src7, src8);
4201  src += (2 * stride);
4202 
4203  XORI_B2_128_SB(src7, src8);
4204  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4205  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4206  ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
4207  hz_out87_r);
4208  ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
4209  hz_out87_l);
4210  tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
4211  filt1, filt2);
4212  tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
4213  filt1, filt2);
4214  tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4215  tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
4216  filt1, filt2);
4217  tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
4218  filt1, filt2);
4219  tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4220 
4221  tmp5 = __msa_srari_h(hz_out4, 5);
4222  tmp7 = __msa_srari_h(hz_out5, 5);
4223  SAT_SH2_SH(tmp5, tmp7, 7);
4224 
4225  tmp2 = __msa_aver_s_h(tmp4, tmp5);
4226  tmp3 = __msa_aver_s_h(tmp6, tmp7);
4227 
4228  LD2(dst, stride, tp2, tp3);
4229  INSERT_D2_UB(tp2, tp3, dst1);
4230 
4231  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4232  dst1 = __msa_aver_u_b(out1, dst1);
4233  ST_D2(dst1, 0, 1, dst, stride);
4234  dst += (2 * stride);
4235 
4236  hz_out0 = hz_out4;
4237  hz_out1 = hz_out5;
4238  hz_out2 = hz_out6;
4239  hz_out3 = hz_out7;
4240  hz_out4 = hz_out8;
4241  }
4242 
4243  src_tmp += 8;
4244  dst_tmp += 8;
4245  }
4246 }
4247 
4249  ptrdiff_t stride)
4250 {
4251  uint64_t tp0, tp1, tp2, tp3;
4252  uint8_t *dst_tmp = dst;
4253  const uint8_t *src_tmp = src - (2 * stride) - 2;
4254  uint32_t multiple8_cnt, loop_cnt;
4255  const int32_t filt_const0 = 0xfffb0001;
4256  const int32_t filt_const1 = 0x140014;
4257  const int32_t filt_const2 = 0x1fffb;
4258  v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
4259  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
4260  v16i8 mask2;
4261  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4262  v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4263  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4264  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
4265  v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
4266  v8i16 hz_out87_l, filt0, filt1, filt2;
4267  v4i32 tmp0_w, tmp1_w;
4268 
4269  filt0 = (v8i16) __msa_fill_w(filt_const0);
4270  filt1 = (v8i16) __msa_fill_w(filt_const1);
4271  filt2 = (v8i16) __msa_fill_w(filt_const2);
4272 
4273  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4274 
4275  for (multiple8_cnt = 2; multiple8_cnt--;) {
4276  dst = dst_tmp;
4277  src = src_tmp;
4278 
4279  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4280  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4281  src += (5 * stride);
4282 
4283  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4284  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4285  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4286  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4287  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4288 
4289  for (loop_cnt = 4; loop_cnt--;) {
4290  LD_SB2(src, stride, src5, src6);
4291  src += (2 * stride);
4292 
4293  XORI_B2_128_SB(src5, src6);
4294  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4295  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4296  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4297  hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
4298  hz_out43_r);
4299  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4300  hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
4301  hz_out43_l);
4302  ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r, hz_out65_r);
4303  ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l, hz_out65_l);
4304 
4305  tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
4306  filt1, filt2);
4307  tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
4308  filt1, filt2);
4309  tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4310  tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
4311  filt1, filt2);
4312  tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
4313  filt1, filt2);
4314  tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4315 
4316  tmp1 = __msa_srari_h(hz_out3, 5);
4317  tmp3 = __msa_srari_h(hz_out4, 5);
4318  SAT_SH2_SH(tmp1, tmp3, 7);
4319 
4320  tmp0 = __msa_aver_s_h(tmp0, tmp1);
4321  tmp1 = __msa_aver_s_h(tmp2, tmp3);
4322 
4323  LD2(dst, stride, tp0, tp1);
4324  INSERT_D2_UB(tp0, tp1, dst0);
4325  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4326  dst0 = __msa_aver_u_b(out0, dst0);
4327  ST_D2(dst0, 0, 1, dst, stride);
4328  dst += (2 * stride);
4329 
4330  LD_SB2(src, stride, src7, src8);
4331  src += (2 * stride);
4332 
4333  XORI_B2_128_SB(src7, src8);
4334  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4335  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4336  ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
4337  hz_out87_r);
4338  ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
4339  hz_out87_l);
4340  tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
4341  filt1, filt2);
4342  tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
4343  filt1, filt2);
4344  tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4345  tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
4346  filt1, filt2);
4347  tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
4348  filt1, filt2);
4349  tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4350 
4351  tmp5 = __msa_srari_h(hz_out5, 5);
4352  tmp7 = __msa_srari_h(hz_out6, 5);
4353  SAT_SH2_SH(tmp5, tmp7, 7);
4354 
4355  tmp2 = __msa_aver_s_h(tmp4, tmp5);
4356  tmp3 = __msa_aver_s_h(tmp6, tmp7);
4357 
4358  LD2(dst, stride, tp2, tp3);
4359  INSERT_D2_UB(tp2, tp3, dst1);
4360  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4361  dst1 = __msa_aver_u_b(out1, dst1);
4362  ST_D2(dst1, 0, 1, dst, stride);
4363  dst += (2 * stride);
4364 
4365  hz_out0 = hz_out4;
4366  hz_out1 = hz_out5;
4367  hz_out2 = hz_out6;
4368  hz_out3 = hz_out7;
4369  hz_out4 = hz_out8;
4370  }
4371 
4372  src_tmp += 8;
4373  dst_tmp += 8;
4374  }
4375 }
4376 
4378  ptrdiff_t stride)
4379 {
4380  const int32_t filt_const0 = 0xfffb0001;
4381  const int32_t filt_const1 = 0x140014;
4382  const int32_t filt_const2 = 0x1fffb;
4383  uint64_t tp0, tp1, tp2, tp3;
4384  v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
4385  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4386  v16i8 src11, src12, mask0, mask1, mask2;
4387  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4388  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
4389  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4390  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
4391  v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
4392  v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
4393  v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
4394  v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
4395  v4i32 tmp0_w, tmp1_w;
4396 
4397  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4398 
4399  filt0 = (v8i16) __msa_fill_w(filt_const0);
4400  filt1 = (v8i16) __msa_fill_w(filt_const1);
4401  filt2 = (v8i16) __msa_fill_w(filt_const2);
4402 
4403  src -= ((2 * stride) + 2);
4404 
4405  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4406  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4407  src += (5 * stride);
4408 
4409  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4410  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4411  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4412  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4413  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4414 
4415  LD_SB4(src, stride, src5, src6, src7, src8);
4416  src += (4 * stride);
4417  XORI_B4_128_SB(src5, src6, src7, src8);
4418 
4419  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4420  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4421  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4422  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4423 
4424  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4425  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4426  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4427  hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
4428  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4429  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4430  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4431  hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
4432 
4433  tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4434  filt2);
4435  tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
4436  filt2);
4437  tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4438  tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4439  filt2);
4440  tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
4441  filt2);
4442  tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4443  tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4444  filt2);
4445  tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
4446  filt2);
4447  tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4448  tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4449  filt2);
4450  tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
4451  filt2);
4452  tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4453 
4454  SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
4455  SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
4456 
4457  LD4(dst, stride, tp0, tp1, tp2, tp3);
4458  INSERT_D2_UB(tp0, tp1, dst0);
4459  INSERT_D2_UB(tp2, tp3, dst1);
4460 
4461  tmp0 = __msa_aver_s_h(tmp0, hz_out2);
4462  tmp1 = __msa_aver_s_h(tmp1, hz_out3);
4463  tmp2 = __msa_aver_s_h(tmp2, hz_out4);
4464  tmp3 = __msa_aver_s_h(tmp3, hz_out5);
4465 
4466  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4467  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4468  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4469  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
4470  dst += (4 * stride);
4471 
4472  LD_SB4(src, stride, src9, src10, src11, src12);
4473  XORI_B4_128_SB(src9, src10, src11, src12);
4474  hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
4475  hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
4476  hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
4477  hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
4478  ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4479  hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
4480  hz_out1211_r);
4481  ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4482  hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
4483  hz_out1211_l);
4484  tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
4485  filt2);
4486  tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
4487  filt2);
4488  tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4489  tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
4490  filt2);
4491  tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
4492  filt2);
4493  tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4494  tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
4495  filt2);
4496  tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
4497  filt2);
4498  tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4499  tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
4500  filt2);
4501  tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
4502  filt2);
4503  tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4504 
4505  SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
4506  SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
4507 
4508  LD4(dst, stride, tp0, tp1, tp2, tp3);
4509  INSERT_D2_UB(tp0, tp1, dst0);
4510  INSERT_D2_UB(tp2, tp3, dst1);
4511 
4512  tmp0 = __msa_aver_s_h(tmp0, hz_out6);
4513  tmp1 = __msa_aver_s_h(tmp1, hz_out7);
4514  tmp2 = __msa_aver_s_h(tmp2, hz_out8);
4515  tmp3 = __msa_aver_s_h(tmp3, hz_out9);
4516 
4517  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4518  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4519  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4520  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
4521 }
4522 
4524  ptrdiff_t stride)
4525 {
4526  const int32_t filt_const0 = 0xfffb0001;
4527  const int32_t filt_const1 = 0x140014;
4528  const int32_t filt_const2 = 0x1fffb;
4529  uint64_t tp0, tp1, tp2, tp3;
4530  v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
4531  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4532  v16i8 src11, src12, mask0, mask1, mask2;
4533  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4534  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
4535  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4536  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
4537  v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
4538  v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
4539  v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
4540  v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
4541  v4i32 tmp0_w, tmp1_w;
4542 
4543  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4544 
4545  filt0 = (v8i16) __msa_fill_w(filt_const0);
4546  filt1 = (v8i16) __msa_fill_w(filt_const1);
4547  filt2 = (v8i16) __msa_fill_w(filt_const2);
4548 
4549  src -= ((2 * stride) + 2);
4550 
4551  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4552  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4553  src += (5 * stride);
4554 
4555  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4556  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4557  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4558  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4559  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4560 
4561  LD_SB4(src, stride, src5, src6, src7, src8);
4562  src += (4 * stride);
4563  XORI_B4_128_SB(src5, src6, src7, src8);
4564 
4565  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4566  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4567  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4568  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4569 
4570  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4571  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4572  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4573  hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
4574  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4575  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4576  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4577  hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
4578 
4579  tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4580  filt2);
4581  tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
4582  filt2);
4583  tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4584  tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4585  filt2);
4586  tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
4587  filt2);
4588  tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4589  tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4590  filt2);
4591  tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
4592  filt2);
4593  tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4594  tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4595  filt2);
4596  tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
4597  filt2);
4598  tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4599 
4600  SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
4601  SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
4602 
4603  LD4(dst, stride, tp0, tp1, tp2, tp3);
4604  INSERT_D2_UB(tp0, tp1, dst0);
4605  INSERT_D2_UB(tp2, tp3, dst1);
4606 
4607  tmp0 = __msa_aver_s_h(tmp0, hz_out3);
4608  tmp1 = __msa_aver_s_h(tmp1, hz_out4);
4609  tmp2 = __msa_aver_s_h(tmp2, hz_out5);
4610  tmp3 = __msa_aver_s_h(tmp3, hz_out6);
4611 
4612  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4613  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4614  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4615  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
4616  dst += (4 * stride);
4617 
4618  LD_SB4(src, stride, src9, src10, src11, src12);
4619  XORI_B4_128_SB(src9, src10, src11, src12);
4620  hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
4621  hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
4622  hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
4623  hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
4624  ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4625  hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
4626  hz_out1211_r);
4627  ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4628  hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
4629  hz_out1211_l);
4630  tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
4631  filt2);
4632  tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
4633  filt2);
4634  tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4635  tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
4636  filt2);
4637  tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
4638  filt2);
4639  tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4640  tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
4641  filt2);
4642  tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
4643  filt2);
4644  tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4645  tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
4646  filt2);
4647  tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
4648  filt2);
4649  tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4650 
4651  SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
4652  SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
4653 
4654  LD4(dst, stride, tp0, tp1, tp2, tp3);
4655  INSERT_D2_UB(tp0, tp1, dst0);
4656  INSERT_D2_UB(tp2, tp3, dst1);
4657 
4658  tmp0 = __msa_aver_s_h(tmp0, hz_out7);
4659  tmp1 = __msa_aver_s_h(tmp1, hz_out8);
4660  tmp2 = __msa_aver_s_h(tmp2, hz_out9);
4661  tmp3 = __msa_aver_s_h(tmp3, hz_out10);
4662 
4663  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4664  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4665  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4666  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
4667 }
4668 
4670  ptrdiff_t stride)
4671 {
4672  uint32_t tp0, tp1, tp2, tp3;
4673  const int32_t filt_const0 = 0xfffb0001;
4674  const int32_t filt_const1 = 0x140014;
4675  const int32_t filt_const2 = 0x1fffb;
4676  v16u8 res, out = { 0 };
4677  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4678  v16i8 mask0, mask1, mask2;
4679  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4680  v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
4681  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4682  v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
4683  v4i32 tmp0, tmp1;
4684 
4685  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
4686 
4687  filt0 = (v8i16) __msa_fill_w(filt_const0);
4688  filt1 = (v8i16) __msa_fill_w(filt_const1);
4689  filt2 = (v8i16) __msa_fill_w(filt_const2);
4690 
4691  src -= ((2 * stride) + 2);
4692 
4693  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4694  src += (5 * stride);
4695  LD_SB4(src, stride, src5, src6, src7, src8);
4696 
4697  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4698  XORI_B4_128_SB(src5, src6, src7, src8);
4699 
4700  hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
4701  hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
4702  hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
4703  hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
4704  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4705  PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
4706  PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
4707 
4708  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4709  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4710  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4711  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4712 
4713  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4714  filt2);
4715  tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4716  filt2);
4717  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4718  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4719  filt2);
4720  tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4721  filt2);
4722  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4723 
4724  SRARI_H2_SH(hz_out2, hz_out4, 5);
4725  SAT_SH2_SH(hz_out2, hz_out4, 7);
4726 
4727  dst0 = __msa_aver_s_h(dst0, hz_out2);
4728  dst1 = __msa_aver_s_h(dst1, hz_out4);
4729  LW4(dst, stride, tp0, tp1, tp2, tp3);
4730  INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
4731  res = PCKEV_XORI128_UB(dst0, dst1);
4732  res = __msa_aver_u_b(res, out);
4733  ST_W4(res, 0, 1, 2, 3, dst, stride);
4734 }
4735 
4737  ptrdiff_t stride)
4738 {
4739  const int32_t filt_const0 = 0xfffb0001;
4740  const int32_t filt_const1 = 0x140014;
4741  const int32_t filt_const2 = 0x1fffb;
4742  uint32_t tp0, tp1, tp2, tp3;
4743  v16u8 res, out = { 0 };
4744  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4745  v16i8 mask0, mask1, mask2;
4746  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4747  v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
4748  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4749  v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
4750  v4i32 tmp0, tmp1;
4751 
4752  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
4753 
4754  filt0 = (v8i16) __msa_fill_w(filt_const0);
4755  filt1 = (v8i16) __msa_fill_w(filt_const1);
4756  filt2 = (v8i16) __msa_fill_w(filt_const2);
4757 
4758  src -= ((2 * stride) + 2);
4759 
4760  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4761  src += (5 * stride);
4762  LD_SB4(src, stride, src5, src6, src7, src8);
4763 
4764  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4765  XORI_B4_128_SB(src5, src6, src7, src8);
4766 
4767  hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
4768  hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
4769  hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
4770  hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
4771  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4772  PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
4773  PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
4774 
4775  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4776  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4777  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4778  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4779 
4780  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4781  filt2);
4782  tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4783  filt2);
4784  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4785  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4786  filt2);
4787  tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4788  filt2);
4789  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4790 
4791  PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
4792  SRARI_H2_SH(hz_out0, hz_out1, 5);
4793  SAT_SH2_SH(hz_out0, hz_out1, 7);
4794 
4795  dst0 = __msa_aver_s_h(dst0, hz_out0);
4796  dst1 = __msa_aver_s_h(dst1, hz_out1);
4797  LW4(dst, stride, tp0, tp1, tp2, tp3);
4798  INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
4799  res = PCKEV_XORI128_UB(dst0, dst1);
4800  res = __msa_aver_u_b(res, out);
4801  ST_W4(res, 0, 1, 2, 3, dst, stride);
4802 }
4803 
4805  ptrdiff_t stride)
4806 {
4807  int32_t loop_cnt;
4808  int16_t filt_const0 = 0xfb01;
4809  int16_t filt_const1 = 0x1414;
4810  int16_t filt_const2 = 0x1fb;
4811  v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
4812  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4813  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
4814  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
4815  v16i8 src65_l, src87_l, filt0, filt1, filt2;
4816  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
4817 
4818  filt0 = (v16i8) __msa_fill_h(filt_const0);
4819  filt1 = (v16i8) __msa_fill_h(filt_const1);
4820  filt2 = (v16i8) __msa_fill_h(filt_const2);
4821  src -= (stride * 2);
4822 
4823  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4824  src += (5 * stride);
4825 
4826  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4827  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4828  src32_r, src43_r);
4829  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
4830  src32_l, src43_l);
4831 
4832  for (loop_cnt = 4; loop_cnt--;) {
4833  LD_SB4(src, stride, src5, src6, src7, src8);
4834  src += (4 * stride);
4835 
4836  XORI_B4_128_SB(src5, src6, src7, src8);
4837  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
4838  src65_r, src76_r, src87_r);
4839  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
4840  src65_l, src76_l, src87_l);
4841  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
4842  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
4843  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
4844  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
4845  out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
4846  out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
4847  out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
4848  out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
4849  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
4850  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
4851  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
4852  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
4853  LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
4854  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
4855  out3_r, res0, res1, res2, res3);
4856  XORI_B4_128_UB(res0, res1, res2, res3);
4857  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
4858  AVER_UB2_UB(res2, dst2, res3, dst3, res2, res3);
4859  ST_UB4(res0, res1, res2, res3, dst, stride);
4860  dst += (4 * stride);
4861 
4862  src10_r = src54_r;
4863  src32_r = src76_r;
4864  src21_r = src65_r;
4865  src43_r = src87_r;
4866  src10_l = src54_l;
4867  src32_l = src76_l;
4868  src21_l = src65_l;
4869  src43_l = src87_l;
4870  src4 = src8;
4871  }
4872 }
4873 
4875  ptrdiff_t stride)
4876 {
4877  uint64_t tp0, tp1, tp2, tp3;
4878  const int16_t filt_const0 = 0xfb01;
4879  const int16_t filt_const1 = 0x1414;
4880  const int16_t filt_const2 = 0x1fb;
4881  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
4882  v16u8 out0, out1, out2, out3;
4883  v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src109_r;
4884  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
4885  v16i8 filt0, filt1, filt2;
4886  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
4887 
4888  filt0 = (v16i8) __msa_fill_h(filt_const0);
4889  filt1 = (v16i8) __msa_fill_h(filt_const1);
4890  filt2 = (v16i8) __msa_fill_h(filt_const2);
4891 
4892  src -= (stride * 2);
4893 
4894  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4895  src += (5 * stride);
4896 
4897  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4898  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4899  src32_r, src43_r);
4900 
4901  LD_SB4(src, stride, src7, src8, src9, src10);
4902  src += (4 * stride);
4903  XORI_B4_128_SB(src7, src8, src9, src10);
4904  ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
4905  src87_r, src98_r, src109_r);
4906  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
4907  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
4908  out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
4909  out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
4910 
4911  LD_SB4(src, stride, src0, src1, src2, src3);
4912  XORI_B4_128_SB(src0, src1, src2, src3);
4913  ILVR_B4_SB(src0, src10, src1, src0, src2, src1, src3, src2, src10_r,
4914  src21_r, src32_r, src43_r);
4915  out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
4916  out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
4917  out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
4918  out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
4919 
4920  LD4(dst, stride, tp0, tp1, tp2, tp3);
4921  INSERT_D2_UB(tp0, tp1, dst0);
4922  INSERT_D2_UB(tp2, tp3, dst1);
4923  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
4924  INSERT_D2_UB(tp0, tp1, dst2);
4925  INSERT_D2_UB(tp2, tp3, dst3);
4926 
4927  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
4928  SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
4929  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
4930  SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
4931  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
4932  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
4933  out2 = PCKEV_XORI128_UB(out4_r, out5_r);
4934  out3 = PCKEV_XORI128_UB(out6_r, out7_r);
4935  AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
4936  dst2, dst3);
4937  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
4938 }
4939 
4941  ptrdiff_t stride)
4942 {
4943  uint32_t tp0, tp1, tp2, tp3;
4944  int16_t filt_const0 = 0xfb01;
4945  int16_t filt_const1 = 0x1414;
4946  int16_t filt_const2 = 0x1fb;
4947  v16u8 res, dst0 = { 0 };
4948  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4949  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
4950  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
4951  v8i16 out10, out32;
4952 
4953  filt0 = (v16i8) __msa_fill_h(filt_const0);
4954  filt1 = (v16i8) __msa_fill_h(filt_const1);
4955  filt2 = (v16i8) __msa_fill_h(filt_const2);
4956 
4957  src -= (stride * 2);
4958  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4959  src += (5 * stride);
4960 
4961  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4962  src32_r, src43_r);
4963  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
4964  XORI_B2_128_SB(src2110, src4332);
4965  LD_SB4(src, stride, src5, src6, src7, src8);
4966  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
4967  src76_r, src87_r);
4968  ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
4969  XORI_B2_128_SB(src6554, src8776);
4970  out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
4971  out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
4972  SRARI_H2_SH(out10, out32, 5);
4973  SAT_SH2_SH(out10, out32, 7);
4974  LW4(dst, stride, tp0, tp1, tp2, tp3);
4975  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4976  res = PCKEV_XORI128_UB(out10, out32);
4977  dst0 = __msa_aver_u_b(res, dst0);
4978  ST_W4(dst0, 0, 1, 2, 3, dst, stride);
4979 }
4980 
4982  ptrdiff_t stride)
4983 {
4984  uint32_t row;
4985  v16u8 out, dst0;
4986  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4987  v16i8 src11;
4988  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
4989  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
4990  v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
4991  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
4992  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
4993  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
4994  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
4995  v8i16 minus5h = __msa_ldi_h(-5);
4996  v8i16 plus20h = __msa_ldi_h(20);
4997 
4998  mask3 = mask0 + 4;
4999  mask4 = mask1 + 4;
5000  mask5 = mask2 + 4;
5001 
5002  src -= ((2 * stride) + 2);
5003 
5004  LD_SB5(src, stride, src0, src1, src2, src3, src4);
5005  LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
5006  src += (5 * stride);
5007  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5008  XORI_B5_128_SB(src7, src8, src9, src10, src11);
5009 
5010  for (row = 16; row--;) {
5011  LD_SB2(src, 8, src5, src6);
5012  src += stride;
5013  XORI_B2_128_SB(src5, src6);
5014  dst0 = LD_UB(dst);
5015 
5016  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5017  vt_res0, vt_res1);
5018  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
5019  vt_res2, vt_res3);
5020  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5021  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5022  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5023  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5024  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5025  mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5026  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5027  mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5028  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5029  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5030  hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5031  hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5032  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5033  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5034  DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5035  DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5036  SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5037  SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5038  tmp0 = __msa_srari_h(shf_vec2, 5);
5039  tmp1 = __msa_srari_h(shf_vec5, 5);
5040  tmp2 = __msa_srari_h(shf_vec8, 5);
5041  tmp3 = __msa_srari_h(shf_vec11, 5);
5042  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5043  PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
5044  PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5045  tmp0 = __msa_aver_s_h(tmp2, tmp0);
5046  tmp1 = __msa_aver_s_h(tmp3, tmp1);
5047  out = PCKEV_XORI128_UB(tmp0, tmp1);
5048  out = __msa_aver_u_b(out, dst0);
5049  ST_UB(out, dst);
5050  dst += stride;
5051 
5052  src0 = src1;
5053  src1 = src2;
5054  src2 = src3;
5055  src3 = src4;
5056  src4 = src5;
5057  src7 = src8;
5058  src8 = src9;
5059  src9 = src10;
5060  src10 = src11;
5061  src11 = src6;
5062  }
5063 }
5064 
5066  ptrdiff_t stride)
5067 {
5068  uint32_t row;
5069  v16u8 out, dst0;
5070  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5071  v16i8 src11;
5072  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
5073  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5074  v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
5075  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5076  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5077  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5078  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5079  v8i16 minus5h = __msa_ldi_h(-5);
5080  v8i16 plus20h = __msa_ldi_h(20);
5081 
5082  mask3 = mask0 + 4;
5083  mask4 = mask1 + 4;
5084  mask5 = mask2 + 4;
5085 
5086  src -= ((2 * stride) + 2);
5087 
5088  LD_SB5(src, stride, src0, src1, src2, src3, src4);
5089  LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
5090  src += (5 * stride);
5091  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5092  XORI_B5_128_SB(src7, src8, src9, src10, src11);
5093 
5094  for (row = 16; row--;) {
5095  LD_SB2(src, 8, src5, src6);
5096  src += stride;
5097  XORI_B2_128_SB(src5, src6);
5098  dst0 = LD_UB(dst);
5099 
5100  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5101  vt_res0, vt_res1);
5102  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
5103  vt_res2, vt_res3);
5104  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5105  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5106  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5107  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5108  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5109  mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5110  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5111  mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5112  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5113  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5114  hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5115  hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5116  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5117  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5118  DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5119  DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5120  SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5121  SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5122  tmp0 = __msa_srari_h(shf_vec2, 5);
5123  tmp1 = __msa_srari_h(shf_vec5, 5);
5124  tmp2 = __msa_srari_h(shf_vec8, 5);
5125  tmp3 = __msa_srari_h(shf_vec11, 5);
5126  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5127  tmp0 = __msa_pckod_h(tmp2, tmp0);
5128  tmp1 = __msa_pckod_h(tmp3, tmp1);
5129  PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5130  tmp0 = __msa_aver_s_h(tmp2, tmp0);
5131  tmp1 = __msa_aver_s_h(tmp3, tmp1);
5132  out = PCKEV_XORI128_UB(tmp0, tmp1);
5133  out = __msa_aver_u_b(out, dst0);
5134  ST_UB(out, dst);
5135  dst += stride;
5136 
5137  src0 = src1;
5138  src1 = src2;
5139  src2 = src3;
5140  src3 = src4;
5141  src4 = src5;
5142  src7 = src8;
5143  src8 = src9;
5144  src9 = src10;
5145  src10 = src11;
5146  src11 = src6;
5147  }
5148 }
5149 
5151  ptrdiff_t stride)
5152 {
5153  uint32_t row;
5154  uint64_t tp0, tp1;
5155  v16u8 out, dst0 = { 0 };
5156  v16i8 src0, src1, src2, src3, src4, src5, src6;
5157  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
5158  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5159  v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
5160  v8i16 mask3, mask4, mask5;
5161  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5162  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5163  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5164  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5165  v8i16 minus5h = __msa_ldi_h(-5);
5166  v8i16 plus20h = __msa_ldi_h(20);
5167 
5168  mask3 = mask0 + 4;
5169  mask4 = mask1 + 4;
5170  mask5 = mask2 + 4;
5171 
5172  src -= ((2 * stride) + 2);
5173 
5174  LD_SB5(src, stride, src0, src1, src2, src3, src4);
5175  src += (5 * stride);
5176  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5177 
5178  for (row = 4; row--;) {
5179  LD_SB2(src, stride, src5, src6);
5180  src += (2 * stride);
5181  XORI_B2_128_SB(src5, src6);
5182 
5183  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5184  vt_res0, vt_res1);
5185  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
5186  vt_res2, vt_res3);
5187  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5188  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5189  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5190  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5191  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5192  mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5193  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5194  mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5195  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5196  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5197  hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5198  hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5199  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5200  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5201  DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5202  DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5203  SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5204  SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5205  tmp0 = __msa_srari_h(shf_vec2, 5);
5206  tmp1 = __msa_srari_h(shf_vec5, 5);
5207  tmp2 = __msa_srari_h(shf_vec8, 5);
5208  tmp3 = __msa_srari_h(shf_vec11, 5);
5209  LD2(dst, stride, tp0, tp1);
5210  INSERT_D2_UB(tp0, tp1, dst0);
5211  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5212  PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
5213  PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5214  tmp0 = __msa_aver_s_h(tmp2, tmp0);
5215  tmp1 = __msa_aver_s_h(tmp3, tmp1);
5216  out = PCKEV_XORI128_UB(tmp0, tmp1);
5217  out = __msa_aver_u_b(out, dst0);
5218  ST_D2(out, 0, 1, dst, stride);
5219  dst += (2 * stride);
5220 
5221  src0 = src2;
5222  src1 = src3;
5223  src2 = src4;
5224  src3 = src5;
5225  src4 = src6;
5226  }
5227 }
5228 
5230  ptrdiff_t stride)
5231 {
5232  uint32_t row;
5233  uint64_t tp0, tp1;
5234  v16u8 out, dst0 = { 0 };
5235  v16i8 src0, src1, src2, src3, src4, src5, src6;
5236  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
5237  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5238  v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
5239  v8i16 mask3, mask4, mask5;
5240  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5241  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5242  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5243  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5244  v8i16 minus5h = __msa_ldi_h(-5);
5245  v8i16 plus20h = __msa_ldi_h(20);
5246 
5247  mask3 = mask0 + 4;
5248  mask4 = mask1 + 4;
5249  mask5 = mask2 + 4;
5250 
5251  src -= ((2 * stride) + 2);
5252 
5253  LD_SB5(src, stride, src0, src1, src2, src3, src4);
5254  src += (5 * stride);
5255  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5256 
5257  for (row = 4; row--;) {
5258  LD_SB2(src, stride, src5, src6);
5259  src += (2 * stride);
5260  XORI_B2_128_SB(src5, src6);
5261 
5262  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5263  vt_res0, vt_res1);
5264  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
5265  vt_res2, vt_res3);
5266  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5267  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5268  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5269  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5270  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5271  mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5272  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5273  mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5274  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5275  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5276  hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5277  hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5278  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5279  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5280  DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5281  DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5282  SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5283  SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5284  tmp0 = __msa_srari_h(shf_vec2, 5);
5285  tmp1 = __msa_srari_h(shf_vec5, 5);
5286  tmp2 = __msa_srari_h(shf_vec8, 5);
5287  tmp3 = __msa_srari_h(shf_vec11, 5);
5288  LD2(dst, stride, tp0, tp1);
5289  INSERT_D2_UB(tp0, tp1, dst0);
5290  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5291  tmp0 = __msa_pckod_h(tmp2, tmp0);
5292  tmp1 = __msa_pckod_h(tmp3, tmp1);
5293  PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5294  tmp0 = __msa_aver_s_h(tmp2, tmp0);
5295  tmp1 = __msa_aver_s_h(tmp3, tmp1);
5296  out = PCKEV_XORI128_UB(tmp0, tmp1);
5297  out = __msa_aver_u_b(out, dst0);
5298  ST_D2(out, 0, 1, dst, stride);
5299  dst += (2 * stride);
5300 
5301  src0 = src2;
5302  src1 = src3;
5303  src2 = src4;
5304  src3 = src5;
5305  src4 = src6;
5306  }
5307 }
5308 
5310  ptrdiff_t stride)
5311 {
5312  uint32_t tp0, tp1, tp2, tp3;
5313  const int16_t filt_const0 = 0xfb01;
5314  const int16_t filt_const1 = 0x1414;
5315  const int16_t filt_const2 = 0x1fb;
5316  v16u8 out, dstv = { 0 };
5317  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5318  v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
5319  v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
5320  v16i8 src76_l, src87_l, filt0, filt1, filt2;
5321  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
5322  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5323  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5324  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5325  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5326  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5327  v8i16 minus5h = __msa_ldi_h(-5);
5328  v8i16 plus20h = __msa_ldi_h(20);
5329  v8i16 zeros = { 0 };
5330 
5331  filt0 = (v16i8) __msa_fill_h(filt_const0);
5332  filt1 = (v16i8) __msa_fill_h(filt_const1);
5333  filt2 = (v16i8) __msa_fill_h(filt_const2);
5334 
5335  src -= ((2 * stride) + 2);
5336 
5337  LD_SB5(src, stride, src0, src1, src2, src3, src4);
5338  src += (5 * stride);
5339  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5340  LD_SB4(src, stride, src5, src6, src7, src8);
5341  XORI_B4_128_SB(src5, src6, src7, src8);
5342 
5343  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
5344  src32_r, src43_r);
5345  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
5346  src76_r, src87_r);
5347  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
5348  src32_l, src43_l);
5349  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
5350  src76_l, src87_l);
5351  vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
5352  vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
5353  vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
5354  vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
5355  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5356  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5357  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5358  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5359  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5360  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5361  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5362  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5363 
5364  vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
5365  vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
5366  vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
5367  vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
5368  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5369  mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
5370  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5371  mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
5372  hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5373  DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
5374  hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5375  DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
5376 
5377  SRARI_W2_SW(hz_res0, hz_res1, 10);
5378  SAT_SW2_SW(hz_res0, hz_res1, 7);
5379  SRARI_W2_SW(hz_res2, hz_res3, 10);
5380  SAT_SW2_SW(hz_res2, hz_res3, 7);
5381 
5382  dst0 = __msa_srari_h(shf_vec2, 5);
5383  dst1 = __msa_srari_h(shf_vec5, 5);
5384  dst2 = __msa_srari_h(shf_vec6, 5);
5385  dst3 = __msa_srari_h(shf_vec7, 5);
5386 
5387  SAT_SH2_SH(dst0, dst1, 7);
5388  SAT_SH2_SH(dst2, dst3, 7);
5389  ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
5390  ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
5391 
5392  hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
5393  hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
5394  hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
5395  hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
5396 
5397  LW4(dst, stride, tp0, tp1, tp2, tp3);
5398  INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv);
5399  PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
5400  out = PCKEV_XORI128_UB(dst0, dst2);
5401  out = __msa_aver_u_b(out, dstv);
5402  ST_W4(out, 0, 1, 2, 3, dst, stride);
5403 }
5404 
5406  ptrdiff_t stride)
5407 {
5408  uint32_t tp0, tp1, tp2, tp3;
5409  const int16_t filt_const0 = 0xfb01;
5410  const int16_t filt_const1 = 0x1414;
5411  const int16_t filt_const2 = 0x1fb;
5412  v16u8 out, dstv = { 0 };
5413  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5414  v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
5415  v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
5416  v16i8 src76_l, src87_l, filt0, filt1, filt2;
5417  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
5418  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5419  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5420  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5421  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5422  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5423  v8i16 minus5h = __msa_ldi_h(-5);
5424  v8i16 plus20h = __msa_ldi_h(20);
5425  v8i16 zeros = { 0 };
5426 
5427  filt0 = (v16i8) __msa_fill_h(filt_const0);
5428  filt1 = (v16i8) __msa_fill_h(filt_const1);
5429  filt2 = (v16i8) __msa_fill_h(filt_const2);
5430 
5431  src -= ((2 * stride) + 2);
5432 
5433  LD_SB5(src, stride, src0, src1, src2, src3, src4);
5434  src += (5 * stride);
5435  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5436  LD_SB4(src, stride, src5, src6, src7, src8);
5437  XORI_B4_128_SB(src5, src6, src7, src8);
5438 
5439  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
5440  src32_r, src43_r);
5441  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
5442  src76_r, src87_r);
5443  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
5444  src32_l, src43_l);
5445  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
5446  src76_l, src87_l);
5447  vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
5448  vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
5449  vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
5450  vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
5451  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5452  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5453  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5454  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5455  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5456  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5457  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5458  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5459 
5460  vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
5461  vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
5462  vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
5463  vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
5464  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5465  mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
5466  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5467  mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
5468  hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5469  DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
5470  hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5471  DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
5472 
5473  SRARI_W2_SW(hz_res0, hz_res1, 10);
5474  SAT_SW2_SW(hz_res0, hz_res1, 7);
5475  SRARI_W2_SW(hz_res2, hz_res3, 10);
5476  SAT_SW2_SW(hz_res2, hz_res3, 7);
5477 
5478  dst0 = __msa_srari_h(shf_vec2, 5);
5479  dst1 = __msa_srari_h(shf_vec5, 5);
5480  dst2 = __msa_srari_h(shf_vec6, 5);
5481  dst3 = __msa_srari_h(shf_vec7, 5);
5482 
5483  SAT_SH2_SH(dst0, dst1, 7);
5484  SAT_SH2_SH(dst2, dst3, 7);
5485 
5486  dst0 = __msa_ilvod_h(zeros, dst0);
5487  dst1 = __msa_ilvod_h(zeros, dst1);
5488  dst2 = __msa_ilvod_h(zeros, dst2);
5489  dst3 = __msa_ilvod_h(zeros, dst3);
5490 
5491  hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
5492  hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
5493  hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
5494  hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
5495 
5496  LW4(dst, stride, tp0, tp1, tp2, tp3);
5497  INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv);
5498  PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
5499  out = PCKEV_XORI128_UB(dst0, dst2);
5500  out = __msa_aver_u_b(out, dstv);
5501  ST_W4(out, 0, 1, 2, 3, dst, stride);
5502 }
5503 
5505  ptrdiff_t stride)
5506 {
5507  const int32_t filt_const0 = 0xfffb0001;
5508  const int32_t filt_const1 = 0x140014;
5509  const int32_t filt_const2 = 0x1fffb;
5510  const uint8_t *src_tmp = src - (2 * stride) - 2;
5511  uint8_t *dst_tmp = dst;
5512  uint64_t tp0, tp1, tp2, tp3;
5513  uint32_t multiple8_cnt, loop_cnt;
5514  v16u8 dst0, dst1, out0, out1;
5515  v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
5516  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5517  v8i16 hz_out7, hz_out8, res0, res1, res2, res3;
5518  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5519  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
5520  v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
5521  v8i16 hz_out87_l, filt0, filt1, filt2;
5522  v4i32 tmp0, tmp1;
5523 
5524  filt0 = (v8i16) __msa_fill_w(filt_const0);
5525  filt1 = (v8i16) __msa_fill_w(filt_const1);
5526  filt2 = (v8i16) __msa_fill_w(filt_const2);
5527 
5528  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
5529 
5530  for (multiple8_cnt = 2; multiple8_cnt--;) {
5531  src = src_tmp;
5532  dst = dst_tmp;
5533 
5534  LD_SB5(src, stride, src0, src1, src2, src3, src4);
5535  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5536  src += (5 * stride);
5537 
5538  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5539  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5540  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5541  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5542  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
5543 
5544  for (loop_cnt = 4; loop_cnt--;) {
5545  LD_SB4(src, stride, src0, src1, src2, src3);
5546  XORI_B4_128_SB(src0, src1, src2, src3);
5547  src += (4 * stride);
5548 
5549  hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5550  hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5551  hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5552  hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5553  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
5554  hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
5555  hz_out43_r);
5556  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
5557  hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
5558  hz_out43_l);
5559  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
5560  hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
5561  hz_out87_r);
5562  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
5563  hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
5564  hz_out87_l);
5565 
5566  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
5567  filt1, filt2);
5568  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
5569  filt1, filt2);
5570  res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5571  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
5572  filt1, filt2);
5573  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
5574  filt1, filt2);
5575  res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5576  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
5577  filt1, filt2);
5578  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
5579  filt1, filt2);
5580  res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5581  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
5582  filt1, filt2);
5583  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
5584  filt1, filt2);
5585  res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5586 
5587  LD4(dst, stride, tp0, tp1, tp2, tp3);
5588  INSERT_D2_UB(tp0, tp1, dst0);
5589  INSERT_D2_UB(tp2, tp3, dst1);
5590  out0 = PCKEV_XORI128_UB(res0, res1);
5591  out1 = PCKEV_XORI128_UB(res2, res3);
5592  AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
5593  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
5594  dst += (4 * stride);
5595 
5596  hz_out0 = hz_out4;
5597  hz_out1 = hz_out5;
5598  hz_out2 = hz_out6;
5599  hz_out3 = hz_out7;
5600  hz_out4 = hz_out8;
5601  }
5602 
5603  src_tmp += 8;
5604  dst_tmp += 8;
5605  }
5606 }
5607 
5609  ptrdiff_t stride)
5610 {
5611  const int32_t filt_const0 = 0xfffb0001;
5612  const int32_t filt_const1 = 0x140014;
5613  const int32_t filt_const2 = 0x1fffb;
5614  uint64_t tp0, tp1, tp2, tp3;
5615  v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
5616  v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
5617  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5618  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
5619  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5620  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
5621  v8i16 hz_out1110_r, hz_out1211_r, res0, res1, res2, res3;
5622  v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
5623  v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
5624  v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
5625  v4i32 tmp0, tmp1;
5626 
5627  filt0 = (v8i16) __msa_fill_w(filt_const0);
5628  filt1 = (v8i16) __msa_fill_w(filt_const1);
5629  filt2 = (v8i16) __msa_fill_w(filt_const2);
5630 
5631  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
5632 
5633  src -= ((2 * stride) + 2);
5634  LD_SB5(src, stride, src0, src1, src2, src3, src4);
5635  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5636  src += (5 * stride);
5637 
5638  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5639  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5640  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5641  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5642  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
5643 
5644  LD_SB4(src, stride, src0, src1, src2, src3);
5645  XORI_B4_128_SB(src0, src1, src2, src3);
5646  src += (4 * stride);
5647  hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5648  hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5649  hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5650  hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5651  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5652  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
5653  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5654  hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
5655  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5656  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
5657  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5658  hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
5659 
5660  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
5661  filt2);
5662  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
5663  filt2);
5664  res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5665  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
5666  filt2);
5667  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
5668  filt2);
5669  res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5670  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
5671  filt2);
5672  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
5673  filt2);
5674  res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5675  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
5676  filt2);
5677  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
5678  filt2);
5679  res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5680  LD4(dst, stride, tp0, tp1, tp2, tp3);
5681  INSERT_D2_UB(tp0, tp1, dst0);
5682  INSERT_D2_UB(tp2, tp3, dst1);
5683  out0 = PCKEV_XORI128_UB(res0, res1);
5684  out1 = PCKEV_XORI128_UB(res2, res3);
5685  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
5686  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
5687  dst += (4 * stride);
5688 
5689  LD_SB4(src, stride, src0, src1, src2, src3);
5690  XORI_B4_128_SB(src0, src1, src2, src3);
5691  hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5692  hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5693  hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5694  hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5695  ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
5696  hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
5697  hz_out1211_r);
5698  ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
5699  hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
5700  hz_out1211_l);
5701  tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
5702  filt2);
5703  tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
5704  filt2);
5705  res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5706  tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
5707  filt2);
5708  tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
5709  filt2);
5710  res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5711  tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
5712  filt2);
5713  tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
5714  filt2);
5715  res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5716  tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
5717  filt2);
5718  tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
5719  filt2);
5720  res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5721  LD4(dst, stride, tp0, tp1, tp2, tp3);
5722  INSERT_D2_UB(tp0, tp1, dst0);
5723  INSERT_D2_UB(tp2, tp3, dst1);
5724  out0 = PCKEV_XORI128_UB(res0, res1);
5725  out1 = PCKEV_XORI128_UB(res2, res3);
5726  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
5727  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
5728 }
5729 
5731  ptrdiff_t stride)
5732 {
5733  const int32_t filt_const0 = 0xfffb0001;
5734  const int32_t filt_const1 = 0x140014;
5735  const int32_t filt_const2 = 0x1fffb;
5736  uint32_t tp0, tp1, tp2, tp3;
5737  v16u8 res, dst0 = { 0 };
5738  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5739  v16i8 mask0, mask1, mask2;
5740  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5741  v8i16 hz_out7, hz_out8, res0, res1, filt0, filt1, filt2;
5742  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5743  v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
5744  v4i32 tmp0, tmp1;
5745 
5746  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
5747 
5748  filt0 = (v8i16) __msa_fill_w(filt_const0);
5749  filt1 = (v8i16) __msa_fill_w(filt_const1);
5750  filt2 = (v8i16) __msa_fill_w(filt_const2);
5751 
5752  src -= ((2 * stride) + 2);
5753 
5754  LD_SB5(src, stride, src0, src1, src2, src3, src4);
5755  src += (5 * stride);
5756  LD_SB4(src, stride, src5, src6, src7, src8);
5757 
5758  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5759  XORI_B4_128_SB(src5, src6, src7, src8);
5760  hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
5761  hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
5762  hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
5763  hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
5764  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
5765  PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
5766  PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
5767  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5768  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
5769  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5770  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
5771 
5772  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
5773  filt2);
5774  tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
5775  filt2);
5776  res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5777  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
5778  filt2);
5779  tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
5780  filt2);
5781  res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5782  LW4(dst, stride, tp0, tp1, tp2, tp3);
5783  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
5784  res = PCKEV_XORI128_UB(res0, res1);
5785  res = __msa_aver_u_b(res, dst0);
5786  ST_W4(res, 0, 1, 2, 3, dst, stride);
5787 }
void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)