FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
h264qpel_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 -2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "h264dsp_mips.h"
23 
24 static const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = {
25  /* 8 width cases */
26  0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
27  1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
28  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
29 
30  /* 4 width cases */
31  0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
32  1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
33  2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
34 };
35 
36 #define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5, \
37  out1, out2) \
38 { \
39  v16i8 tmp0_m, tmp1_m; \
40  v16i8 minus5b_m = __msa_ldi_b(-5); \
41  v16i8 plus20b_m = __msa_ldi_b(20); \
42  \
43  ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m); \
44  HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2); \
45  ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m); \
46  DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2); \
47  ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m); \
48  DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2); \
49 }
50 
51 #define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \
52 ( { \
53  v8i16 out0_m; \
54  v16i8 tmp0_m; \
55  v16i8 minus5b = __msa_ldi_b(-5); \
56  v16i8 plus20b = __msa_ldi_b(20); \
57  \
58  tmp0_m = __msa_vshf_b((v16i8) mask0, in1, in0); \
59  out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m); \
60  \
61  tmp0_m = __msa_vshf_b((v16i8) mask1, in1, in0); \
62  out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m); \
63  \
64  tmp0_m = __msa_vshf_b((v16i8) mask2, in1, in0); \
65  out0_m = __msa_dpadd_s_h(out0_m, plus20b, tmp0_m); \
66  \
67  out0_m; \
68 } )
69 
70 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
71 ( { \
72  v8i16 out0_m; \
73  \
74  out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
75  out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
76  out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \
77  \
78  out0_m; \
79 } )
80 
81 #define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2) \
82 ( { \
83  v4i32 out0_m; \
84  \
85  out0_m = __msa_dotp_s_w((v8i16) in0, (v8i16) coeff0); \
86  out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in1, (v8i16) coeff1); \
87  out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in2, (v8i16) coeff2); \
88  out0_m = __msa_srari_w(out0_m, 10); \
89  out0_m = __msa_sat_s_w(out0_m, 7); \
90  out0_m; \
91 } )
92 
93 static void avc_luma_hv_qrt_4x4_msa(const uint8_t *src_x, const uint8_t *src_y,
94  uint8_t *dst, int32_t stride)
95 {
96  const int16_t filt_const0 = 0xfb01;
97  const int16_t filt_const1 = 0x1414;
98  const int16_t filt_const2 = 0x1fb;
99  v16u8 out;
100  v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
101  v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
102  v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
103  v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
104  v8i16 hz_out0, hz_out1, vt_out0, vt_out1, out0, out1;
105 
106  filt0 = (v16i8) __msa_fill_h(filt_const0);
107  filt1 = (v16i8) __msa_fill_h(filt_const1);
108  filt2 = (v16i8) __msa_fill_h(filt_const2);
109 
110  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
111 
112  LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
113  src_y += (5 * stride);
114 
115  src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
116  src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
117  src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
118  src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
119 
120  XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
121 
122  LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
123  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
124  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2);
125  hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2);
126 
127  SRARI_H2_SH(hz_out0, hz_out1, 5);
128  SAT_SH2_SH(hz_out0, hz_out1, 7);
129 
130  LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
131 
132  src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
133  src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
134  src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
135  src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
136 
137  XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
138  ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
139  ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
140  vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
141  filt2);
142  vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
143  filt2);
144  SRARI_H2_SH(vt_out0, vt_out1, 5);
145  SAT_SH2_SH(vt_out0, vt_out1, 7);
146 
147  out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
148  out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
149 
150  SAT_SH2_SH(out0, out1, 7);
151  out = PCKEV_XORI128_UB(out0, out1);
152  ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
153 }
154 
155 static void avc_luma_hv_qrt_8x8_msa(const uint8_t *src_x, const uint8_t *src_y,
156  uint8_t *dst, int32_t stride)
157 {
158  const int16_t filt_const0 = 0xfb01;
159  const int16_t filt_const1 = 0x1414;
160  const int16_t filt_const2 = 0x1fb;
161  v16u8 out0, out1;
162  v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
163  v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
164  v16i8 src_vt7, src_vt8, src_vt9, src_vt10, src_vt11, src_vt12;
165  v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
166  v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
167  v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
168  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
169  v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
170 
171  filt0 = (v16i8) __msa_fill_h(filt_const0);
172  filt1 = (v16i8) __msa_fill_h(filt_const1);
173  filt2 = (v16i8) __msa_fill_h(filt_const2);
174 
175  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
176  LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
177  src_y += (5 * stride);
178 
179  XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
180 
181  LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
182  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
183  src_x += (4 * stride);
184 
185  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
186  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
187  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
188  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
189 
190  SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
191  SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
192 
193  LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
194  src_y += (4 * stride);
195  XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
196 
197  ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
198  src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
199  ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
200  src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
201  vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
202  filt2);
203  vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
204  filt2);
205  vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
206  filt2);
207  vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
208  filt2);
209  SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
210  SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
211 
212  tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
213  tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
214  tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
215  tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
216 
217  LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
218  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
219 
220  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
221  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
222  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
223  ST8x4_UB(out0, out1, dst, stride);
224  dst += (4 * stride);
225 
226  LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
227  XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12);
228 
229  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
230  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
231  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
232  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
233 
234  SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
235  SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
236 
237  ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
238  src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
239  src_vt1211_r);
240  vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
241  filt2);
242  vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
243  filt2);
244  vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
245  filt2);
246  vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
247  filt1, filt2);
248  SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
249  SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
250 
251  tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
252  tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
253  tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
254  tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
255 
256  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
257  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
258  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
259  ST8x4_UB(out0, out1, dst, stride);
260  dst += (4 * stride);
261 }
262 
263 static void avc_luma_hv_qrt_16x16_msa(const uint8_t *src_x,
264  const uint8_t *src_y, uint8_t *dst,
265  int32_t stride)
266 {
267  const int16_t filt_const0 = 0xfb01;
268  const int16_t filt_const1 = 0x1414;
269  const int16_t filt_const2 = 0x1fb;
270  const uint8_t *src_x_tmp = src_x;
271  const uint8_t *src_y_tmp = src_y;
272  uint8_t *dst_tmp = dst;
273  uint32_t multiple8_cnt, loop_cnt;
274  v16u8 tmp0, tmp1;
275  v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
276  v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
277  v16i8 src_vt7, src_vt8;
278  v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
279  v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
280  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
281  v8i16 vt_out3, out0, out1, out2, out3;
282 
283  filt0 = (v16i8) __msa_fill_h(filt_const0);
284  filt1 = (v16i8) __msa_fill_h(filt_const1);
285  filt2 = (v16i8) __msa_fill_h(filt_const2);
286 
287  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
288 
289  for (multiple8_cnt = 2; multiple8_cnt--;) {
290  src_x = src_x_tmp;
291  src_y = src_y_tmp;
292  dst = dst_tmp;
293 
294  LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
295  src_y += (5 * stride);
296 
297  XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
298 
299  for (loop_cnt = 4; loop_cnt--;) {
300  LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
301  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
302  src_x += (4 * stride);
303 
304  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
305  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
306  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
307  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
308  SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
309  SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
310 
311  LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
312  src_y += (4 * stride);
313 
314  XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
315  ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
316  src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
317  src_vt43_r);
318  ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
319  src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
320  src_vt87_r);
321  vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
322  filt1, filt2);
323  vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
324  filt1, filt2);
325  vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
326  filt1, filt2);
327  vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
328  filt1, filt2);
329  SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
330  SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
331 
332  out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
333  out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
334  out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
335  out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
336 
337  SAT_SH4_SH(out0, out1, out2, out3, 7);
338  tmp0 = PCKEV_XORI128_UB(out0, out1);
339  tmp1 = PCKEV_XORI128_UB(out2, out3);
340  ST8x4_UB(tmp0, tmp1, dst, stride);
341  dst += (4 * stride);
342 
343  src_vt0 = src_vt4;
344  src_vt1 = src_vt5;
345  src_vt2 = src_vt6;
346  src_vt3 = src_vt7;
347  src_vt4 = src_vt8;
348  }
349 
350  src_x_tmp += 8;
351  src_y_tmp += 8;
352  dst_tmp += 8;
353  }
354 }
355 
357  const uint8_t *src_y,
358  uint8_t *dst,
359  int32_t stride)
360 {
361  uint32_t tp0, tp1, tp2, tp3;
362  const int16_t filt_const0 = 0xfb01;
363  const int16_t filt_const1 = 0x1414;
364  const int16_t filt_const2 = 0x1fb;
365  v16u8 res, dst0 = { 0 };
366  v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
367  v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
368  v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
369  v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
370  v8i16 hz_out0, hz_out1, vt_out0, vt_out1, res0, res1;
371 
372  filt0 = (v16i8) __msa_fill_h(filt_const0);
373  filt1 = (v16i8) __msa_fill_h(filt_const1);
374  filt2 = (v16i8) __msa_fill_h(filt_const2);
375 
376  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
377 
378  LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
379  src_y += (5 * stride);
380 
381  src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
382  src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
383  src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
384  src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
385 
386  XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
387 
388  LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
389  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
390  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2);
391  hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2);
392 
393  SRARI_H2_SH(hz_out0, hz_out1, 5);
394  SAT_SH2_SH(hz_out0, hz_out1, 7);
395 
396  LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
397 
398  src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
399  src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
400  src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
401  src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
402 
403  XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
404  ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
405  ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
406  vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
407  filt2);
408  vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
409  filt2);
410  SRARI_H2_SH(vt_out0, vt_out1, 5);
411  SAT_SH2_SH(vt_out0, vt_out1, 7);
412  LW4(dst, stride, tp0, tp1, tp2, tp3);
413  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
414 
415  res1 = __msa_srari_h((hz_out1 + vt_out1), 1);
416  res0 = __msa_srari_h((hz_out0 + vt_out0), 1);
417 
418  SAT_SH2_SH(res0, res1, 7);
419  res = PCKEV_XORI128_UB(res0, res1);
420  dst0 = __msa_aver_u_b(res, dst0);
421 
422  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
423 }
424 
426  const uint8_t *src_y,
427  uint8_t *dst,
428  int32_t stride)
429 {
430  const int16_t filt_const0 = 0xfb01;
431  const int16_t filt_const1 = 0x1414;
432  const int16_t filt_const2 = 0x1fb;
433  uint64_t tp0, tp1, tp2, tp3;
434  v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
435  v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt0, src_vt1, src_vt2;
436  v16i8 src_vt3, src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
437  v16i8 src_vt9, src_vt10, src_vt11, src_vt12, mask0, mask1, mask2;
438  v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
439  v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
440  v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
441  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
442  v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
443 
444  filt0 = (v16i8) __msa_fill_h(filt_const0);
445  filt1 = (v16i8) __msa_fill_h(filt_const1);
446  filt2 = (v16i8) __msa_fill_h(filt_const2);
447 
448  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
449  LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
450  src_y += (5 * stride);
451 
452  XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
453 
454  LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
455  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
456  src_x += (4 * stride);
457 
458  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
459  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
460  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
461  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
462 
463  SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
464  SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
465 
466  LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
467  src_y += (4 * stride);
468  XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
469 
470  ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
471  src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
472  ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
473  src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
474  vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
475  filt2);
476  vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
477  filt2);
478  vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
479  filt2);
480  vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
481  filt2);
482  SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
483  SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
484 
485  tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
486  tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
487  tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
488  tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
489 
490  LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
491  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
492 
493  LD4(dst, stride, tp0, tp1, tp2, tp3);
494  INSERT_D2_UB(tp0, tp1, dst0);
495  INSERT_D2_UB(tp2, tp3, dst1);
496 
497  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
498  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
499  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
500  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
501  ST8x4_UB(dst0, dst1, dst, stride);
502  dst += (4 * stride);
503 
504  LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
505  XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12);
506 
507  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
508  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
509  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
510  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
511 
512  SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
513  SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
514 
515  ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
516  src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
517  src_vt1211_r);
518  vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
519  filt2);
520  vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
521  filt2);
522  vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
523  filt2);
524  vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
525  filt1, filt2);
526  SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
527  SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
528 
529  tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
530  tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
531  tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
532  tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
533 
534  LD4(dst, stride, tp0, tp1, tp2, tp3);
535  INSERT_D2_UB(tp0, tp1, dst0);
536  INSERT_D2_UB(tp2, tp3, dst1);
537 
538  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
539  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
540  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
541  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
542  ST8x4_UB(dst0, dst1, dst, stride);
543  dst += (4 * stride);
544 }
545 
547  const uint8_t *src_y,
548  uint8_t *dst,
549  int32_t stride)
550 {
551  const int16_t filt_const0 = 0xfb01;
552  const int16_t filt_const1 = 0x1414;
553  const int16_t filt_const2 = 0x1fb;
554  const uint8_t *src_x_tmp = src_x;
555  const uint8_t *src_y_tmp = src_y;
556  uint8_t *dst_tmp = dst;
557  uint32_t multiple8_cnt, loop_cnt;
558  uint64_t tp0, tp1, tp2, tp3;
559  v16u8 tmp0, tmp1, dst0 = { 0 }, dst1 = { 0 };
560  v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
561  v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
562  v16i8 src_vt7, src_vt8;
563  v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
564  v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
565  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
566  v8i16 vt_out3, out0, out1, out2, out3;
567 
568  filt0 = (v16i8) __msa_fill_h(filt_const0);
569  filt1 = (v16i8) __msa_fill_h(filt_const1);
570  filt2 = (v16i8) __msa_fill_h(filt_const2);
571 
572  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
573 
574  for (multiple8_cnt = 2; multiple8_cnt--;) {
575  src_x = src_x_tmp;
576  src_y = src_y_tmp;
577  dst = dst_tmp;
578 
579  LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
580  src_y += (5 * stride);
581 
582  XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
583 
584  for (loop_cnt = 4; loop_cnt--;) {
585  LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
586  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
587  src_x += (4 * stride);
588 
589  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
590  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
591  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
592  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
593  SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
594  SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
595 
596  LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
597  src_y += (4 * stride);
598 
599  XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
600  ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
601  src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
602  src_vt43_r);
603  ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
604  src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
605  src_vt87_r);
606  vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
607  filt1, filt2);
608  vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
609  filt1, filt2);
610  vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
611  filt1, filt2);
612  vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
613  filt1, filt2);
614  SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
615  SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
616 
617  out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
618  out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
619  out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
620  out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
621 
622  LD4(dst, stride, tp0, tp1, tp2, tp3);
623  INSERT_D2_UB(tp0, tp1, dst0);
624  INSERT_D2_UB(tp2, tp3, dst1);
625 
626  SAT_SH4_SH(out0, out1, out2, out3, 7);
627  tmp0 = PCKEV_XORI128_UB(out0, out1);
628  tmp1 = PCKEV_XORI128_UB(out2, out3);
629  AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
630  ST8x4_UB(dst0, dst1, dst, stride);
631  dst += (4 * stride);
632 
633  src_vt0 = src_vt4;
634  src_vt1 = src_vt5;
635  src_vt2 = src_vt6;
636  src_vt3 = src_vt7;
637  src_vt4 = src_vt8;
638  }
639 
640  src_x_tmp += 8;
641  src_y_tmp += 8;
642  dst_tmp += 8;
643  }
644 }
645 
647  ptrdiff_t stride)
648 {
649  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
650  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
651 
652  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
653  src += (8 * stride);
654  LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, src15);
655 
656  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride);
657  dst += (8 * stride);
658  ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, stride);
659 }
660 
662  ptrdiff_t stride)
663 {
664  uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
665 
666  LD4(src, stride, src0, src1, src2, src3);
667  src += 4 * stride;
668  LD4(src, stride, src4, src5, src6, src7);
669  SD4(src0, src1, src2, src3, dst, stride);
670  dst += 4 * stride;
671  SD4(src4, src5, src6, src7, dst, stride);
672 }
673 
675  ptrdiff_t stride)
676 {
677  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
678  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
679 
680  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
681  src += (8 * stride);
682  LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
683 
684  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
685  dst2, dst3);
686  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
687  dst6, dst7);
688  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
689  dst += (8 * stride);
690 
691  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
692  LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
693 
694  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
695  dst2, dst3);
696  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
697  dst6, dst7);
698  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
699 }
700 
702  ptrdiff_t stride)
703 {
704  uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
705  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
706  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
707 
708  LD4(src, stride, tp0, tp1, tp2, tp3);
709  src += 4 * stride;
710  LD4(src, stride, tp4, tp5, tp6, tp7);
711  INSERT_D2_UB(tp0, tp1, src0);
712  INSERT_D2_UB(tp2, tp3, src1);
713  INSERT_D2_UB(tp4, tp5, src2);
714  INSERT_D2_UB(tp6, tp7, src3);
715 
716  LD4(dst, stride, tp0, tp1, tp2, tp3);
717  LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
718  INSERT_D2_UB(tp0, tp1, dst0);
719  INSERT_D2_UB(tp2, tp3, dst1);
720  INSERT_D2_UB(tp4, tp5, dst2);
721  INSERT_D2_UB(tp6, tp7, dst3);
722 
723  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
724  dst2, dst3);
725 
726  ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
727 }
728 
730  ptrdiff_t stride)
731 {
732  uint32_t tp0, tp1, tp2, tp3;
733  v16u8 src0 = { 0 }, dst0 = { 0 };
734 
735  LW4(src, stride, tp0, tp1, tp2, tp3);
736  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
737  LW4(dst, stride, tp0, tp1, tp2, tp3);
738  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
739 
740  dst0 = __msa_aver_u_b(src0, dst0);
741 
742  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
743 }
744 
746  ptrdiff_t stride)
747 {
748  uint32_t loop_cnt;
749  v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
750  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
751  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
752  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
753  v16i8 minus5b = __msa_ldi_b(-5);
754  v16i8 plus20b = __msa_ldi_b(20);
755 
756  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
757  mask3 = mask0 + 8;
758  mask4 = mask1 + 8;
759  mask5 = mask2 + 8;
760  src -= 2;
761 
762  for (loop_cnt = 4; loop_cnt--;) {
763  LD_SB2(src, 16, src0, src1);
764  src += stride;
765  LD_SB2(src, 16, src2, src3);
766  src += stride;
767  LD_SB2(src, 16, src4, src5);
768  src += stride;
769  LD_SB2(src, 16, src6, src7);
770  src += stride;
771 
772  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
773  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
774  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
775  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
776  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
777  VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
778  VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
779  HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
780  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
781  minus5b, res0, res1, res2, res3);
782  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
783  plus20b, res0, res1, res2, res3);
784  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
785  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
786  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
787  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
788  VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
789  VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
790  HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
791  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
792  minus5b, res4, res5, res6, res7);
793  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
794  plus20b, res4, res5, res6, res7);
795  SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2);
796  SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2);
797  SRARI_H4_SH(res0, res1, res2, res3, 5);
798  SRARI_H4_SH(res4, res5, res6, res7, 5);
799  SAT_SH4_SH(res0, res1, res2, res3, 7);
800  SAT_SH4_SH(res4, res5, res6, res7, 7);
801  PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
802  PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
803  dst0 = __msa_aver_s_b(dst0, src0);
804  dst1 = __msa_aver_s_b(dst1, src2);
805  dst2 = __msa_aver_s_b(dst2, src4);
806  dst3 = __msa_aver_s_b(dst3, src6);
807  XORI_B4_128_SB(dst0, dst1, dst2, dst3);
808  ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
809  dst += (4 * stride);
810  }
811 }
812 
814  ptrdiff_t stride)
815 {
816  uint32_t loop_cnt;
817  v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
818  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
819  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
820  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
821  v16i8 minus5b = __msa_ldi_b(-5);
822  v16i8 plus20b = __msa_ldi_b(20);
823 
824  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
825  mask3 = mask0 + 8;
826  mask4 = mask1 + 8;
827  mask5 = mask2 + 8;
828  src -= 2;
829 
830  for (loop_cnt = 4; loop_cnt--;) {
831  LD_SB2(src, 16, src0, src1);
832  src += stride;
833  LD_SB2(src, 16, src2, src3);
834  src += stride;
835  LD_SB2(src, 16, src4, src5);
836  src += stride;
837  LD_SB2(src, 16, src6, src7);
838  src += stride;
839 
840  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
841  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
842  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
843  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
844  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
845  VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
846  VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
847  HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
848  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
849  minus5b, res0, res1, res2, res3);
850  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
851  plus20b, res0, res1, res2, res3);
852  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
853  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
854  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
855  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
856  VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
857  VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
858  HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
859  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
860  minus5b, res4, res5, res6, res7);
861  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
862  plus20b, res4, res5, res6, res7);
863  SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3);
864  SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3);
865  SRARI_H4_SH(res0, res1, res2, res3, 5);
866  SRARI_H4_SH(res4, res5, res6, res7, 5);
867  SAT_SH4_SH(res0, res1, res2, res3, 7);
868  SAT_SH4_SH(res4, res5, res6, res7, 7);
869  PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
870  PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
871  dst0 = __msa_aver_s_b(dst0, src0);
872  dst1 = __msa_aver_s_b(dst1, src2);
873  dst2 = __msa_aver_s_b(dst2, src4);
874  dst3 = __msa_aver_s_b(dst3, src6);
875  XORI_B4_128_SB(dst0, dst1, dst2, dst3);
876  ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
877  dst += (4 * stride);
878  }
879 }
880 
882  ptrdiff_t stride)
883 {
884  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
885  v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
886  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
887  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
888  v16i8 minus5b = __msa_ldi_b(-5);
889  v16i8 plus20b = __msa_ldi_b(20);
890 
891  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
892  LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
893  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
894  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
895  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
896  HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
897  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
898  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
899  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
900  res0, res1, res2, res3);
901  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
902  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
903  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
904  res0, res1, res2, res3);
905  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
906  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
907  HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
908  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
909  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
910  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
911  res4, res5, res6, res7);
912  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
913  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
914  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
915  res4, res5, res6, res7);
916  SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
917  SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
918  SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2);
919  SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2);
920  PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
921  PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
922  SRARI_H4_SH(res0, res1, res2, res3, 5);
923  SRARI_H4_SH(res4, res5, res6, res7, 5);
924  SAT_SH4_SH(res0, res1, res2, res3, 7);
925  SAT_SH4_SH(res4, res5, res6, res7, 7);
926  PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
927  PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
928  tmp0 = __msa_aver_s_b(tmp0, src0);
929  tmp1 = __msa_aver_s_b(tmp1, src1);
930  tmp2 = __msa_aver_s_b(tmp2, src4);
931  tmp3 = __msa_aver_s_b(tmp3, src5);
932  XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
933  ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
934 }
935 
937  ptrdiff_t stride)
938 {
939  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
940  v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
941  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
942  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
943  v16i8 minus5b = __msa_ldi_b(-5);
944  v16i8 plus20b = __msa_ldi_b(20);
945 
946  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
947  LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
948  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
949  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
950  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
951  HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
952  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
953  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
954  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
955  res0, res1, res2, res3);
956  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
957  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
958  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
959  res0, res1, res2, res3);
960  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
961  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
962  HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
963  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
964  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
965  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
966  res4, res5, res6, res7);
967  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
968  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
969  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
970  res4, res5, res6, res7);
971  SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
972  SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
973  SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 3);
974  SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 3);
975  PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
976  PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
977  SRARI_H4_SH(res0, res1, res2, res3, 5);
978  SRARI_H4_SH(res4, res5, res6, res7, 5);
979  SAT_SH4_SH(res0, res1, res2, res3, 7);
980  SAT_SH4_SH(res4, res5, res6, res7, 7);
981  PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
982  PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
983  tmp0 = __msa_aver_s_b(tmp0, src0);
984  tmp1 = __msa_aver_s_b(tmp1, src1);
985  tmp2 = __msa_aver_s_b(tmp2, src4);
986  tmp3 = __msa_aver_s_b(tmp3, src5);
987  XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
988  ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
989 }
990 
992  ptrdiff_t stride)
993 {
994  v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
995  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
996  v8i16 res0, res1;
997  v16i8 minus5b = __msa_ldi_b(-5);
998  v16i8 plus20b = __msa_ldi_b(20);
999 
1000  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1001  LD_SB4(src - 2, stride, src0, src1, src2, src3);
1002  XORI_B4_128_SB(src0, src1, src2, src3);
1003  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1004  HADD_SB2_SH(vec0, vec1, res0, res1);
1005  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1006  DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1007  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1008  DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1009  SRARI_H2_SH(res0, res1, 5);
1010  SAT_SH2_SH(res0, res1, 7);
1011  res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
1012  SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
1013  SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
1014  src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
1015  src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1016  src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
1017  res = __msa_aver_s_b(res, src0);
1018  res = (v16i8) __msa_xori_b((v16u8) res, 128);
1019  ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
1020 }
1021 
1023  ptrdiff_t stride)
1024 {
1025  v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
1026  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1027  v8i16 res0, res1;
1028  v16i8 minus5b = __msa_ldi_b(-5);
1029  v16i8 plus20b = __msa_ldi_b(20);
1030 
1031  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1032  LD_SB4(src - 2, stride, src0, src1, src2, src3);
1033  XORI_B4_128_SB(src0, src1, src2, src3);
1034  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1035  HADD_SB2_SH(vec0, vec1, res0, res1);
1036  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1037  DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1038  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1039  DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1040  SRARI_H2_SH(res0, res1, 5);
1041  SAT_SH2_SH(res0, res1, 7);
1042  res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
1043  SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
1044  SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
1045  src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
1046  src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1047  src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
1048  res = __msa_aver_s_b(res, src0);
1049  res = (v16i8) __msa_xori_b((v16u8) res, 128);
1050  ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
1051 }
1052 
1054  ptrdiff_t stride)
1055 {
1056  uint32_t loop_cnt;
1057  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1058  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1059  v16i8 vec11;
1060  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1061  v16i8 minus5b = __msa_ldi_b(-5);
1062  v16i8 plus20b = __msa_ldi_b(20);
1063 
1064  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1065  src -= 2;
1066 
1067  for (loop_cnt = 4; loop_cnt--;) {
1068  LD_SB2(src, 8, src0, src1);
1069  src += stride;
1070  LD_SB2(src, 8, src2, src3);
1071  src += stride;
1072  LD_SB2(src, 8, src4, src5);
1073  src += stride;
1074  LD_SB2(src, 8, src6, src7);
1075  src += stride;
1076 
1077  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1078  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
1079  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
1080  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
1081  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
1082  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
1083  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
1084  HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
1085  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1086  minus5b, res0, res1, res2, res3);
1087  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1088  plus20b, res0, res1, res2, res3);
1089  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
1090  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
1091  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
1092  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
1093  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
1094  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
1095  HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
1096  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1097  minus5b, res4, res5, res6, res7);
1098  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1099  plus20b, res4, res5, res6, res7);
1100  SRARI_H4_SH(res0, res1, res2, res3, 5);
1101  SRARI_H4_SH(res4, res5, res6, res7, 5);
1102  SAT_SH4_SH(res0, res1, res2, res3, 7);
1103  SAT_SH4_SH(res4, res5, res6, res7, 7);
1104  PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
1105  vec2, vec3);
1106  XORI_B4_128_SB(vec0, vec1, vec2, vec3);
1107  ST_SB4(vec0, vec1, vec2, vec3, dst, stride);
1108  dst += (4 * stride);
1109  }
1110 }
1111 
1113  ptrdiff_t stride)
1114 {
1115  v16u8 out0, out1, out2, out3;
1116  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1117  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1118  v16i8 vec11;
1119  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1120  v16i8 minus5b = __msa_ldi_b(-5);
1121  v16i8 plus20b = __msa_ldi_b(20);
1122 
1123  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1124  LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1125  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1126  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1127  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
1128  HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
1129  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
1130  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
1131  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1132  res0, res1, res2, res3);
1133  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
1134  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
1135  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1136  plus20b, res0, res1, res2, res3);
1137  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
1138  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
1139  HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
1140  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
1141  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
1142  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1143  res4, res5, res6, res7);
1144  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
1145  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
1146  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1147  plus20b, res4, res5, res6, res7);
1148  SRARI_H4_SH(res0, res1, res2, res3, 5);
1149  SRARI_H4_SH(res4, res5, res6, res7, 5);
1150  SAT_SH4_SH(res0, res1, res2, res3, 7);
1151  SAT_SH4_SH(res4, res5, res6, res7, 7);
1152  out0 = PCKEV_XORI128_UB(res0, res1);
1153  out1 = PCKEV_XORI128_UB(res2, res3);
1154  out2 = PCKEV_XORI128_UB(res4, res5);
1155  out3 = PCKEV_XORI128_UB(res6, res7);
1156  ST8x8_UB(out0, out1, out2, out3, dst, stride);
1157 }
1158 
1160  ptrdiff_t stride)
1161 {
1162  v16u8 out;
1163  v16i8 src0, src1, src2, src3, mask0, mask1, mask2;
1164  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1165  v8i16 res0, res1;
1166  v16i8 minus5b = __msa_ldi_b(-5);
1167  v16i8 plus20b = __msa_ldi_b(20);
1168 
1169  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1170  LD_SB4(src - 2, stride, src0, src1, src2, src3);
1171  XORI_B4_128_SB(src0, src1, src2, src3);
1172  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1173  HADD_SB2_SH(vec0, vec1, res0, res1);
1174  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1175  DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1176  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1177  DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1178  SRARI_H2_SH(res0, res1, 5);
1179  SAT_SH2_SH(res0, res1, 7);
1180  out = PCKEV_XORI128_UB(res0, res1);
1181  ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
1182 }
1183 
1185  ptrdiff_t stride)
1186 {
1187  int32_t loop_cnt;
1188  int16_t filt_const0 = 0xfb01;
1189  int16_t filt_const1 = 0x1414;
1190  int16_t filt_const2 = 0x1fb;
1191  v16u8 res0, res1, res2, res3;
1192  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1193  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1194  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1195  v16i8 src65_l, src87_l, filt0, filt1, filt2;
1196  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1197 
1198  filt0 = (v16i8) __msa_fill_h(filt_const0);
1199  filt1 = (v16i8) __msa_fill_h(filt_const1);
1200  filt2 = (v16i8) __msa_fill_h(filt_const2);
1201 
1202  src -= (stride * 2);
1203 
1204  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1205  src += (5 * stride);
1206 
1207  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1208  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1209  src32_r, src43_r);
1210  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
1211  src32_l, src43_l);
1212 
1213  for (loop_cnt = 4; loop_cnt--;) {
1214  LD_SB4(src, stride, src5, src6, src7, src8);
1215  src += (4 * stride);
1216 
1217  XORI_B4_128_SB(src5, src6, src7, src8);
1218  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
1219  src65_r, src76_r, src87_r);
1220  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
1221  src65_l, src76_l, src87_l);
1222  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1223  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1224  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1225  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1226  out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1227  out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1228  out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1229  out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1230  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1231  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1232  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1233  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1234  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1235  out3_r, res0, res1, res2, res3);
1236  res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
1237  res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
1238  res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
1239  res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
1240  XORI_B4_128_UB(res0, res1, res2, res3);
1241  ST_UB4(res0, res1, res2, res3, dst, stride);
1242  dst += (4 * stride);
1243 
1244  src10_r = src54_r;
1245  src32_r = src76_r;
1246  src21_r = src65_r;
1247  src43_r = src87_r;
1248  src10_l = src54_l;
1249  src32_l = src76_l;
1250  src21_l = src65_l;
1251  src43_l = src87_l;
1252  src2 = src6;
1253  src3 = src7;
1254  src4 = src8;
1255  }
1256 }
1257 
1259  ptrdiff_t stride)
1260 {
1261  int32_t loop_cnt;
1262  int16_t filt_const0 = 0xfb01;
1263  int16_t filt_const1 = 0x1414;
1264  int16_t filt_const2 = 0x1fb;
1265  v16u8 res0, res1, res2, res3;
1266  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1267  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1268  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1269  v16i8 src65_l, src87_l, filt0, filt1, filt2;
1270  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1271 
1272  filt0 = (v16i8) __msa_fill_h(filt_const0);
1273  filt1 = (v16i8) __msa_fill_h(filt_const1);
1274  filt2 = (v16i8) __msa_fill_h(filt_const2);
1275 
1276  src -= (stride * 2);
1277 
1278  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1279  src += (5 * stride);
1280 
1281  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1282  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1283  src32_r, src43_r);
1284  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
1285  src32_l, src43_l);
1286 
1287  for (loop_cnt = 4; loop_cnt--;) {
1288  LD_SB4(src, stride, src5, src6, src7, src8);
1289  src += (4 * stride);
1290 
1291  XORI_B4_128_SB(src5, src6, src7, src8);
1292  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
1293  src65_r, src76_r, src87_r);
1294  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
1295  src65_l, src76_l, src87_l);
1296  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1297  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1298  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1299  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1300  out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1301  out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1302  out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1303  out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1304  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1305  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1306  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1307  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1308  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1309  out3_r, res0, res1, res2, res3);
1310  res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
1311  res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
1312  res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
1313  res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
1314  XORI_B4_128_UB(res0, res1, res2, res3);
1315  ST_UB4(res0, res1, res2, res3, dst, stride);
1316  dst += (4 * stride);
1317 
1318  src10_r = src54_r;
1319  src32_r = src76_r;
1320  src21_r = src65_r;
1321  src43_r = src87_r;
1322  src10_l = src54_l;
1323  src32_l = src76_l;
1324  src21_l = src65_l;
1325  src43_l = src87_l;
1326  src3 = src7;
1327  src4 = src8;
1328  }
1329 }
1330 
1332  ptrdiff_t stride)
1333 {
1334  const int16_t filt_const0 = 0xfb01;
1335  const int16_t filt_const1 = 0x1414;
1336  const int16_t filt_const2 = 0x1fb;
1337  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1338  v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
1339  v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
1340  v16i8 tmp0, tmp1, tmp2, tmp3, filt0, filt1, filt2, out0, out1, out2, out3;
1341  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
1342 
1343  filt0 = (v16i8) __msa_fill_h(filt_const0);
1344  filt1 = (v16i8) __msa_fill_h(filt_const1);
1345  filt2 = (v16i8) __msa_fill_h(filt_const2);
1346 
1347  src -= (stride * 2);
1348 
1349  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1350  src += (5 * stride);
1351  LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
1352  XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
1353  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1354  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1355  src32_r, src43_r);
1356  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1357  src76_r, src87_r);
1358  ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
1359  src109_r, src1110_r, src1211_r);
1360  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1361  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1362  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1363  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1364  out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
1365  out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
1366  out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
1367  out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
1368  PCKEV_D2_SB(src3, src2, src5, src4, tmp0, tmp1);
1369  PCKEV_D2_SB(src7, src6, src9, src8, tmp2, tmp3);
1370  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1371  SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
1372  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1373  SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
1374  PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
1375  PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
1376  out0 = __msa_aver_s_b(out0, tmp0);
1377  out1 = __msa_aver_s_b(out1, tmp1);
1378  out2 = __msa_aver_s_b(out2, tmp2);
1379  out3 = __msa_aver_s_b(out3, tmp3);
1380  XORI_B4_128_SB(out0, out1, out2, out3);
1381  ST8x8_UB(out0, out1, out2, out3, dst, stride);
1382 }
1383 
1385  ptrdiff_t stride)
1386 {
1387  const int16_t filt_const0 = 0xfb01;
1388  const int16_t filt_const1 = 0x1414;
1389  const int16_t filt_const2 = 0x1fb;
1390  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1391  v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
1392  v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
1393  v16i8 filt0, filt1, filt2, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
1394  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
1395 
1396  filt0 = (v16i8) __msa_fill_h(filt_const0);
1397  filt1 = (v16i8) __msa_fill_h(filt_const1);
1398  filt2 = (v16i8) __msa_fill_h(filt_const2);
1399 
1400  src -= (stride * 2);
1401 
1402  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1403  src += (5 * stride);
1404  LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
1405  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1406  XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
1407  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1408  src32_r, src43_r);
1409  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1410  src76_r, src87_r);
1411  ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
1412  src109_r, src1110_r, src1211_r);
1413  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1414  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1415  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1416  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1417  out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
1418  out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
1419  out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
1420  out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
1421  PCKEV_D2_SB(src4, src3, src6, src5, tmp0, tmp1);
1422  PCKEV_D2_SB(src8, src7, src10, src9, tmp2, tmp3);
1423  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1424  SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
1425  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1426  SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
1427  PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
1428  PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
1429  out0 = __msa_aver_s_b(out0, tmp0);
1430  out1 = __msa_aver_s_b(out1, tmp1);
1431  out2 = __msa_aver_s_b(out2, tmp2);
1432  out3 = __msa_aver_s_b(out3, tmp3);
1433  XORI_B4_128_SB(out0, out1, out2, out3);
1434  ST8x8_UB(out0, out1, out2, out3, dst, stride);
1435 }
1436 
1438  ptrdiff_t stride)
1439 {
1440  int16_t filt_const0 = 0xfb01;
1441  int16_t filt_const1 = 0x1414;
1442  int16_t filt_const2 = 0x1fb;
1443  v16u8 out;
1444  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1445  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1446  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
1447  v8i16 out10, out32;
1448 
1449  filt0 = (v16i8) __msa_fill_h(filt_const0);
1450  filt1 = (v16i8) __msa_fill_h(filt_const1);
1451  filt2 = (v16i8) __msa_fill_h(filt_const2);
1452 
1453  src -= (stride * 2);
1454 
1455  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1456  src += (5 * stride);
1457  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1458  src32_r, src43_r);
1459  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1460  XORI_B2_128_SB(src2110, src4332);
1461  LD_SB4(src, stride, src5, src6, src7, src8);
1462  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1463  src76_r, src87_r);
1464  ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1465  XORI_B2_128_SB(src6554, src8776);
1466  out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1467  out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1468  SRARI_H2_SH(out10, out32, 5);
1469  SAT_SH2_SH(out10, out32, 7);
1470  out = PCKEV_XORI128_UB(out10, out32);
1471  src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1472  src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
1473  src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1474  out = __msa_aver_u_b(out, (v16u8) src32_r);
1475  ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
1476 }
1477 
1479  ptrdiff_t stride)
1480 {
1481  int16_t filt_const0 = 0xfb01;
1482  int16_t filt_const1 = 0x1414;
1483  int16_t filt_const2 = 0x1fb;
1484  v16u8 out;
1485  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1486  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1487  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
1488  v8i16 out10, out32;
1489 
1490  filt0 = (v16i8) __msa_fill_h(filt_const0);
1491  filt1 = (v16i8) __msa_fill_h(filt_const1);
1492  filt2 = (v16i8) __msa_fill_h(filt_const2);
1493 
1494  src -= (stride * 2);
1495 
1496  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1497  src += (5 * stride);
1498  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1499  src32_r, src43_r);
1500  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1501  XORI_B2_128_SB(src2110, src4332);
1502  LD_SB4(src, stride, src5, src6, src7, src8);
1503  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1504  src76_r, src87_r);
1505  ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1506  XORI_B2_128_SB(src6554, src8776);
1507  out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1508  out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1509  SRARI_H2_SH(out10, out32, 5);
1510  SAT_SH2_SH(out10, out32, 7);
1511  out = PCKEV_XORI128_UB(out10, out32);
1512  src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
1513  src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
1514  src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1515  out = __msa_aver_u_b(out, (v16u8) src32_r);
1516  ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
1517 }
1518 
1520  ptrdiff_t stride)
1521 {
1522  avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2), dst, stride);
1523 }
1524 
1526  ptrdiff_t stride)
1527 {
1528  avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1529 }
1530 
1532  ptrdiff_t stride)
1533 {
1534  avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2), dst,
1535  stride);
1536 }
1537 
1539  ptrdiff_t stride)
1540 {
1541  avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1542  stride);
1543 }
1544 
1546  ptrdiff_t stride)
1547 {
1548  avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2), dst, stride);
1549 }
1550 
1552  ptrdiff_t stride)
1553 {
1554  avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1555 }
1556 
1558  ptrdiff_t stride)
1559 {
1560  avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2), dst, stride);
1561 }
1562 
1564  ptrdiff_t stride)
1565 {
1566  avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1567  stride);
1568 }
1569 
1570 
1572  ptrdiff_t stride)
1573 {
1574  avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2), dst, stride);
1575 }
1576 
1578  ptrdiff_t stride)
1579 {
1580  avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1581 }
1582 
1584  ptrdiff_t stride)
1585 {
1586  avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2), dst, stride);
1587 }
1588 
1590  ptrdiff_t stride)
1591 {
1592  avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1593  stride);
1594 }
1595 
1597  ptrdiff_t stride)
1598 {
1599  uint8_t *dst_tmp = dst;
1600  const uint8_t *src_tmp = src - (2 * stride) - 2;
1601  uint32_t multiple8_cnt, loop_cnt;
1602  const int32_t filt_const0 = 0xfffb0001;
1603  const int32_t filt_const1 = 0x140014;
1604  const int32_t filt_const2 = 0x1fffb;
1605  v16u8 out0, out1;
1606  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
1607  v16i8 mask2;
1608  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1609  v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1610  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1611  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
1612  v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
1613  v8i16 hz_out87_l, filt0, filt1, filt2;
1614  v4i32 tmp0, tmp1;
1615 
1616  filt0 = (v8i16) __msa_fill_w(filt_const0);
1617  filt1 = (v8i16) __msa_fill_w(filt_const1);
1618  filt2 = (v8i16) __msa_fill_w(filt_const2);
1619 
1620  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1621 
1622  for (multiple8_cnt = 2; multiple8_cnt--;) {
1623  dst = dst_tmp;
1624  src = src_tmp;
1625 
1626  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1627  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1628  src += (5 * stride);
1629 
1630  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1631  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1632  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1633  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1634  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1635 
1636  for (loop_cnt = 4; loop_cnt--;) {
1637  LD_SB4(src, stride, src5, src6, src7, src8);
1638  src += (4 * stride);
1639 
1640  XORI_B4_128_SB(src5, src6, src7, src8);
1641 
1642  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1643  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1644  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1645  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1646 
1647  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1648  hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
1649  hz_out43_r);
1650  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1651  hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
1652  hz_out43_l);
1653  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1654  hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
1655  hz_out87_r);
1656  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1657  hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
1658  hz_out87_l);
1659 
1660  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
1661  filt1, filt2);
1662  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
1663  filt1, filt2);
1664  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1665  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
1666  filt1, filt2);
1667  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
1668  filt1, filt2);
1669  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1670  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
1671  filt1, filt2);
1672  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
1673  filt1, filt2);
1674  dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1675  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
1676  filt1, filt2);
1677  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
1678  filt1, filt2);
1679  dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1680 
1681  dst1 = __msa_srari_h(hz_out2, 5);
1682  dst3 = __msa_srari_h(hz_out3, 5);
1683  dst5 = __msa_srari_h(hz_out4, 5);
1684  dst7 = __msa_srari_h(hz_out5, 5);
1685  SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
1686 
1687  dst0 = __msa_aver_s_h(dst0, dst1);
1688  dst1 = __msa_aver_s_h(dst2, dst3);
1689  dst2 = __msa_aver_s_h(dst4, dst5);
1690  dst3 = __msa_aver_s_h(dst6, dst7);
1691 
1692  out0 = PCKEV_XORI128_UB(dst0, dst1);
1693  out1 = PCKEV_XORI128_UB(dst2, dst3);
1694  ST8x4_UB(out0, out1, dst, stride);
1695  dst += (4 * stride);
1696 
1697  hz_out0 = hz_out4;
1698  hz_out1 = hz_out5;
1699  hz_out2 = hz_out6;
1700  hz_out3 = hz_out7;
1701  hz_out4 = hz_out8;
1702  }
1703 
1704  src_tmp += 8;
1705  dst_tmp += 8;
1706  }
1707 }
1708 
1710  ptrdiff_t stride)
1711 {
1712  uint8_t *dst_tmp = dst;
1713  const uint8_t *src_tmp = src - (2 * stride) - 2;
1714  uint32_t multiple8_cnt, loop_cnt;
1715  const int32_t filt_const0 = 0xfffb0001;
1716  const int32_t filt_const1 = 0x140014;
1717  const int32_t filt_const2 = 0x1fffb;
1718  v16u8 out0, out1;
1719  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
1720  v16i8 mask2;
1721  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1722  v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1723  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1724  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
1725  v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
1726  v8i16 hz_out87_l, filt0, filt1, filt2;
1727  v4i32 tmp0, tmp1;
1728 
1729  filt0 = (v8i16) __msa_fill_w(filt_const0);
1730  filt1 = (v8i16) __msa_fill_w(filt_const1);
1731  filt2 = (v8i16) __msa_fill_w(filt_const2);
1732 
1733  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1734 
1735  for (multiple8_cnt = 2; multiple8_cnt--;) {
1736  dst = dst_tmp;
1737  src = src_tmp;
1738 
1739  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1740  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1741  src += (5 * stride);
1742 
1743  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1744  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1745  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1746  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1747  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1748 
1749  for (loop_cnt = 4; loop_cnt--;) {
1750  LD_SB4(src, stride, src5, src6, src7, src8);
1751  src += (4 * stride);
1752 
1753  XORI_B4_128_SB(src5, src6, src7, src8);
1754 
1755  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1756  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1757  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1758  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1759 
1760  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1761  hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
1762  hz_out43_r);
1763  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1764  hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
1765  hz_out43_l);
1766  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1767  hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
1768  hz_out87_r);
1769  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1770  hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
1771  hz_out87_l);
1772 
1773  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
1774  filt1, filt2);
1775  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
1776  filt1, filt2);
1777  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1778  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
1779  filt1, filt2);
1780  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
1781  filt1, filt2);
1782  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1783  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
1784  filt1, filt2);
1785  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
1786  filt1, filt2);
1787  dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1788  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
1789  filt1, filt2);
1790  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
1791  filt1, filt2);
1792  dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1793 
1794  dst1 = __msa_srari_h(hz_out3, 5);
1795  dst3 = __msa_srari_h(hz_out4, 5);
1796  dst5 = __msa_srari_h(hz_out5, 5);
1797  dst7 = __msa_srari_h(hz_out6, 5);
1798  SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
1799 
1800  dst0 = __msa_aver_s_h(dst0, dst1);
1801  dst1 = __msa_aver_s_h(dst2, dst3);
1802  dst2 = __msa_aver_s_h(dst4, dst5);
1803  dst3 = __msa_aver_s_h(dst6, dst7);
1804 
1805  out0 = PCKEV_XORI128_UB(dst0, dst1);
1806  out1 = PCKEV_XORI128_UB(dst2, dst3);
1807  ST8x4_UB(out0, out1, dst, stride);
1808  dst += (4 * stride);
1809 
1810  hz_out0 = hz_out4;
1811  hz_out1 = hz_out5;
1812  hz_out2 = hz_out6;
1813  hz_out3 = hz_out7;
1814  hz_out4 = hz_out8;
1815  }
1816 
1817  src_tmp += 8;
1818  dst_tmp += 8;
1819  }
1820 }
1821 
1823  ptrdiff_t stride)
1824 {
1825  const int32_t filt_const0 = 0xfffb0001;
1826  const int32_t filt_const1 = 0x140014;
1827  const int32_t filt_const2 = 0x1fffb;
1828  v16u8 out0, out1;
1829  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1830  v16i8 src11, src12, mask0, mask1, mask2;
1831  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1832  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
1833  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1834  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
1835  v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
1836  v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
1837  v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
1838  v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
1839  v4i32 tmp0, tmp1;
1840 
1841  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1842 
1843  filt0 = (v8i16) __msa_fill_w(filt_const0);
1844  filt1 = (v8i16) __msa_fill_w(filt_const1);
1845  filt2 = (v8i16) __msa_fill_w(filt_const2);
1846 
1847  src -= ((2 * stride) + 2);
1848 
1849  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1850  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1851  src += (5 * stride);
1852 
1853  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1854  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1855  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1856  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1857  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1858 
1859  LD_SB4(src, stride, src5, src6, src7, src8);
1860  src += (4 * stride);
1861  XORI_B4_128_SB(src5, src6, src7, src8);
1862 
1863  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1864  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1865  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1866  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1867 
1868  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
1869  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
1870  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
1871  hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
1872  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
1873  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
1874  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
1875  hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
1876 
1877  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
1878  filt2);
1879  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
1880  filt2);
1881  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1882  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
1883  filt2);
1884  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
1885  filt2);
1886  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1887  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
1888  filt2);
1889  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
1890  filt2);
1891  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1892  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
1893  filt2);
1894  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
1895  filt2);
1896  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1897 
1898  SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
1899  SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
1900 
1901  dst0 = __msa_aver_s_h(dst0, hz_out2);
1902  dst1 = __msa_aver_s_h(dst1, hz_out3);
1903  dst2 = __msa_aver_s_h(dst2, hz_out4);
1904  dst3 = __msa_aver_s_h(dst3, hz_out5);
1905 
1906  out0 = PCKEV_XORI128_UB(dst0, dst1);
1907  out1 = PCKEV_XORI128_UB(dst2, dst3);
1908  ST8x4_UB(out0, out1, dst, stride);
1909  dst += (4 * stride);
1910 
1911  LD_SB4(src, stride, src9, src10, src11, src12);
1912  XORI_B4_128_SB(src9, src10, src11, src12);
1913  hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
1914  hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
1915  hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
1916  hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
1917  ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
1918  hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
1919  hz_out1211_r);
1920  ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
1921  hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
1922  hz_out1211_l);
1923  tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
1924  filt2);
1925  tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
1926  filt2);
1927  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1928  tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
1929  filt2);
1930  tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
1931  filt2);
1932  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1933  tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
1934  filt2);
1935  tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
1936  filt2);
1937  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1938  tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
1939  filt2);
1940  tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
1941  filt2);
1942  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1943 
1944  SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
1945  SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
1946 
1947  dst0 = __msa_aver_s_h(dst0, hz_out6);
1948  dst1 = __msa_aver_s_h(dst1, hz_out7);
1949  dst2 = __msa_aver_s_h(dst2, hz_out8);
1950  dst3 = __msa_aver_s_h(dst3, hz_out9);
1951 
1952  out0 = PCKEV_XORI128_UB(dst0, dst1);
1953  out1 = PCKEV_XORI128_UB(dst2, dst3);
1954  ST8x4_UB(out0, out1, dst, stride);
1955 }
1956 
1958  ptrdiff_t stride)
1959 {
1960  const int32_t filt_const0 = 0xfffb0001;
1961  const int32_t filt_const1 = 0x140014;
1962  const int32_t filt_const2 = 0x1fffb;
1963  v16u8 out0, out1;
1964  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1965  v16i8 src11, src12, mask0, mask1, mask2;
1966  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1967  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
1968  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1969  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
1970  v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
1971  v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
1972  v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
1973  v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
1974  v4i32 tmp0, tmp1;
1975 
1976  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1977 
1978  filt0 = (v8i16) __msa_fill_w(filt_const0);
1979  filt1 = (v8i16) __msa_fill_w(filt_const1);
1980  filt2 = (v8i16) __msa_fill_w(filt_const2);
1981 
1982  src -= ((2 * stride) + 2);
1983 
1984  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1985  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1986  src += (5 * stride);
1987 
1988  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1989  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1990  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1991  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1992  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1993 
1994  LD_SB4(src, stride, src5, src6, src7, src8);
1995  src += (4 * stride);
1996  XORI_B4_128_SB(src5, src6, src7, src8);
1997 
1998  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1999  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
2000  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
2001  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2002 
2003  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2004  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2005  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2006  hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
2007  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2008  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2009  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2010  hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
2011 
2012  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2013  filt2);
2014  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
2015  filt2);
2016  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2017  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2018  filt2);
2019  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
2020  filt2);
2021  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2022  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2023  filt2);
2024  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
2025  filt2);
2026  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2027  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2028  filt2);
2029  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
2030  filt2);
2031  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2032 
2033  SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
2034  SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
2035 
2036  dst0 = __msa_aver_s_h(dst0, hz_out3);
2037  dst1 = __msa_aver_s_h(dst1, hz_out4);
2038  dst2 = __msa_aver_s_h(dst2, hz_out5);
2039  dst3 = __msa_aver_s_h(dst3, hz_out6);
2040 
2041  out0 = PCKEV_XORI128_UB(dst0, dst1);
2042  out1 = PCKEV_XORI128_UB(dst2, dst3);
2043  ST8x4_UB(out0, out1, dst, stride);
2044  dst += (4 * stride);
2045 
2046  LD_SB4(src, stride, src9, src10, src11, src12);
2047  XORI_B4_128_SB(src9, src10, src11, src12);
2048  hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
2049  hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
2050  hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
2051  hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
2052  ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2053  hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
2054  hz_out1211_r);
2055  ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2056  hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
2057  hz_out1211_l);
2058  tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
2059  filt2);
2060  tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
2061  filt2);
2062  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2063  tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
2064  filt2);
2065  tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
2066  filt2);
2067  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2068  tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
2069  filt2);
2070  tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
2071  filt2);
2072  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2073  tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
2074  filt2);
2075  tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
2076  filt2);
2077  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2078 
2079  SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
2080  SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
2081 
2082  dst0 = __msa_aver_s_h(dst0, hz_out7);
2083  dst1 = __msa_aver_s_h(dst1, hz_out8);
2084  dst2 = __msa_aver_s_h(dst2, hz_out9);
2085  dst3 = __msa_aver_s_h(dst3, hz_out10);
2086 
2087  out0 = PCKEV_XORI128_UB(dst0, dst1);
2088  out1 = PCKEV_XORI128_UB(dst2, dst3);
2089  ST8x4_UB(out0, out1, dst, stride);
2090 }
2091 
2093  ptrdiff_t stride)
2094 {
2095  const int32_t filt_const0 = 0xfffb0001;
2096  const int32_t filt_const1 = 0x140014;
2097  const int32_t filt_const2 = 0x1fffb;
2098  v16u8 res;
2099  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2100  v16i8 mask0, mask1, mask2;
2101  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2102  v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
2103  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2104  v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
2105  v4i32 tmp0, tmp1;
2106 
2107  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2108 
2109  filt0 = (v8i16) __msa_fill_w(filt_const0);
2110  filt1 = (v8i16) __msa_fill_w(filt_const1);
2111  filt2 = (v8i16) __msa_fill_w(filt_const2);
2112 
2113  src -= ((2 * stride) + 2);
2114 
2115  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2116  src += (5 * stride);
2117  LD_SB4(src, stride, src5, src6, src7, src8);
2118 
2119  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2120  XORI_B4_128_SB(src5, src6, src7, src8);
2121 
2122  hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
2123  hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
2124  hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
2125  hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
2126  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2127  PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2128  PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
2129 
2130  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2131  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2132  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2133  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2134 
2135  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2136  filt2);
2137  tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2138  filt2);
2139  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2140  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2141  filt2);
2142  tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2143  filt2);
2144  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2145 
2146  SRARI_H2_SH(hz_out2, hz_out4, 5);
2147  SAT_SH2_SH(hz_out2, hz_out4, 7);
2148 
2149  dst0 = __msa_aver_s_h(dst0, hz_out2);
2150  dst1 = __msa_aver_s_h(dst1, hz_out4);
2151 
2152  res = PCKEV_XORI128_UB(dst0, dst1);
2153  ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
2154 }
2155 
2157  ptrdiff_t stride)
2158 {
2159  const int32_t filt_const0 = 0xfffb0001;
2160  const int32_t filt_const1 = 0x140014;
2161  const int32_t filt_const2 = 0x1fffb;
2162  v16u8 res;
2163  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2164  v16i8 mask0, mask1, mask2;
2165  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2166  v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
2167  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2168  v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
2169  v4i32 tmp0, tmp1;
2170 
2171  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2172 
2173  filt0 = (v8i16) __msa_fill_w(filt_const0);
2174  filt1 = (v8i16) __msa_fill_w(filt_const1);
2175  filt2 = (v8i16) __msa_fill_w(filt_const2);
2176 
2177  src -= ((2 * stride) + 2);
2178 
2179  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2180  src += (5 * stride);
2181  LD_SB4(src, stride, src5, src6, src7, src8);
2182 
2183  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2184  XORI_B4_128_SB(src5, src6, src7, src8);
2185 
2186  hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
2187  hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
2188  hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
2189  hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
2190  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2191  PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2192  PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
2193 
2194  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2195  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2196  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2197  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2198 
2199  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2200  filt2);
2201  tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2202  filt2);
2203  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2204  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2205  filt2);
2206  tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2207  filt2);
2208  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2209 
2210  PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
2211  SRARI_H2_SH(hz_out0, hz_out1, 5);
2212  SAT_SH2_SH(hz_out0, hz_out1, 7);
2213 
2214  dst0 = __msa_aver_s_h(dst0, hz_out0);
2215  dst1 = __msa_aver_s_h(dst1, hz_out1);
2216 
2217  res = PCKEV_XORI128_UB(dst0, dst1);
2218  ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
2219 }
2220 
2222  ptrdiff_t stride)
2223 {
2224  int32_t loop_cnt;
2225  int16_t filt_const0 = 0xfb01;
2226  int16_t filt_const1 = 0x1414;
2227  int16_t filt_const2 = 0x1fb;
2228  v16u8 res0, res1, res2, res3;
2229  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2230  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2231  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2232  v16i8 src65_l, src87_l, filt0, filt1, filt2;
2233  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2234 
2235  filt0 = (v16i8) __msa_fill_h(filt_const0);
2236  filt1 = (v16i8) __msa_fill_h(filt_const1);
2237  filt2 = (v16i8) __msa_fill_h(filt_const2);
2238  src -= (stride * 2);
2239 
2240  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2241  src += (5 * stride);
2242 
2243  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2244  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2245  src32_r, src43_r);
2246  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2247  src32_l, src43_l);
2248 
2249  for (loop_cnt = 4; loop_cnt--;) {
2250  LD_SB4(src, stride, src5, src6, src7, src8);
2251  src += (4 * stride);
2252 
2253  XORI_B4_128_SB(src5, src6, src7, src8);
2254  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
2255  src65_r, src76_r, src87_r);
2256  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
2257  src65_l, src76_l, src87_l);
2258  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2259  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2260  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2261  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2262  out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2263  out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2264  out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2265  out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2266  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2267  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2268  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
2269  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2270  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2271  out3_r, res0, res1, res2, res3);
2272  XORI_B4_128_UB(res0, res1, res2, res3);
2273  ST_UB4(res0, res1, res2, res3, dst, stride);
2274  dst += (4 * stride);
2275 
2276  src10_r = src54_r;
2277  src32_r = src76_r;
2278  src21_r = src65_r;
2279  src43_r = src87_r;
2280  src10_l = src54_l;
2281  src32_l = src76_l;
2282  src21_l = src65_l;
2283  src43_l = src87_l;
2284  src4 = src8;
2285  }
2286 }
2287 
2289  ptrdiff_t stride)
2290 {
2291  const int16_t filt_const0 = 0xfb01;
2292  const int16_t filt_const1 = 0x1414;
2293  const int16_t filt_const2 = 0x1fb;
2294  v16u8 out0, out1, out2, out3;
2295  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2296  v16i8 src11, src12, src10_r, src21_r, src32_r, src43_r, src76_r, src87_r;
2297  v16i8 src98_r, src109_r, src89_r, src910_r, src1110_r, src1211_r;
2298  v16i8 filt0, filt1, filt2;
2299  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
2300 
2301  filt0 = (v16i8) __msa_fill_h(filt_const0);
2302  filt1 = (v16i8) __msa_fill_h(filt_const1);
2303  filt2 = (v16i8) __msa_fill_h(filt_const2);
2304 
2305  src -= (stride * 2);
2306 
2307  LD_SB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2308  src += (8 * stride);
2309  LD_SB5(src, stride, src8, src9, src10, src11, src12);
2310  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2311  src32_r, src43_r);
2312  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src76_r, src87_r,
2313  src98_r, src109_r);
2314  ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src89_r,
2315  src910_r, src1110_r, src1211_r);
2316  XORI_B4_128_SB(src10_r, src21_r, src32_r, src43_r);
2317  XORI_B4_128_SB(src76_r, src87_r, src98_r, src109_r);
2318  XORI_B4_128_SB(src89_r, src910_r, src1110_r, src1211_r);
2319  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
2320  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
2321  out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
2322  out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
2323  out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src89_r, filt0, filt1, filt2);
2324  out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src910_r, filt0, filt1, filt2);
2325  out6_r = AVC_DOT_SH3_SH(src98_r, src89_r, src1110_r, filt0, filt1, filt2);
2326  out7_r = AVC_DOT_SH3_SH(src109_r, src910_r, src1211_r, filt0, filt1, filt2);
2327  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2328  SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
2329  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2330  SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
2331  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
2332  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
2333  out2 = PCKEV_XORI128_UB(out4_r, out5_r);
2334  out3 = PCKEV_XORI128_UB(out6_r, out7_r);
2335  ST8x8_UB(out0, out1, out2, out3, dst, stride);
2336 }
2337 
2339  ptrdiff_t stride)
2340 {
2341  const int16_t filt_const0 = 0xfb01;
2342  const int16_t filt_const1 = 0x1414;
2343  const int16_t filt_const2 = 0x1fb;
2344  v16u8 out;
2345  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2346  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2347  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
2348  v8i16 out10, out32;
2349 
2350  filt0 = (v16i8) __msa_fill_h(filt_const0);
2351  filt1 = (v16i8) __msa_fill_h(filt_const1);
2352  filt2 = (v16i8) __msa_fill_h(filt_const2);
2353 
2354  src -= (stride * 2);
2355 
2356  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2357  src += (5 * stride);
2358  LD_SB4(src, stride, src5, src6, src7, src8);
2359 
2360  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2361  src32_r, src43_r);
2362  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2363  src76_r, src87_r);
2364  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
2365  src76_r, src2110, src4332, src6554, src8776);
2366  XORI_B4_128_SB(src2110, src4332, src6554, src8776);
2367  out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
2368  out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
2369  SRARI_H2_SH(out10, out32, 5);
2370  SAT_SH2_SH(out10, out32, 7);
2371  out = PCKEV_XORI128_UB(out10, out32);
2372  ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2373 }
2374 
2376  ptrdiff_t stride)
2377 {
2378  uint32_t row;
2379  v16u8 out;
2380  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2381  v16i8 src11;
2382  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
2383  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2384  v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
2385  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2386  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2387  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2388  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2389  v8i16 minus5h = __msa_ldi_h(-5);
2390  v8i16 plus20h = __msa_ldi_h(20);
2391 
2392  mask3 = mask0 + 4;
2393  mask4 = mask1 + 4;
2394  mask5 = mask2 + 4;
2395 
2396  src -= ((2 * stride) + 2);
2397 
2398  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2399  LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
2400  src += (5 * stride);
2401  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2402  XORI_B5_128_SB(src7, src8, src9, src10, src11);
2403 
2404  for (row = 16; row--;) {
2405  LD_SB2(src, 8, src5, src6);
2406  src += stride;
2407  XORI_B2_128_SB(src5, src6);
2408 
2409  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2410  vt_res0, vt_res1);
2411  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
2412  vt_res2, vt_res3);
2413  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2414  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2415  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2416  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2417  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2418  mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2419  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2420  mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2421  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2422  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2423  hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2424  hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2425  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2426  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2427  DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2428  DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2429  SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2430  SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2431  dst0 = __msa_srari_h(shf_vec2, 5);
2432  dst1 = __msa_srari_h(shf_vec5, 5);
2433  dst2 = __msa_srari_h(shf_vec8, 5);
2434  dst3 = __msa_srari_h(shf_vec11, 5);
2435  SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2436  PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1);
2437  PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2438  dst0 = __msa_aver_s_h(dst2, dst0);
2439  dst1 = __msa_aver_s_h(dst3, dst1);
2440  out = PCKEV_XORI128_UB(dst0, dst1);
2441  ST_UB(out, dst);
2442  dst += stride;
2443 
2444  src0 = src1;
2445  src1 = src2;
2446  src2 = src3;
2447  src3 = src4;
2448  src4 = src5;
2449  src7 = src8;
2450  src8 = src9;
2451  src9 = src10;
2452  src10 = src11;
2453  src11 = src6;
2454  }
2455 }
2456 
2458  ptrdiff_t stride)
2459 {
2460  uint32_t row;
2461  v16u8 out;
2462  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2463  v16i8 src11;
2464  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
2465  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2466  v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
2467  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2468  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2469  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2470  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2471  v8i16 minus5h = __msa_ldi_h(-5);
2472  v8i16 plus20h = __msa_ldi_h(20);
2473 
2474  mask3 = mask0 + 4;
2475  mask4 = mask1 + 4;
2476  mask5 = mask2 + 4;
2477 
2478  src -= ((2 * stride) + 2);
2479 
2480  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2481  LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
2482  src += (5 * stride);
2483  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2484  XORI_B5_128_SB(src7, src8, src9, src10, src11);
2485 
2486  for (row = 16; row--;) {
2487  LD_SB2(src, 8, src5, src6);
2488  src += stride;
2489  XORI_B2_128_SB(src5, src6);
2490 
2491  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2492  vt_res0, vt_res1);
2493  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
2494  vt_res2, vt_res3);
2495  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2496  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2497  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2498  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2499  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2500  mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2501  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2502  mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2503  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2504  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2505  hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2506  hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2507  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2508  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2509  DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2510  DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2511  SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2512  SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2513  dst0 = __msa_srari_h(shf_vec2, 5);
2514  dst1 = __msa_srari_h(shf_vec5, 5);
2515  dst2 = __msa_srari_h(shf_vec8, 5);
2516  dst3 = __msa_srari_h(shf_vec11, 5);
2517  SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2518  dst0 = __msa_pckod_h(dst2, dst0);
2519  dst1 = __msa_pckod_h(dst3, dst1);
2520  PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2521  dst0 = __msa_aver_s_h(dst2, dst0);
2522  dst1 = __msa_aver_s_h(dst3, dst1);
2523  out = PCKEV_XORI128_UB(dst0, dst1);
2524  ST_UB(out, dst);
2525  dst += stride;
2526 
2527  src0 = src1;
2528  src1 = src2;
2529  src2 = src3;
2530  src3 = src4;
2531  src4 = src5;
2532  src7 = src8;
2533  src8 = src9;
2534  src9 = src10;
2535  src10 = src11;
2536  src11 = src6;
2537  }
2538 }
2539 
2541  ptrdiff_t stride)
2542 {
2543  uint32_t row;
2544  v16u8 out;
2545  v16i8 src0, src1, src2, src3, src4, src5, src6;
2546  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
2547  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2548  v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
2549  v8i16 mask3, mask4, mask5;
2550  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2551  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2552  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2553  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2554  v8i16 minus5h = __msa_ldi_h(-5);
2555  v8i16 plus20h = __msa_ldi_h(20);
2556 
2557  mask3 = mask0 + 4;
2558  mask4 = mask1 + 4;
2559  mask5 = mask2 + 4;
2560 
2561  src -= ((2 * stride) + 2);
2562 
2563  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2564  src += (5 * stride);
2565  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2566 
2567  for (row = 4; row--;) {
2568  LD_SB2(src, stride, src5, src6);
2569  src += (2 * stride);
2570  XORI_B2_128_SB(src5, src6);
2571 
2572  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2573  vt_res0, vt_res1);
2574  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
2575  vt_res2, vt_res3);
2576  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2577  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2578  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2579  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2580  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2581  mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2582  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2583  mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2584  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2585  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2586  hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2587  hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2588  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2589  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2590  DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2591  DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2592  SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2593  SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2594  dst0 = __msa_srari_h(shf_vec2, 5);
2595  dst1 = __msa_srari_h(shf_vec5, 5);
2596  dst2 = __msa_srari_h(shf_vec8, 5);
2597  dst3 = __msa_srari_h(shf_vec11, 5);
2598  SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2599  PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1);
2600  PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2601  dst0 = __msa_aver_s_h(dst2, dst0);
2602  dst1 = __msa_aver_s_h(dst3, dst1);
2603  out = PCKEV_XORI128_UB(dst0, dst1);
2604  ST8x2_UB(out, dst, stride);
2605  dst += (2 * stride);
2606 
2607  src0 = src2;
2608  src1 = src3;
2609  src2 = src4;
2610  src3 = src5;
2611  src4 = src6;
2612  }
2613 }
2614 
2616  ptrdiff_t stride)
2617 {
2618  uint32_t row;
2619  v16u8 out;
2620  v16i8 src0, src1, src2, src3, src4, src5, src6;
2621  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
2622  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2623  v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
2624  v8i16 mask3, mask4, mask5;
2625  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2626  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2627  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2628  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2629  v8i16 minus5h = __msa_ldi_h(-5);
2630  v8i16 plus20h = __msa_ldi_h(20);
2631 
2632  mask3 = mask0 + 4;
2633  mask4 = mask1 + 4;
2634  mask5 = mask2 + 4;
2635 
2636  src -= ((2 * stride) + 2);
2637 
2638  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2639  src += (5 * stride);
2640  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2641 
2642  for (row = 4; row--;) {
2643  LD_SB2(src, stride, src5, src6);
2644  src += (2 * stride);
2645  XORI_B2_128_SB(src5, src6);
2646 
2647  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2648  vt_res0, vt_res1);
2649  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
2650  vt_res2, vt_res3);
2651  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2652  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2653  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2654  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2655  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2656  mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2657  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2658  mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2659  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2660  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2661  hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2662  hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2663  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2664  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2665  DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2666  DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2667  SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2668  SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2669  dst0 = __msa_srari_h(shf_vec2, 5);
2670  dst1 = __msa_srari_h(shf_vec5, 5);
2671  dst2 = __msa_srari_h(shf_vec8, 5);
2672  dst3 = __msa_srari_h(shf_vec11, 5);
2673  SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2674  dst0 = __msa_pckod_h(dst2, dst0);
2675  dst1 = __msa_pckod_h(dst3, dst1);
2676  PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2677  dst0 = __msa_aver_s_h(dst2, dst0);
2678  dst1 = __msa_aver_s_h(dst3, dst1);
2679  out = PCKEV_XORI128_UB(dst0, dst1);
2680  ST8x2_UB(out, dst, stride);
2681  dst += (2 * stride);
2682 
2683  src0 = src2;
2684  src1 = src3;
2685  src2 = src4;
2686  src3 = src5;
2687  src4 = src6;
2688  }
2689 }
2690 
2692  ptrdiff_t stride)
2693 {
2694  const int16_t filt_const0 = 0xfb01;
2695  const int16_t filt_const1 = 0x1414;
2696  const int16_t filt_const2 = 0x1fb;
2697  v16u8 out;
2698  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2699  v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
2700  v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
2701  v16i8 src76_l, src87_l, filt0, filt1, filt2;
2702  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
2703  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2704  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2705  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2706  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2707  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2708  v8i16 minus5h = __msa_ldi_h(-5);
2709  v8i16 plus20h = __msa_ldi_h(20);
2710  v8i16 zeros = { 0 };
2711 
2712  filt0 = (v16i8) __msa_fill_h(filt_const0);
2713  filt1 = (v16i8) __msa_fill_h(filt_const1);
2714  filt2 = (v16i8) __msa_fill_h(filt_const2);
2715 
2716  src -= ((2 * stride) + 2);
2717 
2718  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2719  src += (5 * stride);
2720  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2721  LD_SB4(src, stride, src5, src6, src7, src8);
2722  XORI_B4_128_SB(src5, src6, src7, src8);
2723 
2724  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2725  src32_r, src43_r);
2726  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2727  src76_r, src87_r);
2728  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2729  src32_l, src43_l);
2730  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
2731  src76_l, src87_l);
2732  vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2733  vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2734  vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2735  vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2736  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2737  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2738  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2739  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2740  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2741  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2742  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2743  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2744 
2745  vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2746  vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2747  vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2748  vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2749  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2750  mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
2751  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2752  mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
2753  hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2754  DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
2755  hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2756  DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
2757 
2758  SRARI_W2_SW(hz_res0, hz_res1, 10);
2759  SAT_SW2_SW(hz_res0, hz_res1, 7);
2760  SRARI_W2_SW(hz_res2, hz_res3, 10);
2761  SAT_SW2_SW(hz_res2, hz_res3, 7);
2762 
2763  dst0 = __msa_srari_h(shf_vec2, 5);
2764  dst1 = __msa_srari_h(shf_vec5, 5);
2765  dst2 = __msa_srari_h(shf_vec6, 5);
2766  dst3 = __msa_srari_h(shf_vec7, 5);
2767 
2768  SAT_SH2_SH(dst0, dst1, 7);
2769  SAT_SH2_SH(dst2, dst3, 7);
2770  ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
2771  ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
2772 
2773  hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
2774  hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
2775  hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
2776  hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
2777 
2778  PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
2779  out = PCKEV_XORI128_UB(dst0, dst2);
2780  ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2781 }
2782 
2784  ptrdiff_t stride)
2785 {
2786  const int16_t filt_const0 = 0xfb01;
2787  const int16_t filt_const1 = 0x1414;
2788  const int16_t filt_const2 = 0x1fb;
2789  v16u8 out;
2790  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2791  v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
2792  v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
2793  v16i8 src76_l, src87_l, filt0, filt1, filt2;
2794  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
2795  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2796  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2797  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2798  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2799  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2800  v8i16 minus5h = __msa_ldi_h(-5);
2801  v8i16 plus20h = __msa_ldi_h(20);
2802  v8i16 zeros = { 0 };
2803 
2804  filt0 = (v16i8) __msa_fill_h(filt_const0);
2805  filt1 = (v16i8) __msa_fill_h(filt_const1);
2806  filt2 = (v16i8) __msa_fill_h(filt_const2);
2807 
2808  src -= ((2 * stride) + 2);
2809 
2810  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2811  src += (5 * stride);
2812  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2813  LD_SB4(src, stride, src5, src6, src7, src8);
2814  XORI_B4_128_SB(src5, src6, src7, src8);
2815 
2816  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2817  src32_r, src43_r);
2818  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2819  src76_r, src87_r);
2820  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2821  src32_l, src43_l);
2822  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
2823  src76_l, src87_l);
2824 
2825  vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2826  vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2827  vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2828  vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2829  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2830  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2831  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2832  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2833  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2834  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2835  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2836  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2837 
2838  vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2839  vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2840  vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2841  vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2842  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2843  mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
2844  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2845  mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
2846  hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2847  DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
2848  hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2849  DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
2850 
2851  SRARI_W2_SW(hz_res0, hz_res1, 10);
2852  SAT_SW2_SW(hz_res0, hz_res1, 7);
2853  SRARI_W2_SW(hz_res2, hz_res3, 10);
2854  SAT_SW2_SW(hz_res2, hz_res3, 7);
2855 
2856  dst0 = __msa_srari_h(shf_vec2, 5);
2857  dst1 = __msa_srari_h(shf_vec5, 5);
2858  dst2 = __msa_srari_h(shf_vec6, 5);
2859  dst3 = __msa_srari_h(shf_vec7, 5);
2860 
2861  SAT_SH2_SH(dst0, dst1, 7);
2862  SAT_SH2_SH(dst2, dst3, 7);
2863 
2864  dst0 = __msa_ilvod_h(zeros, dst0);
2865  dst1 = __msa_ilvod_h(zeros, dst1);
2866  dst2 = __msa_ilvod_h(zeros, dst2);
2867  dst3 = __msa_ilvod_h(zeros, dst3);
2868 
2869  hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
2870  hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
2871  hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
2872  hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
2873 
2874  PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
2875  out = PCKEV_XORI128_UB(dst0, dst2);
2876  ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2877 }
2878 
2880  ptrdiff_t stride)
2881 {
2882  const int32_t filt_const0 = 0xfffb0001;
2883  const int32_t filt_const1 = 0x140014;
2884  const int32_t filt_const2 = 0x1fffb;
2885  const uint8_t *src_tmp = src - (2 * stride) - 2;
2886  uint8_t *dst_tmp = dst;
2887  uint32_t multiple8_cnt, loop_cnt;
2888  v16u8 out0, out1;
2889  v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
2890  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2891  v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3;
2892  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2893  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
2894  v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
2895  v8i16 hz_out87_l, filt0, filt1, filt2;
2896  v4i32 tmp0, tmp1;
2897 
2898  filt0 = (v8i16) __msa_fill_w(filt_const0);
2899  filt1 = (v8i16) __msa_fill_w(filt_const1);
2900  filt2 = (v8i16) __msa_fill_w(filt_const2);
2901 
2902  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2903 
2904  for (multiple8_cnt = 2; multiple8_cnt--;) {
2905  src = src_tmp;
2906  dst = dst_tmp;
2907 
2908  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2909  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2910  src += (5 * stride);
2911 
2912  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
2913  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
2914  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
2915  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
2916  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
2917 
2918  for (loop_cnt = 4; loop_cnt--;) {
2919  LD_SB4(src, stride, src0, src1, src2, src3);
2920  XORI_B4_128_SB(src0, src1, src2, src3);
2921  src += (4 * stride);
2922 
2923  hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
2924  hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
2925  hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
2926  hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
2927 
2928  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2929  hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
2930  hz_out43_r);
2931  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2932  hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
2933  hz_out43_l);
2934  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2935  hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
2936  hz_out87_r);
2937  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2938  hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
2939  hz_out87_l);
2940 
2941  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
2942  filt1, filt2);
2943  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
2944  filt1, filt2);
2945  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2946  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
2947  filt1, filt2);
2948  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
2949  filt1, filt2);
2950  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2951  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
2952  filt1, filt2);
2953  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
2954  filt1, filt2);
2955  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2956  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
2957  filt1, filt2);
2958  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
2959  filt1, filt2);
2960  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2961 
2962  out0 = PCKEV_XORI128_UB(dst0, dst1);
2963  out1 = PCKEV_XORI128_UB(dst2, dst3);
2964  ST8x4_UB(out0, out1, dst, stride);
2965  dst += (4 * stride);
2966 
2967  hz_out0 = hz_out4;
2968  hz_out1 = hz_out5;
2969  hz_out2 = hz_out6;
2970  hz_out3 = hz_out7;
2971  hz_out4 = hz_out8;
2972  }
2973 
2974  src_tmp += 8;
2975  dst_tmp += 8;
2976  }
2977 }
2978 
2980  ptrdiff_t stride)
2981 {
2982  const int32_t filt_const0 = 0xfffb0001;
2983  const int32_t filt_const1 = 0x140014;
2984  const int32_t filt_const2 = 0x1fffb;
2985  v16u8 out0, out1;
2986  v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
2987  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2988  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
2989  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2990  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
2991  v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
2992  v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
2993  v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
2994  v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
2995  v4i32 tmp0, tmp1;
2996 
2997  filt0 = (v8i16) __msa_fill_w(filt_const0);
2998  filt1 = (v8i16) __msa_fill_w(filt_const1);
2999  filt2 = (v8i16) __msa_fill_w(filt_const2);
3000 
3001  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3002 
3003  src -= ((2 * stride) + 2);
3004  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3005  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3006  src += (5 * stride);
3007 
3008  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3009  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3010  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3011  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3012  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
3013 
3014  LD_SB4(src, stride, src0, src1, src2, src3);
3015  XORI_B4_128_SB(src0, src1, src2, src3);
3016  src += (4 * stride);
3017  hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3018  hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3019  hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3020  hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3021  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3022  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3023  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3024  hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
3025  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3026  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3027  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3028  hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
3029 
3030  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3031  filt2);
3032  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
3033  filt2);
3034  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3035  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3036  filt2);
3037  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
3038  filt2);
3039  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3040  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3041  filt2);
3042  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
3043  filt2);
3044  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3045  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3046  filt2);
3047  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
3048  filt2);
3049  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3050  out0 = PCKEV_XORI128_UB(dst0, dst1);
3051  out1 = PCKEV_XORI128_UB(dst2, dst3);
3052  ST8x4_UB(out0, out1, dst, stride);
3053  dst += (4 * stride);
3054 
3055  LD_SB4(src, stride, src0, src1, src2, src3);
3056  XORI_B4_128_SB(src0, src1, src2, src3);
3057  hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3058  hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3059  hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3060  hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3061  ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3062  hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
3063  hz_out1211_r);
3064  ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3065  hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
3066  hz_out1211_l);
3067  tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
3068  filt2);
3069  tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
3070  filt2);
3071  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3072  tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
3073  filt2);
3074  tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
3075  filt2);
3076  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3077  tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
3078  filt2);
3079  tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
3080  filt2);
3081  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3082  tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
3083  filt2);
3084  tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
3085  filt2);
3086  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3087  out0 = PCKEV_XORI128_UB(dst0, dst1);
3088  out1 = PCKEV_XORI128_UB(dst2, dst3);
3089  ST8x4_UB(out0, out1, dst, stride);
3090 }
3091 
3093  ptrdiff_t stride)
3094 {
3095  const int32_t filt_const0 = 0xfffb0001;
3096  const int32_t filt_const1 = 0x140014;
3097  const int32_t filt_const2 = 0x1fffb;
3098  v16u8 res;
3099  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3100  v16i8 mask0, mask1, mask2;
3101  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3102  v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
3103  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3104  v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
3105  v4i32 tmp0, tmp1;
3106 
3107  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3108 
3109  filt0 = (v8i16) __msa_fill_w(filt_const0);
3110  filt1 = (v8i16) __msa_fill_w(filt_const1);
3111  filt2 = (v8i16) __msa_fill_w(filt_const2);
3112 
3113  src -= ((2 * stride) + 2);
3114 
3115  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3116  src += (5 * stride);
3117  LD_SB4(src, stride, src5, src6, src7, src8);
3118 
3119  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3120  XORI_B4_128_SB(src5, src6, src7, src8);
3121  hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
3122  hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
3123  hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
3124  hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
3125  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3126  PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
3127  PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
3128  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3129  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3130  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3131  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3132 
3133  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3134  filt2);
3135  tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3136  filt2);
3137  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3138  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3139  filt2);
3140  tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3141  filt2);
3142  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3143  res = PCKEV_XORI128_UB(dst0, dst1);
3144  ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
3145 }
3146 
3148  ptrdiff_t stride)
3149 {
3150  uint32_t loop_cnt;
3151  v16u8 dst0, dst1, dst2, dst3;
3152  v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6;
3153  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3154  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3155  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3156  v16i8 minus5b = __msa_ldi_b(-5);
3157  v16i8 plus20b = __msa_ldi_b(20);
3158 
3159  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3160  mask3 = mask0 + 8;
3161  mask4 = mask1 + 8;
3162  mask5 = mask2 + 8;
3163  src -= 2;
3164 
3165  for (loop_cnt = 4; loop_cnt--;) {
3166  LD_SB2(src, 16, src0, src1);
3167  src += stride;
3168  LD_SB2(src, 16, src2, src3);
3169  src += stride;
3170  LD_SB2(src, 16, src4, src5);
3171  src += stride;
3172  LD_SB2(src, 16, src6, src7);
3173  src += stride;
3174 
3175  LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3176  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3177  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
3178  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3179  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
3180  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3181  VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
3182  VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3183  HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3184  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3185  minus5b, res0, res1, res2, res3);
3186  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3187  plus20b, res0, res1, res2, res3);
3188  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3189  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3190  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3191  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3192  VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3193  VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3194  HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3195  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3196  minus5b, res4, res5, res6, res7);
3197  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3198  plus20b, res4, res5, res6, res7);
3199  SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2);
3200  SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2);
3201  SRARI_H4_SH(res0, res1, res2, res3, 5);
3202  SRARI_H4_SH(res4, res5, res6, res7, 5);
3203  SAT_SH4_SH(res0, res1, res2, res3, 7);
3204  SAT_SH4_SH(res4, res5, res6, res7, 7);
3205  PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
3206  PCKEV_B2_SB(res5, res4, res7, res6, out2, out3);
3207  out0 = __msa_aver_s_b(out0, src0);
3208  out1 = __msa_aver_s_b(out1, src2);
3209  out2 = __msa_aver_s_b(out2, src4);
3210  out3 = __msa_aver_s_b(out3, src6);
3211  XORI_B4_128_SB(out0, out1, out2, out3);
3212  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
3213  AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3);
3214  ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3215  dst += (4 * stride);
3216  }
3217 }
3218 
3220  ptrdiff_t stride)
3221 {
3222  uint32_t loop_cnt;
3223  v16u8 dst0, dst1, dst2, dst3;
3224  v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6;
3225  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3226  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3227  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3228  v16i8 minus5b = __msa_ldi_b(-5);
3229  v16i8 plus20b = __msa_ldi_b(20);
3230 
3231  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3232  mask3 = mask0 + 8;
3233  mask4 = mask1 + 8;
3234  mask5 = mask2 + 8;
3235  src -= 2;
3236 
3237  for (loop_cnt = 4; loop_cnt--;) {
3238  LD_SB2(src, 16, src0, src1);
3239  src += stride;
3240  LD_SB2(src, 16, src2, src3);
3241  src += stride;
3242  LD_SB2(src, 16, src4, src5);
3243  src += stride;
3244  LD_SB2(src, 16, src6, src7);
3245  src += stride;
3246 
3247  LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3248  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3249  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
3250  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3251  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
3252  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3253  VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
3254  VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3255  HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3256  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3257  minus5b, res0, res1, res2, res3);
3258  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3259  plus20b, res0, res1, res2, res3);
3260  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3261  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3262  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3263  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3264  VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3265  VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3266  HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3267  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3268  minus5b, res4, res5, res6, res7);
3269  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3270  plus20b, res4, res5, res6, res7);
3271  SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3);
3272  SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3);
3273  SRARI_H4_SH(res0, res1, res2, res3, 5);
3274  SRARI_H4_SH(res4, res5, res6, res7, 5);
3275  SAT_SH4_SH(res0, res1, res2, res3, 7);
3276  SAT_SH4_SH(res4, res5, res6, res7, 7);
3277  PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
3278  PCKEV_B2_SB(res5, res4, res7, res6, out2, out3);
3279  out0 = __msa_aver_s_b(out0, src0);
3280  out1 = __msa_aver_s_b(out1, src2);
3281  out2 = __msa_aver_s_b(out2, src4);
3282  out3 = __msa_aver_s_b(out3, src6);
3283  XORI_B4_128_SB(out0, out1, out2, out3);
3284  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
3285  AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3);
3286  ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3287  dst += (4 * stride);
3288  }
3289 }
3290 
3292  ptrdiff_t stride)
3293 {
3294  uint64_t tp0, tp1, tp2, tp3;
3295  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3296  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3297  v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3298  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3299  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3300  v16i8 minus5b = __msa_ldi_b(-5);
3301  v16i8 plus20b = __msa_ldi_b(20);
3302 
3303  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3304  LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3305  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3306  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3307  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3308  HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3309  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3310  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3311  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3312  res0, res1, res2, res3);
3313  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3314  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3315  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3316  res0, res1, res2, res3);
3317  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3318  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3319  HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3320  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3321  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3322  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3323  res4, res5, res6, res7);
3324  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3325  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3326  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3327  res4, res5, res6, res7);
3328  SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
3329  SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
3330  SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2);
3331  SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2);
3332  PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
3333  PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
3334  SRARI_H4_SH(res0, res1, res2, res3, 5);
3335  SRARI_H4_SH(res4, res5, res6, res7, 5);
3336  SAT_SH4_SH(res0, res1, res2, res3, 7);
3337  SAT_SH4_SH(res4, res5, res6, res7, 7);
3338  PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
3339  PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
3340  tmp0 = __msa_aver_s_b(tmp0, src0);
3341  tmp1 = __msa_aver_s_b(tmp1, src1);
3342  tmp2 = __msa_aver_s_b(tmp2, src4);
3343  tmp3 = __msa_aver_s_b(tmp3, src5);
3344  XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
3345  LD4(dst, stride, tp0, tp1, tp2, tp3);
3346  INSERT_D2_UB(tp0, tp1, dst0);
3347  INSERT_D2_UB(tp2, tp3, dst1);
3348  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3349  INSERT_D2_UB(tp0, tp1, dst2);
3350  INSERT_D2_UB(tp2, tp3, dst3);
3351  AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
3352  AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
3353  ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
3354 }
3355 
3357  ptrdiff_t stride)
3358 {
3359  uint64_t tp0, tp1, tp2, tp3;
3360  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3361  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3362  v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3363  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3364  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3365  v16i8 minus5b = __msa_ldi_b(-5);
3366  v16i8 plus20b = __msa_ldi_b(20);
3367 
3368  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3369  LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3370  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3371  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3372  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3373  HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3374  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3375  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3376  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3377  res0, res1, res2, res3);
3378  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3379  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3380  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3381  res0, res1, res2, res3);
3382  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3383  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3384  HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3385  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3386  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3387  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3388  res4, res5, res6, res7);
3389  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3390  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3391  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3392  res4, res5, res6, res7);
3393  SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
3394  SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
3395  SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 3);
3396  SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 3);
3397  PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
3398  PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
3399  SRARI_H4_SH(res0, res1, res2, res3, 5);
3400  SRARI_H4_SH(res4, res5, res6, res7, 5);
3401  SAT_SH4_SH(res0, res1, res2, res3, 7);
3402  SAT_SH4_SH(res4, res5, res6, res7, 7);
3403  PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
3404  PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
3405  tmp0 = __msa_aver_s_b(tmp0, src0);
3406  tmp1 = __msa_aver_s_b(tmp1, src1);
3407  tmp2 = __msa_aver_s_b(tmp2, src4);
3408  tmp3 = __msa_aver_s_b(tmp3, src5);
3409  XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
3410  LD4(dst, stride, tp0, tp1, tp2, tp3);
3411  INSERT_D2_UB(tp0, tp1, dst0);
3412  INSERT_D2_UB(tp2, tp3, dst1);
3413  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3414  INSERT_D2_UB(tp0, tp1, dst2);
3415  INSERT_D2_UB(tp2, tp3, dst3);
3416  AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
3417  AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
3418  ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
3419 }
3420 
3422  ptrdiff_t stride)
3423 {
3424  uint32_t tp0, tp1, tp2, tp3;
3425  v16u8 dst0 = { 0 };
3426  v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
3427  v16i8 mask0, mask1, mask2;
3428  v8i16 out0, out1;
3429  v16i8 minus5b = __msa_ldi_b(-5);
3430  v16i8 plus20b = __msa_ldi_b(20);
3431 
3432  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3433  LD_SB4(src - 2, stride, src0, src1, src2, src3);
3434  XORI_B4_128_SB(src0, src1, src2, src3);
3435  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3436  HADD_SB2_SH(vec0, vec1, out0, out1);
3437  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3438  DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
3439  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3440  DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
3441  SRARI_H2_SH(out0, out1, 5);
3442  SAT_SH2_SH(out0, out1, 7);
3443  res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
3444  SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
3445  SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
3446  src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
3447  src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3448  src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
3449  res = __msa_aver_s_b(res, src0);
3450  res = (v16i8) __msa_xori_b((v16u8) res, 128);
3451  LW4(dst, stride, tp0, tp1, tp2, tp3);
3452  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3453  dst0 = __msa_aver_u_b((v16u8) res, dst0);
3454  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
3455 }
3456 
3458  ptrdiff_t stride)
3459 {
3460  uint32_t tp0, tp1, tp2, tp3;
3461  v16u8 dst0 = { 0 };
3462  v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
3463  v16i8 mask0, mask1, mask2;
3464  v8i16 out0, out1;
3465  v16i8 minus5b = __msa_ldi_b(-5);
3466  v16i8 plus20b = __msa_ldi_b(20);
3467 
3468  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3469  LD_SB4(src - 2, stride, src0, src1, src2, src3);
3470  XORI_B4_128_SB(src0, src1, src2, src3);
3471  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3472  HADD_SB2_SH(vec0, vec1, out0, out1);
3473  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3474  DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
3475  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3476  DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
3477  SRARI_H2_SH(out0, out1, 5);
3478  SAT_SH2_SH(out0, out1, 7);
3479  res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
3480  SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
3481  SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
3482  src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
3483  src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3484  src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
3485  res = __msa_aver_s_b(res, src0);
3486  res = (v16i8) __msa_xori_b((v16u8) res, 128);
3487  LW4(dst, stride, tp0, tp1, tp2, tp3);
3488  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3489  dst0 = __msa_aver_u_b((v16u8) res, dst0);
3490  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
3491 }
3492 
3494  ptrdiff_t stride)
3495 {
3496  uint32_t loop_cnt;
3497  v16u8 dst0, dst1, dst2, dst3;
3498  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3499  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3500  v16i8 vec11;
3501  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3502  v16i8 minus5b = __msa_ldi_b(-5);
3503  v16i8 plus20b = __msa_ldi_b(20);
3504 
3505  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3506  src -= 2;
3507 
3508  for (loop_cnt = 4; loop_cnt--;) {
3509  LD_SB2(src, 8, src0, src1);
3510  src += stride;
3511  LD_SB2(src, 8, src2, src3);
3512  src += stride;
3513  LD_SB2(src, 8, src4, src5);
3514  src += stride;
3515  LD_SB2(src, 8, src6, src7);
3516  src += stride;
3517 
3518  LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3519  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3520  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
3521  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
3522  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
3523  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
3524  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
3525  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
3526  HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3527  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3528  minus5b, res0, res1, res2, res3);
3529  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3530  plus20b, res0, res1, res2, res3);
3531  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
3532  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
3533  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
3534  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
3535  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
3536  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
3537  HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3538  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3539  minus5b, res4, res5, res6, res7);
3540  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3541  plus20b, res4, res5, res6, res7);
3542  SRARI_H4_SH(res0, res1, res2, res3, 5);
3543  SRARI_H4_SH(res4, res5, res6, res7, 5);
3544  SAT_SH4_SH(res0, res1, res2, res3, 7);
3545  SAT_SH4_SH(res4, res5, res6, res7, 7);
3546  PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
3547  vec2, vec3);
3548  XORI_B4_128_SB(vec0, vec1, vec2, vec3);
3549  AVER_UB2_UB(vec0, dst0, vec1, dst1, dst0, dst1);
3550  AVER_UB2_UB(vec2, dst2, vec3, dst3, dst2, dst3);
3551  ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3552  dst += (4 * stride);
3553  }
3554 }
3555 
3557  ptrdiff_t stride)
3558 {
3559  uint64_t tp0, tp1, tp2, tp3;
3560  v16u8 out0, out1, out2 = { 0 }, out3 = { 0 };
3561  v16u8 out4, out5, out6 = { 0 }, out7 = { 0 };
3562  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3563  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3564  v16i8 vec11;
3565  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3566  v16i8 minus5b = __msa_ldi_b(-5);
3567  v16i8 plus20b = __msa_ldi_b(20);
3568 
3569  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3570 
3571  LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3572  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3573  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3574  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3575  HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3576  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3577  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3578  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3579  res0, res1, res2, res3);
3580  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3581  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3582  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3583  res0, res1, res2, res3);
3584  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3585  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3586  HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3587  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3588  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3589  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3590  res4, res5, res6, res7);
3591  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3592  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3593  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3594  res4, res5, res6, res7);
3595  SRARI_H4_SH(res0, res1, res2, res3, 5);
3596  SRARI_H4_SH(res4, res5, res6, res7, 5);
3597  SAT_SH4_SH(res0, res1, res2, res3, 7);
3598  SAT_SH4_SH(res4, res5, res6, res7, 7);
3599  out0 = PCKEV_XORI128_UB(res0, res1);
3600  out1 = PCKEV_XORI128_UB(res2, res3);
3601  out4 = PCKEV_XORI128_UB(res4, res5);
3602  out5 = PCKEV_XORI128_UB(res6, res7);
3603  LD4(dst, stride, tp0, tp1, tp2, tp3);
3604  INSERT_D2_UB(tp0, tp1, out2);
3605  INSERT_D2_UB(tp2, tp3, out3);
3606  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3607  INSERT_D2_UB(tp0, tp1, out6);
3608  INSERT_D2_UB(tp2, tp3, out7);
3609  AVER_UB2_UB(out0, out2, out1, out3, out0, out1);
3610  AVER_UB2_UB(out4, out6, out5, out7, out4, out5);
3611  ST8x8_UB(out0, out1, out4, out5, dst, stride);
3612 }
3613 
3615  ptrdiff_t stride)
3616 {
3617  uint32_t tp0, tp1, tp2, tp3;
3618  v16u8 res, dst0 = { 0 };
3619  v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, vec4, vec5;
3620  v16i8 mask0, mask1, mask2;
3621  v8i16 res0, res1;
3622  v16i8 minus5b = __msa_ldi_b(-5);
3623  v16i8 plus20b = __msa_ldi_b(20);
3624 
3625  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3626  LD_SB4(src - 2, stride, src0, src1, src2, src3);
3627  XORI_B4_128_SB(src0, src1, src2, src3);
3628  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3629  HADD_SB2_SH(vec0, vec1, res0, res1);
3630  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3631  DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
3632  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3633  DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
3634  SRARI_H2_SH(res0, res1, 5);
3635  SAT_SH2_SH(res0, res1, 7);
3636  res = PCKEV_XORI128_UB(res0, res1);
3637  LW4(dst, stride, tp0, tp1, tp2, tp3);
3638  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3639  res = __msa_aver_u_b(res, dst0);
3640  ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
3641 }
3642 
3644  ptrdiff_t stride)
3645 {
3646  int32_t loop_cnt;
3647  int16_t filt_const0 = 0xfb01;
3648  int16_t filt_const1 = 0x1414;
3649  int16_t filt_const2 = 0x1fb;
3650  v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
3651  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3652  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3653  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3654  v16i8 src65_l, src87_l, filt0, filt1, filt2;
3655  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3656 
3657  filt0 = (v16i8) __msa_fill_h(filt_const0);
3658  filt1 = (v16i8) __msa_fill_h(filt_const1);
3659  filt2 = (v16i8) __msa_fill_h(filt_const2);
3660 
3661  src -= (stride * 2);
3662 
3663  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3664  src += (5 * stride);
3665 
3666  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3667  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3668  src32_r, src43_r);
3669  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3670  src32_l, src43_l);
3671 
3672  for (loop_cnt = 4; loop_cnt--;) {
3673  LD_SB4(src, stride, src5, src6, src7, src8);
3674  src += (4 * stride);
3675 
3676  XORI_B4_128_SB(src5, src6, src7, src8);
3677  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3678  src65_r, src76_r, src87_r);
3679  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3680  src65_l, src76_l, src87_l);
3681  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3682  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3683  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3684  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3685  out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3686  out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3687  out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3688  out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3689  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3690  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3691  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
3692  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3693  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3694  out3_r, res0, res1, res2, res3);
3695  res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
3696  res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
3697  res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
3698  res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
3699  LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3700  XORI_B4_128_UB(res0, res1, res2, res3);
3701  AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
3702  AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3);
3703  ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3704  dst += (4 * stride);
3705 
3706  src10_r = src54_r;
3707  src32_r = src76_r;
3708  src21_r = src65_r;
3709  src43_r = src87_r;
3710  src10_l = src54_l;
3711  src32_l = src76_l;
3712  src21_l = src65_l;
3713  src43_l = src87_l;
3714  src2 = src6;
3715  src3 = src7;
3716  src4 = src8;
3717  }
3718 }
3719 
3721  ptrdiff_t stride)
3722 {
3723  int32_t loop_cnt;
3724  int16_t filt_const0 = 0xfb01;
3725  int16_t filt_const1 = 0x1414;
3726  int16_t filt_const2 = 0x1fb;
3727  v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
3728  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3729  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3730  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3731  v16i8 src65_l, src87_l, filt0, filt1, filt2;
3732  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3733 
3734  filt0 = (v16i8) __msa_fill_h(filt_const0);
3735  filt1 = (v16i8) __msa_fill_h(filt_const1);
3736  filt2 = (v16i8) __msa_fill_h(filt_const2);
3737 
3738  src -= (stride * 2);
3739 
3740  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3741  src += (5 * stride);
3742 
3743  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3744  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3745  src32_r, src43_r);
3746  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3747  src32_l, src43_l);
3748 
3749  for (loop_cnt = 4; loop_cnt--;) {
3750  LD_SB4(src, stride, src5, src6, src7, src8);
3751  src += (4 * stride);
3752 
3753  XORI_B4_128_SB(src5, src6, src7, src8);
3754  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3755  src65_r, src76_r, src87_r);
3756  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3757  src65_l, src76_l, src87_l);
3758  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3759  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3760  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3761  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3762  out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3763  out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3764  out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3765  out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3766  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3767  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3768  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
3769  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3770  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3771  out3_r, res0, res1, res2, res3);
3772  res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
3773  res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
3774  res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
3775  res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
3776  LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3777  XORI_B4_128_UB(res0, res1, res2, res3);
3778  AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
3779  AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3);
3780  ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3781  dst += (4 * stride);
3782 
3783  src10_r = src54_r;
3784  src32_r = src76_r;
3785  src21_r = src65_r;
3786  src43_r = src87_r;
3787  src10_l = src54_l;
3788  src32_l = src76_l;
3789  src21_l = src65_l;
3790  src43_l = src87_l;
3791  src3 = src7;
3792  src4 = src8;
3793  }
3794 }
3795 
3797  ptrdiff_t stride)
3798 {
3799  uint64_t tp0, tp1, tp2, tp3;
3800  const int16_t filt_const0 = 0xfb01;
3801  const int16_t filt_const1 = 0x1414;
3802  const int16_t filt_const2 = 0x1fb;
3803  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3804  v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
3805  v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
3806  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
3807  v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
3808  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3809 
3810  filt0 = (v16i8) __msa_fill_h(filt_const0);
3811  filt1 = (v16i8) __msa_fill_h(filt_const1);
3812  filt2 = (v16i8) __msa_fill_h(filt_const2);
3813 
3814  src -= (stride * 2);
3815 
3816  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3817  src += (5 * stride);
3818 
3819  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3820  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3821  src32_r, src43_r);
3822  LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14);
3823  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
3824  ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
3825  src87_r, src98_r, src109_r);
3826  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3827  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3828  out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3829  out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3830  PCKEV_D2_SB(src3, src2, src7, src4, tmp0, tmp1);
3831  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
3832  src21_r, src32_r, src43_r);
3833  out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
3834  out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
3835  out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
3836  out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
3837  PCKEV_D2_SB(src9, src8, src11, src10, tmp2, tmp3);
3838  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3839  SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
3840  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3841  SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3842 
3843  LD4(dst, stride, tp0, tp1, tp2, tp3);
3844  INSERT_D2_UB(tp0, tp1, dst0);
3845  INSERT_D2_UB(tp2, tp3, dst1);
3846  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3847  INSERT_D2_UB(tp0, tp1, dst2);
3848  INSERT_D2_UB(tp2, tp3, dst3);
3849 
3850  PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3851  PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3852  out0 = __msa_aver_s_b(out0, tmp0);
3853  out1 = __msa_aver_s_b(out1, tmp1);
3854  out2 = __msa_aver_s_b(out2, tmp2);
3855  out3 = __msa_aver_s_b(out3, tmp3);
3856  XORI_B4_128_SB(out0, out1, out2, out3);
3857  AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
3858  dst2, dst3);
3859  ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
3860 }
3861 
3863  ptrdiff_t stride)
3864 {
3865  uint64_t tp0, tp1, tp2, tp3;
3866  const int16_t filt_const0 = 0xfb01;
3867  const int16_t filt_const1 = 0x1414;
3868  const int16_t filt_const2 = 0x1fb;
3869  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3870  v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
3871  v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
3872  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
3873  v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
3874  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3875 
3876  filt0 = (v16i8) __msa_fill_h(filt_const0);
3877  filt1 = (v16i8) __msa_fill_h(filt_const1);
3878  filt2 = (v16i8) __msa_fill_h(filt_const2);
3879 
3880  src -= (stride * 2);
3881 
3882  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3883  src += (5 * stride);
3884 
3885  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3886  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3887  src32_r, src43_r);
3888  LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14);
3889  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
3890  ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
3891  src87_r, src98_r, src109_r);
3892  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3893  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3894  out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3895  out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3896  PCKEV_D2_SB(src4, src3, src8, src7, tmp0, tmp1);
3897  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
3898  src21_r, src32_r, src43_r);
3899  out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
3900  out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
3901  out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
3902  out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
3903  PCKEV_D2_SB(src10, src9, src12, src11, tmp2, tmp3);
3904  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3905  SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
3906  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3907  SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3908 
3909  LD4(dst, stride, tp0, tp1, tp2, tp3);
3910  INSERT_D2_UB(tp0, tp1, dst0);
3911  INSERT_D2_UB(tp2, tp3, dst1);
3912  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3913  INSERT_D2_UB(tp0, tp1, dst2);
3914  INSERT_D2_UB(tp2, tp3, dst3);
3915 
3916  PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3917  PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3918  out0 = __msa_aver_s_b(out0, tmp0);
3919  out1 = __msa_aver_s_b(out1, tmp1);
3920  out2 = __msa_aver_s_b(out2, tmp2);
3921  out3 = __msa_aver_s_b(out3, tmp3);
3922  XORI_B4_128_SB(out0, out1, out2, out3);
3923  AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
3924  dst2, dst3);
3925  ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
3926 }
3927 
3929  ptrdiff_t stride)
3930 {
3931  uint32_t tp0, tp1, tp2, tp3;
3932  int16_t filt_const0 = 0xfb01;
3933  int16_t filt_const1 = 0x1414;
3934  int16_t filt_const2 = 0x1fb;
3935  v16u8 res, dst0 = { 0 };
3936  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3937  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3938  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3939  v8i16 out10, out32;
3940 
3941  filt0 = (v16i8) __msa_fill_h(filt_const0);
3942  filt1 = (v16i8) __msa_fill_h(filt_const1);
3943  filt2 = (v16i8) __msa_fill_h(filt_const2);
3944 
3945  src -= (stride * 2);
3946  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3947  src += (5 * stride);
3948 
3949  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3950  src32_r, src43_r);
3951  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3952  XORI_B2_128_SB(src2110, src4332);
3953  LD_SB4(src, stride, src5, src6, src7, src8);
3954  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3955  src76_r, src87_r);
3956  ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
3957  XORI_B2_128_SB(src6554, src8776);
3958  src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3959  src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
3960  src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
3961  out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
3962  out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
3963  SRARI_H2_SH(out10, out32, 5);
3964  SAT_SH2_SH(out10, out32, 7);
3965  LW4(dst, stride, tp0, tp1, tp2, tp3);
3966  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3967  res = PCKEV_XORI128_UB(out10, out32);
3968  res = __msa_aver_u_b(res, (v16u8) src32_r);
3969  dst0 = __msa_aver_u_b(res, dst0);
3970  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
3971 }
3972 
3974  ptrdiff_t stride)
3975 {
3976  uint32_t tp0, tp1, tp2, tp3;
3977  int16_t filt_const0 = 0xfb01;
3978  int16_t filt_const1 = 0x1414;
3979  int16_t filt_const2 = 0x1fb;
3980  v16u8 res, dst0 = { 0 };
3981  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3982  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3983  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3984  v8i16 out10, out32;
3985 
3986  filt0 = (v16i8) __msa_fill_h(filt_const0);
3987  filt1 = (v16i8) __msa_fill_h(filt_const1);
3988  filt2 = (v16i8) __msa_fill_h(filt_const2);
3989 
3990  src -= (stride * 2);
3991 
3992  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3993  src += (5 * stride);
3994 
3995  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3996  src32_r, src43_r);
3997  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3998  XORI_B2_128_SB(src2110, src4332);
3999  LD_SB4(src, stride, src5, src6, src7, src8);
4000  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
4001  src76_r, src87_r);
4002  ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
4003  XORI_B2_128_SB(src6554, src8776);
4004  out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
4005  out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
4006  SRARI_H2_SH(out10, out32, 5);
4007  SAT_SH2_SH(out10, out32, 7);
4008  LW4(dst, stride, tp0, tp1, tp2, tp3);
4009  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4010  res = PCKEV_XORI128_UB(out10, out32);
4011  src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
4012  src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
4013  src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
4014  res = __msa_aver_u_b(res, (v16u8) src32_r);
4015  dst0 = __msa_aver_u_b(res, dst0);
4016  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
4017 }
4018 
4020  ptrdiff_t stride)
4021 {
4023  src - (stride * 2),
4024  dst, stride);
4025 }
4026 
4028  ptrdiff_t stride)
4029 {
4031  src - (stride * 2) +
4032  sizeof(uint8_t),
4033  dst, stride);
4034 }
4035 
4037  ptrdiff_t stride)
4038 {
4039  avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
4040  src - (stride * 2),
4041  dst, stride);
4042 }
4043 
4045  ptrdiff_t stride)
4046 {
4047  avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
4048  src - (stride * 2) +
4049  sizeof(uint8_t),
4050  dst, stride);
4051 }
4052 
4054  ptrdiff_t stride)
4055 {
4057  src - (stride * 2),
4058  dst, stride);
4059 }
4060 
4062  ptrdiff_t stride)
4063 {
4065  src - (stride * 2) +
4066  sizeof(uint8_t), dst, stride);
4067 }
4068 
4070  ptrdiff_t stride)
4071 {
4072  avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
4073  src - (stride * 2),
4074  dst, stride);
4075 }
4076 
4078  ptrdiff_t stride)
4079 {
4080  avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
4081  src - (stride * 2) +
4082  sizeof(uint8_t), dst, stride);
4083 }
4084 
4085 
4087  ptrdiff_t stride)
4088 {
4090  src - (stride * 2),
4091  dst, stride);
4092 }
4093 
4095  ptrdiff_t stride)
4096 {
4098  src - (stride * 2) +
4099  sizeof(uint8_t), dst, stride);
4100 }
4101 
4103  ptrdiff_t stride)
4104 {
4105  avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
4106  src - (stride * 2),
4107  dst, stride);
4108 }
4109 
4111  ptrdiff_t stride)
4112 {
4113  avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
4114  src - (stride * 2) +
4115  sizeof(uint8_t), dst, stride);
4116 }
4117 
4119  ptrdiff_t stride)
4120 {
4121  uint64_t tp0, tp1, tp2, tp3;
4122  uint8_t *dst_tmp = dst;
4123  const uint8_t *src_tmp = src - (2 * stride) - 2;
4124  uint32_t multiple8_cnt, loop_cnt;
4125  const int32_t filt_const0 = 0xfffb0001;
4126  const int32_t filt_const1 = 0x140014;
4127  const int32_t filt_const2 = 0x1fffb;
4128  v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
4129  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
4130  v16i8 mask2;
4131  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4132  v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4133  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4134  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
4135  v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
4136  v8i16 hz_out87_l, filt0, filt1, filt2;
4137  v4i32 tmp0_w, tmp1_w;
4138 
4139  filt0 = (v8i16) __msa_fill_w(filt_const0);
4140  filt1 = (v8i16) __msa_fill_w(filt_const1);
4141  filt2 = (v8i16) __msa_fill_w(filt_const2);
4142 
4143  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4144 
4145  for (multiple8_cnt = 2; multiple8_cnt--;) {
4146  dst = dst_tmp;
4147  src = src_tmp;
4148 
4149  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4150  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4151  src += (5 * stride);
4152 
4153  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4154  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4155  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4156  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4157  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4158 
4159  for (loop_cnt = 4; loop_cnt--;) {
4160  LD_SB2(src, stride, src5, src6);
4161  src += (2 * stride);
4162 
4163  XORI_B2_128_SB(src5, src6);
4164  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4165  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4166  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4167  hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
4168  hz_out43_r);
4169  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4170  hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
4171  hz_out43_l);
4172  ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r,
4173  hz_out65_r);
4174  ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l,
4175  hz_out65_l);
4176  tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
4177  filt1, filt2);
4178  tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
4179  filt1, filt2);
4180  tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4181  tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
4182  filt1, filt2);
4183  tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
4184  filt1, filt2);
4185  tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4186 
4187  tmp1 = __msa_srari_h(hz_out2, 5);
4188  tmp3 = __msa_srari_h(hz_out3, 5);
4189  SAT_SH2_SH(tmp1, tmp3, 7);
4190 
4191  tmp0 = __msa_aver_s_h(tmp0, tmp1);
4192  tmp1 = __msa_aver_s_h(tmp2, tmp3);
4193 
4194  LD2(dst, stride, tp0, tp1);
4195  INSERT_D2_UB(tp0, tp1, dst0);
4196 
4197  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4198  dst0 = __msa_aver_u_b(out0, dst0);
4199  ST8x2_UB(dst0, dst, stride);
4200  dst += (2 * stride);
4201 
4202  LD_SB2(src, stride, src7, src8);
4203  src += (2 * stride);
4204 
4205  XORI_B2_128_SB(src7, src8);
4206  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4207  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4208  ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
4209  hz_out87_r);
4210  ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
4211  hz_out87_l);
4212  tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
4213  filt1, filt2);
4214  tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
4215  filt1, filt2);
4216  tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4217  tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
4218  filt1, filt2);
4219  tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
4220  filt1, filt2);
4221  tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4222 
4223  tmp5 = __msa_srari_h(hz_out4, 5);
4224  tmp7 = __msa_srari_h(hz_out5, 5);
4225  SAT_SH2_SH(tmp5, tmp7, 7);
4226 
4227  tmp2 = __msa_aver_s_h(tmp4, tmp5);
4228  tmp3 = __msa_aver_s_h(tmp6, tmp7);
4229 
4230  LD2(dst, stride, tp2, tp3);
4231  INSERT_D2_UB(tp2, tp3, dst1);
4232 
4233  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4234  dst1 = __msa_aver_u_b(out1, dst1);
4235  ST8x2_UB(dst1, dst, stride);
4236  dst += (2 * stride);
4237 
4238  hz_out0 = hz_out4;
4239  hz_out1 = hz_out5;
4240  hz_out2 = hz_out6;
4241  hz_out3 = hz_out7;
4242  hz_out4 = hz_out8;
4243  }
4244 
4245  src_tmp += 8;
4246  dst_tmp += 8;
4247  }
4248 }
4249 
4251  ptrdiff_t stride)
4252 {
4253  uint64_t tp0, tp1, tp2, tp3;
4254  uint8_t *dst_tmp = dst;
4255  const uint8_t *src_tmp = src - (2 * stride) - 2;
4256  uint32_t multiple8_cnt, loop_cnt;
4257  const int32_t filt_const0 = 0xfffb0001;
4258  const int32_t filt_const1 = 0x140014;
4259  const int32_t filt_const2 = 0x1fffb;
4260  v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
4261  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
4262  v16i8 mask2;
4263  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4264  v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4265  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4266  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
4267  v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
4268  v8i16 hz_out87_l, filt0, filt1, filt2;
4269  v4i32 tmp0_w, tmp1_w;
4270 
4271  filt0 = (v8i16) __msa_fill_w(filt_const0);
4272  filt1 = (v8i16) __msa_fill_w(filt_const1);
4273  filt2 = (v8i16) __msa_fill_w(filt_const2);
4274 
4275  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4276 
4277  for (multiple8_cnt = 2; multiple8_cnt--;) {
4278  dst = dst_tmp;
4279  src = src_tmp;
4280 
4281  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4282  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4283  src += (5 * stride);
4284 
4285  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4286  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4287  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4288  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4289  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4290 
4291  for (loop_cnt = 4; loop_cnt--;) {
4292  LD_SB2(src, stride, src5, src6);
4293  src += (2 * stride);
4294 
4295  XORI_B2_128_SB(src5, src6);
4296  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4297  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4298  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4299  hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
4300  hz_out43_r);
4301  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4302  hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
4303  hz_out43_l);
4304  ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r, hz_out65_r);
4305  ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l, hz_out65_l);
4306 
4307  tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
4308  filt1, filt2);
4309  tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
4310  filt1, filt2);
4311  tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4312  tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
4313  filt1, filt2);
4314  tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
4315  filt1, filt2);
4316  tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4317 
4318  tmp1 = __msa_srari_h(hz_out3, 5);
4319  tmp3 = __msa_srari_h(hz_out4, 5);
4320  SAT_SH2_SH(tmp1, tmp3, 7);
4321 
4322  tmp0 = __msa_aver_s_h(tmp0, tmp1);
4323  tmp1 = __msa_aver_s_h(tmp2, tmp3);
4324 
4325  LD2(dst, stride, tp0, tp1);
4326  INSERT_D2_UB(tp0, tp1, dst0);
4327  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4328  dst0 = __msa_aver_u_b(out0, dst0);
4329  ST8x2_UB(dst0, dst, stride);
4330  dst += (2 * stride);
4331 
4332  LD_SB2(src, stride, src7, src8);
4333  src += (2 * stride);
4334 
4335  XORI_B2_128_SB(src7, src8);
4336  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4337  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4338  ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
4339  hz_out87_r);
4340  ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
4341  hz_out87_l);
4342  tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
4343  filt1, filt2);
4344  tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
4345  filt1, filt2);
4346  tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4347  tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
4348  filt1, filt2);
4349  tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
4350  filt1, filt2);
4351  tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4352 
4353  tmp5 = __msa_srari_h(hz_out5, 5);
4354  tmp7 = __msa_srari_h(hz_out6, 5);
4355  SAT_SH2_SH(tmp5, tmp7, 7);
4356 
4357  tmp2 = __msa_aver_s_h(tmp4, tmp5);
4358  tmp3 = __msa_aver_s_h(tmp6, tmp7);
4359 
4360  LD2(dst, stride, tp2, tp3);
4361  INSERT_D2_UB(tp2, tp3, dst1);
4362  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4363  dst1 = __msa_aver_u_b(out1, dst1);
4364  ST8x2_UB(dst1, dst, stride);
4365  dst += (2 * stride);
4366 
4367  hz_out0 = hz_out4;
4368  hz_out1 = hz_out5;
4369  hz_out2 = hz_out6;
4370  hz_out3 = hz_out7;
4371  hz_out4 = hz_out8;
4372  }
4373 
4374  src_tmp += 8;
4375  dst_tmp += 8;
4376  }
4377 }
4378 
4380  ptrdiff_t stride)
4381 {
4382  const int32_t filt_const0 = 0xfffb0001;
4383  const int32_t filt_const1 = 0x140014;
4384  const int32_t filt_const2 = 0x1fffb;
4385  uint64_t tp0, tp1, tp2, tp3;
4386  v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
4387  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4388  v16i8 src11, src12, mask0, mask1, mask2;
4389  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4390  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
4391  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4392  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
4393  v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
4394  v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
4395  v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
4396  v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
4397  v4i32 tmp0_w, tmp1_w;
4398 
4399  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4400 
4401  filt0 = (v8i16) __msa_fill_w(filt_const0);
4402  filt1 = (v8i16) __msa_fill_w(filt_const1);
4403  filt2 = (v8i16) __msa_fill_w(filt_const2);
4404 
4405  src -= ((2 * stride) + 2);
4406 
4407  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4408  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4409  src += (5 * stride);
4410 
4411  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4412  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4413  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4414  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4415  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4416 
4417  LD_SB4(src, stride, src5, src6, src7, src8);
4418  src += (4 * stride);
4419  XORI_B4_128_SB(src5, src6, src7, src8);
4420 
4421  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4422  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4423  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4424  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4425 
4426  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4427  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4428  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4429  hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
4430  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4431  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4432  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4433  hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
4434 
4435  tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4436  filt2);
4437  tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
4438  filt2);
4439  tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4440  tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4441  filt2);
4442  tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
4443  filt2);
4444  tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4445  tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4446  filt2);
4447  tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
4448  filt2);
4449  tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4450  tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4451  filt2);
4452  tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
4453  filt2);
4454  tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4455 
4456  SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
4457  SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
4458 
4459  LD4(dst, stride, tp0, tp1, tp2, tp3);
4460  INSERT_D2_UB(tp0, tp1, dst0);
4461  INSERT_D2_UB(tp2, tp3, dst1);
4462 
4463  tmp0 = __msa_aver_s_h(tmp0, hz_out2);
4464  tmp1 = __msa_aver_s_h(tmp1, hz_out3);
4465  tmp2 = __msa_aver_s_h(tmp2, hz_out4);
4466  tmp3 = __msa_aver_s_h(tmp3, hz_out5);
4467 
4468  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4469  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4470  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4471  ST8x4_UB(dst0, dst1, dst, stride);
4472  dst += (4 * stride);
4473 
4474  LD_SB4(src, stride, src9, src10, src11, src12);
4475  XORI_B4_128_SB(src9, src10, src11, src12);
4476  hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
4477  hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
4478  hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
4479  hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
4480  ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4481  hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
4482  hz_out1211_r);
4483  ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4484  hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
4485  hz_out1211_l);
4486  tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
4487  filt2);
4488  tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
4489  filt2);
4490  tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4491  tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
4492  filt2);
4493  tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
4494  filt2);
4495  tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4496  tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
4497  filt2);
4498  tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
4499  filt2);
4500  tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4501  tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
4502  filt2);
4503  tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
4504  filt2);
4505  tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4506 
4507  SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
4508  SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
4509 
4510  LD4(dst, stride, tp0, tp1, tp2, tp3);
4511  INSERT_D2_UB(tp0, tp1, dst0);
4512  INSERT_D2_UB(tp2, tp3, dst1);
4513 
4514  tmp0 = __msa_aver_s_h(tmp0, hz_out6);
4515  tmp1 = __msa_aver_s_h(tmp1, hz_out7);
4516  tmp2 = __msa_aver_s_h(tmp2, hz_out8);
4517  tmp3 = __msa_aver_s_h(tmp3, hz_out9);
4518 
4519  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4520  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4521  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4522  ST8x4_UB(dst0, dst1, dst, stride);
4523 }
4524 
4526  ptrdiff_t stride)
4527 {
4528  const int32_t filt_const0 = 0xfffb0001;
4529  const int32_t filt_const1 = 0x140014;
4530  const int32_t filt_const2 = 0x1fffb;
4531  uint64_t tp0, tp1, tp2, tp3;
4532  v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
4533  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4534  v16i8 src11, src12, mask0, mask1, mask2;
4535  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4536  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
4537  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4538  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
4539  v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
4540  v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
4541  v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
4542  v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
4543  v4i32 tmp0_w, tmp1_w;
4544 
4545  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4546 
4547  filt0 = (v8i16) __msa_fill_w(filt_const0);
4548  filt1 = (v8i16) __msa_fill_w(filt_const1);
4549  filt2 = (v8i16) __msa_fill_w(filt_const2);
4550 
4551  src -= ((2 * stride) + 2);
4552 
4553  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4554  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4555  src += (5 * stride);
4556 
4557  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4558  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4559  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4560  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4561  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4562 
4563  LD_SB4(src, stride, src5, src6, src7, src8);
4564  src += (4 * stride);
4565  XORI_B4_128_SB(src5, src6, src7, src8);
4566 
4567  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4568  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4569  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4570  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4571 
4572  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4573  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4574  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4575  hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
4576  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4577  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4578  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4579  hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
4580 
4581  tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4582  filt2);
4583  tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
4584  filt2);
4585  tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4586  tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4587  filt2);
4588  tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
4589  filt2);
4590  tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4591  tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4592  filt2);
4593  tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
4594  filt2);
4595  tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4596  tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4597  filt2);
4598  tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
4599  filt2);
4600  tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4601 
4602  SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
4603  SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
4604 
4605  LD4(dst, stride, tp0, tp1, tp2, tp3);
4606  INSERT_D2_UB(tp0, tp1, dst0);
4607  INSERT_D2_UB(tp2, tp3, dst1);
4608 
4609  tmp0 = __msa_aver_s_h(tmp0, hz_out3);
4610  tmp1 = __msa_aver_s_h(tmp1, hz_out4);
4611  tmp2 = __msa_aver_s_h(tmp2, hz_out5);
4612  tmp3 = __msa_aver_s_h(tmp3, hz_out6);
4613 
4614  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4615  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4616  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4617  ST8x4_UB(dst0, dst1, dst, stride);
4618  dst += (4 * stride);
4619 
4620  LD_SB4(src, stride, src9, src10, src11, src12);
4621  XORI_B4_128_SB(src9, src10, src11, src12);
4622  hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
4623  hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
4624  hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
4625  hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
4626  ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4627  hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
4628  hz_out1211_r);
4629  ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4630  hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
4631  hz_out1211_l);
4632  tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
4633  filt2);
4634  tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
4635  filt2);
4636  tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4637  tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
4638  filt2);
4639  tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
4640  filt2);
4641  tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4642  tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
4643  filt2);
4644  tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
4645  filt2);
4646  tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4647  tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
4648  filt2);
4649  tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
4650  filt2);
4651  tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4652 
4653  SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
4654  SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
4655 
4656  LD4(dst, stride, tp0, tp1, tp2, tp3);
4657  INSERT_D2_UB(tp0, tp1, dst0);
4658  INSERT_D2_UB(tp2, tp3, dst1);
4659 
4660  tmp0 = __msa_aver_s_h(tmp0, hz_out7);
4661  tmp1 = __msa_aver_s_h(tmp1, hz_out8);
4662  tmp2 = __msa_aver_s_h(tmp2, hz_out9);
4663  tmp3 = __msa_aver_s_h(tmp3, hz_out10);
4664 
4665  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4666  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4667  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4668  ST8x4_UB(dst0, dst1, dst, stride);
4669 }
4670 
4672  ptrdiff_t stride)
4673 {
4674  uint32_t tp0, tp1, tp2, tp3;
4675  const int32_t filt_const0 = 0xfffb0001;
4676  const int32_t filt_const1 = 0x140014;
4677  const int32_t filt_const2 = 0x1fffb;
4678  v16u8 res, out = { 0 };
4679  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4680  v16i8 mask0, mask1, mask2;
4681  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4682  v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
4683  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4684  v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
4685  v4i32 tmp0, tmp1;
4686 
4687  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
4688 
4689  filt0 = (v8i16) __msa_fill_w(filt_const0);
4690  filt1 = (v8i16) __msa_fill_w(filt_const1);
4691  filt2 = (v8i16) __msa_fill_w(filt_const2);
4692 
4693  src -= ((2 * stride) + 2);
4694 
4695  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4696  src += (5 * stride);
4697  LD_SB4(src, stride, src5, src6, src7, src8);
4698 
4699  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4700  XORI_B4_128_SB(src5, src6, src7, src8);
4701 
4702  hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
4703  hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
4704  hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
4705  hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
4706  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4707  PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
4708  PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
4709 
4710  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4711  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4712  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4713  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4714 
4715  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4716  filt2);
4717  tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4718  filt2);
4719  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4720  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4721  filt2);
4722  tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4723  filt2);
4724  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4725 
4726  SRARI_H2_SH(hz_out2, hz_out4, 5);
4727  SAT_SH2_SH(hz_out2, hz_out4, 7);
4728 
4729  dst0 = __msa_aver_s_h(dst0, hz_out2);
4730  dst1 = __msa_aver_s_h(dst1, hz_out4);
4731  LW4(dst, stride, tp0, tp1, tp2, tp3);
4732  INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
4733  res = PCKEV_XORI128_UB(dst0, dst1);
4734  res = __msa_aver_u_b(res, out);
4735  ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
4736 }
4737 
4739  ptrdiff_t stride)
4740 {
4741  const int32_t filt_const0 = 0xfffb0001;
4742  const int32_t filt_const1 = 0x140014;
4743  const int32_t filt_const2 = 0x1fffb;
4744  uint32_t tp0, tp1, tp2, tp3;
4745  v16u8 res, out = { 0 };
4746  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4747  v16i8 mask0, mask1, mask2;
4748  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4749  v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
4750  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4751  v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
4752  v4i32 tmp0, tmp1;
4753 
4754  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
4755 
4756  filt0 = (v8i16) __msa_fill_w(filt_const0);
4757  filt1 = (v8i16) __msa_fill_w(filt_const1);
4758  filt2 = (v8i16) __msa_fill_w(filt_const2);
4759 
4760  src -= ((2 * stride) + 2);
4761 
4762  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4763  src += (5 * stride);
4764  LD_SB4(src, stride, src5, src6, src7, src8);
4765 
4766  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4767  XORI_B4_128_SB(src5, src6, src7, src8);
4768 
4769  hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
4770  hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
4771  hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
4772  hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
4773  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4774  PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
4775  PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
4776 
4777  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4778  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4779  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4780  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4781 
4782  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4783  filt2);
4784  tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4785  filt2);
4786  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4787  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4788  filt2);
4789  tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4790  filt2);
4791  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4792 
4793  PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
4794  SRARI_H2_SH(hz_out0, hz_out1, 5);
4795  SAT_SH2_SH(hz_out0, hz_out1, 7);
4796 
4797  dst0 = __msa_aver_s_h(dst0, hz_out0);
4798  dst1 = __msa_aver_s_h(dst1, hz_out1);
4799  LW4(dst, stride, tp0, tp1, tp2, tp3);
4800  INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
4801  res = PCKEV_XORI128_UB(dst0, dst1);
4802  res = __msa_aver_u_b(res, out);
4803  ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
4804 }
4805 
4807  ptrdiff_t stride)
4808 {
4809  int32_t loop_cnt;
4810  int16_t filt_const0 = 0xfb01;
4811  int16_t filt_const1 = 0x1414;
4812  int16_t filt_const2 = 0x1fb;
4813  v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
4814  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4815  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
4816  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
4817  v16i8 src65_l, src87_l, filt0, filt1, filt2;
4818  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
4819 
4820  filt0 = (v16i8) __msa_fill_h(filt_const0);
4821  filt1 = (v16i8) __msa_fill_h(filt_const1);
4822  filt2 = (v16i8) __msa_fill_h(filt_const2);
4823  src -= (stride * 2);
4824 
4825  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4826  src += (5 * stride);
4827 
4828  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4829  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4830  src32_r, src43_r);
4831  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
4832  src32_l, src43_l);
4833 
4834  for (loop_cnt = 4; loop_cnt--;) {
4835  LD_SB4(src, stride, src5, src6, src7, src8);
4836  src += (4 * stride);
4837 
4838  XORI_B4_128_SB(src5, src6, src7, src8);
4839  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
4840  src65_r, src76_r, src87_r);
4841  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
4842  src65_l, src76_l, src87_l);
4843  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
4844  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
4845  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
4846  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
4847  out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
4848  out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
4849  out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
4850  out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
4851  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
4852  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
4853  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
4854  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
4855  LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
4856  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
4857  out3_r, res0, res1, res2, res3);
4858  XORI_B4_128_UB(res0, res1, res2, res3);
4859  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
4860  AVER_UB2_UB(res2, dst2, res3, dst3, res2, res3);
4861  ST_UB4(res0, res1, res2, res3, dst, stride);
4862  dst += (4 * stride);
4863 
4864  src10_r = src54_r;
4865  src32_r = src76_r;
4866  src21_r = src65_r;
4867  src43_r = src87_r;
4868  src10_l = src54_l;
4869  src32_l = src76_l;
4870  src21_l = src65_l;
4871  src43_l = src87_l;
4872  src4 = src8;
4873  }
4874 }
4875 
4877  ptrdiff_t stride)
4878 {
4879  uint64_t tp0, tp1, tp2, tp3;
4880  const int16_t filt_const0 = 0xfb01;
4881  const int16_t filt_const1 = 0x1414;
4882  const int16_t filt_const2 = 0x1fb;
4883  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
4884  v16u8 out0, out1, out2, out3;
4885  v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src109_r;
4886  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
4887  v16i8 filt0, filt1, filt2;
4888  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
4889 
4890  filt0 = (v16i8) __msa_fill_h(filt_const0);
4891  filt1 = (v16i8) __msa_fill_h(filt_const1);
4892  filt2 = (v16i8) __msa_fill_h(filt_const2);
4893 
4894  src -= (stride * 2);
4895 
4896  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4897  src += (5 * stride);
4898 
4899  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4900  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4901  src32_r, src43_r);
4902 
4903  LD_SB4(src, stride, src7, src8, src9, src10);
4904  src += (4 * stride);
4905  XORI_B4_128_SB(src7, src8, src9, src10);
4906  ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
4907  src87_r, src98_r, src109_r);
4908  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
4909  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
4910  out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
4911  out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
4912 
4913  LD_SB4(src, stride, src0, src1, src2, src3);
4914  XORI_B4_128_SB(src0, src1, src2, src3);
4915  ILVR_B4_SB(src0, src10, src1, src0, src2, src1, src3, src2, src10_r,
4916  src21_r, src32_r, src43_r);
4917  out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
4918  out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
4919  out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
4920  out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
4921 
4922  LD4(dst, stride, tp0, tp1, tp2, tp3);
4923  INSERT_D2_UB(tp0, tp1, dst0);
4924  INSERT_D2_UB(tp2, tp3, dst1);
4925  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
4926  INSERT_D2_UB(tp0, tp1, dst2);
4927  INSERT_D2_UB(tp2, tp3, dst3);
4928 
4929  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
4930  SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
4931  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
4932  SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
4933  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
4934  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
4935  out2 = PCKEV_XORI128_UB(out4_r, out5_r);
4936  out3 = PCKEV_XORI128_UB(out6_r, out7_r);
4937  AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
4938  dst2, dst3);
4939  ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
4940 }
4941 
4943  ptrdiff_t stride)
4944 {
4945  uint32_t tp0, tp1, tp2, tp3;
4946  int16_t filt_const0 = 0xfb01;
4947  int16_t filt_const1 = 0x1414;
4948  int16_t filt_const2 = 0x1fb;
4949  v16u8 res, dst0 = { 0 };
4950  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4951  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
4952  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
4953  v8i16 out10, out32;
4954 
4955  filt0 = (v16i8) __msa_fill_h(filt_const0);
4956  filt1 = (v16i8) __msa_fill_h(filt_const1);
4957  filt2 = (v16i8) __msa_fill_h(filt_const2);
4958 
4959  src -= (stride * 2);
4960  LD_SB5(src, stride, src0, src1, src2, src3, src4);
4961  src += (5 * stride);
4962 
4963  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4964  src32_r, src43_r);
4965  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
4966  XORI_B2_128_SB(src2110, src4332);
4967  LD_SB4(src, stride, src5, src6, src7, src8);
4968  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
4969  src76_r, src87_r);
4970  ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
4971  XORI_B2_128_SB(src6554, src8776);
4972  out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
4973  out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
4974  SRARI_H2_SH(out10, out32, 5);
4975  SAT_SH2_SH(out10, out32, 7);
4976  LW4(dst, stride, tp0, tp1, tp2, tp3);
4977  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4978  res = PCKEV_XORI128_UB(out10, out32);
4979  dst0 = __msa_aver_u_b(res, dst0);
4980  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
4981 }
4982 
4984  ptrdiff_t stride)
4985 {
4986  uint32_t row;
4987  v16u8 out, dst0;
4988  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4989  v16i8 src11;
4990  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
4991  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
4992  v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
4993  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
4994  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
4995  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
4996  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
4997  v8i16 minus5h = __msa_ldi_h(-5);
4998  v8i16 plus20h = __msa_ldi_h(20);
4999 
5000  mask3 = mask0 + 4;
5001  mask4 = mask1 + 4;
5002  mask5 = mask2 + 4;
5003 
5004  src -= ((2 * stride) + 2);
5005 
5006  LD_SB5(src, stride, src0, src1, src2, src3, src4);
5007  LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
5008  src += (5 * stride);
5009  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5010  XORI_B5_128_SB(src7, src8, src9, src10, src11);
5011 
5012  for (row = 16; row--;) {
5013  LD_SB2(src, 8, src5, src6);
5014  src += stride;
5015  XORI_B2_128_SB(src5, src6);
5016  dst0 = LD_UB(dst);
5017 
5018  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5019  vt_res0, vt_res1);
5020  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
5021  vt_res2, vt_res3);
5022  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5023  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5024  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5025  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5026  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5027  mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5028  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5029  mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5030  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5031  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5032  hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5033  hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5034  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5035  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5036  DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5037  DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5038  SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5039  SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5040  tmp0 = __msa_srari_h(shf_vec2, 5);
5041  tmp1 = __msa_srari_h(shf_vec5, 5);
5042  tmp2 = __msa_srari_h(shf_vec8, 5);
5043  tmp3 = __msa_srari_h(shf_vec11, 5);
5044  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5045  PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
5046  PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5047  tmp0 = __msa_aver_s_h(tmp2, tmp0);
5048  tmp1 = __msa_aver_s_h(tmp3, tmp1);
5049  out = PCKEV_XORI128_UB(tmp0, tmp1);
5050  out = __msa_aver_u_b(out, dst0);
5051  ST_UB(out, dst);
5052  dst += stride;
5053 
5054  src0 = src1;
5055  src1 = src2;
5056  src2 = src3;
5057  src3 = src4;
5058  src4 = src5;
5059  src7 = src8;
5060  src8 = src9;
5061  src9 = src10;
5062  src10 = src11;
5063  src11 = src6;
5064  }
5065 }
5066 
5068  ptrdiff_t stride)
5069 {
5070  uint32_t row;
5071  v16u8 out, dst0;
5072  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5073  v16i8 src11;
5074  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
5075  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5076  v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
5077  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5078  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5079  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5080  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5081  v8i16 minus5h = __msa_ldi_h(-5);
5082  v8i16 plus20h = __msa_ldi_h(20);
5083 
5084  mask3 = mask0 + 4;
5085  mask4 = mask1 + 4;
5086  mask5 = mask2 + 4;
5087 
5088  src -= ((2 * stride) + 2);
5089 
5090  LD_SB5(src, stride, src0, src1, src2, src3, src4);
5091  LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
5092  src += (5 * stride);
5093  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5094  XORI_B5_128_SB(src7, src8, src9, src10, src11);
5095 
5096  for (row = 16; row--;) {
5097  LD_SB2(src, 8, src5, src6);
5098  src += stride;
5099  XORI_B2_128_SB(src5, src6);
5100  dst0 = LD_UB(dst);
5101 
5102  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5103  vt_res0, vt_res1);
5104  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
5105  vt_res2, vt_res3);
5106  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5107  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5108  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5109  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5110  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5111  mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5112  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5113  mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5114  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5115  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5116  hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5117  hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5118  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5119  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5120  DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5121  DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5122  SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5123  SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5124  tmp0 = __msa_srari_h(shf_vec2, 5);
5125  tmp1 = __msa_srari_h(shf_vec5, 5);
5126  tmp2 = __msa_srari_h(shf_vec8, 5);
5127  tmp3 = __msa_srari_h(shf_vec11, 5);
5128  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5129  tmp0 = __msa_pckod_h(tmp2, tmp0);
5130  tmp1 = __msa_pckod_h(tmp3, tmp1);
5131  PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5132  tmp0 = __msa_aver_s_h(tmp2, tmp0);
5133  tmp1 = __msa_aver_s_h(tmp3, tmp1);
5134  out = PCKEV_XORI128_UB(tmp0, tmp1);
5135  out = __msa_aver_u_b(out, dst0);
5136  ST_UB(out, dst);
5137  dst += stride;
5138 
5139  src0 = src1;
5140  src1 = src2;
5141  src2 = src3;
5142  src3 = src4;
5143  src4 = src5;
5144  src7 = src8;
5145  src8 = src9;
5146  src9 = src10;
5147  src10 = src11;
5148  src11 = src6;
5149  }
5150 }
5151 
5153  ptrdiff_t stride)
5154 {
5155  uint32_t row;
5156  uint64_t tp0, tp1;
5157  v16u8 out, dst0 = { 0 };
5158  v16i8 src0, src1, src2, src3, src4, src5, src6;
5159  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
5160  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5161  v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
5162  v8i16 mask3, mask4, mask5;
5163  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5164  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5165  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5166  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5167  v8i16 minus5h = __msa_ldi_h(-5);
5168  v8i16 plus20h = __msa_ldi_h(20);
5169 
5170  mask3 = mask0 + 4;
5171  mask4 = mask1 + 4;
5172  mask5 = mask2 + 4;
5173 
5174  src -= ((2 * stride) + 2);
5175 
5176  LD_SB5(src, stride, src0, src1, src2, src3, src4);
5177  src += (5 * stride);
5178  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5179 
5180  for (row = 4; row--;) {
5181  LD_SB2(src, stride, src5, src6);
5182  src += (2 * stride);
5183  XORI_B2_128_SB(src5, src6);
5184 
5185  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5186  vt_res0, vt_res1);
5187  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
5188  vt_res2, vt_res3);
5189  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5190  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5191  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5192  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5193  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5194  mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5195  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5196  mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5197  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5198  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5199  hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5200  hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5201  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5202  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5203  DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5204  DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5205  SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5206  SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5207  tmp0 = __msa_srari_h(shf_vec2, 5);
5208  tmp1 = __msa_srari_h(shf_vec5, 5);
5209  tmp2 = __msa_srari_h(shf_vec8, 5);
5210  tmp3 = __msa_srari_h(shf_vec11, 5);
5211  LD2(dst, stride, tp0, tp1);
5212  INSERT_D2_UB(tp0, tp1, dst0);
5213  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5214  PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
5215  PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5216  tmp0 = __msa_aver_s_h(tmp2, tmp0);
5217  tmp1 = __msa_aver_s_h(tmp3, tmp1);
5218  out = PCKEV_XORI128_UB(tmp0, tmp1);
5219  out = __msa_aver_u_b(out, dst0);
5220  ST8x2_UB(out, dst, stride);
5221  dst += (2 * stride);
5222 
5223  src0 = src2;
5224  src1 = src3;
5225  src2 = src4;
5226  src3 = src5;
5227  src4 = src6;
5228  }
5229 }
5230 
5232  ptrdiff_t stride)
5233 {
5234  uint32_t row;
5235  uint64_t tp0, tp1;
5236  v16u8 out, dst0 = { 0 };
5237  v16i8 src0, src1, src2, src3, src4, src5, src6;
5238  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
5239  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5240  v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
5241  v8i16 mask3, mask4, mask5;
5242  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5243  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5244  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5245  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5246  v8i16 minus5h = __msa_ldi_h(-5);
5247  v8i16 plus20h = __msa_ldi_h(20);
5248 
5249  mask3 = mask0 + 4;
5250  mask4 = mask1 + 4;
5251  mask5 = mask2 + 4;
5252 
5253  src -= ((2 * stride) + 2);
5254 
5255  LD_SB5(src, stride, src0, src1, src2, src3, src4);
5256  src += (5 * stride);
5257  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5258 
5259  for (row = 4; row--;) {
5260  LD_SB2(src, stride, src5, src6);
5261  src += (2 * stride);
5262  XORI_B2_128_SB(src5, src6);
5263 
5264  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5265  vt_res0, vt_res1);
5266  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
5267  vt_res2, vt_res3);
5268  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5269  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5270  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5271  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5272  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5273  mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5274  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5275  mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5276  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5277  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5278  hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5279  hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5280  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5281  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5282  DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5283  DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5284  SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5285  SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5286  tmp0 = __msa_srari_h(shf_vec2, 5);
5287  tmp1 = __msa_srari_h(shf_vec5, 5);
5288  tmp2 = __msa_srari_h(shf_vec8, 5);
5289  tmp3 = __msa_srari_h(shf_vec11, 5);
5290  LD2(dst, stride, tp0, tp1);
5291  INSERT_D2_UB(tp0, tp1, dst0);
5292  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5293  tmp0 = __msa_pckod_h(tmp2, tmp0);
5294  tmp1 = __msa_pckod_h(tmp3, tmp1);
5295  PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5296  tmp0 = __msa_aver_s_h(tmp2, tmp0);
5297  tmp1 = __msa_aver_s_h(tmp3, tmp1);
5298  out = PCKEV_XORI128_UB(tmp0, tmp1);
5299  out = __msa_aver_u_b(out, dst0);
5300  ST8x2_UB(out, dst, stride);
5301  dst += (2 * stride);
5302 
5303  src0 = src2;
5304  src1 = src3;
5305  src2 = src4;
5306  src3 = src5;
5307  src4 = src6;
5308  }
5309 }
5310 
5312  ptrdiff_t stride)
5313 {
5314  uint32_t tp0, tp1, tp2, tp3;
5315  const int16_t filt_const0 = 0xfb01;
5316  const int16_t filt_const1 = 0x1414;
5317  const int16_t filt_const2 = 0x1fb;
5318  v16u8 out, dstv = { 0 };
5319  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5320  v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
5321  v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
5322  v16i8 src76_l, src87_l, filt0, filt1, filt2;
5323  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
5324  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5325  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5326  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5327  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5328  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5329  v8i16 minus5h = __msa_ldi_h(-5);
5330  v8i16 plus20h = __msa_ldi_h(20);
5331  v8i16 zeros = { 0 };
5332 
5333  filt0 = (v16i8) __msa_fill_h(filt_const0);
5334  filt1 = (v16i8) __msa_fill_h(filt_const1);
5335  filt2 = (v16i8) __msa_fill_h(filt_const2);
5336 
5337  src -= ((2 * stride) + 2);
5338 
5339  LD_SB5(src, stride, src0, src1, src2, src3, src4);
5340  src += (5 * stride);
5341  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5342  LD_SB4(src, stride, src5, src6, src7, src8);
5343  XORI_B4_128_SB(src5, src6, src7, src8);
5344 
5345  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
5346  src32_r, src43_r);
5347  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
5348  src76_r, src87_r);
5349  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
5350  src32_l, src43_l);
5351  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
5352  src76_l, src87_l);
5353  vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
5354  vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
5355  vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
5356  vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
5357  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5358  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5359  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5360  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5361  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5362  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5363  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5364  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5365 
5366  vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
5367  vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
5368  vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
5369  vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
5370  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5371  mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
5372  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5373  mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
5374  hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5375  DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
5376  hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5377  DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
5378 
5379  SRARI_W2_SW(hz_res0, hz_res1, 10);
5380  SAT_SW2_SW(hz_res0, hz_res1, 7);
5381  SRARI_W2_SW(hz_res2, hz_res3, 10);
5382  SAT_SW2_SW(hz_res2, hz_res3, 7);
5383 
5384  dst0 = __msa_srari_h(shf_vec2, 5);
5385  dst1 = __msa_srari_h(shf_vec5, 5);
5386  dst2 = __msa_srari_h(shf_vec6, 5);
5387  dst3 = __msa_srari_h(shf_vec7, 5);
5388 
5389  SAT_SH2_SH(dst0, dst1, 7);
5390  SAT_SH2_SH(dst2, dst3, 7);
5391  ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
5392  ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
5393 
5394  hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
5395  hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
5396  hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
5397  hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
5398 
5399  LW4(dst, stride, tp0, tp1, tp2, tp3);
5400  INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv);
5401  PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
5402  out = PCKEV_XORI128_UB(dst0, dst2);
5403  out = __msa_aver_u_b(out, dstv);
5404  ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
5405 }
5406 
5408  ptrdiff_t stride)
5409 {
5410  uint32_t tp0, tp1, tp2, tp3;
5411  const int16_t filt_const0 = 0xfb01;
5412  const int16_t filt_const1 = 0x1414;
5413  const int16_t filt_const2 = 0x1fb;
5414  v16u8 out, dstv = { 0 };
5415  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5416  v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
5417  v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
5418  v16i8 src76_l, src87_l, filt0, filt1, filt2;
5419  v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
5420  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5421  v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5422  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5423  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5424  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5425  v8i16 minus5h = __msa_ldi_h(-5);
5426  v8i16 plus20h = __msa_ldi_h(20);
5427  v8i16 zeros = { 0 };
5428 
5429  filt0 = (v16i8) __msa_fill_h(filt_const0);
5430  filt1 = (v16i8) __msa_fill_h(filt_const1);
5431  filt2 = (v16i8) __msa_fill_h(filt_const2);
5432 
5433  src -= ((2 * stride) + 2);
5434 
5435  LD_SB5(src, stride, src0, src1, src2, src3, src4);
5436  src += (5 * stride);
5437  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5438  LD_SB4(src, stride, src5, src6, src7, src8);
5439  XORI_B4_128_SB(src5, src6, src7, src8);
5440 
5441  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
5442  src32_r, src43_r);
5443  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
5444  src76_r, src87_r);
5445  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
5446  src32_l, src43_l);
5447  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
5448  src76_l, src87_l);
5449  vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
5450  vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
5451  vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
5452  vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
5453  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5454  mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5455  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5456  mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5457  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5458  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5459  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5460  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5461 
5462  vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
5463  vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
5464  vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
5465  vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
5466  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5467  mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
5468  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5469  mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
5470  hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5471  DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
5472  hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5473  DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
5474 
5475  SRARI_W2_SW(hz_res0, hz_res1, 10);
5476  SAT_SW2_SW(hz_res0, hz_res1, 7);
5477  SRARI_W2_SW(hz_res2, hz_res3, 10);
5478  SAT_SW2_SW(hz_res2, hz_res3, 7);
5479 
5480  dst0 = __msa_srari_h(shf_vec2, 5);
5481  dst1 = __msa_srari_h(shf_vec5, 5);
5482  dst2 = __msa_srari_h(shf_vec6, 5);
5483  dst3 = __msa_srari_h(shf_vec7, 5);
5484 
5485  SAT_SH2_SH(dst0, dst1, 7);
5486  SAT_SH2_SH(dst2, dst3, 7);
5487 
5488  dst0 = __msa_ilvod_h(zeros, dst0);
5489  dst1 = __msa_ilvod_h(zeros, dst1);
5490  dst2 = __msa_ilvod_h(zeros, dst2);
5491  dst3 = __msa_ilvod_h(zeros, dst3);
5492 
5493  hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
5494  hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
5495  hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
5496  hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
5497 
5498  LW4(dst, stride, tp0, tp1, tp2, tp3);
5499  INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv);
5500  PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
5501  out = PCKEV_XORI128_UB(dst0, dst2);
5502  out = __msa_aver_u_b(out, dstv);
5503  ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
5504 }
5505 
5507  ptrdiff_t stride)
5508 {
5509  const int32_t filt_const0 = 0xfffb0001;
5510  const int32_t filt_const1 = 0x140014;
5511  const int32_t filt_const2 = 0x1fffb;
5512  const uint8_t *src_tmp = src - (2 * stride) - 2;
5513  uint8_t *dst_tmp = dst;
5514  uint64_t tp0, tp1, tp2, tp3;
5515  uint32_t multiple8_cnt, loop_cnt;
5516  v16u8 dst0, dst1, out0, out1;
5517  v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
5518  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5519  v8i16 hz_out7, hz_out8, res0, res1, res2, res3;
5520  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5521  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
5522  v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
5523  v8i16 hz_out87_l, filt0, filt1, filt2;
5524  v4i32 tmp0, tmp1;
5525 
5526  filt0 = (v8i16) __msa_fill_w(filt_const0);
5527  filt1 = (v8i16) __msa_fill_w(filt_const1);
5528  filt2 = (v8i16) __msa_fill_w(filt_const2);
5529 
5530  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
5531 
5532  for (multiple8_cnt = 2; multiple8_cnt--;) {
5533  src = src_tmp;
5534  dst = dst_tmp;
5535 
5536  LD_SB5(src, stride, src0, src1, src2, src3, src4);
5537  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5538  src += (5 * stride);
5539 
5540  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5541  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5542  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5543  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5544  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
5545 
5546  for (loop_cnt = 4; loop_cnt--;) {
5547  LD_SB4(src, stride, src0, src1, src2, src3);
5548  XORI_B4_128_SB(src0, src1, src2, src3);
5549  src += (4 * stride);
5550 
5551  hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5552  hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5553  hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5554  hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5555  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
5556  hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
5557  hz_out43_r);
5558  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
5559  hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
5560  hz_out43_l);
5561  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
5562  hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
5563  hz_out87_r);
5564  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
5565  hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
5566  hz_out87_l);
5567 
5568  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
5569  filt1, filt2);
5570  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
5571  filt1, filt2);
5572  res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5573  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
5574  filt1, filt2);
5575  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
5576  filt1, filt2);
5577  res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5578  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
5579  filt1, filt2);
5580  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
5581  filt1, filt2);
5582  res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5583  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
5584  filt1, filt2);
5585  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
5586  filt1, filt2);
5587  res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5588 
5589  LD4(dst, stride, tp0, tp1, tp2, tp3);
5590  INSERT_D2_UB(tp0, tp1, dst0);
5591  INSERT_D2_UB(tp2, tp3, dst1);
5592  out0 = PCKEV_XORI128_UB(res0, res1);
5593  out1 = PCKEV_XORI128_UB(res2, res3);
5594  AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
5595  ST8x4_UB(out0, out1, dst, stride);
5596  dst += (4 * stride);
5597 
5598  hz_out0 = hz_out4;
5599  hz_out1 = hz_out5;
5600  hz_out2 = hz_out6;
5601  hz_out3 = hz_out7;
5602  hz_out4 = hz_out8;
5603  }
5604 
5605  src_tmp += 8;
5606  dst_tmp += 8;
5607  }
5608 }
5609 
5611  ptrdiff_t stride)
5612 {
5613  const int32_t filt_const0 = 0xfffb0001;
5614  const int32_t filt_const1 = 0x140014;
5615  const int32_t filt_const2 = 0x1fffb;
5616  uint64_t tp0, tp1, tp2, tp3;
5617  v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
5618  v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
5619  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5620  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
5621  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5622  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
5623  v8i16 hz_out1110_r, hz_out1211_r, res0, res1, res2, res3;
5624  v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
5625  v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
5626  v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
5627  v4i32 tmp0, tmp1;
5628 
5629  filt0 = (v8i16) __msa_fill_w(filt_const0);
5630  filt1 = (v8i16) __msa_fill_w(filt_const1);
5631  filt2 = (v8i16) __msa_fill_w(filt_const2);
5632 
5633  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
5634 
5635  src -= ((2 * stride) + 2);
5636  LD_SB5(src, stride, src0, src1, src2, src3, src4);
5637  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5638  src += (5 * stride);
5639 
5640  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5641  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5642  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5643  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5644  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
5645 
5646  LD_SB4(src, stride, src0, src1, src2, src3);
5647  XORI_B4_128_SB(src0, src1, src2, src3);
5648  src += (4 * stride);
5649  hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5650  hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5651  hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5652  hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5653  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5654  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
5655  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5656  hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
5657  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5658  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
5659  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5660  hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
5661 
5662  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
5663  filt2);
5664  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
5665  filt2);
5666  res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5667  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
5668  filt2);
5669  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
5670  filt2);
5671  res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5672  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
5673  filt2);
5674  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
5675  filt2);
5676  res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5677  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
5678  filt2);
5679  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
5680  filt2);
5681  res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5682  LD4(dst, stride, tp0, tp1, tp2, tp3);
5683  INSERT_D2_UB(tp0, tp1, dst0);
5684  INSERT_D2_UB(tp2, tp3, dst1);
5685  out0 = PCKEV_XORI128_UB(res0, res1);
5686  out1 = PCKEV_XORI128_UB(res2, res3);
5687  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
5688  ST8x4_UB(dst0, dst1, dst, stride);
5689  dst += (4 * stride);
5690 
5691  LD_SB4(src, stride, src0, src1, src2, src3);
5692  XORI_B4_128_SB(src0, src1, src2, src3);
5693  hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5694  hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5695  hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5696  hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5697  ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
5698  hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
5699  hz_out1211_r);
5700  ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
5701  hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
5702  hz_out1211_l);
5703  tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
5704  filt2);
5705  tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
5706  filt2);
5707  res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5708  tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
5709  filt2);
5710  tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
5711  filt2);
5712  res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5713  tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
5714  filt2);
5715  tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
5716  filt2);
5717  res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5718  tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
5719  filt2);
5720  tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
5721  filt2);
5722  res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5723  LD4(dst, stride, tp0, tp1, tp2, tp3);
5724  INSERT_D2_UB(tp0, tp1, dst0);
5725  INSERT_D2_UB(tp2, tp3, dst1);
5726  out0 = PCKEV_XORI128_UB(res0, res1);
5727  out1 = PCKEV_XORI128_UB(res2, res3);
5728  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
5729  ST8x4_UB(dst0, dst1, dst, stride);
5730 }
5731 
5733  ptrdiff_t stride)
5734 {
5735  const int32_t filt_const0 = 0xfffb0001;
5736  const int32_t filt_const1 = 0x140014;
5737  const int32_t filt_const2 = 0x1fffb;
5738  uint32_t tp0, tp1, tp2, tp3;
5739  v16u8 res, dst0 = { 0 };
5740  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5741  v16i8 mask0, mask1, mask2;
5742  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5743  v8i16 hz_out7, hz_out8, res0, res1, filt0, filt1, filt2;
5744  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5745  v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
5746  v4i32 tmp0, tmp1;
5747 
5748  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
5749 
5750  filt0 = (v8i16) __msa_fill_w(filt_const0);
5751  filt1 = (v8i16) __msa_fill_w(filt_const1);
5752  filt2 = (v8i16) __msa_fill_w(filt_const2);
5753 
5754  src -= ((2 * stride) + 2);
5755 
5756