24 #define APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, coef0, coef1, coef2) \
26 v16u8 out, tmp0, tmp1; \
27 v16u8 data0, data1, data2, data3, data4, data5; \
29 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
30 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
32 VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \
33 ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \
34 data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \
35 data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \
36 HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \
37 ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \
38 data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \
39 data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \
40 sum0_r *= (v8u16) (coef0); \
41 sum0_l *= (v8u16) (coef0); \
42 ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \
43 data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \
44 data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \
45 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
46 ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \
47 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
48 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
49 res_r = (v8i16) (sum0_r - sum3_r); \
50 res_l = (v8i16) (sum0_l - sum3_l); \
51 SRARI_H2_SH(res_r, res_l, 5); \
52 CLIP_SH2_0_255(res_r, res_l); \
53 out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
58 #define APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, \
59 mask0, mask1, mask2, mask3, \
60 coef0, coef1, coef2) \
63 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
64 v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \
65 v8i16 res0_r, res1_r; \
67 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \
68 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \
69 HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \
70 DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \
71 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \
72 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \
73 DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \
74 DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \
75 res0_r = (v8i16) (sum0_r - sum3_r); \
76 res1_r = (v8i16) (sum4_r - sum7_r); \
77 SRARI_H2_SH(res0_r, res1_r, 5); \
78 CLIP_SH2_0_255(res0_r, res1_r); \
79 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \
84 #define APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, \
85 mask0, mask1, mask2, mask3, \
86 coef0, coef1, coef2) \
90 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
92 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \
93 sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \
94 sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \
95 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \
96 DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \
97 res0_r = (v8i16) (sum0_r - sum3_r); \
98 res0_r = __msa_srari_h(res0_r, 5); \
99 res0_r = CLIP_SH_0_255(res0_r); \
100 out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \
105 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, \
106 mask2, mask3, coef0, \
111 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
113 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \
114 sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \
115 sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \
116 VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \
117 DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \
118 res0_r = (v8i16) (sum0_r - sum3_r); \
121 res0_r = CLIP_SH_0_255(res0_r); \
122 out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \
127 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, \
128 coef0, coef1, coef2) \
130 v16u8 out, tmp0, tmp1; \
131 v16u8 data0, data1, data2, data3, data4, data5; \
132 v8i16 res_r, res_l; \
133 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
134 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
136 VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \
137 ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \
138 data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \
139 data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \
140 HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \
141 ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \
142 data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \
143 data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \
144 sum0_r *= (v8u16) (coef0); \
145 sum0_l *= (v8u16) (coef0); \
146 ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \
147 data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \
148 data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \
149 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
150 ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \
151 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
152 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
153 res_r = (v8i16) (sum0_r - sum3_r); \
154 res_l = (v8i16) (sum0_l - sum3_l); \
159 CLIP_SH2_0_255(res_r, res_l); \
160 out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
165 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, \
166 mask0, mask1, mask2, mask3, \
167 coef0, coef1, coef2) \
170 v8i16 res0_r, res1_r; \
171 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
172 v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \
174 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \
175 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \
176 HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \
177 DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \
178 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \
179 VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \
180 DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \
181 DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \
182 res0_r = (v8i16) (sum0_r - sum3_r); \
183 res1_r = (v8i16) (sum4_r - sum7_r); \
188 CLIP_SH2_0_255(res0_r, res1_r); \
189 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \
194 #define APPLY_VERT_QPEL_FILTER(inp0, inp1, inp2, inp3, \
195 inp4, inp5, inp6, inp7, \
196 coef0, coef1, coef2) \
199 v8i16 res_r, res_l; \
200 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
201 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
203 ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \
204 ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \
205 DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \
206 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
207 ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \
208 ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \
209 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
210 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
211 res_r = (v8i16) (sum0_r - sum3_r); \
212 res_l = (v8i16) (sum0_l - sum3_l); \
213 SRARI_H2_SH(res_r, res_l, 5); \
214 CLIP_SH2_0_255(res_r, res_l); \
215 res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
220 #define APPLY_VERT_QPEL_FILTER_8BYTE(inp00, inp01, inp02, inp03, \
221 inp04, inp05, inp06, inp07, \
222 inp10, inp11, inp12, inp13, \
223 inp14, inp15, inp16, inp17, \
224 coef0, coef1, coef2) \
228 v8u16 sum00, sum01, sum02, sum03; \
229 v8u16 sum10, sum11, sum12, sum13; \
231 ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \
232 sum00, sum10, sum03, sum13); \
233 DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \
234 HADD_UB2_UH(sum03, sum13, sum03, sum13); \
235 ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \
236 sum02, sum12, sum01, sum11); \
237 DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \
238 DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \
239 val0 = (v8i16) (sum00 - sum03); \
240 val1 = (v8i16) (sum10 - sum13); \
241 SRARI_H2_SH(val0, val1, 5); \
242 CLIP_SH2_0_255(val0, val1); \
243 res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \
248 #define APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp1, inp2, inp3, \
249 inp4, inp5, inp6, inp7, \
250 coef0, coef1, coef2) \
253 v8i16 res_r, res_l; \
254 v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
255 v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
257 ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \
258 ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \
259 DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \
260 HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
261 ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \
262 ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \
263 DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
264 DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
265 res_r = (v8i16) (sum0_r - sum3_r); \
266 res_l = (v8i16) (sum0_l - sum3_l); \
271 CLIP_SH2_0_255(res_r, res_l); \
272 res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
277 #define APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp00, inp01, inp02, inp03, \
278 inp04, inp05, inp06, inp07, \
279 inp10, inp11, inp12, inp13, \
280 inp14, inp15, inp16, inp17, \
281 coef0, coef1, coef2) \
285 v8u16 sum00, sum01, sum02, sum03; \
286 v8u16 sum10, sum11, sum12, sum13; \
288 ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \
289 sum00, sum10, sum03, sum13); \
290 DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \
291 HADD_UB2_UH(sum03, sum13, sum03, sum13); \
292 ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \
293 sum02, sum12, sum01, sum11); \
294 DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \
295 DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \
296 val0 = (v8i16) (sum00 - sum03); \
297 val1 = (v8i16) (sum10 - sum13); \
302 CLIP_SH2_0_255(val0, val1); \
303 res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \
315 v16u8 inp0, inp1, inp2, inp3;
317 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
318 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
319 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
320 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
321 v16u8 const20 = (v16u8) __msa_ldi_b(20);
322 v16u8 const6 = (v16u8) __msa_ldi_b(6);
323 v16u8 const3 = (v16u8) __msa_ldi_b(3);
325 for (loop_count = (height >> 2); loop_count--;) {
326 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
327 src += (4 * src_stride);
329 mask0, mask1, mask2, mask3,
330 const20, const6, const3);
332 mask0, mask1, mask2, mask3,
333 const20, const6, const3);
334 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
335 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
337 ST8x4_UB(res0, res1, dst, dst_stride);
338 dst += (4 * dst_stride);
349 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
351 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
352 v16u8 const6 = (v16u8) __msa_ldi_b(6);
353 v16u8 const3 = (v16u8) __msa_ldi_b(3);
354 v8u16 const20 = (v8u16) __msa_ldi_h(20);
356 for (loop_count = (height >> 2); loop_count--;) {
357 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
358 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
359 src += (4 * src_stride);
361 const20, const6, const3);
362 res = __msa_aver_u_b(inp0, res);
367 const20, const6, const3);
368 res = __msa_aver_u_b(inp2, res);
373 const20, const6, const3);
374 res = __msa_aver_u_b(inp4, res);
379 const20, const6, const3);
380 res = __msa_aver_u_b(inp6, res);
393 v16u8 inp0, inp1, inp2, inp3;
395 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
396 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
397 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
398 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
399 v16u8 const20 = (v16u8) __msa_ldi_b(20);
400 v16u8 const6 = (v16u8) __msa_ldi_b(6);
401 v16u8 const3 = (v16u8) __msa_ldi_b(3);
403 for (loop_count = (height >> 2); loop_count--;) {
404 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
405 src += (4 * src_stride);
407 mask0, mask1, mask2, mask3,
408 const20, const6, const3);
410 mask0, mask1, mask2, mask3,
411 const20, const6, const3);
412 ST8x4_UB(res0, res1, dst, dst_stride);
413 dst += (4 * dst_stride);
424 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
426 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
427 v8u16 const20 = (v8u16) __msa_ldi_h(20);
428 v16u8 const6 = (v16u8) __msa_ldi_b(6);
429 v16u8 const3 = (v16u8) __msa_ldi_b(3);
431 for (loop_count = (height >> 2); loop_count--;) {
432 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
433 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
434 src += (4 * src_stride);
436 const20, const6, const3);
441 const20, const6, const3);
446 const20, const6, const3);
451 const20, const6, const3);
464 v16u8 inp0, inp1, inp2, inp3;
466 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
467 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
468 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
469 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
470 v16u8 const20 = (v16u8) __msa_ldi_b(20);
471 v16u8 const6 = (v16u8) __msa_ldi_b(6);
472 v16u8 const3 = (v16u8) __msa_ldi_b(3);
474 for (loop_count = (height >> 2); loop_count--;) {
475 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
476 src += (4 * src_stride);
478 mask0, mask1, mask2, mask3,
479 const20, const6, const3);
481 mask0, mask1, mask2, mask3,
482 const20, const6, const3);
483 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
484 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
485 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
486 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
488 ST8x4_UB(res0, res1, dst, dst_stride);
489 dst += (4 * dst_stride);
500 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
502 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
503 v8u16 const20 = (v8u16) __msa_ldi_h(20);
504 v16u8 const6 = (v16u8) __msa_ldi_b(6);
505 v16u8 const3 = (v16u8) __msa_ldi_b(3);
507 for (loop_count = (height >> 2); loop_count--;) {
508 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
509 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
510 src += (4 * src_stride);
512 const20, const6, const3);
513 res = __msa_aver_u_b(res, inp1);
518 const20, const6, const3);
519 res = __msa_aver_u_b(res, inp3);
524 const20, const6, const3);
525 res = __msa_aver_u_b(res, inp5);
530 const20, const6, const3);
531 res = __msa_aver_u_b(res, inp7);
544 v16u8 inp0, inp1, inp2, inp3;
546 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
547 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
548 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
549 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
550 v16u8 const20 = (v16u8) __msa_ldi_b(20);
551 v16u8 const6 = (v16u8) __msa_ldi_b(6);
552 v16u8 const3 = (v16u8) __msa_ldi_b(3);
554 for (loop_count = (height >> 2); loop_count--;) {
555 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
556 src += (4 * src_stride);
558 mask2, mask3, const20,
561 mask2, mask3, const20,
563 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
564 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
565 res0 = __msa_ave_u_b(inp0, res0);
566 res1 = __msa_ave_u_b(inp2, res1);
567 ST8x4_UB(res0, res1, dst, dst_stride);
568 dst += (4 * dst_stride);
579 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
581 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
582 v8u16 const20 = (v8u16) __msa_ldi_h(20);
583 v16u8 const6 = (v16u8) __msa_ldi_b(6);
584 v16u8 const3 = (v16u8) __msa_ldi_b(3);
586 for (loop_count = (height >> 2); loop_count--;) {
587 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
588 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
589 src += (4 * src_stride);
591 const20, const6, const3);
592 res = __msa_ave_u_b(inp0, res);
597 const20, const6, const3);
598 res = __msa_ave_u_b(inp2, res);
603 const20, const6, const3);
604 res = __msa_ave_u_b(inp4, res);
609 const20, const6, const3);
610 res = __msa_ave_u_b(inp6, res);
623 v16u8 inp0, inp1, inp2, inp3;
625 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
626 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
627 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
628 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
629 v16u8 const20 = (v16u8) __msa_ldi_b(20);
630 v16u8 const6 = (v16u8) __msa_ldi_b(6);
631 v16u8 const3 = (v16u8) __msa_ldi_b(3);
633 for (loop_count = (height >> 2); loop_count--;) {
634 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
635 src += (4 * src_stride);
637 mask2, mask3, const20,
640 mask2, mask3, const20,
642 ST8x4_UB(res0, res1, dst, dst_stride);
643 dst += (4 * dst_stride);
654 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
656 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
657 v16u8 const6 = (v16u8) __msa_ldi_b(6);
658 v16u8 const3 = (v16u8) __msa_ldi_b(3);
659 v8u16 const20 = (v8u16) __msa_ldi_h(20);
661 for (loop_count = (height >> 2); loop_count--;) {
662 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
663 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
664 src += (4 * src_stride);
666 const20, const6, const3);
671 const20, const6, const3);
676 const20, const6, const3);
681 const20, const6, const3);
694 v16u8 inp0, inp1, inp2, inp3;
696 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
697 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
698 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
699 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
700 v16u8 const20 = (v16u8) __msa_ldi_b(20);
701 v16u8 const6 = (v16u8) __msa_ldi_b(6);
702 v16u8 const3 = (v16u8) __msa_ldi_b(3);
704 for (loop_count = (height >> 2); loop_count--;) {
705 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
706 src += (4 * src_stride);
708 mask2, mask3, const20,
711 mask2, mask3, const20,
713 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
714 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
715 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
716 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
717 res0 = __msa_ave_u_b(inp0, res0);
718 res1 = __msa_ave_u_b(inp2, res1);
719 ST8x4_UB(res0, res1, dst, dst_stride);
720 dst += (4 * dst_stride);
731 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
733 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
734 v16u8 const6 = (v16u8) __msa_ldi_b(6);
735 v16u8 const3 = (v16u8) __msa_ldi_b(3);
736 v8u16 const20 = (v8u16) __msa_ldi_h(20);
738 for (loop_count = (height >> 2); loop_count--;) {
739 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
740 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
741 src += (4 * src_stride);
743 const20, const6, const3);
744 res = __msa_ave_u_b(res, inp1);
749 const20, const6, const3);
750 res = __msa_ave_u_b(res, inp3);
755 const20, const6, const3);
756 res = __msa_ave_u_b(res, inp5);
761 const20, const6, const3);
762 res = __msa_ave_u_b(res, inp7);
775 v16u8 inp0, inp1, inp2, inp3;
776 v16u8 dst0, dst1, dst2, dst3;
778 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
779 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
780 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
781 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
782 v16u8 const20 = (v16u8) __msa_ldi_b(20);
783 v16u8 const6 = (v16u8) __msa_ldi_b(6);
784 v16u8 const3 = (v16u8) __msa_ldi_b(3);
786 for (loop_count = (height >> 2); loop_count--;) {
787 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
788 src += (4 * src_stride);
790 mask0, mask1, mask2, mask3,
791 const20, const6, const3);
793 mask0, mask1, mask2, mask3,
794 const20, const6, const3);
795 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
796 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
797 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
798 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
799 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
802 ST8x4_UB(res0, res1, dst, dst_stride);
803 dst += (4 * dst_stride);
814 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
817 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
818 v16u8 const6 = (v16u8) __msa_ldi_b(6);
819 v16u8 const3 = (v16u8) __msa_ldi_b(3);
820 v8u16 const20 = (v8u16) __msa_ldi_h(20);
822 for (loop_count = (height >> 2); loop_count--;) {
823 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
824 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
825 src += (4 * src_stride);
827 const20, const6, const3);
829 const20, const6, const3);
830 LD_UB2(dst, dst_stride, dst0, dst1);
833 ST_UB2(res0, res1, dst, dst_stride);
834 dst += (2 * dst_stride);
837 const20, const6, const3);
839 const20, const6, const3);
840 LD_UB2(dst, dst_stride, dst0, dst1);
843 ST_UB2(res0, res1, dst, dst_stride);
844 dst += (2 * dst_stride);
855 v16u8 inp0, inp1, inp2, inp3;
856 v16u8 dst0, dst1, dst2, dst3;
858 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
859 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
860 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
861 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
862 v16u8 const20 = (v16u8) __msa_ldi_b(20);
863 v16u8 const6 = (v16u8) __msa_ldi_b(6);
864 v16u8 const3 = (v16u8) __msa_ldi_b(3);
866 for (loop_count = (height >> 2); loop_count--;) {
867 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
868 src += (4 * src_stride);
870 mask0, mask1, mask2, mask3,
871 const20, const6, const3);
873 mask0, mask1, mask2, mask3,
874 const20, const6, const3);
875 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
876 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
877 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
879 ST8x4_UB(res0, res1, dst, dst_stride);
880 dst += (4 * dst_stride);
891 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
894 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
895 v16u8 const6 = (v16u8) __msa_ldi_b(6);
896 v16u8 const3 = (v16u8) __msa_ldi_b(3);
897 v8u16 const20 = (v8u16) __msa_ldi_h(20);
899 for (loop_count = (height >> 2); loop_count--;) {
900 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
901 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
902 src += (4 * src_stride);
904 const20, const6, const3);
906 const20, const6, const3);
907 LD_UB2(dst, dst_stride, dst0, dst1);
909 ST_UB2(res0, res1, dst, dst_stride);
910 dst += (2 * dst_stride);
913 const20, const6, const3);
915 const20, const6, const3);
916 LD_UB2(dst, dst_stride, dst0, dst1);
918 ST_UB2(res0, res1, dst, dst_stride);
919 dst += (2 * dst_stride);
930 v16u8 inp0, inp1, inp2, inp3;
931 v16u8 dst0, dst1, dst2, dst3;
933 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
934 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
935 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
936 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
937 v16u8 const20 = (v16u8) __msa_ldi_b(20);
938 v16u8 const6 = (v16u8) __msa_ldi_b(6);
939 v16u8 const3 = (v16u8) __msa_ldi_b(3);
941 for (loop_count = (height >> 2); loop_count--;) {
942 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
943 src += (4 * src_stride);
945 mask0, mask1, mask2, mask3,
946 const20, const6, const3);
948 mask0, mask1, mask2, mask3,
949 const20, const6, const3);
950 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
951 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
952 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
953 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
954 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
955 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
956 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
959 ST8x4_UB(res0, res1, dst, dst_stride);
960 dst += (4 * dst_stride);
971 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
972 v16u8 res0, res1, dst0, dst1;
973 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
974 v16u8 const6 = (v16u8) __msa_ldi_b(6);
975 v16u8 const3 = (v16u8) __msa_ldi_b(3);
976 v8u16 const20 = (v8u16) __msa_ldi_h(20);
978 for (loop_count = (height >> 2); loop_count--;) {
979 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
980 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
981 src += (4 * src_stride);
983 const20, const6, const3);
985 const20, const6, const3);
986 LD_UB2(dst, dst_stride, dst0, dst1);
989 ST_UB2(res0, res1, dst, dst_stride);
990 dst += (2 * dst_stride);
992 const20, const6, const3);
994 const20, const6, const3);
995 LD_UB2(dst, dst_stride, dst0, dst1);
998 ST_UB2(res0, res1, dst, dst_stride);
999 dst += (2 * dst_stride);
1009 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1010 v16u8 tmp0, tmp1, res0, res1;
1011 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1012 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1013 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1015 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1016 src += (4 * src_stride);
1017 LD_UB2(src, src_stride, inp4, inp5);
1018 src += (2 * src_stride);
1020 inp1, inp2, inp3, inp4,
1021 inp1, inp0, inp0, inp1,
1022 inp2, inp3, inp4, inp5,
1023 const20, const6, const3);
1024 LD_UB2(src, src_stride, inp6, inp7);
1025 src += (2 * src_stride);
1027 inp3, inp4, inp5, inp6,
1028 inp3, inp2, inp1, inp0,
1029 inp4, inp5, inp6, inp7,
1030 const20, const6, const3);
1031 tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
1032 tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
1034 ST8x4_UB(res0, res1, dst, dst_stride);
1035 dst += (4 * dst_stride);
1039 inp5, inp6, inp7, inp8,
1040 inp5, inp4, inp3, inp2,
1041 inp6, inp7, inp8, inp8,
1042 const20, const6, const3);
1044 inp7, inp8, inp8, inp7,
1045 inp7, inp6, inp5, inp4,
1046 inp8, inp8, inp7, inp6,
1047 const20, const6, const3);
1048 tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
1049 tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
1051 ST8x4_UB(res0, res1, dst, dst_stride);
1052 dst += (4 * dst_stride);
1060 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1061 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1063 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1064 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1065 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1067 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1068 src += (5 * src_stride);
1070 inp1, inp2, inp3, inp4,
1071 const20, const6, const3);
1072 res0 = __msa_aver_u_b(res0, inp0);
1079 inp2, inp3, inp4, inp5,
1080 const20, const6, const3);
1081 res0 = __msa_aver_u_b(res0, inp1);
1088 inp3, inp4, inp5, inp6,
1089 const20, const6, const3);
1090 res0 = __msa_aver_u_b(res0, inp2);
1097 inp4, inp5, inp6, inp7,
1098 const20, const6, const3);
1099 res0 = __msa_aver_u_b(res0, inp3);
1103 LD_UB2(src, src_stride, inp8, inp9);
1104 src += (2 * src_stride);
1106 inp5, inp6, inp7, inp8,
1107 const20, const6, const3);
1108 res0 = __msa_aver_u_b(res0, inp4);
1113 inp6, inp7, inp8, inp9,
1114 const20, const6, const3);
1115 res0 = __msa_aver_u_b(res0, inp5);
1119 LD_UB2(src, src_stride, inp10, inp11);
1120 src += (2 * src_stride);
1122 inp7, inp8, inp9, inp10,
1123 const20, const6, const3);
1124 res0 = __msa_aver_u_b(res0, inp6);
1129 inp8, inp9, inp10, inp11,
1130 const20, const6, const3);
1131 res0 = __msa_aver_u_b(res0, inp7);
1135 LD_UB2(src, src_stride, inp12, inp13);
1136 src += (2 * src_stride);
1138 inp9, inp10, inp11, inp12,
1139 const20, const6, const3);
1140 res0 = __msa_aver_u_b(res0, inp8);
1145 inp10, inp11, inp12, inp13,
1146 const20, const6, const3);
1147 res0 = __msa_aver_u_b(res0, inp9);
1151 LD_UB2(src, src_stride, inp14, inp15);
1152 src += (2 * src_stride);
1154 inp11, inp12, inp13, inp14,
1155 const20, const6, const3);
1156 res0 = __msa_aver_u_b(res0, inp10);
1161 inp12, inp13, inp14, inp15,
1162 const20, const6, const3);
1163 res0 = __msa_aver_u_b(res0, inp11);
1169 inp13, inp14, inp15, inp16,
1170 const20, const6, const3);
1171 res0 = __msa_aver_u_b(res0, inp12);
1176 inp14, inp15, inp16, inp16,
1177 const20, const6, const3);
1178 res0 = __msa_aver_u_b(res0, inp13);
1183 inp15, inp16, inp16, inp15,
1184 const20, const6, const3);
1185 res0 = __msa_aver_u_b(res0, inp14);
1190 inp16, inp16, inp15, inp14,
1191 const20, const6, const3);
1192 res0 = __msa_aver_u_b(res0, inp15);
1201 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1203 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1204 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1205 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1207 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1208 src += (4 * src_stride);
1209 LD_UB2(src, src_stride, inp4, inp5);
1210 src += (2 * src_stride);
1212 inp1, inp2, inp3, inp4,
1213 inp1, inp0, inp0, inp1,
1214 inp2, inp3, inp4, inp5,
1215 const20, const6, const3);
1216 LD_UB2(src, src_stride, inp6, inp7);
1217 src += (2 * src_stride);
1219 inp3, inp4, inp5, inp6,
1220 inp3, inp2, inp1, inp0,
1221 inp4, inp5, inp6, inp7,
1222 const20, const6, const3);
1223 ST8x4_UB(res0, res1, dst, dst_stride);
1224 dst += (4 * dst_stride);
1228 inp5, inp6, inp7, inp8,
1229 inp5, inp4, inp3, inp2,
1230 inp6, inp7, inp8, inp8,
1231 const20, const6, const3);
1233 inp7, inp8, inp8, inp7,
1234 inp7, inp6, inp5, inp4,
1235 inp8, inp8, inp7, inp6,
1236 const20, const6, const3);
1237 ST8x4_UB(res0, res1, dst, dst_stride);
1238 dst += (4 * dst_stride);
1246 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1247 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1249 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1250 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1251 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1253 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1254 src += (4 * src_stride);
1258 inp1, inp2, inp3, inp4,
1259 const20, const6, const3);
1266 inp2, inp3, inp4, inp5,
1267 const20, const6, const3);
1274 inp3, inp4, inp5, inp6,
1275 const20, const6, const3);
1282 inp4, inp5, inp6, inp7,
1283 const20, const6, const3);
1290 inp5, inp6, inp7, inp8,
1291 const20, const6, const3);
1298 inp6, inp7, inp8, inp9,
1299 const20, const6, const3);
1306 inp7, inp8, inp9, inp10,
1307 const20, const6, const3);
1314 inp8, inp9, inp10, inp11,
1315 const20, const6, const3);
1322 inp9, inp10, inp11, inp12,
1323 const20, const6, const3);
1330 inp10, inp11, inp12, inp13,
1331 const20, const6, const3);
1338 inp11, inp12, inp13, inp14,
1339 const20, const6, const3);
1346 inp12, inp13, inp14, inp15,
1347 const20, const6, const3);
1353 inp13, inp14, inp15, inp16,
1354 const20, const6, const3);
1359 inp14, inp15, inp16, inp16,
1360 const20, const6, const3);
1365 inp15, inp16, inp16, inp15,
1366 const20, const6, const3);
1371 inp16, inp16, inp15, inp14,
1372 const20, const6, const3);
1382 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1383 v16u8 tmp0, tmp1, res0, res1;
1384 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1385 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1386 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1388 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1389 src += (4 * src_stride);
1390 LD_UB2(src, src_stride, inp4, inp5);
1391 src += (2 * src_stride);
1393 inp1, inp2, inp3, inp4,
1394 inp1, inp0, inp0, inp1,
1395 inp2, inp3, inp4, inp5,
1396 const20, const6, const3);
1398 LD_UB2(src, src_stride, inp6, inp7);
1399 src += (2 * src_stride);
1401 inp3, inp4, inp5, inp6,
1402 inp3, inp2, inp1, inp0,
1403 inp4, inp5, inp6, inp7,
1404 const20, const6, const3);
1405 tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
1406 tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
1408 ST8x4_UB(res0, res1, dst, dst_stride);
1409 dst += (4 * dst_stride);
1413 inp5, inp6, inp7, inp8,
1414 inp5, inp4, inp3, inp2,
1415 inp6, inp7, inp8, inp8,
1416 const20, const6, const3);
1418 inp7, inp8, inp8, inp7,
1419 inp7, inp6, inp5, inp4,
1420 inp8, inp8, inp7, inp6,
1421 const20, const6, const3);
1422 tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
1423 tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
1425 ST8x4_UB(res0, res1, dst, dst_stride);
1433 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1434 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1436 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1437 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1438 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1440 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1441 src += (4 * src_stride);
1445 inp1, inp2, inp3, inp4,
1446 const20, const6, const3);
1447 res0 = __msa_aver_u_b(res0, inp1);
1454 inp2, inp3, inp4, inp5,
1455 const20, const6, const3);
1456 res0 = __msa_aver_u_b(res0, inp2);
1463 inp3, inp4, inp5, inp6,
1464 const20, const6, const3);
1465 res0 = __msa_aver_u_b(res0, inp3);
1472 inp4, inp5, inp6, inp7,
1473 const20, const6, const3);
1474 res0 = __msa_aver_u_b(res0, inp4);
1481 inp5, inp6, inp7, inp8,
1482 const20, const6, const3);
1483 res0 = __msa_aver_u_b(res0, inp5);
1490 inp6, inp7, inp8, inp9,
1491 const20, const6, const3);
1492 res0 = __msa_aver_u_b(res0, inp6);
1499 inp7, inp8, inp9, inp10,
1500 const20, const6, const3);
1501 res0 = __msa_aver_u_b(res0, inp7);
1508 inp8, inp9, inp10, inp11,
1509 const20, const6, const3);
1510 res0 = __msa_aver_u_b(res0, inp8);
1517 inp9, inp10, inp11, inp12,
1518 const20, const6, const3);
1519 res0 = __msa_aver_u_b(res0, inp9);
1526 inp10, inp11, inp12, inp13,
1527 const20, const6, const3);
1528 res0 = __msa_aver_u_b(res0, inp10);
1535 inp11, inp12, inp13, inp14,
1536 const20, const6, const3);
1537 res0 = __msa_aver_u_b(res0, inp11);
1544 inp12, inp13, inp14, inp15,
1545 const20, const6, const3);
1546 res0 = __msa_aver_u_b(res0, inp12);
1552 inp13, inp14, inp15, inp16,
1553 const20, const6, const3);
1554 res0 = __msa_aver_u_b(res0, inp13);
1559 inp14, inp15, inp16, inp16,
1560 const20, const6, const3);
1561 res0 = __msa_aver_u_b(res0, inp14);
1566 inp15, inp16, inp16, inp15,
1567 const20, const6, const3);
1568 res0 = __msa_aver_u_b(res0, inp15);
1573 inp16, inp16, inp15, inp14,
1574 const20, const6, const3);
1575 res0 = __msa_aver_u_b(res0, inp16);
1584 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1585 v16u8 tmp0, tmp1, res0, res1;
1586 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1587 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1588 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1590 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1591 src += (4 * src_stride);
1592 LD_UB2(src, src_stride, inp4, inp5);
1593 src += (2 * src_stride);
1595 inp1, inp2, inp3, inp4,
1596 inp1, inp0, inp0, inp1,
1597 inp2, inp3, inp4, inp5,
1598 const20, const6, const3);
1599 LD_UB2(src, src_stride, inp6, inp7);
1600 src += (2 * src_stride);
1602 inp3, inp4, inp5, inp6,
1603 inp3, inp2, inp1, inp0,
1604 inp4, inp5, inp6, inp7,
1605 const20, const6, const3);
1606 tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
1607 tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
1608 res0 = __msa_ave_u_b(res0, tmp0);
1609 res1 = __msa_ave_u_b(res1, tmp1);
1610 ST8x4_UB(res0, res1, dst, dst_stride);
1611 dst += (4 * dst_stride);
1615 inp5, inp6, inp7, inp8,
1616 inp5, inp4, inp3, inp2,
1617 inp6, inp7, inp8, inp8,
1618 const20, const6, const3);
1620 inp7, inp8, inp8, inp7,
1621 inp7, inp6, inp5, inp4,
1622 inp8, inp8, inp7, inp6,
1623 const20, const6, const3);
1624 tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
1625 tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
1626 res0 = __msa_ave_u_b(res0, tmp0);
1627 res1 = __msa_ave_u_b(res1, tmp1);
1628 ST8x4_UB(res0, res1, dst, dst_stride);
1629 dst += (4 * dst_stride);
1637 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1638 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1640 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1641 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1642 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1644 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1645 src += (5 * src_stride);
1647 inp1, inp2, inp3, inp4,
1648 const20, const6, const3);
1649 res0 = __msa_ave_u_b(res0, inp0);
1656 inp2, inp3, inp4, inp5,
1657 const20, const6, const3);
1658 res0 = __msa_ave_u_b(res0, inp1);
1665 inp3, inp4, inp5, inp6,
1666 const20, const6, const3);
1667 res0 = __msa_ave_u_b(res0, inp2);
1674 inp4, inp5, inp6, inp7,
1675 const20, const6, const3);
1676 res0 = __msa_ave_u_b(res0, inp3);
1683 inp5, inp6, inp7, inp8,
1684 const20, const6, const3);
1685 res0 = __msa_ave_u_b(res0, inp4);
1692 inp6, inp7, inp8, inp9,
1693 const20, const6, const3);
1694 res0 = __msa_ave_u_b(res0, inp5);
1701 inp7, inp8, inp9, inp10,
1702 const20, const6, const3);
1703 res0 = __msa_ave_u_b(res0, inp6);
1710 inp8, inp9, inp10, inp11,
1711 const20, const6, const3);
1712 res0 = __msa_ave_u_b(res0, inp7);
1719 inp9, inp10, inp11, inp12,
1720 const20, const6, const3);
1721 res0 = __msa_ave_u_b(res0, inp8);
1728 inp10, inp11, inp12, inp13,
1729 const20, const6, const3);
1730 res0 = __msa_ave_u_b(res0, inp9);
1737 inp11, inp12, inp13, inp14,
1738 const20, const6, const3);
1739 res0 = __msa_ave_u_b(res0, inp10);
1746 inp12, inp13, inp14, inp15,
1747 const20, const6, const3);
1748 res0 = __msa_ave_u_b(res0, inp11);
1754 inp13, inp14, inp15, inp16,
1755 const20, const6, const3);
1756 res0 = __msa_ave_u_b(res0, inp12);
1761 inp14, inp15, inp16, inp16,
1762 const20, const6, const3);
1763 res0 = __msa_ave_u_b(res0, inp13);
1768 inp15, inp16, inp16, inp15,
1769 const20, const6, const3);
1770 res0 = __msa_ave_u_b(res0, inp14);
1775 inp16, inp16, inp15, inp14,
1776 const20, const6, const3);
1777 res0 = __msa_ave_u_b(res0, inp15);
1787 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1789 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1790 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1791 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1793 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1794 src += (4 * src_stride);
1795 LD_UB2(src, src_stride, inp4, inp5);
1796 src += (2 * src_stride);
1798 inp1, inp2, inp3, inp4,
1799 inp1, inp0, inp0, inp1,
1800 inp2, inp3, inp4, inp5,
1801 const20, const6, const3);
1802 LD_UB2(src, src_stride, inp6, inp7);
1803 src += (2 * src_stride);
1805 inp3, inp4, inp5, inp6,
1806 inp3, inp2, inp1, inp0,
1807 inp4, inp5, inp6, inp7,
1808 const20, const6, const3);
1809 ST8x4_UB(res0, res1, dst, dst_stride);
1810 dst += (4 * dst_stride);
1814 inp5, inp6, inp7, inp8,
1815 inp5, inp4, inp3, inp2,
1816 inp6, inp7, inp8, inp8,
1817 const20, const6, const3);
1819 inp7, inp8, inp8, inp7,
1820 inp7, inp6, inp5, inp4,
1821 inp8, inp8, inp7, inp6,
1822 const20, const6, const3);
1823 ST8x4_UB(res0, res1, dst, dst_stride);
1824 dst += (4 * dst_stride);
1832 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1833 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1835 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1836 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1837 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1839 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1840 src += (5 * src_stride);
1842 inp1, inp2, inp3, inp4,
1843 const20, const6, const3);
1850 inp2, inp3, inp4, inp5,
1851 const20, const6, const3);
1858 inp3, inp4, inp5, inp6,
1859 const20, const6, const3);
1866 inp4, inp5, inp6, inp7,
1867 const20, const6, const3);
1874 inp5, inp6, inp7, inp8,
1875 const20, const6, const3);
1882 inp6, inp7, inp8, inp9,
1883 const20, const6, const3);
1890 inp7, inp8, inp9, inp10,
1891 const20, const6, const3);
1898 inp8, inp9, inp10, inp11,
1899 const20, const6, const3);
1906 inp9, inp10, inp11, inp12,
1907 const20, const6, const3);
1914 inp10, inp11, inp12, inp13,
1915 const20, const6, const3);
1922 inp11, inp12, inp13, inp14,
1923 const20, const6, const3);
1930 inp12, inp13, inp14, inp15,
1931 const20, const6, const3);
1937 inp13, inp14, inp15, inp16,
1938 const20, const6, const3);
1943 inp14, inp15, inp16, inp16,
1944 const20, const6, const3);
1949 inp15, inp16, inp16, inp15,
1950 const20, const6, const3);
1955 inp16, inp16, inp15, inp14,
1956 const20, const6, const3);
1965 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1966 v16u8 tmp0, tmp1, res0, res1;
1967 v16u8 const20 = (v16u8) __msa_ldi_b(20);
1968 v16u8 const6 = (v16u8) __msa_ldi_b(6);
1969 v16u8 const3 = (v16u8) __msa_ldi_b(3);
1971 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1972 src += (4 * src_stride);
1973 LD_UB2(src, src_stride, inp4, inp5);
1974 src += (2 * src_stride);
1976 inp1, inp2, inp3, inp4,
1977 inp1, inp0, inp0, inp1,
1978 inp2, inp3, inp4, inp5,
1979 const20, const6, const3);
1980 LD_UB2(src, src_stride, inp6, inp7);
1981 src += (2 * src_stride);
1983 inp3, inp4, inp5, inp6,
1984 inp3, inp2, inp1, inp0,
1985 inp4, inp5, inp6, inp7,
1986 const20, const6, const3);
1987 tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
1988 tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
1989 res0 = __msa_ave_u_b(res0, tmp0);
1990 res1 = __msa_ave_u_b(res1, tmp1);
1991 ST8x4_UB(res0, res1, dst, dst_stride);
1992 dst += (4 * dst_stride);
1996 inp5, inp6, inp7, inp8,
1997 inp5, inp4, inp3, inp2,
1998 inp6, inp7, inp8, inp8,
1999 const20, const6, const3);
2001 inp7, inp8, inp8, inp7,
2002 inp7, inp6, inp5, inp4,
2003 inp8, inp8, inp7, inp6,
2004 const20, const6, const3);
2005 tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
2006 tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
2007 res0 = __msa_ave_u_b(res0, tmp0);
2008 res1 = __msa_ave_u_b(res1, tmp1);
2009 ST8x4_UB(res0, res1, dst, dst_stride);
2017 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2018 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2020 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2021 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2022 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2024 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2025 src += (5 * src_stride);
2027 inp1, inp2, inp3, inp4,
2028 const20, const6, const3);
2029 res0 = __msa_ave_u_b(res0, inp1);
2036 inp2, inp3, inp4, inp5,
2037 const20, const6, const3);
2038 res0 = __msa_ave_u_b(res0, inp2);
2045 inp3, inp4, inp5, inp6,
2046 const20, const6, const3);
2047 res0 = __msa_ave_u_b(res0, inp3);
2054 inp4, inp5, inp6, inp7,
2055 const20, const6, const3);
2056 res0 = __msa_ave_u_b(res0, inp4);
2063 inp5, inp6, inp7, inp8,
2064 const20, const6, const3);
2065 res0 = __msa_ave_u_b(res0, inp5);
2072 inp6, inp7, inp8, inp9,
2073 const20, const6, const3);
2074 res0 = __msa_ave_u_b(res0, inp6);
2081 inp7, inp8, inp9, inp10,
2082 const20, const6, const3);
2083 res0 = __msa_ave_u_b(res0, inp7);
2090 inp8, inp9, inp10, inp11,
2091 const20, const6, const3);
2092 res0 = __msa_ave_u_b(res0, inp8);
2099 inp9, inp10, inp11, inp12,
2100 const20, const6, const3);
2101 res0 = __msa_ave_u_b(res0, inp9);
2108 inp10, inp11, inp12, inp13,
2109 const20, const6, const3);
2110 res0 = __msa_ave_u_b(res0, inp10);
2117 inp11, inp12, inp13, inp14,
2118 const20, const6, const3);
2119 res0 = __msa_ave_u_b(res0, inp11);
2126 inp12, inp13, inp14, inp15,
2127 const20, const6, const3);
2128 res0 = __msa_ave_u_b(res0, inp12);
2134 inp13, inp14, inp15, inp16,
2135 const20, const6, const3);
2136 res0 = __msa_ave_u_b(res0, inp13);
2141 inp14, inp15, inp16, inp16,
2142 const20, const6, const3);
2143 res0 = __msa_ave_u_b(res0, inp14);
2148 inp15, inp16, inp16, inp15,
2149 const20, const6, const3);
2150 res0 = __msa_ave_u_b(res0, inp15);
2155 inp16, inp16, inp15, inp14,
2156 const20, const6, const3);
2157 res0 = __msa_ave_u_b(res0, inp16);
2166 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2167 v16u8 dst0, dst1, dst2, dst3;
2168 v16u8 tmp0, tmp1, res0, res1;
2169 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2170 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2171 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2173 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2174 src += (4 * src_stride);
2175 LD_UB2(src, src_stride, inp4, inp5);
2176 src += (2 * src_stride);
2178 inp1, inp2, inp3, inp4,
2179 inp1, inp0, inp0, inp1,
2180 inp2, inp3, inp4, inp5,
2181 const20, const6, const3);
2183 LD_UB2(src, src_stride, inp6, inp7);
2184 src += (2 * src_stride);
2186 inp3, inp4, inp5, inp6,
2187 inp3, inp2, inp1, inp0,
2188 inp4, inp5, inp6, inp7,
2189 const20, const6, const3);
2191 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2192 tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
2193 tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
2194 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2195 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2198 ST8x4_UB(res0, res1, dst, dst_stride);
2199 dst += (4 * dst_stride);
2203 inp5, inp6, inp7, inp8,
2204 inp5, inp4, inp3, inp2,
2205 inp6, inp7, inp8, inp8,
2206 const20, const6, const3);
2208 inp7, inp8, inp8, inp7,
2209 inp7, inp6, inp5, inp4,
2210 inp8, inp8, inp7, inp6,
2211 const20, const6, const3);
2213 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2214 tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
2215 tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
2216 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2217 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2220 ST8x4_UB(res0, res1, dst, dst_stride);
2228 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2229 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2230 v16u8 res0, res1, dst0, dst1;
2231 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2232 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2233 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2235 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2236 src += (5 * src_stride);
2238 inp1, inp2, inp3, inp4,
2239 const20, const6, const3);
2244 inp2, inp3, inp4, inp5,
2245 const20, const6, const3);
2247 LD_UB2(dst, dst_stride, dst0, dst1);
2250 ST_UB2(res0, res1, dst, dst_stride);
2251 dst += (2 * dst_stride);
2256 inp3, inp4, inp5, inp6,
2257 const20, const6, const3);
2262 inp4, inp5, inp6, inp7,
2263 const20, const6, const3);
2265 LD_UB2(dst, dst_stride, dst0, dst1);
2268 ST_UB2(res0, res1, dst, dst_stride);
2269 dst += (2 * dst_stride);
2271 LD_UB2(src, src_stride, inp8, inp9);
2272 src += (2 * src_stride);
2274 inp5, inp6, inp7, inp8,
2275 const20, const6, const3);
2277 inp6, inp7, inp8, inp9,
2278 const20, const6, const3);
2280 LD_UB2(dst, dst_stride, dst0, dst1);
2283 ST_UB2(res0, res1, dst, dst_stride);
2284 dst += (2 * dst_stride);
2286 LD_UB2(src, src_stride, inp10, inp11);
2287 src += (2 * src_stride);
2289 inp7, inp8, inp9, inp10,
2290 const20, const6, const3);
2292 inp8, inp9, inp10, inp11,
2293 const20, const6, const3);
2295 LD_UB2(dst, dst_stride, dst0, dst1);
2298 ST_UB2(res0, res1, dst, dst_stride);
2299 dst += (2 * dst_stride);
2301 LD_UB2(src, src_stride, inp12, inp13);
2302 src += (2 * src_stride);
2304 inp9, inp10, inp11, inp12,
2305 const20, const6, const3);
2307 inp10, inp11, inp12, inp13,
2308 const20, const6, const3);
2309 LD_UB2(dst, dst_stride, dst0, dst1);
2312 ST_UB2(res0, res1, dst, dst_stride);
2313 dst += (2 * dst_stride);
2315 LD_UB2(src, src_stride, inp14, inp15);
2316 src += (2 * src_stride);
2318 inp11, inp12, inp13, inp14,
2319 const20, const6, const3);
2321 inp12, inp13, inp14, inp15,
2322 const20, const6, const3);
2324 LD_UB2(dst, dst_stride, dst0, dst1);
2325 AVER_UB2_UB(res0, inp10, res1, inp11, res0, res1);
2327 ST_UB2(res0, res1, dst, dst_stride);
2328 dst += (2 * dst_stride);
2332 inp13, inp14, inp15, inp16,
2333 const20, const6, const3);
2335 inp14, inp15, inp16, inp16,
2336 const20, const6, const3);
2337 LD_UB2(dst, dst_stride, dst0, dst1);
2338 AVER_UB2_UB(res0, inp12, res1, inp13, res0, res1);
2340 ST_UB2(res0, res1, dst, dst_stride);
2341 dst += (2 * dst_stride);
2344 inp15, inp16, inp16, inp15,
2345 const20, const6, const3);
2347 inp16, inp16, inp15, inp14,
2348 const20, const6, const3);
2349 LD_UB2(dst, dst_stride, dst0, dst1);
2350 AVER_UB2_UB(res0, inp14, res1, inp15, res0, res1);
2352 ST_UB2(res0, res1, dst, dst_stride);
2360 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2361 v16u8 dst0, dst1, dst2, dst3;
2363 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2364 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2365 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2367 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2368 src += (4 * src_stride);
2369 LD_UB2(src, src_stride, inp4, inp5);
2370 src += (2 * src_stride);
2372 inp1, inp2, inp3, inp4,
2373 inp1, inp0, inp0, inp1,
2374 inp2, inp3, inp4, inp5,
2375 const20, const6, const3);
2376 LD_UB2(src, src_stride, inp6, inp7);
2377 src += (2 * src_stride);
2379 inp3, inp4, inp5, inp6,
2380 inp3, inp2, inp1, inp0,
2381 inp4, inp5, inp6, inp7,
2382 const20, const6, const3);
2383 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2384 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2385 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2387 ST8x4_UB(res0, res1, dst, dst_stride);
2388 dst += (4 * dst_stride);
2392 inp5, inp6, inp7, inp8,
2393 inp5, inp4, inp3, inp2,
2394 inp6, inp7, inp8, inp8,
2395 const20, const6, const3);
2397 inp7, inp8, inp8, inp7,
2398 inp7, inp6, inp5, inp4,
2399 inp8, inp8, inp7, inp6,
2400 const20, const6, const3);
2401 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2402 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2403 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2405 ST8x4_UB(res0, res1, dst, dst_stride);
2406 dst += (4 * dst_stride);
2414 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2415 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2416 v16u8 res0, res1, dst0, dst1;
2417 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2418 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2419 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2421 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2422 src += (5 * src_stride);
2424 inp1, inp2, inp3, inp4,
2425 const20, const6, const3);
2429 inp2, inp3, inp4, inp5,
2430 const20, const6, const3);
2431 LD_UB2(dst, dst_stride, dst0, dst1);
2433 ST_UB2(res0, res1, dst, dst_stride);
2434 dst += (2 * dst_stride);
2439 inp3, inp4, inp5, inp6,
2440 const20, const6, const3);
2444 inp4, inp5, inp6, inp7,
2445 const20, const6, const3);
2446 LD_UB2(dst, dst_stride, dst0, dst1);
2448 ST_UB2(res0, res1, dst, dst_stride);
2449 dst += (2 * dst_stride);
2454 inp5, inp6, inp7, inp8,
2455 const20, const6, const3);
2459 inp6, inp7, inp8, inp9,
2460 const20, const6, const3);
2461 LD_UB2(dst, dst_stride, dst0, dst1);
2463 ST_UB2(res0, res1, dst, dst_stride);
2464 dst += (2 * dst_stride);
2469 inp7, inp8, inp9, inp10,
2470 const20, const6, const3);
2474 inp8, inp9, inp10, inp11,
2475 const20, const6, const3);
2476 LD_UB2(dst, dst_stride, dst0, dst1);
2478 ST_UB2(res0, res1, dst, dst_stride);
2479 dst += (2 * dst_stride);
2484 inp9, inp10, inp11, inp12,
2485 const20, const6, const3);
2489 inp10, inp11, inp12, inp13,
2490 const20, const6, const3);
2491 LD_UB2(dst, dst_stride, dst0, dst1);
2493 ST_UB2(res0, res1, dst, dst_stride);
2494 dst += (2 * dst_stride);
2499 inp11, inp12, inp13, inp14,
2500 const20, const6, const3);
2504 inp12, inp13, inp14, inp15,
2505 const20, const6, const3);
2506 LD_UB2(dst, dst_stride, dst0, dst1);
2508 ST_UB2(res0, res1, dst, dst_stride);
2509 dst += (2 * dst_stride);
2513 inp13, inp14, inp15, inp16,
2514 const20, const6, const3);
2516 inp14, inp15, inp16, inp16,
2517 const20, const6, const3);
2518 LD_UB2(dst, dst_stride, dst0, dst1);
2520 ST_UB2(res0, res1, dst, dst_stride);
2521 dst += (2 * dst_stride);
2524 inp15, inp16, inp16, inp15,
2525 const20, const6, const3);
2527 inp16, inp16, inp15, inp14,
2528 const20, const6, const3);
2529 LD_UB2(dst, dst_stride, dst0, dst1);
2531 ST_UB2(res0, res1, dst, dst_stride);
2539 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2540 v16u8 dst0, dst1, dst2, dst3;
2541 v16u8 tmp0, tmp1, res0, res1;
2542 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2543 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2544 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2546 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2547 src += (4 * src_stride);
2548 LD_UB2(src, src_stride, inp4, inp5);
2549 src += (2 * src_stride);
2551 inp1, inp2, inp3, inp4,
2552 inp1, inp0, inp0, inp1,
2553 inp2, inp3, inp4, inp5,
2554 const20, const6, const3);
2555 LD_UB2(src, src_stride, inp6, inp7);
2556 src += (2 * src_stride);
2558 inp3, inp4, inp5, inp6,
2559 inp3, inp2, inp1, inp0,
2560 inp4, inp5, inp6, inp7,
2561 const20, const6, const3);
2562 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2563 tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
2564 tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
2565 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2566 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2569 ST8x4_UB(res0, res1, dst, dst_stride);
2570 dst += (4 * dst_stride);
2574 inp5, inp6, inp7, inp8,
2575 inp5, inp4, inp3, inp2,
2576 inp6, inp7, inp8, inp8,
2577 const20, const6, const3);
2579 inp7, inp8, inp8, inp7,
2580 inp7, inp6, inp5, inp4,
2581 inp8, inp8, inp7, inp6,
2582 const20, const6, const3);
2583 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2584 tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
2585 tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
2586 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2587 dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2590 ST8x4_UB(res0, res1, dst, dst_stride);
2598 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2599 v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2600 v16u8 res0, res1, dst0, dst1;
2601 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2602 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2603 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2605 LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2606 src += (5 * src_stride);
2608 inp1, inp2, inp3, inp4,
2609 const20, const6, const3);
2613 inp2, inp3, inp4, inp5,
2614 const20, const6, const3);
2615 LD_UB2(dst, dst_stride, dst0, dst1);
2618 ST_UB2(res0, res1, dst, dst_stride);
2619 dst += (2 * dst_stride);
2624 inp3, inp4, inp5, inp6,
2625 const20, const6, const3);
2629 inp4, inp5, inp6, inp7,
2630 const20, const6, const3);
2631 LD_UB2(dst, dst_stride, dst0, dst1);
2634 ST_UB2(res0, res1, dst, dst_stride);
2635 dst += (2 * dst_stride);
2640 inp5, inp6, inp7, inp8,
2641 const20, const6, const3);
2645 inp6, inp7, inp8, inp9,
2646 const20, const6, const3);
2647 LD_UB2(dst, dst_stride, dst0, dst1);
2650 ST_UB2(res0, res1, dst, dst_stride);
2651 dst += (2 * dst_stride);
2656 inp7, inp8, inp9, inp10,
2657 const20, const6, const3);
2661 inp8, inp9, inp10, inp11,
2662 const20, const6, const3);
2663 LD_UB2(dst, dst_stride, dst0, dst1);
2666 ST_UB2(res0, res1, dst, dst_stride);
2667 dst += (2 * dst_stride);
2672 inp9, inp10, inp11, inp12,
2673 const20, const6, const3);
2677 inp10, inp11, inp12, inp13,
2678 const20, const6, const3);
2679 LD_UB2(dst, dst_stride, dst0, dst1);
2682 ST_UB2(res0, res1, dst, dst_stride);
2683 dst += (2 * dst_stride);
2688 inp11, inp12, inp13, inp14,
2689 const20, const6, const3);
2693 inp12, inp13, inp14, inp15,
2694 const20, const6, const3);
2695 LD_UB2(dst, dst_stride, dst0, dst1);
2696 AVER_UB2_UB(res0, inp11, res1, inp12, res0, res1);
2698 ST_UB2(res0, res1, dst, dst_stride);
2699 dst += (2 * dst_stride);
2703 inp13, inp14, inp15, inp16,
2704 const20, const6, const3);
2706 inp14, inp15, inp16, inp16,
2707 const20, const6, const3);
2708 LD_UB2(dst, dst_stride, dst0, dst1);
2709 AVER_UB2_UB(res0, inp13, res1, inp14, res0, res1);
2711 ST_UB2(res0, res1, dst, dst_stride);
2712 dst += (2 * dst_stride);
2715 inp15, inp16, inp16, inp15,
2716 const20, const6, const3);
2718 inp16, inp16, inp15, inp14,
2719 const20, const6, const3);
2720 LD_UB2(dst, dst_stride, dst0, dst1);
2721 AVER_UB2_UB(res0, inp15, res1, inp16, res0, res1);
2723 ST_UB2(res0, res1, dst, dst_stride);
2733 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
2735 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
2736 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2737 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2738 v8u16 const20 = (v8u16) __msa_ldi_h(20);
2740 for (loop_count = (height >> 2); loop_count--;) {
2741 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
2742 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
2743 src += (4 * src_stride);
2745 const20, const6, const3);
2746 res = __msa_ave_u_b(inp0, res);
2751 const20, const6, const3);
2752 res = __msa_ave_u_b(inp2, res);
2757 const20, const6, const3);
2758 res = __msa_ave_u_b(inp4, res);
2763 const20, const6, const3);
2764 res = __msa_ave_u_b(inp6, res);
2769 LD_UB2(src, 1, inp0, inp1);
2771 const20, const6, const3);
2772 res = __msa_ave_u_b(inp0, res);
2792 v16u8 inp0, inp1, inp2, inp3;
2793 v16u8 res0, res1, avg0, avg1;
2794 v16u8 horiz0, horiz1, horiz2, horiz3;
2795 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
2796 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2797 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
2798 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
2799 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
2800 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2801 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2802 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2804 LD_UB2(src, src_stride, inp0, inp1);
2805 src += (2 * src_stride);
2807 mask2, mask3, const20,
2809 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
2810 horiz0 = __msa_ave_u_b(inp0, res0);
2811 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
2812 LD_UB2(src, src_stride, inp2, inp3);
2813 src += (2 * src_stride);
2815 mask2, mask3, const20,
2817 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
2818 horiz2 = __msa_ave_u_b(inp2, res1);
2819 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
2820 LD_UB2(src, src_stride, inp0, inp1);
2821 src += (2 * src_stride);
2823 mask2, mask3, const20,
2825 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
2826 horiz4 = __msa_ave_u_b(inp0, res0);
2827 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
2829 horiz1, horiz2, horiz3, horiz4,
2830 horiz1, horiz0, horiz0, horiz1,
2831 horiz2, horiz3, horiz4, horiz5,
2832 const20, const6, const3);
2833 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2834 res0 = __msa_ave_u_b(avg0, res0);
2836 dst += (2 * dst_stride);
2838 LD_UB2(src, src_stride, inp2, inp3);
2839 src += (2 * src_stride);
2841 mask2, mask3, const20,
2843 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
2844 horiz6 = __msa_ave_u_b(inp2, res1);
2845 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
2848 mask2, mask3, const20,
2850 horiz8 = __msa_ave_u_b(inp0, res0);
2852 horiz3, horiz4, horiz5, horiz6,
2853 horiz3, horiz2, horiz1, horiz0,
2854 horiz4, horiz5, horiz6, horiz7,
2855 const20, const6, const3);
2856 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
2857 res1 = __msa_ave_u_b(avg1, res1);
2859 horiz5, horiz6, horiz7, horiz8,
2860 horiz5, horiz4, horiz3, horiz2,
2861 horiz6, horiz7, horiz8, horiz8,
2862 const20, const6, const3);
2864 dst += 2 * dst_stride;
2866 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
2867 res0 = __msa_ave_u_b(avg0, res0);
2869 horiz7, horiz8, horiz8, horiz7,
2870 horiz7, horiz6, horiz5, horiz4,
2871 horiz8, horiz8, horiz7, horiz6,
2872 const20, const6, const3);
2874 dst += 2 * dst_stride;
2876 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
2877 res1 = __msa_ave_u_b(avg1, res1);
2888 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
2890 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
2891 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2892 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2893 v8u16 const20 = (v8u16) __msa_ldi_h(20);
2895 for (loop_count = (height >> 2); loop_count--;) {
2896 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
2897 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
2898 src += (4 * src_stride);
2900 const20, const6, const3);
2905 const20, const6, const3);
2910 const20, const6, const3);
2915 const20, const6, const3);
2920 LD_UB2(src, 1, inp0, inp1);
2922 const20, const6, const3);
2942 v16u8 inp0, inp1, inp2, inp3;
2943 v16u8 res0, res1, avg0, avg1;
2944 v16u8 horiz0, horiz1, horiz2, horiz3;
2945 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
2946 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2947 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
2948 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
2949 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
2950 v16u8 const20 = (v16u8) __msa_ldi_b(20);
2951 v16u8 const6 = (v16u8) __msa_ldi_b(6);
2952 v16u8 const3 = (v16u8) __msa_ldi_b(3);
2954 LD_UB2(src, src_stride, inp0, inp1);
2955 src += (2 * src_stride);
2957 mask2, mask3, const20,
2959 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
2961 LD_UB2(src, src_stride, inp2, inp3);
2962 src += (2 * src_stride);
2964 mask2, mask3, const20,
2966 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
2967 LD_UB2(src, src_stride, inp0, inp1);
2968 src += (2 * src_stride);
2970 mask2, mask3, const20,
2972 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
2974 horiz1, horiz2, horiz3, horiz4,
2975 horiz1, horiz0, horiz0, horiz1,
2976 horiz2, horiz3, horiz4, horiz5,
2977 const20, const6, const3);
2978 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2979 res0 = __msa_ave_u_b(avg0, res0);
2981 dst += (2 * dst_stride);
2983 LD_UB2(src, src_stride, inp2, inp3);
2984 src += (2 * src_stride);
2986 mask2, mask3, const20,
2988 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
2991 mask2, mask3, const20,
2994 horiz3, horiz4, horiz5, horiz6,
2995 horiz3, horiz2, horiz1, horiz0,
2996 horiz4, horiz5, horiz6, horiz7,
2997 const20, const6, const3);
2998 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
2999 res1 = __msa_ave_u_b(avg1, res1);
3000 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3001 res0 = __msa_ave_u_b(avg0, res0);
3003 dst += (2 * dst_stride);
3006 horiz5, horiz6, horiz7, horiz8,
3007 horiz5, horiz4, horiz3, horiz2,
3008 horiz6, horiz7, horiz8, horiz8,
3009 const20, const6, const3);
3010 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3011 res0 = __msa_ave_u_b(avg0, res0);
3013 dst += (2 * dst_stride);
3016 horiz7, horiz8, horiz8, horiz7,
3017 horiz7, horiz6, horiz5, horiz4,
3018 horiz8, horiz8, horiz7, horiz6,
3019 const20, const6, const3);
3020 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3021 res1 = __msa_ave_u_b(avg1, res1);
3032 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3034 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3035 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3036 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3037 v8u16 const20 = (v8u16) __msa_ldi_h(20);
3039 for (loop_count = (height >> 2); loop_count--;) {
3040 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3041 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3042 src += (4 * src_stride);
3044 const20, const6, const3);
3045 res = __msa_ave_u_b(res, inp1);
3050 const20, const6, const3);
3051 res = __msa_ave_u_b(res, inp3);
3056 const20, const6, const3);
3057 res = __msa_ave_u_b(res, inp5);
3062 const20, const6, const3);
3063 res = __msa_ave_u_b(res, inp7);
3068 LD_UB2(src, 1, inp0, inp1);
3070 const20, const6, const3);
3071 res = __msa_ave_u_b(inp1, res);
3091 v16u8 inp0, inp1, inp2, inp3;
3092 v16u8 res0, res1, avg0, avg1;
3093 v16u8 horiz0, horiz1, horiz2, horiz3;
3094 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3095 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3096 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3097 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3098 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3099 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3100 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3101 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3103 LD_UB2(src, src_stride, inp0, inp1);
3104 src += (2 * src_stride);
3106 mask2, mask3, const20,
3108 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3110 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3111 horiz0 = __msa_ave_u_b(inp0, res0);
3112 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3113 LD_UB2(src, src_stride, inp2, inp3);
3114 src += (2 * src_stride);
3116 mask2, mask3, const20,
3118 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3120 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3121 horiz2 = __msa_ave_u_b(inp2, res1);
3122 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3123 LD_UB2(src, src_stride, inp0, inp1);
3124 src += (2 * src_stride);
3126 mask2, mask3, const20,
3128 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3130 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3131 horiz4 = __msa_ave_u_b(inp0, res0);
3132 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3134 horiz1, horiz2, horiz3, horiz4,
3135 horiz1, horiz0, horiz0, horiz1,
3136 horiz2, horiz3, horiz4, horiz5,
3137 const20, const6, const3);
3138 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3139 res0 = __msa_ave_u_b(avg0, res0);
3141 dst += (2 * dst_stride);
3143 LD_UB2(src, src_stride, inp2, inp3);
3144 src += (2 * src_stride);
3146 mask2, mask3, const20,
3148 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3150 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3151 horiz6 = __msa_ave_u_b(inp2, res1);
3152 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3155 mask2, mask3, const20,
3157 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3158 horiz8 = __msa_ave_u_b(inp0, res0);
3160 horiz3, horiz4, horiz5, horiz6,
3161 horiz3, horiz2, horiz1, horiz0,
3162 horiz4, horiz5, horiz6, horiz7,
3163 const20, const6, const3);
3164 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
3165 res1 = __msa_ave_u_b(avg1, res1);
3167 dst += (2 * dst_stride);
3170 horiz5, horiz6, horiz7, horiz8,
3171 horiz5, horiz4, horiz3, horiz2,
3172 horiz6, horiz7, horiz8, horiz8,
3173 const20, const6, const3);
3174 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3175 res0 = __msa_ave_u_b(avg0, res0);
3177 dst += (2 * dst_stride);
3180 horiz7, horiz8, horiz8, horiz7,
3181 horiz7, horiz6, horiz5, horiz4,
3182 horiz8, horiz8, horiz7, horiz6,
3183 const20, const6, const3);
3184 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3185 res1 = __msa_ave_u_b(avg1, res1);
3205 v16u8 inp0, inp1, inp2, inp3;
3207 v16u8 horiz0, horiz1, horiz2, horiz3;
3208 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3209 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3210 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3211 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3212 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3213 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3214 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3215 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3217 LD_UB2(src, src_stride, inp0, inp1);
3218 src += (2 * src_stride);
3220 mask2, mask3, const20,
3222 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3223 horiz0 = __msa_ave_u_b(inp0, res0);
3224 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3225 LD_UB2(src, src_stride, inp2, inp3);
3226 src += (2 * src_stride);
3228 mask2, mask3, const20,
3230 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3231 horiz2 = __msa_ave_u_b(inp2, res1);
3232 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3233 LD_UB2(src, src_stride, inp0, inp1);
3234 src += (2 * src_stride);
3236 mask2, mask3, const20,
3238 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3239 horiz4 = __msa_ave_u_b(inp0, res0);
3240 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3242 horiz1, horiz2, horiz3, horiz4,
3243 horiz1, horiz0, horiz0, horiz1,
3244 horiz2, horiz3, horiz4, horiz5,
3245 const20, const6, const3);
3247 LD_UB2(src, src_stride, inp2, inp3);
3248 src += (2 * src_stride);
3250 dst += 2 * dst_stride;
3253 mask2, mask3, const20,
3255 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3256 horiz6 = __msa_ave_u_b(inp2, res1);
3257 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3260 mask2, mask3, const20,
3262 horiz8 = __msa_ave_u_b(inp0, res0);
3264 horiz3, horiz4, horiz5, horiz6,
3265 horiz3, horiz2, horiz1, horiz0,
3266 horiz4, horiz5, horiz6, horiz7,
3267 const20, const6, const3);
3269 horiz5, horiz6, horiz7, horiz8,
3270 horiz5, horiz4, horiz3, horiz2,
3271 horiz6, horiz7, horiz8, horiz8,
3272 const20, const6, const3);
3274 dst += 2 * dst_stride;
3277 dst += (2 * dst_stride);
3280 horiz7, horiz8, horiz8, horiz7,
3281 horiz7, horiz6, horiz5, horiz4,
3282 horiz8, horiz8, horiz7, horiz6,
3283 const20, const6, const3);
3303 v16u8 inp0, inp1, inp2, inp3;
3305 v16u8 horiz0, horiz1, horiz2, horiz3;
3306 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3307 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3308 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3309 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3310 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3311 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3312 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3313 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3315 LD_UB2(src, src_stride, inp0, inp1);
3316 src += (2 * src_stride);
3318 mask2, mask3, const20,
3320 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3321 LD_UB2(src, src_stride, inp2, inp3);
3322 src += (2 * src_stride);
3324 mask2, mask3, const20,
3326 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3327 LD_UB2(src, src_stride, inp0, inp1);
3328 src += (2 * src_stride);
3330 mask2, mask3, const20,
3332 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3334 horiz1, horiz2, horiz3, horiz4,
3335 horiz1, horiz0, horiz0, horiz1,
3336 horiz2, horiz3, horiz4, horiz5,
3337 const20, const6, const3);
3338 LD_UB2(src, src_stride, inp2, inp3);
3339 src += (2 * src_stride);
3341 dst += 2 * dst_stride;
3344 mask2, mask3, const20,
3346 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3349 mask2, mask3, const20,
3352 horiz3, horiz4, horiz5, horiz6,
3353 horiz3, horiz2, horiz1, horiz0,
3354 horiz4, horiz5, horiz6, horiz7,
3355 const20, const6, const3);
3357 horiz5, horiz6, horiz7, horiz8,
3358 horiz5, horiz4, horiz3, horiz2,
3359 horiz6, horiz7, horiz8, horiz8,
3360 const20, const6, const3);
3362 dst += 2 * dst_stride;
3366 horiz7, horiz8, horiz8, horiz7,
3367 horiz7, horiz6, horiz5, horiz4,
3368 horiz8, horiz8, horiz7, horiz6,
3369 const20, const6, const3);
3371 dst += 2 * dst_stride;
3391 v16u8 inp0, inp1, inp2, inp3;
3393 v16u8 horiz0, horiz1, horiz2, horiz3;
3394 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3395 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3396 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3397 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3398 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3399 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3400 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3401 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3403 LD_UB2(src, src_stride, inp0, inp1);
3404 src += (2 * src_stride);
3406 mask2, mask3, const20,
3408 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3410 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3411 horiz0 = __msa_ave_u_b(inp0, res0);
3412 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3413 LD_UB2(src, src_stride, inp2, inp3);
3414 src += (2 * src_stride);
3416 mask2, mask3, const20,
3418 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3420 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3421 horiz2 = __msa_ave_u_b(inp2, res1);
3422 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3423 LD_UB2(src, src_stride, inp0, inp1);
3424 src += (2 * src_stride);
3426 mask2, mask3, const20,
3428 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3430 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3431 horiz4 = __msa_ave_u_b(inp0, res0);
3432 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3434 horiz1, horiz2, horiz3, horiz4,
3435 horiz1, horiz0, horiz0, horiz1,
3436 horiz2, horiz3, horiz4, horiz5,
3437 const20, const6, const3);
3438 LD_UB2(src, src_stride, inp2, inp3);
3439 src += (2 * src_stride);
3441 dst += 2 * dst_stride;
3444 mask2, mask3, const20,
3446 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3448 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3449 horiz6 = __msa_ave_u_b(inp2, res1);
3450 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3453 mask2, mask3, const20,
3455 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3456 horiz8 = __msa_ave_u_b(inp0, res0);
3458 horiz3, horiz4, horiz5, horiz6,
3459 horiz3, horiz2, horiz1, horiz0,
3460 horiz4, horiz5, horiz6, horiz7,
3461 const20, const6, const3);
3463 horiz5, horiz6, horiz7, horiz8,
3464 horiz5, horiz4, horiz3, horiz2,
3465 horiz6, horiz7, horiz8, horiz8,
3466 const20, const6, const3);
3468 dst += 2 * dst_stride;
3471 horiz7, horiz8, horiz8, horiz7,
3472 horiz7, horiz6, horiz5, horiz4,
3473 horiz8, horiz8, horiz7, horiz6,
3474 const20, const6, const3);
3476 dst += 2 * dst_stride;
3496 v16u8 inp0, inp1, inp2, inp3;
3497 v16u8 res0, res1, avg0, avg1;
3498 v16u8 horiz0, horiz1, horiz2, horiz3;
3499 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3500 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3501 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3502 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3503 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3504 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3505 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3506 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3508 LD_UB2(src, src_stride, inp0, inp1);
3509 src += (2 * src_stride);
3511 mask2, mask3, const20,
3513 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3514 horiz0 = __msa_ave_u_b(inp0, res0);
3515 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3516 LD_UB2(src, src_stride, inp2, inp3);
3517 src += (2 * src_stride);
3519 mask2, mask3, const20,
3521 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3522 horiz2 = __msa_ave_u_b(inp2, res1);
3523 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3524 LD_UB2(src, src_stride, inp0, inp1);
3525 src += (2 * src_stride);
3527 mask2, mask3, const20,
3529 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3530 horiz4 = __msa_ave_u_b(inp0, res0);
3531 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3533 horiz1, horiz2, horiz3, horiz4,
3534 horiz1, horiz0, horiz0, horiz1,
3535 horiz2, horiz3, horiz4, horiz5,
3536 const20, const6, const3);
3537 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3538 res0 = __msa_ave_u_b(avg0, res0);
3540 dst += (2 * dst_stride);
3542 LD_UB2(src, src_stride, inp2, inp3);
3543 src += (2 * src_stride);
3545 mask2, mask3, const20,
3547 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3548 horiz6 = __msa_ave_u_b(inp2, res1);
3549 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3552 mask2, mask3, const20,
3554 horiz8 = __msa_ave_u_b(inp0, res0);
3556 horiz3, horiz4, horiz5, horiz6,
3557 horiz3, horiz2, horiz1, horiz0,
3558 horiz4, horiz5, horiz6, horiz7,
3559 const20, const6, const3);
3560 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3561 res1 = __msa_ave_u_b(avg1, res1);
3563 horiz5, horiz6, horiz7, horiz8,
3564 horiz5, horiz4, horiz3, horiz2,
3565 horiz6, horiz7, horiz8, horiz8,
3566 const20, const6, const3);
3568 dst += 2 * dst_stride;
3570 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3571 res0 = __msa_ave_u_b(avg0, res0);
3574 horiz7, horiz8, horiz8, horiz7,
3575 horiz7, horiz6, horiz5, horiz4,
3576 horiz8, horiz8, horiz7, horiz6,
3577 const20, const6, const3);
3579 dst += 2 * dst_stride;
3581 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3582 res1 = __msa_ave_u_b(avg1, res1);
3602 v16u8 inp0, inp1, inp2, inp3;
3603 v16u8 res0, res1, avg0, avg1;
3604 v16u8 horiz0, horiz1, horiz2, horiz3;
3605 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3606 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3607 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3608 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3609 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3610 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3611 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3612 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3614 LD_UB2(src, src_stride, inp0, inp1);
3615 src += (2 * src_stride);
3617 mask2, mask3, const20,
3619 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3620 LD_UB2(src, src_stride, inp2, inp3);
3621 src += (2 * src_stride);
3623 mask2, mask3, const20,
3625 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3626 LD_UB2(src, src_stride, inp0, inp1);
3627 src += (2 * src_stride);
3629 mask2, mask3, const20,
3631 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3633 horiz1, horiz2, horiz3, horiz4,
3634 horiz1, horiz0, horiz0, horiz1,
3635 horiz2, horiz3, horiz4, horiz5,
3636 const20, const6, const3);
3637 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3638 res0 = __msa_ave_u_b(avg0, res0);
3639 LD_UB2(src, src_stride, inp2, inp3);
3640 src += (2 * src_stride);
3642 dst += 2 * dst_stride;
3645 mask2, mask3, const20,
3647 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3649 horiz3, horiz4, horiz5, horiz6,
3650 horiz3, horiz2, horiz1, horiz0,
3651 horiz4, horiz5, horiz6, horiz7,
3652 const20, const6, const3);
3653 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3654 res1 = __msa_ave_u_b(avg1, res1);
3657 mask2, mask3, const20,
3660 dst += 2 * dst_stride;
3663 horiz5, horiz6, horiz7, horiz8,
3664 horiz5, horiz4, horiz3, horiz2,
3665 horiz6, horiz7, horiz8, horiz8,
3666 const20, const6, const3);
3667 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3668 res0 = __msa_ave_u_b(avg0, res0);
3670 horiz7, horiz8, horiz8, horiz7,
3671 horiz7, horiz6, horiz5, horiz4,
3672 horiz8, horiz8, horiz7, horiz6,
3673 const20, const6, const3);
3675 dst += 2 * dst_stride;
3677 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3678 res1 = __msa_ave_u_b(avg1, res1);
3698 v16u8 inp0, inp1, inp2, inp3;
3699 v16u8 res0, res1, avg0, avg1;
3700 v16u8 horiz0, horiz1, horiz2, horiz3;
3701 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3702 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3703 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3704 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3705 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3706 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3707 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3708 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3710 LD_UB2(src, src_stride, inp0, inp1);
3711 src += (2 * src_stride);
3713 mask2, mask3, const20,
3715 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3717 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3718 horiz0 = __msa_ave_u_b(inp0, res0);
3719 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3720 LD_UB2(src, src_stride, inp2, inp3);
3721 src += (2 * src_stride);
3723 mask2, mask3, const20,
3725 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3727 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3728 horiz2 = __msa_ave_u_b(inp2, res1);
3729 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3730 LD_UB2(src, src_stride, inp0, inp1);
3731 src += (2 * src_stride);
3733 mask2, mask3, const20,
3736 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3737 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3738 horiz4 = __msa_ave_u_b(inp0, res0);
3739 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3741 horiz1, horiz2, horiz3, horiz4,
3742 horiz1, horiz0, horiz0, horiz1,
3743 horiz2, horiz3, horiz4, horiz5,
3744 const20, const6, const3);
3745 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3746 res0 = __msa_ave_u_b(avg0, res0);
3748 dst += (2 * dst_stride);
3750 LD_UB2(src, src_stride, inp2, inp3);
3751 src += (2 * src_stride);
3753 mask2, mask3, const20,
3755 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3757 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3758 horiz6 = __msa_ave_u_b(inp2, res1);
3759 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3761 horiz3, horiz4, horiz5, horiz6,
3762 horiz3, horiz2, horiz1, horiz0,
3763 horiz4, horiz5, horiz6, horiz7,
3764 const20, const6, const3);
3765 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3766 res1 = __msa_ave_u_b(avg1, res1);
3768 dst += (2 * dst_stride);
3772 mask2, mask3, const20,
3774 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3775 horiz8 = __msa_ave_u_b(inp0, res0);
3777 horiz5, horiz6, horiz7, horiz8,
3778 horiz5, horiz4, horiz3, horiz2,
3779 horiz6, horiz7, horiz8, horiz8,
3780 const20, const6, const3);
3782 horiz7, horiz8, horiz8, horiz7,
3783 horiz7, horiz6, horiz5, horiz4,
3784 horiz8, horiz8, horiz7, horiz6,
3785 const20, const6, const3);
3786 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3787 res0 = __msa_ave_u_b(avg0, res0);
3788 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3789 res1 = __msa_ave_u_b(avg1, res1);
3790 ST8x4_UB(res0, res1, dst, dst_stride);
3800 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3802 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3803 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3804 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3805 v8u16 const20 = (v8u16) __msa_ldi_h(20);
3807 for (loop_count = (height >> 2); loop_count--;) {
3808 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3809 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3810 src += (4 * src_stride);
3812 const20, const6, const3);
3813 res = __msa_aver_u_b(inp0, res);
3818 const20, const6, const3);
3819 res = __msa_aver_u_b(inp2, res);
3824 const20, const6, const3);
3825 res = __msa_aver_u_b(inp4, res);
3830 const20, const6, const3);
3831 res = __msa_aver_u_b(inp6, res);
3836 LD_UB2(src, 1, inp0, inp1);
3838 res = __msa_aver_u_b(inp0, res);
3858 v16u8 inp0, inp1, inp2, inp3;
3859 v16u8 res0, res1, avg0, avg1;
3860 v16u8 horiz0, horiz1, horiz2, horiz3;
3861 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3862 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3863 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3864 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3865 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3866 v16u8 const20 = (v16u8) __msa_ldi_b(20);
3867 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3868 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3870 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
3871 src += (4 * src_stride);
3873 const20, const6, const3);
3875 const20, const6, const3);
3876 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3877 horiz0 = __msa_aver_u_b(inp0, res0);
3878 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3879 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3880 horiz2 = __msa_aver_u_b(inp2, res1);
3881 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3882 LD_UB2(src, src_stride, inp0, inp1);
3883 src += (2 * src_stride);
3885 const20, const6, const3);
3886 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3887 horiz4 = __msa_aver_u_b(inp0, res0);
3888 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3890 horiz1, horiz2, horiz3, horiz4,
3891 horiz1, horiz0, horiz0, horiz1,
3892 horiz2, horiz3, horiz4, horiz5,
3893 const20, const6, const3);
3894 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3895 res0 = __msa_aver_u_b(avg0, res0);
3897 dst += (2 * dst_stride);
3899 LD_UB2(src, src_stride, inp2, inp3);
3900 src += (2 * src_stride);
3902 const20, const6, const3);
3903 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3904 horiz6 = __msa_aver_u_b(inp2, res1);
3905 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3907 horiz3, horiz4, horiz5, horiz6,
3908 horiz3, horiz2, horiz1, horiz0,
3909 horiz4, horiz5, horiz6, horiz7,
3910 const20, const6, const3);
3911 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
3912 res1 = __msa_aver_u_b(avg1, res1);
3916 const20, const6, const3);
3917 horiz8 = __msa_aver_u_b(inp0, res0);
3919 dst += 2 * dst_stride;
3922 horiz5, horiz6, horiz7, horiz8,
3923 horiz5, horiz4, horiz3, horiz2,
3924 horiz6, horiz7, horiz8, horiz8,
3925 const20, const6, const3);
3926 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3927 res0 = __msa_aver_u_b(avg0, res0);
3929 horiz7, horiz8, horiz8, horiz7,
3930 horiz7, horiz6, horiz5, horiz4,
3931 horiz8, horiz8, horiz7, horiz6,
3932 const20, const6, const3);
3934 dst += 2 * dst_stride;
3935 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3936 res1 = __msa_aver_u_b(avg1, res1);
3947 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3949 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3950 v16u8 const6 = (v16u8) __msa_ldi_b(6);
3951 v16u8 const3 = (v16u8) __msa_ldi_b(3);
3952 v8u16 const20 = (v8u16) __msa_ldi_h(20);
3954 for (loop_count = (height >> 2); loop_count--;) {
3955 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3956 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3957 src += (4 * src_stride);
3959 const20, const6, const3);
3964 const20, const6, const3);
3969 const20, const6, const3);
3974 const20, const6, const3);
3979 LD_UB2(src, 1, inp0, inp1);
4000 v16u8 inp0, inp1, inp2, inp3;
4001 v16u8 res0, res1, avg0, avg1;
4002 v16u8 horiz0, horiz1, horiz2, horiz3;
4003 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4004 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4005 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4006 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4007 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4008 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4009 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4010 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4012 LD_UB2(src, src_stride, inp0, inp1);
4013 src += (2 * src_stride);
4015 mask0, mask1, mask2, mask3,
4016 const20, const6, const3);
4017 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4018 LD_UB2(src, src_stride, inp2, inp3);
4019 src += (2 * src_stride);
4021 mask0, mask1, mask2, mask3,
4022 const20, const6, const3);
4023 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4024 LD_UB2(src, src_stride, inp0, inp1);
4025 src += (2 * src_stride);
4027 mask0, mask1, mask2, mask3,
4028 const20, const6, const3);
4029 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4031 horiz1, horiz2, horiz3, horiz4,
4032 horiz1, horiz0, horiz0, horiz1,
4033 horiz2, horiz3, horiz4, horiz5,
4034 const20, const6, const3);
4035 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4036 res0 = __msa_aver_u_b(avg0, res0);
4038 dst += (2 * dst_stride);
4040 LD_UB2(src, src_stride, inp2, inp3);
4041 src += (2 * src_stride);
4043 mask0, mask1, mask2, mask3,
4044 const20, const6, const3);
4045 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4047 horiz3, horiz4, horiz5, horiz6,
4048 horiz3, horiz2, horiz1, horiz0,
4049 horiz4, horiz5, horiz6, horiz7,
4050 const20, const6, const3);
4053 mask0, mask1, mask2, mask3,
4054 const20, const6, const3);
4055 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4056 res1 = __msa_aver_u_b(avg1, res1);
4058 horiz5, horiz6, horiz7, horiz8,
4059 horiz5, horiz4, horiz3, horiz2,
4060 horiz6, horiz7, horiz8, horiz8,
4061 const20, const6, const3);
4063 dst += 2 * dst_stride;
4065 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4066 res0 = __msa_aver_u_b(avg0, res0);
4068 horiz7, horiz8, horiz8, horiz7,
4069 horiz7, horiz6, horiz5, horiz4,
4070 horiz8, horiz8, horiz7, horiz6,
4071 const20, const6, const3);
4073 dst += 2 * dst_stride;
4074 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4075 res1 = __msa_aver_u_b(avg1, res1);
4086 v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
4088 v16u8
mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
4089 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4090 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4091 v8u16 const20 = (v8u16) __msa_ldi_h(20);
4093 for (loop_count = (height >> 2); loop_count--;) {
4094 LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
4095 LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
4096 src += (4 * src_stride);
4098 const20, const6, const3);
4099 res = __msa_aver_u_b(res, inp1);
4104 const20, const6, const3);
4105 res = __msa_aver_u_b(res, inp3);
4110 const20, const6, const3);
4111 res = __msa_aver_u_b(res, inp5);
4116 const20, const6, const3);
4117 res = __msa_aver_u_b(res, inp7);
4122 LD_UB2(src, 1, inp0, inp1);
4124 res = __msa_aver_u_b(inp1, res);
4144 v16u8 inp0, inp1, inp2, inp3;
4145 v16u8 res0, res1, avg0, avg1;
4146 v16u8 horiz0, horiz1, horiz2, horiz3;
4147 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4148 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4149 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4150 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4151 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4152 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4153 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4154 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4156 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4157 src += (4 * src_stride);
4159 const20, const6, const3);
4161 const20, const6, const3);
4162 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4164 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4165 horiz0 = __msa_aver_u_b(inp0, res0);
4166 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4167 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4169 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4170 horiz2 = __msa_aver_u_b(inp2, res1);
4171 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4172 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4173 src += (4 * src_stride);
4175 const20, const6, const3);
4177 const20, const6, const3);
4178 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4180 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4181 horiz4 = __msa_aver_u_b(inp0, res0);
4182 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4183 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4185 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4186 horiz6 = __msa_aver_u_b(inp2, res1);
4187 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4189 horiz1, horiz2, horiz3, horiz4,
4190 horiz1, horiz0, horiz0, horiz1,
4191 horiz2, horiz3, horiz4, horiz5,
4192 const20, const6, const3);
4193 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4194 res0 = __msa_aver_u_b(avg0, res0);
4196 horiz3, horiz4, horiz5, horiz6,
4197 horiz3, horiz2, horiz1, horiz0,
4198 horiz4, horiz5, horiz6, horiz7,
4199 const20, const6, const3);
4201 dst += 2 * dst_stride;
4205 const20, const6, const3);
4206 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4207 res1 = __msa_aver_u_b(avg1, res1);
4208 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4209 horiz8 = __msa_aver_u_b(inp0, res0);
4211 horiz5, horiz6, horiz7, horiz8,
4212 horiz5, horiz4, horiz3, horiz2,
4213 horiz6, horiz7, horiz8, horiz8,
4214 const20, const6, const3);
4216 dst += 2 * dst_stride;
4218 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4219 res0 = __msa_aver_u_b(avg0, res0);
4221 horiz7, horiz8, horiz8, horiz7,
4222 horiz7, horiz6, horiz5, horiz4,
4223 horiz8, horiz8, horiz7, horiz6,
4224 const20, const6, const3);
4226 dst += 2 * dst_stride;
4228 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4229 res1 = __msa_aver_u_b(avg1, res1);
4249 v16u8 inp0, inp1, inp2, inp3;
4251 v16u8 horiz0, horiz1, horiz2, horiz3;
4252 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4253 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4254 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4255 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4256 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4257 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4258 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4259 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4261 LD_UB2(src, src_stride, inp0, inp1);
4262 src += (2 * src_stride);
4264 const20, const6, const3);
4265 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4266 horiz0 = __msa_aver_u_b(inp0, res0);
4267 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4269 LD_UB2(src, src_stride, inp2, inp3);
4270 src += (2 * src_stride);
4272 const20, const6, const3);
4273 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4274 horiz2 = __msa_aver_u_b(inp2, res1);
4275 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4276 LD_UB2(src, src_stride, inp0, inp1);
4277 src += (2 * src_stride);
4279 const20, const6, const3);
4280 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4281 horiz4 = __msa_aver_u_b(inp0, res0);
4282 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4284 horiz1, horiz2, horiz3, horiz4,
4285 horiz1, horiz0, horiz0, horiz1,
4286 horiz2, horiz3, horiz4, horiz5,
4287 const20, const6, const3);
4289 dst += (2 * dst_stride);
4291 LD_UB2(src, src_stride, inp2, inp3);
4292 src += (2 * src_stride);
4294 const20, const6, const3);
4295 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4296 horiz6 = __msa_aver_u_b(inp2, res1);
4297 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4299 horiz3, horiz4, horiz5, horiz6,
4300 horiz3, horiz2, horiz1, horiz0,
4301 horiz4, horiz5, horiz6, horiz7,
4302 const20, const6, const3);
4305 const20, const6, const3);
4306 horiz8 = __msa_aver_u_b(inp0, res0);
4308 horiz5, horiz6, horiz7, horiz8,
4309 horiz5, horiz4, horiz3, horiz2,
4310 horiz6, horiz7, horiz8, horiz8,
4311 const20, const6, const3);
4313 dst += 2 * dst_stride;
4316 horiz7, horiz8, horiz8, horiz7,
4317 horiz7, horiz6, horiz5, horiz4,
4318 horiz8, horiz8, horiz7, horiz6,
4319 const20, const6, const3);
4321 dst += 2 * dst_stride;
4339 v16u8 inp0, inp1, inp2, inp3;
4341 v16u8 horiz0, horiz1, horiz2, horiz3;
4342 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4343 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4344 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4345 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4346 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4347 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4348 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4349 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4351 LD_UB2(src, src_stride, inp0, inp1);
4352 src += (2 * src_stride);
4354 mask0, mask1, mask2, mask3,
4355 const20, const6, const3);
4356 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4357 LD_UB2(src, src_stride, inp2, inp3);
4358 src += (2 * src_stride);
4360 mask0, mask1, mask2, mask3,
4361 const20, const6, const3);
4362 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4363 LD_UB2(src, src_stride, inp0, inp1);
4364 src += (2 * src_stride);
4366 mask0, mask1, mask2, mask3,
4367 const20, const6, const3);
4368 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4370 horiz1, horiz2, horiz3, horiz4,
4371 horiz1, horiz0, horiz0, horiz1,
4372 horiz2, horiz3, horiz4, horiz5,
4373 const20, const6, const3);
4375 dst += (2 * dst_stride);
4377 LD_UB2(src, src_stride, inp2, inp3);
4378 src += (2 * src_stride);
4380 mask0, mask1, mask2, mask3,
4381 const20, const6, const3);
4382 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4384 horiz3, horiz4, horiz5, horiz6,
4385 horiz3, horiz2, horiz1, horiz0,
4386 horiz4, horiz5, horiz6, horiz7,
4387 const20, const6, const3);
4390 mask0, mask1, mask2, mask3,
4391 const20, const6, const3);
4393 dst += 2 * dst_stride;
4396 horiz5, horiz6, horiz7, horiz8,
4397 horiz5, horiz4, horiz3, horiz2,
4398 horiz6, horiz7, horiz8, horiz8,
4399 const20, const6, const3);
4401 horiz7, horiz8, horiz8, horiz7,
4402 horiz7, horiz6, horiz5, horiz4,
4403 horiz8, horiz8, horiz7, horiz6,
4404 const20, const6, const3);
4406 dst += 2 * dst_stride;
4426 v16u8 inp0, inp1, inp2, inp3;
4428 v16u8 horiz0, horiz1, horiz2, horiz3;
4429 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4430 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4431 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4432 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4433 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4434 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4435 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4436 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4438 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4439 src += (4 * src_stride);
4442 const20, const6, const3);
4444 const20, const6, const3);
4445 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4447 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4448 horiz0 = __msa_aver_u_b(inp0, res0);
4449 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4450 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4452 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4453 horiz2 = __msa_aver_u_b(inp2, res1);
4454 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4455 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4456 src += (4 * src_stride);
4458 const20, const6, const3);
4460 const20, const6, const3);
4461 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4463 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4464 horiz4 = __msa_aver_u_b(inp0, res0);
4465 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4466 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4468 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4469 horiz6 = __msa_aver_u_b(inp2, res1);
4470 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4473 const20, const6, const3);
4474 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4475 horiz8 = __msa_aver_u_b(inp0, res0);
4477 horiz1, horiz2, horiz3, horiz4,
4478 horiz1, horiz0, horiz0, horiz1,
4479 horiz2, horiz3, horiz4, horiz5,
4480 const20, const6, const3);
4482 dst += (2 * dst_stride);
4485 horiz3, horiz4, horiz5, horiz6,
4486 horiz3, horiz2, horiz1, horiz0,
4487 horiz4, horiz5, horiz6, horiz7,
4488 const20, const6, const3);
4490 dst += (2 * dst_stride);
4493 horiz5, horiz6, horiz7, horiz8,
4494 horiz5, horiz4, horiz3, horiz2,
4495 horiz6, horiz7, horiz8, horiz8,
4496 const20, const6, const3);
4498 dst += (2 * dst_stride);
4501 horiz7, horiz8, horiz8, horiz7,
4502 horiz7, horiz6, horiz5, horiz4,
4503 horiz8, horiz8, horiz7, horiz6,
4504 const20, const6, const3);
4524 v16u8 inp0, inp1, inp2, inp3;
4525 v16u8 res0, res1, avg0, avg1;
4526 v16u8 horiz0, horiz1, horiz2, horiz3;
4527 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4528 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4529 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4530 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4531 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4532 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4533 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4534 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4536 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4537 src += (4 * src_stride);
4540 const20, const6, const3);
4542 const20, const6, const3);
4543 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4544 horiz0 = __msa_aver_u_b(inp0, res0);
4545 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4546 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4547 horiz2 = __msa_aver_u_b(inp2, res1);
4548 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4549 LD_UB2(src, src_stride, inp0, inp1);
4550 src += (2 * src_stride);
4553 const20, const6, const3);
4554 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4555 horiz4 = __msa_aver_u_b(inp0, res0);
4556 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4558 horiz1, horiz2, horiz3, horiz4,
4559 horiz1, horiz0, horiz0, horiz1,
4560 horiz2, horiz3, horiz4, horiz5,
4561 const20, const6, const3);
4562 avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
4563 res0 = __msa_aver_u_b(avg0, res0);
4565 dst += (2 * dst_stride);
4567 LD_UB2(src, src_stride, inp2, inp3);
4568 src += (2 * src_stride);
4570 const20, const6, const3);
4571 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4572 horiz6 = __msa_aver_u_b(inp2, res1);
4573 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4576 const20, const6, const3);
4577 horiz8 = __msa_aver_u_b(inp0, res0);
4579 horiz3, horiz4, horiz5, horiz6,
4580 horiz3, horiz2, horiz1, horiz0,
4581 horiz4, horiz5, horiz6, horiz7,
4582 const20, const6, const3);
4583 avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
4584 res1 = __msa_aver_u_b(avg1, res1);
4586 horiz5, horiz6, horiz7, horiz8,
4587 horiz5, horiz4, horiz3, horiz2,
4588 horiz6, horiz7, horiz8, horiz8,
4589 const20, const6, const3);
4591 dst += 2 * dst_stride;
4593 avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
4594 res0 = __msa_aver_u_b(avg0, res0);
4596 horiz7, horiz8, horiz8, horiz7,
4597 horiz7, horiz6, horiz5, horiz4,
4598 horiz8, horiz8, horiz7, horiz6,
4599 const20, const6, const3);
4601 dst += 2 * dst_stride;
4603 avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
4604 res1 = __msa_aver_u_b(avg1, res1);
4606 dst += (2 * dst_stride);
4625 v16u8 inp0, inp1, inp2, inp3;
4626 v16u8 res0, res1, avg0, avg1;
4627 v16u8 horiz0, horiz1, horiz2, horiz3;
4628 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4629 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4630 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4631 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4632 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4633 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4634 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4635 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4637 LD_UB2(src, src_stride, inp0, inp1);
4638 src += (2 * src_stride);
4640 mask0, mask1, mask2, mask3,
4641 const20, const6, const3);
4642 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4643 LD_UB2(src, src_stride, inp2, inp3);
4644 src += (2 * src_stride);
4646 mask0, mask1, mask2, mask3,
4647 const20, const6, const3);
4648 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4649 LD_UB2(src, src_stride, inp0, inp1);
4650 src += (2 * src_stride);
4652 mask0, mask1, mask2, mask3,
4653 const20, const6, const3);
4654 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4655 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4657 horiz1, horiz2, horiz3, horiz4,
4658 horiz1, horiz0, horiz0, horiz1,
4659 horiz2, horiz3, horiz4, horiz5,
4660 const20, const6, const3);
4661 avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
4662 res0 = __msa_aver_u_b(avg0, res0);
4664 dst += (2 * dst_stride);
4666 LD_UB2(src, src_stride, inp2, inp3);
4667 src += (2 * src_stride);
4669 mask0, mask1, mask2, mask3,
4670 const20, const6, const3);
4671 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4673 horiz3, horiz4, horiz5, horiz6,
4674 horiz3, horiz2, horiz1, horiz0,
4675 horiz4, horiz5, horiz6, horiz7,
4676 const20, const6, const3);
4679 mask0, mask1, mask2, mask3,
4680 const20, const6, const3);
4681 avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
4682 res1 = __msa_aver_u_b(avg1, res1);
4684 horiz5, horiz6, horiz7, horiz8,
4685 horiz5, horiz4, horiz3, horiz2,
4686 horiz6, horiz7, horiz8, horiz8,
4687 const20, const6, const3);
4689 dst += 2 * dst_stride;
4690 avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
4691 res0 = __msa_aver_u_b(avg0, res0);
4694 horiz7, horiz8, horiz8, horiz7,
4695 horiz7, horiz6, horiz5, horiz4,
4696 horiz8, horiz8, horiz7, horiz6,
4697 const20, const6, const3);
4699 dst += 2 * dst_stride;
4700 avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
4701 res1 = __msa_aver_u_b(avg1, res1);
4720 v16u8 inp0, inp1, inp2, inp3;
4721 v16u8 res0, res1, avg0, avg1;
4722 v16u8 horiz0, horiz1, horiz2, horiz3;
4723 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4724 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4725 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4726 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4727 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4728 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4729 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4730 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4732 LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4733 src += (4 * src_stride);
4735 mask0, mask1, mask2, mask3,
4736 const20, const6, const3);
4737 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4739 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4740 horiz0 = __msa_aver_u_b(inp0, res0);
4741 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4743 const20, const6, const3);
4744 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4746 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4747 horiz2 = __msa_aver_u_b(inp2, res1);
4748 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4749 LD_UB2(src, src_stride, inp0, inp1);
4750 src += (2 * src_stride);
4752 const20, const6, const3);
4753 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4755 inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4756 horiz4 = __msa_aver_u_b(inp0, res0);
4757 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4759 horiz1, horiz2, horiz3, horiz4,
4760 horiz1, horiz0, horiz0, horiz1,
4761 horiz2, horiz3, horiz4, horiz5,
4762 const20, const6, const3);
4763 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
4764 res0 = __msa_aver_u_b(avg0, res0);
4765 LD_UB2(src, src_stride, inp2, inp3);
4766 src += (2 * src_stride);
4768 dst += 2 * dst_stride;
4771 const20, const6, const3);
4772 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4774 inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4775 horiz6 = __msa_aver_u_b(inp2, res1);
4776 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4778 horiz3, horiz4, horiz5, horiz6,
4779 horiz3, horiz2, horiz1, horiz0,
4780 horiz4, horiz5, horiz6, horiz7,
4781 const20, const6, const3);
4782 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
4783 res1 = __msa_aver_u_b(avg1, res1);
4786 const20, const6, const3);
4787 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4788 horiz8 = __msa_aver_u_b(inp0, res0);
4790 horiz5, horiz6, horiz7, horiz8,
4791 horiz5, horiz4, horiz3, horiz2,
4792 horiz6, horiz7, horiz8, horiz8,
4793 const20, const6, const3);
4795 dst += 2 * dst_stride;
4797 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
4798 res0 = __msa_aver_u_b(avg0, res0);
4800 horiz7, horiz8, horiz8, horiz7,
4801 horiz7, horiz6, horiz5, horiz4,
4802 horiz8, horiz8, horiz7, horiz6,
4803 const20, const6, const3);
4805 dst += 2 * dst_stride;
4807 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
4808 res1 = __msa_aver_u_b(avg1, res1);
4828 v16u8 inp0, inp1, inp2, inp3;
4829 v16u8 res0, res1, avg0, avg1;
4830 v16u8 horiz0, horiz1, horiz2, horiz3;
4831 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4833 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4834 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4835 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4836 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4837 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4838 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4839 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4841 LD_UB2(src, src_stride, inp0, inp1);
4842 src += (2 * src_stride);
4844 const20, const6, const3);
4845 LD_UB2(src, src_stride, inp2, inp3);
4846 src += (2 * src_stride);
4847 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4848 horiz0 = __msa_aver_u_b(inp0, res0);
4849 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4851 const20, const6, const3);
4852 LD_UB2(src, src_stride, inp0, inp1);
4853 src += (2 * src_stride);
4854 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4855 horiz2 = __msa_aver_u_b(inp2, res1);
4856 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4858 const20, const6, const3);
4859 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4860 horiz4 = __msa_aver_u_b(inp0, res0);
4861 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4862 LD_UB2(dst, dst_stride, dst0, dst1);
4863 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4865 horiz1, horiz2, horiz3, horiz4,
4866 horiz1, horiz0, horiz0, horiz1,
4867 horiz2, horiz3, horiz4, horiz5,
4868 const20, const6, const3);
4869 res0 = __msa_aver_u_b(avg0, res0);
4870 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4871 res0 = __msa_aver_u_b(avg0, res0);
4873 dst += (2 * dst_stride);
4875 LD_UB2(src, src_stride, inp2, inp3);
4876 src += (2 * src_stride);
4878 const20, const6, const3);
4879 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4880 horiz6 = __msa_aver_u_b(inp2, res1);
4881 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4882 LD_UB2(dst, dst_stride, dst0, dst1);
4883 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4885 horiz3, horiz4, horiz5, horiz6,
4886 horiz3, horiz2, horiz1, horiz0,
4887 horiz4, horiz5, horiz6, horiz7,
4888 const20, const6, const3);
4889 res1 = __msa_aver_u_b(avg1, res1);
4890 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4891 res1 = __msa_aver_u_b(avg1, res1);
4893 dst += (2 * dst_stride);
4897 const20, const6, const3);
4898 horiz8 = __msa_aver_u_b(inp0, res0);
4899 LD_UB2(dst, dst_stride, dst0, dst1);
4900 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4902 horiz5, horiz6, horiz7, horiz8,
4903 horiz5, horiz4, horiz3, horiz2,
4904 horiz6, horiz7, horiz8, horiz8,
4905 const20, const6, const3);
4906 res0 = __msa_aver_u_b(avg0, res0);
4907 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4908 res0 = __msa_aver_u_b(avg0, res0);
4910 dst += (2 * dst_stride);
4912 LD_UB2(dst, dst_stride, dst0, dst1);
4913 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4915 horiz7, horiz8, horiz8, horiz7,
4916 horiz7, horiz6, horiz5, horiz4,
4917 horiz8, horiz8, horiz7, horiz6,
4918 const20, const6, const3);
4919 res1 = __msa_aver_u_b(avg1, res1);
4920 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4921 res1 = __msa_aver_u_b(avg1, res1);
4941 v16u8 inp0, inp1, inp2, inp3;
4942 v16u8 res0, res1, avg0, avg1;
4943 v16u8 horiz0, horiz1, horiz2, horiz3;
4944 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4946 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4947 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4948 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4949 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4950 v16u8 const20 = (v16u8) __msa_ldi_b(20);
4951 v16u8 const6 = (v16u8) __msa_ldi_b(6);
4952 v16u8 const3 = (v16u8) __msa_ldi_b(3);
4954 LD_UB2(src, src_stride, inp0, inp1);
4955 src += (2 * src_stride);
4957 mask0, mask1, mask2, mask3,
4958 const20, const6, const3);
4959 LD_UB2(src, src_stride, inp2, inp3);
4960 src += (2 * src_stride);
4961 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4963 mask0, mask1, mask2, mask3,
4964 const20, const6, const3);
4965 LD_UB2(src, src_stride, inp0, inp1);
4966 src += (2 * src_stride);
4967 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4969 mask0, mask1, mask2, mask3,
4970 const20, const6, const3);
4971 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4972 LD_UB2(dst, dst_stride, dst0, dst1);
4973 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4975 horiz1, horiz2, horiz3, horiz4,
4976 horiz1, horiz0, horiz0, horiz1,
4977 horiz2, horiz3, horiz4, horiz5,
4978 const20, const6, const3);
4979 res0 = __msa_aver_u_b(avg0, res0);
4980 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4981 res0 = __msa_aver_u_b(avg0, res0);
4983 dst += (2 * dst_stride);
4985 LD_UB2(src, src_stride, inp2, inp3);
4986 src += (2 * src_stride);
4988 mask0, mask1, mask2, mask3,
4989 const20, const6, const3);
4990 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4991 LD_UB2(dst, dst_stride, dst0, dst1);
4992 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4994 horiz3, horiz4, horiz5, horiz6,
4995 horiz3, horiz2, horiz1, horiz0,
4996 horiz4, horiz5, horiz6, horiz7,
4997 const20, const6, const3);
4998 res1 = __msa_aver_u_b(avg1, res1);
4999 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5000 res1 = __msa_aver_u_b(avg1, res1);
5002 dst += (2 * dst_stride);
5006 mask0, mask1, mask2, mask3,
5007 const20, const6, const3);
5008 LD_UB2(dst, dst_stride, dst0, dst1);
5009 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
5011 horiz5, horiz6, horiz7, horiz8,
5012 horiz5, horiz4, horiz3, horiz2,
5013 horiz6, horiz7, horiz8, horiz8,
5014 const20, const6, const3);
5015 res0 = __msa_aver_u_b(avg0, res0);
5016 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5017 res0 = __msa_aver_u_b(avg0, res0);
5019 dst += (2 * dst_stride);
5021 LD_UB2(dst, dst_stride, dst0, dst1);
5022 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
5024 horiz7, horiz8, horiz8, horiz7,
5025 horiz7, horiz6, horiz5, horiz4,
5026 horiz8, horiz8, horiz7, horiz6,
5027 const20, const6, const3);
5028 res1 = __msa_aver_u_b(avg1, res1);
5029 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5030 res1 = __msa_aver_u_b(avg1, res1);
5050 v16u8 inp0, inp1, inp2, inp3;
5051 v16u8 res0, res1, avg0, avg1;
5052 v16u8 horiz0, horiz1, horiz2, horiz3;
5053 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5055 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5056 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5057 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5058 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5059 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5060 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5061 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5063 LD_UB2(src, src_stride, inp0, inp1);
5064 src += (2 * src_stride);
5066 const20, const6, const3);
5068 LD_UB2(src, src_stride, inp2, inp3);
5069 src += (2 * src_stride);
5070 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5072 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5073 horiz0 = __msa_aver_u_b(inp0, res0);
5074 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5076 const20, const6, const3);
5077 LD_UB2(src, src_stride, inp0, inp1);
5078 src += (2 * src_stride);
5079 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5081 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5082 horiz2 = __msa_aver_u_b(inp2, res1);
5083 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5085 const20, const6, const3);
5087 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5089 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5090 horiz4 = __msa_aver_u_b(inp0, res0);
5091 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5092 LD_UB2(dst, dst_stride, dst0, dst1);
5093 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
5095 horiz1, horiz2, horiz3, horiz4,
5096 horiz1, horiz0, horiz0, horiz1,
5097 horiz2, horiz3, horiz4, horiz5,
5098 const20, const6, const3);
5099 res0 = __msa_aver_u_b(avg0, res0);
5100 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5101 res0 = __msa_aver_u_b(avg0, res0);
5103 dst += (2 * dst_stride);
5105 LD_UB2(src, src_stride, inp2, inp3);
5106 src += (2 * src_stride);
5108 const20, const6, const3);
5110 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5112 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5113 horiz6 = __msa_aver_u_b(inp2, res1);
5114 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5115 LD_UB2(dst, dst_stride, dst0, dst1);
5116 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
5118 horiz3, horiz4, horiz5, horiz6,
5119 horiz3, horiz2, horiz1, horiz0,
5120 horiz4, horiz5, horiz6, horiz7,
5121 const20, const6, const3);
5122 res1 = __msa_aver_u_b(avg1, res1);
5123 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5124 res1 = __msa_aver_u_b(avg1, res1);
5126 dst += (2 * dst_stride);
5130 const20, const6, const3);
5131 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5132 horiz8 = __msa_aver_u_b(inp0, res0);
5133 LD_UB2(dst, dst_stride, dst0, dst1);
5134 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
5136 horiz5, horiz6, horiz7, horiz8,
5137 horiz5, horiz4, horiz3, horiz2,
5138 horiz6, horiz7, horiz8, horiz8,
5139 const20, const6, const3);
5140 res0 = __msa_aver_u_b(avg0, res0);
5141 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5142 res0 = __msa_aver_u_b(avg0, res0);
5144 dst += (2 * dst_stride);
5146 LD_UB2(dst, dst_stride, dst0, dst1);
5147 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
5149 horiz7, horiz8, horiz8, horiz7,
5150 horiz7, horiz6, horiz5, horiz4,
5151 horiz8, horiz8, horiz7, horiz6,
5152 const20, const6, const3);
5153 res1 = __msa_aver_u_b(avg1, res1);
5154 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5155 res1 = __msa_aver_u_b(avg1, res1);
5175 v16u8 inp0, inp1, inp2, inp3;
5176 v16u8 res0, res1, avg0, avg1;
5177 v16u8 horiz0, horiz1, horiz2, horiz3;
5178 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5180 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5181 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5182 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5183 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5184 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5185 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5186 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5188 LD_UB2(src, src_stride, inp0, inp1);
5189 src += (2 * src_stride);
5191 const20, const6, const3);
5192 LD_UB2(src, src_stride, inp2, inp3);
5193 src += (2 * src_stride);
5194 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5195 horiz0 = __msa_aver_u_b(inp0, res0);
5196 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5198 const20, const6, const3);
5199 LD_UB2(src, src_stride, inp0, inp1);
5200 src += (2 * src_stride);
5201 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5202 horiz2 = __msa_aver_u_b(inp2, res1);
5203 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5205 const20, const6, const3);
5206 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5207 horiz4 = __msa_aver_u_b(inp0, res0);
5208 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5209 LD_UB2(dst, dst_stride, dst0, dst1);
5211 horiz1, horiz2, horiz3, horiz4,
5212 horiz1, horiz0, horiz0, horiz1,
5213 horiz2, horiz3, horiz4, horiz5,
5214 const20, const6, const3);
5215 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5216 res0 = __msa_aver_u_b(avg0, res0);
5218 dst += (2 * dst_stride);
5220 LD_UB2(src, src_stride, inp2, inp3);
5221 src += (2 * src_stride);
5223 const20, const6, const3);
5224 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5225 horiz6 = __msa_aver_u_b(inp2, res1);
5226 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5227 LD_UB2(dst, dst_stride, dst0, dst1);
5229 horiz3, horiz4, horiz5, horiz6,
5230 horiz3, horiz2, horiz1, horiz0,
5231 horiz4, horiz5, horiz6, horiz7,
5232 const20, const6, const3);
5233 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5234 res1 = __msa_aver_u_b(avg1, res1);
5236 dst += (2 * dst_stride);
5240 const20, const6, const3);
5241 horiz8 = __msa_aver_u_b(inp0, res0);
5242 LD_UB2(dst, dst_stride, dst0, dst1);
5244 horiz5, horiz6, horiz7, horiz8,
5245 horiz5, horiz4, horiz3, horiz2,
5246 horiz6, horiz7, horiz8, horiz8,
5247 const20, const6, const3);
5248 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5249 res0 = __msa_aver_u_b(avg0, res0);
5251 dst += (2 * dst_stride);
5253 LD_UB2(dst, dst_stride, dst0, dst1);
5255 horiz7, horiz8, horiz8, horiz7,
5256 horiz7, horiz6, horiz5, horiz4,
5257 horiz8, horiz8, horiz7, horiz6,
5258 const20, const6, const3);
5259 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5260 res1 = __msa_aver_u_b(avg1, res1);
5262 dst += (2 * dst_stride);
5278 v16u8 inp0, inp1, inp2, inp3;
5279 v16u8 res0, res1, avg0, avg1;
5280 v16u8 horiz0, horiz1, horiz2, horiz3;
5281 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5283 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5284 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5285 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5286 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5287 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5288 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5289 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5291 LD_UB2(src, src_stride, inp0, inp1);
5292 src += (2 * src_stride);
5294 mask0, mask1, mask2, mask3,
5295 const20, const6, const3);
5296 LD_UB2(src, src_stride, inp2, inp3);
5297 src += (2 * src_stride);
5298 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5300 mask0, mask1, mask2, mask3,
5301 const20, const6, const3);
5302 LD_UB2(src, src_stride, inp0, inp1);
5303 src += (2 * src_stride);
5304 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5306 mask0, mask1, mask2, mask3,
5307 const20, const6, const3);
5308 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5309 LD_UB2(src, src_stride, inp2, inp3);
5310 src += (2 * src_stride);
5312 mask0, mask1, mask2, mask3,
5313 const20, const6, const3);
5314 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5317 mask0, mask1, mask2, mask3,
5318 const20, const6, const3);
5319 LD_UB2(dst, dst_stride, dst0, dst1);
5321 horiz1, horiz2, horiz3, horiz4,
5322 horiz1, horiz0, horiz0, horiz1,
5323 horiz2, horiz3, horiz4, horiz5,
5324 const20, const6, const3);
5325 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5326 res0 = __msa_aver_u_b(avg0, res0);
5328 dst += (2 * dst_stride);
5330 LD_UB2(dst, dst_stride, dst0, dst1);
5332 horiz3, horiz4, horiz5, horiz6,
5333 horiz3, horiz2, horiz1, horiz0,
5334 horiz4, horiz5, horiz6, horiz7,
5335 const20, const6, const3);
5336 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5337 res1 = __msa_aver_u_b(avg1, res1);
5339 dst += (2 * dst_stride);
5341 LD_UB2(dst, dst_stride, dst0, dst1);
5343 horiz5, horiz6, horiz7, horiz8,
5344 horiz5, horiz4, horiz3, horiz2,
5345 horiz6, horiz7, horiz8, horiz8,
5346 const20, const6, const3);
5347 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5348 res0 = __msa_aver_u_b(avg0, res0);
5350 dst += (2 * dst_stride);
5352 LD_UB2(dst, dst_stride, dst0, dst1);
5354 horiz7, horiz8, horiz8, horiz7,
5355 horiz7, horiz6, horiz5, horiz4,
5356 horiz8, horiz8, horiz7, horiz6,
5357 const20, const6, const3);
5358 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5359 res1 = __msa_aver_u_b(avg1, res1);
5379 v16u8 inp0, inp1, inp2, inp3;
5380 v16u8 res0, res1, avg0, avg1;
5381 v16u8 horiz0, horiz1, horiz2, horiz3;
5382 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5384 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5385 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5386 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5387 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5388 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5389 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5390 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5392 LD_UB2(src, src_stride, inp0, inp1);
5393 src += (2 * src_stride);
5395 const20, const6, const3);
5396 LD_UB2(src, src_stride, inp2, inp3);
5397 src += (2 * src_stride);
5398 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5400 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5401 horiz0 = __msa_aver_u_b(inp0, res0);
5402 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5404 const20, const6, const3);
5405 LD_UB2(src, src_stride, inp0, inp1);
5406 src += (2 * src_stride);
5407 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5409 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5410 horiz2 = __msa_aver_u_b(inp2, res1);
5411 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5413 const20, const6, const3);
5415 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5417 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5418 horiz4 = __msa_aver_u_b(inp0, res0);
5419 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5420 LD_UB2(dst, dst_stride, dst0, dst1);
5422 horiz1, horiz2, horiz3, horiz4,
5423 horiz1, horiz0, horiz0, horiz1,
5424 horiz2, horiz3, horiz4, horiz5,
5425 const20, const6, const3);
5426 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5427 res0 = __msa_aver_u_b(avg0, res0);
5429 dst += (2 * dst_stride);
5431 LD_UB2(src, src_stride, inp2, inp3);
5432 src += (2 * src_stride);
5434 const20, const6, const3);
5436 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5438 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5439 horiz6 = __msa_aver_u_b(inp2, res1);
5440 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5441 LD_UB2(dst, dst_stride, dst0, dst1);
5443 horiz3, horiz4, horiz5, horiz6,
5444 horiz3, horiz2, horiz1, horiz0,
5445 horiz4, horiz5, horiz6, horiz7,
5446 const20, const6, const3);
5447 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5448 res1 = __msa_aver_u_b(avg1, res1);
5450 dst += (2 * dst_stride);
5454 const20, const6, const3);
5455 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5456 horiz8 = __msa_aver_u_b(inp0, res0);
5457 LD_UB2(dst, dst_stride, dst0, dst1);
5459 horiz5, horiz6, horiz7, horiz8,
5460 horiz5, horiz4, horiz3, horiz2,
5461 horiz6, horiz7, horiz8, horiz8,
5462 const20, const6, const3);
5463 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5464 res0 = __msa_aver_u_b(avg0, res0);
5466 dst += (2 * dst_stride);
5468 LD_UB2(dst, dst_stride, dst0, dst1);
5470 horiz7, horiz8, horiz8, horiz7,
5471 horiz7, horiz6, horiz5, horiz4,
5472 horiz8, horiz8, horiz7, horiz6,
5473 const20, const6, const3);
5474 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5475 res1 = __msa_aver_u_b(avg1, res1);
5495 v16u8 inp0, inp1, inp2, inp3;
5496 v16u8 res0, res1, avg0, avg1;
5497 v16u8 horiz0, horiz1, horiz2, horiz3;
5498 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5500 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5501 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5502 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5503 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5504 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5505 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5506 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5508 LD_UB2(src, src_stride, inp0, inp1);
5509 src += (2 * src_stride);
5512 const20, const6, const3);
5513 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5514 horiz0 = __msa_aver_u_b(inp0, res0);
5515 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5516 LD_UB2(src, src_stride, inp2, inp3);
5517 src += (2 * src_stride);
5519 const20, const6, const3);
5520 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5521 horiz2 = __msa_aver_u_b(inp2, res1);
5522 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5523 LD_UB2(dst, dst_stride, dst0, dst1);
5524 LD_UB2(src, src_stride, inp0, inp1);
5525 src += (2 * src_stride);
5527 const20, const6, const3);
5528 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5529 horiz4 = __msa_aver_u_b(inp0, res0);
5530 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5532 horiz1, horiz2, horiz3, horiz4,
5533 horiz1, horiz0, horiz0, horiz1,
5534 horiz2, horiz3, horiz4, horiz5,
5535 const20, const6, const3);
5536 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5537 res0 = __msa_aver_u_b(avg0, res0);
5538 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5539 res0 = __msa_aver_u_b(avg0, res0);
5541 dst += (2 * dst_stride);
5543 LD_UB2(dst, dst_stride, dst0, dst1);
5544 LD_UB2(src, src_stride, inp2, inp3);
5545 src += (2 * src_stride);
5547 const20, const6, const3);
5548 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5549 horiz6 = __msa_aver_u_b(inp2, res1);
5550 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5552 horiz3, horiz4, horiz5, horiz6,
5553 horiz3, horiz2, horiz1, horiz0,
5554 horiz4, horiz5, horiz6, horiz7,
5555 const20, const6, const3);
5556 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5557 res1 = __msa_aver_u_b(avg1, res1);
5558 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5559 res1 = __msa_aver_u_b(avg1, res1);
5561 dst += (2 * dst_stride);
5565 const20, const6, const3);
5566 horiz8 = __msa_aver_u_b(inp0, res0);
5568 horiz5, horiz6, horiz7, horiz8,
5569 horiz5, horiz4, horiz3, horiz2,
5570 horiz6, horiz7, horiz8, horiz8,
5571 const20, const6, const3);
5573 horiz7, horiz8, horiz8, horiz7,
5574 horiz7, horiz6, horiz5, horiz4,
5575 horiz8, horiz8, horiz7, horiz6,
5576 const20, const6, const3);
5577 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5578 res0 = __msa_aver_u_b(avg0, res0);
5579 LD_UB2(dst, dst_stride, dst0, dst1);
5580 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5581 res0 = __msa_aver_u_b(avg0, res0);
5583 dst += (2 * dst_stride);
5585 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5586 res1 = __msa_aver_u_b(avg1, res1);
5587 LD_UB2(dst, dst_stride, dst0, dst1);
5588 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5589 res1 = __msa_aver_u_b(avg1, res1);
5609 v16u8 inp0, inp1, inp2, inp3;
5610 v16u8 res0, res1, avg0, avg1;
5611 v16u8 horiz0, horiz1, horiz2, horiz3;
5612 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5614 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5615 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5616 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5617 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5618 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5619 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5620 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5622 LD_UB2(src, src_stride, inp0, inp1);
5623 src += (2 * src_stride);
5625 mask0, mask1, mask2, mask3,
5626 const20, const6, const3);
5627 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5628 LD_UB2(src, src_stride, inp2, inp3);
5629 src += (2 * src_stride);
5631 mask0, mask1, mask2, mask3,
5632 const20, const6, const3);
5633 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5634 LD_UB2(dst, dst_stride, dst0, dst1);
5635 LD_UB2(src, src_stride, inp0, inp1);
5636 src += (2 * src_stride);
5638 mask0, mask1, mask2, mask3,
5639 const20, const6, const3);
5640 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5642 horiz1, horiz2, horiz3, horiz4,
5643 horiz1, horiz0, horiz0, horiz1,
5644 horiz2, horiz3, horiz4, horiz5,
5645 const20, const6, const3);
5646 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5647 res0 = __msa_aver_u_b(avg0, res0);
5648 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5649 res0 = __msa_aver_u_b(avg0, res0);
5651 dst += (2 * dst_stride);
5653 LD_UB2(dst, dst_stride, dst0, dst1);
5654 LD_UB2(src, src_stride, inp2, inp3);
5655 src += (2 * src_stride);
5657 mask0, mask1, mask2, mask3,
5658 const20, const6, const3);
5659 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5661 horiz3, horiz4, horiz5, horiz6,
5662 horiz3, horiz2, horiz1, horiz0,
5663 horiz4, horiz5, horiz6, horiz7,
5664 const20, const6, const3);
5665 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5666 res1 = __msa_aver_u_b(avg1, res1);
5667 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5668 res1 = __msa_aver_u_b(avg1, res1);
5670 dst += (2 * dst_stride);
5674 mask0, mask1, mask2, mask3,
5675 const20, const6, const3);
5677 horiz6, horiz7, horiz8, horiz5, horiz4,
5678 horiz3, horiz2, horiz6, horiz7, horiz8,
5679 horiz8, const20, const6, const3);
5681 horiz8, horiz8, horiz7, horiz7, horiz6,
5682 horiz5, horiz4, horiz8, horiz8, horiz7,
5683 horiz6, const20, const6, const3);
5684 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5685 res0 = __msa_aver_u_b(avg0, res0);
5686 LD_UB2(dst, dst_stride, dst0, dst1);
5687 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5688 res0 = __msa_aver_u_b(avg0, res0);
5690 dst += (2 * dst_stride);
5692 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5693 res1 = __msa_aver_u_b(avg1, res1);
5694 LD_UB2(dst, dst_stride, dst0, dst1);
5695 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5696 res1 = __msa_aver_u_b(avg1, res1);
5716 v16u8 inp0, inp1, inp2, inp3;
5717 v16u8 res0, res1, avg0, avg1;
5718 v16u8 horiz0, horiz1, horiz2, horiz3;
5719 v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5721 v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5722 v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5723 v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5724 v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5725 v16u8 const20 = (v16u8) __msa_ldi_b(20);
5726 v16u8 const6 = (v16u8) __msa_ldi_b(6);
5727 v16u8 const3 = (v16u8) __msa_ldi_b(3);
5729 LD_UB2(src, src_stride, inp0, inp1);
5730 src += (2 * src_stride);
5732 const20, const6, const3);
5733 LD_UB2(src, src_stride, inp2, inp3);
5734 src += (2 * src_stride);
5735 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5737 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5738 horiz0 = __msa_aver_u_b(inp0, res0);
5739 horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5741 const20, const6, const3);
5742 LD_UB2(src, src_stride, inp0, inp1);
5743 src += (2 * src_stride);
5744 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5746 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5747 horiz2 = __msa_aver_u_b(inp2, res1);
5748 horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5750 const20, const6, const3);
5751 SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5753 inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5754 horiz4 = __msa_aver_u_b(inp0, res0);
5755 horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5756 LD_UB2(dst, dst_stride, dst0, dst1);
5757 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5759 horiz2, horiz3, horiz4, horiz1, horiz0,
5760 horiz0, horiz1, horiz2, horiz3, horiz4,
5761 horiz5, const20, const6, const3);
5762 res0 = __msa_aver_u_b(avg0, res0);
5763 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5764 res0 = __msa_aver_u_b(avg0, res0);
5766 dst += (2 * dst_stride);
5768 LD_UB2(src, src_stride, inp2, inp3);
5769 src += (2 * src_stride);
5771 const20, const6, const3);
5772 SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5774 inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5775 horiz6 = __msa_aver_u_b(inp2, res1);
5776 horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5777 LD_UB2(dst, dst_stride, dst0, dst1);
5778 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5780 horiz4, horiz5, horiz6, horiz3, horiz2,
5781 horiz1, horiz0, horiz4, horiz5, horiz6,
5782 horiz7, const20, const6, const3);
5783 res1 = __msa_aver_u_b(avg1, res1);
5784 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5785 res1 = __msa_aver_u_b(avg1, res1);
5787 dst += (2 * dst_stride);
5791 const20, const6, const3);
5792 inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5793 horiz8 = __msa_aver_u_b(inp0, res0);
5794 LD_UB2(dst, dst_stride, dst0, dst1);
5795 avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5797 horiz6, horiz7, horiz8, horiz5, horiz4,
5798 horiz3, horiz2, horiz6, horiz7, horiz8,
5799 horiz8, const20, const6, const3);
5800 res0 = __msa_aver_u_b(avg0, res0);
5801 avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5802 res0 = __msa_aver_u_b(avg0, res0);
5804 dst += (2 * dst_stride);
5806 LD_UB2(dst, dst_stride, dst0, dst1);
5807 avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5809 horiz8, horiz8, horiz7, horiz7, horiz6,
5810 horiz5, horiz4, horiz8, horiz8, horiz7,
5811 horiz6, const20, const6, const3);
5812 res1 = __msa_aver_u_b(avg1, res1);
5813 avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5814 res1 = __msa_aver_u_b(avg1, res1);
5824 for (loop_cnt = 4; loop_cnt--;) {
5840 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
5841 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
5843 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
5844 src += (8 * src_stride);
5846 src8, src9, src10, src11, src12, src13, src14, src15);
5848 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
5849 dst += (8 * dst_stride);
5850 ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
5859 uint64_t out0, out1, out2, out3;
5861 v16u8 dst0, dst1, dst2, dst3;
5863 for (cnt = (height / 4); cnt--;) {
5864 LD_UB4(src, src_stride, src0, src1, src2, src3);
5865 src += (4 * src_stride);
5866 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
5868 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
5869 dst0, dst1, dst2, dst3);
5871 out0 = __msa_copy_u_d((v2i64) dst0, 0);
5872 out1 = __msa_copy_u_d((v2i64) dst1, 0);
5873 out2 = __msa_copy_u_d((v2i64) dst2, 0);
5874 out3 = __msa_copy_u_d((v2i64) dst3, 0);
5875 SD4(out0, out1, out2, out3, dst, dst_stride);
5876 dst += (4 * dst_stride);
5885 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
5886 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5888 for (cnt = (height / 8); cnt--;) {
5889 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
5890 src += (8 * src_stride);
5891 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
5893 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
5894 dst0, dst1, dst2, dst3);
5895 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
5896 dst4, dst5, dst6, dst7);
5897 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
5898 dst += (8 * dst_stride);