25 #define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
26 p1_out, p0_out, q0_out, q1_out) \
28 v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
29 v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
30 v8i16 q0_sub_p0_r, filt_r, cnst3h; \
32 p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \
33 p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \
34 q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \
35 q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \
37 filt = __msa_subs_s_b(p1_m, q1_m); \
38 filt = filt & (v16i8) hev_in; \
39 q0_sub_p0 = q0_m - p0_m; \
40 filt_sign = __msa_clti_s_b(filt, 0); \
42 cnst3h = __msa_ldi_h(3); \
43 q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
44 q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h); \
45 filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \
46 filt_r += q0_sub_p0_r; \
47 filt_r = __msa_sat_s_h(filt_r, 7); \
50 filt = __msa_pckev_b((v16i8) filt_r, (v16i8) filt_r); \
52 filt = filt & (v16i8) mask_in; \
53 cnst4b = __msa_ldi_b(4); \
54 filt1 = __msa_adds_s_b(filt, cnst4b); \
57 cnst3b = __msa_ldi_b(3); \
58 filt2 = __msa_adds_s_b(filt, cnst3b); \
61 q0_m = __msa_subs_s_b(q0_m, filt1); \
62 q0_out = __msa_xori_b((v16u8) q0_m, 0x80); \
63 p0_m = __msa_adds_s_b(p0_m, filt2); \
64 p0_out = __msa_xori_b((v16u8) p0_m, 0x80); \
66 filt = __msa_srari_b(filt1, 1); \
67 hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \
68 filt = filt & (v16i8) hev_in; \
70 q1_m = __msa_subs_s_b(q1_m, filt); \
71 q1_out = __msa_xori_b((v16u8) q1_m, 0x80); \
72 p1_m = __msa_adds_s_b(p1_m, filt); \
73 p1_out = __msa_xori_b((v16u8) p1_m, 0x80); \
76 #define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
77 p1_out, p0_out, q0_out, q1_out) \
79 v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
80 v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
81 v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \
83 p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \
84 p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \
85 q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \
86 q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \
88 filt = __msa_subs_s_b(p1_m, q1_m); \
90 filt = filt & (v16i8) hev_in; \
92 q0_sub_p0 = q0_m - p0_m; \
93 filt_sign = __msa_clti_s_b(filt, 0); \
95 cnst3h = __msa_ldi_h(3); \
96 q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
97 q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h); \
98 filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \
99 filt_r += q0_sub_p0_r; \
100 filt_r = __msa_sat_s_h(filt_r, 7); \
102 q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0, q0_sub_p0); \
103 q0_sub_p0_l = __msa_dotp_s_h((v16i8) q0_sub_p0_l, (v16i8) cnst3h); \
104 filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt); \
105 filt_l += q0_sub_p0_l; \
106 filt_l = __msa_sat_s_h(filt_l, 7); \
108 filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r); \
109 filt = filt & (v16i8) mask_in; \
111 cnst4b = __msa_ldi_b(4); \
112 filt1 = __msa_adds_s_b(filt, cnst4b); \
115 cnst3b = __msa_ldi_b(3); \
116 filt2 = __msa_adds_s_b(filt, cnst3b); \
119 q0_m = __msa_subs_s_b(q0_m, filt1); \
120 q0_out = __msa_xori_b((v16u8) q0_m, 0x80); \
121 p0_m = __msa_adds_s_b(p0_m, filt2); \
122 p0_out = __msa_xori_b((v16u8) p0_m, 0x80); \
124 filt = __msa_srari_b(filt1, 1); \
125 hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \
126 filt = filt & (v16i8) hev_in; \
128 q1_m = __msa_subs_s_b(q1_m, filt); \
129 q1_out = __msa_xori_b((v16u8) q1_m, 0x80); \
130 p1_m = __msa_adds_s_b(p1_m, filt); \
131 p1_out = __msa_xori_b((v16u8) p1_m, 0x80); \
134 #define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
136 v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
137 v16u8 zero_in = { 0 }; \
139 tmp = __msa_ori_b(zero_in, 1); \
140 p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \
141 q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \
142 p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \
143 q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \
145 p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \
146 flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \
147 p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \
148 flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \
150 flat_out = (tmp < (v16u8) flat_out); \
151 flat_out = __msa_xori_b(flat_out, 0xff); \
152 flat_out = flat_out & (mask); \
155 #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \
156 q5_in, q6_in, q7_in, flat_in, flat2_out) \
158 v16u8 tmp, zero_in = { 0 }; \
159 v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
160 v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
162 tmp = __msa_ori_b(zero_in, 1); \
163 p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \
164 q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \
165 p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \
166 q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \
167 p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \
168 q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \
169 p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \
170 q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \
172 p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \
173 flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \
174 flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \
175 p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \
176 flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \
177 p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \
178 flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \
180 flat2_out = (tmp < (v16u8) flat2_out); \
181 flat2_out = __msa_xori_b(flat2_out, 0xff); \
182 flat2_out = flat2_out & flat_in; \
185 #define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, \
186 q0_in, q1_in, q2_in, q3_in, \
187 p2_filt8_out, p1_filt8_out, p0_filt8_out, \
188 q0_filt8_out, q1_filt8_out, q2_filt8_out) \
190 v8u16 tmp0, tmp1, tmp2; \
192 tmp2 = p2_in + p1_in + p0_in; \
195 tmp0 = tmp0 + tmp2 + q0_in; \
196 tmp1 = tmp0 + p3_in + p2_in; \
197 p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
199 tmp1 = tmp0 + p1_in + q1_in; \
200 p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
202 tmp1 = q2_in + q1_in + q0_in; \
203 tmp2 = tmp2 + tmp1; \
204 tmp0 = tmp2 + (p0_in); \
205 tmp0 = tmp0 + (p3_in); \
206 p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3); \
208 tmp0 = q2_in + q3_in; \
209 tmp0 = p0_in + tmp1 + tmp0; \
210 tmp1 = q3_in + q3_in; \
211 tmp1 = tmp1 + tmp0; \
212 q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
214 tmp0 = tmp2 + q3_in; \
215 tmp1 = tmp0 + q0_in; \
216 q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
218 tmp1 = tmp0 - p2_in; \
219 tmp0 = q1_in + q3_in; \
220 tmp1 = tmp0 + tmp1; \
221 q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
224 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \
225 q0_in, q1_in, q2_in, q3_in, \
226 limit_in, b_limit_in, thresh_in, \
227 hev_out, mask_out, flat_out) \
229 v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
230 v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
233 p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
234 p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
235 p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
236 q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
237 q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
238 q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
239 p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
240 p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
243 flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
244 hev_out = thresh_in < (v16u8) flat_out; \
247 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
248 p1_asub_q1_m >>= 1; \
249 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
251 mask_out = b_limit_in < p0_asub_q0_m; \
252 mask_out = __msa_max_u_b(flat_out, mask_out); \
253 p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
254 mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
255 q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
256 mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
258 mask_out = limit_in < (v16u8) mask_out; \
259 mask_out = __msa_xori_b(mask_out, 0xff); \
267 uint64_t p1_d, p0_d, q0_d, q1_d;
269 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0, p1_out, p0_out, q0_out, q1_out;
272 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
274 thresh = (v16u8) __msa_fill_b(thresh_ptr);
275 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
276 limit = (v16u8) __msa_fill_b(limit_ptr);
278 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
283 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
284 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
285 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
286 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
287 SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
296 v16u8
mask,
hev,
flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
297 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
300 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
302 thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
303 thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
304 thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
306 b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
307 b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
308 b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
310 limit0 = (v16u8) __msa_fill_b(limit_ptr);
311 limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
312 limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
314 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
318 ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
326 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
328 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
329 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
330 v8i16 p2_filter8, p1_filter8, p0_filter8;
331 v8i16 q0_filter8, q1_filter8, q2_filter8;
332 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
336 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
338 thresh = (v16u8) __msa_fill_b(thresh_ptr);
339 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
340 limit = (v16u8) __msa_fill_b(limit_ptr);
342 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
348 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
351 if (__msa_test_bz_v(flat)) {
352 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
353 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
354 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
355 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
356 SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
358 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
359 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
361 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
362 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
365 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
366 zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
368 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
371 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
372 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
373 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
374 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
375 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
376 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
378 p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
379 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
380 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
381 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
382 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
383 q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
387 SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
400 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
401 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
402 v16u8
flat,
mask,
hev, tmp, thresh, b_limit, limit;
403 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
404 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
405 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
406 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
407 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
408 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
412 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
414 thresh = (v16u8) __msa_fill_b(thresh_ptr);
415 tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
416 thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
418 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
419 tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
420 b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
422 limit = (v16u8) __msa_fill_b(limit_ptr);
423 tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
424 limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
427 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
434 if (__msa_test_bz_v(flat)) {
435 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
437 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
438 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
440 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
441 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
443 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
445 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
447 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
448 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
451 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
452 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
453 p0_filt8_r, q0_filt8_r);
454 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r,
455 q1_filt8_r, q2_filt8_r);
458 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
459 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
460 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
461 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
462 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
463 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
467 ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
469 ST_UB2(q1_out, q2_out, src, pitch);
479 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
480 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
481 v16u8
flat,
mask,
hev, tmp, thresh, b_limit, limit;
482 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
483 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
484 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
488 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
490 thresh = (v16u8) __msa_fill_b(thresh_ptr);
491 tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
492 thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
494 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
495 tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
496 b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
498 limit = (v16u8) __msa_fill_b(limit_ptr);
499 tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
500 limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
503 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
509 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
512 if (__msa_test_bz_v(flat)) {
513 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
515 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
516 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
518 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
519 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
522 PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
523 p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
524 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
525 PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
526 q1_filt8_r, q2_filt8_r);
529 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
530 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
531 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
532 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
533 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
534 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
538 ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
540 ST_UB2(q1_out, q2_out, src, pitch);
550 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
551 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
552 v16u8
flat,
mask,
hev, tmp, thresh, b_limit, limit;
553 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
554 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
555 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
559 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
561 thresh = (v16u8) __msa_fill_b(thresh_ptr);
562 tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
563 thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
565 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
566 tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
567 b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
569 limit = (v16u8) __msa_fill_b(limit_ptr);
570 tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
571 limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
574 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
580 flat = (v16u8) __msa_insve_d((v2i64)
flat, 0, (v2i64) zero);
583 if (__msa_test_bz_v(flat)) {
584 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
586 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
588 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
590 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
591 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
594 PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
595 p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
596 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
597 PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
598 q1_filt8_l, q2_filt8_l);
601 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
602 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
603 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
604 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
605 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
606 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
610 ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
612 ST_UB2(q1_out, q2_out, src, pitch);
623 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
624 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
626 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
627 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
628 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
629 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
630 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
631 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
635 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
637 thresh = (v16u8) __msa_fill_b(thresh_ptr);
638 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
639 limit = (v16u8) __msa_fill_b(limit_ptr);
642 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
649 if (__msa_test_bz_v(flat)) {
650 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
654 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
655 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
657 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
658 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
660 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
662 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
664 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
665 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
668 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
669 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
670 p0_filt8_r, q0_filt8_r);
671 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
675 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
676 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
677 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
678 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
679 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
680 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
682 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
683 filter48 += (4 * 16);
684 ST_UB2(q1_out, q2_out, filter48, 16);
685 filter48 += (2 * 16);
686 ST_UB(flat, filter48);
694 v16u8
flat, flat2, filter8;
696 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
697 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
698 v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
699 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
700 v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
701 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
702 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
703 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
704 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
705 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
708 flat =
LD_UB(filter48 + 96);
710 LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
711 LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
712 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
715 if (__msa_test_bz_v(flat2)) {
716 LD_UB4(filter48, 16, p2, p1, p0, q0);
717 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
720 ST_UB4(p2, p1, p0, q0, src, pitch);
722 ST_UB2(q1, q2, src, pitch);
726 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
727 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
728 p3_r_in, p2_r_in, p1_r_in, p0_r_in);
730 q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q0);
732 tmp0_r = p7_r_in << 3;
736 tmp1_r = p6_r_in + p5_r_in;
743 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
745 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
747 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
749 q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8)
q0);
751 tmp0_l = p7_l_in << 3;
755 tmp1_l = p6_l_in + p5_l_in;
762 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
764 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
765 p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
770 q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q1);
771 tmp0_r = p5_r_in - p6_r_in;
775 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
777 q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8)
q1);
778 tmp0_l = p5_l_in - p6_l_in;
782 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
784 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
785 p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
790 q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
791 tmp0_r = p4_r_in - p5_r_in;
795 r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4);
797 q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
798 tmp0_l = p4_l_in - p5_l_in;
802 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
804 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
805 p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
810 q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
811 tmp0_r = p3_r_in - p4_r_in;
815 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
817 q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
818 tmp0_l = p3_l_in - p4_l_in;
822 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
824 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
825 p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
830 q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
831 filter8 =
LD_UB(filter48);
832 tmp0_r = p2_r_in - p3_r_in;
836 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
838 q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
839 tmp0_l = p2_l_in - p3_l_in;
843 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
845 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
846 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
851 q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
852 filter8 =
LD_UB(filter48 + 16);
853 tmp0_r = p1_r_in - p2_r_in;
857 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
859 q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
860 tmp0_l = p1_l_in - p2_l_in;
864 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
866 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
867 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
872 q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
873 filter8 =
LD_UB(filter48 + 32);
874 tmp0_r = p0_r_in - p1_r_in;
878 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
880 q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
881 tmp0_l = p0_l_in - p1_l_in;
885 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
887 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
888 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
893 q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
894 filter8 =
LD_UB(filter48 + 48);
895 tmp0_r = q7_r_in - p0_r_in;
899 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
901 q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
902 tmp0_l = q7_l_in - p0_l_in;
906 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
908 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
909 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
914 filter8 =
LD_UB(filter48 + 64);
915 tmp0_r = q7_r_in - q0_r_in;
919 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
921 tmp0_l = q7_l_in - q0_l_in;
925 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
927 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
928 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
933 filter8 =
LD_UB(filter48 + 80);
934 tmp0_r = q7_r_in - q1_r_in;
938 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
940 tmp0_l = q7_l_in - q1_l_in;
944 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
946 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
947 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
952 tmp0_r = q7_r_in - q2_r_in;
956 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
958 tmp0_l = q7_l_in - q2_l_in;
962 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
964 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
965 q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
970 tmp0_r = q7_r_in - q3_r_in;
974 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
976 tmp0_l = q7_l_in - q3_l_in;
980 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
982 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
983 q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
988 tmp0_r = q7_r_in - q4_r_in;
992 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
994 tmp0_l = q7_l_in - q4_l_in;
998 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
1000 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
1001 q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
1006 tmp0_r = q7_r_in - q5_r_in;
1010 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1012 tmp0_l = q7_l_in - q5_l_in;
1016 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
1018 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
1019 q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
1033 b_limit_ptr, limit_ptr, thresh_ptr);
1035 if (0 == early_exit) {
1045 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
1046 uint64_t dword0, dword1;
1047 v16u8 flat2,
mask,
hev,
flat, thresh, b_limit, limit;
1048 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0, p7, p6, p5, p4, q4, q5, q6, q7;
1049 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1050 v16u8 p0_filter16, p1_filter16;
1051 v8i16 p2_filter8, p1_filter8, p0_filter8;
1052 v8i16 q0_filter8, q1_filter8, q2_filter8;
1053 v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
1054 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
1056 v8u16 tmp0, tmp1, tmp2;
1059 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1061 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1062 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1063 limit = (v16u8) __msa_fill_b(limit_ptr);
1065 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1067 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1071 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
1074 if (__msa_test_bz_v(flat)) {
1075 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1076 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1077 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1078 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1079 SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
1082 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero,
1083 q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r,
1085 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r,
1086 p2_filter8, p1_filter8, p0_filter8, q0_filter8,
1087 q1_filter8, q2_filter8);
1090 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
1091 zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
1093 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8,
1097 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
1098 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
1099 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
1100 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
1101 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
1102 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
1105 LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
1106 LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
1108 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1111 if (__msa_test_bz_v(flat2)) {
1112 p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
1113 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1114 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1115 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1116 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1117 q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
1119 SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
1120 SD(q1_d, src + pitch);
1121 SD(q2_d, src + 2 * pitch);
1124 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4,
1125 zero, q5, zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r,
1126 q4_r, q5_r, q6_r, q7_r);
1136 tmp1 = p6_r + p5_r + p4_r + p3_r;
1137 tmp1 += (p2_r + p1_r + p0_r);
1139 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1140 tmp0 = p5_r - p6_r + q1_r - p7_r;
1142 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1144 p0_filter16, p1_filter16);
1145 p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
1146 p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
1147 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1148 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1155 tmp0 = p4_r - p5_r + q2_r - p7_r;
1156 tmp2 = p3_r - p4_r + q3_r - p7_r;
1158 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1160 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1162 p0_filter16, p1_filter16);
1163 p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
1164 p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
1165 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1166 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1173 tmp0 = p2_r - p3_r + q4_r - p7_r;
1174 tmp2 = p1_r - p2_r + q5_r - p7_r;
1176 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1178 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1180 p0_filter16, p1_filter16);
1181 p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
1182 p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
1183 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1184 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1191 tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
1192 tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
1194 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1196 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1198 p0_filter16, p1_filter16);
1199 p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
1200 p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
1201 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1202 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1209 tmp0 = q7_r - q0_r + q1_r - p6_r;
1210 tmp2 = q7_r - q1_r + q2_r - p5_r;
1212 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1214 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1216 p0_filter16, p1_filter16);
1217 p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
1218 p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
1219 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1220 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1227 tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
1228 tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
1230 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1232 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1234 p0_filter16, p1_filter16);
1235 p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
1236 p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
1237 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1238 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1245 tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
1246 tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
1248 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1250 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1252 p0_filter16, p1_filter16);
1253 p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
1254 p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
1255 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1256 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1270 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1271 v8i16 vec0, vec1, vec2, vec3;
1273 LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1275 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1276 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1277 limit = (v16u8) __msa_fill_b(limit_ptr);
1280 p3, p2, p1, p0, q0, q1, q2, q3);
1281 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1288 ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
1290 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1299 v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
1300 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1301 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1302 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1303 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1305 LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1306 LD_UB8(src - 4 + (8 * pitch), pitch,
1307 row8, row9, row10, row11, row12, row13, row14, row15);
1310 row8, row9, row10, row11, row12, row13, row14, row15,
1311 p3, p2, p1, p0, q0, q1, q2, q3);
1313 thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
1314 thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
1315 thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
1317 b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
1318 b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
1319 b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
1321 limit0 = (v16u8) __msa_fill_b(limit_ptr);
1322 limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
1323 limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
1325 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
1345 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1346 v16u8 p1_out, p0_out, q0_out, q1_out;
1348 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1349 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1350 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1352 v8i16 vec0, vec1, vec2, vec3, vec4;
1355 LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1358 p3, p2, p1, p0, q0, q1, q2, q3);
1360 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1361 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1362 limit = (v16u8) __msa_fill_b(limit_ptr);
1365 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1368 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1373 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
1376 if (__msa_test_bz_v(flat)) {
1378 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1382 ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
1384 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1386 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1387 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1389 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1390 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1392 PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
1393 p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1394 p0_filt8_r, q0_filt8_r);
1395 PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
1399 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1400 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1401 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1402 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1403 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1404 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1409 vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1412 ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
1415 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1426 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1427 v16u8 p1_out, p0_out, q0_out, q1_out;
1429 v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1430 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1431 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1432 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1433 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1434 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1435 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1437 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1441 LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1442 temp_src += (8 * pitch);
1443 LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1447 q3, q2, q1, q0, row12, row13, row14, row15,
1448 p3, p2, p1, p0, q0, q1, q2, q3);
1450 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1451 vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1452 thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1454 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1455 vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1456 b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1458 limit = (v16u8) __msa_fill_b(limit_ptr);
1459 vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1460 limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1463 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1466 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1472 if (__msa_test_bz_v(flat)) {
1473 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1475 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1483 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1484 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1486 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1487 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1489 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1491 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1495 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1496 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1499 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1500 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1501 p0_filt8_r, q0_filt8_r);
1502 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1506 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1507 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1508 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1509 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1510 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1511 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1520 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1523 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
1526 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
1529 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
1540 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1541 v16u8 p1_out, p0_out, q0_out, q1_out;
1543 v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1544 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1545 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1546 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1548 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1552 LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1553 temp_src += (8 * pitch);
1554 LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1558 q3, q2, q1, q0, row12, row13, row14, row15,
1559 p3, p2, p1, p0, q0, q1, q2, q3);
1561 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1562 vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1563 thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1565 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1566 vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1567 b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1569 limit = (v16u8) __msa_fill_b(limit_ptr);
1570 vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1571 limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1574 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1577 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1582 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
1585 if (__msa_test_bz_v(flat)) {
1586 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1588 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1596 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1597 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1599 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1600 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1603 PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
1604 p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
1605 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
1606 PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
1607 q1_filt8_r, q2_filt8_r);
1610 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1611 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1612 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1613 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1614 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1615 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1624 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1627 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
1630 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
1633 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
1644 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1645 v16u8 p1_out, p0_out, q0_out, q1_out;
1647 v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1648 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1649 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1650 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1652 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1656 LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1657 temp_src += (8 * pitch);
1658 LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1662 q3, q2, q1, q0, row12, row13, row14, row15,
1663 p3, p2, p1, p0, q0, q1, q2, q3);
1665 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1666 vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1667 thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1669 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1670 vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1671 b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1673 limit = (v16u8) __msa_fill_b(limit_ptr);
1674 vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1675 limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1678 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1681 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1686 flat = (v16u8) __msa_insve_d((v2i64)
flat, 0, (v2i64) zero);
1689 if (__msa_test_bz_v(flat)) {
1690 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1692 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1700 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1702 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1705 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1706 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1709 PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
1710 p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
1711 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
1712 PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
1713 q1_filt8_l, q2_filt8_l);
1716 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
1717 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
1718 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
1719 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
1720 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
1721 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
1730 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1733 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
1736 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
1739 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
1747 v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
1748 v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1749 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
1752 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
1755 p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
1757 ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
1758 tmp0, tmp1, tmp2, tmp3);
1759 ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
1760 ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
1765 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1766 output += (8 * out_pitch);
1767 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1773 v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
1774 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
1776 LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
1777 LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
1778 TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
1779 q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
1780 ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
1786 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1787 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1788 v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
1790 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
1792 LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1793 input += (8 * in_pitch);
1795 row8, row9, row10, row11, row12, row13, row14, row15);
1798 row8, row9, row10, row11, row12, row13, row14, row15,
1799 p7, p6, p5, p4, p3, p2, p1, p0);
1803 q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0);
1804 q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1);
1805 q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2);
1806 q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3);
1807 q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4);
1808 q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5);
1809 q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6);
1810 q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7);
1813 tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7);
1814 tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5);
1817 tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3);
1818 tmp7 = (v8i16) __msa_ilvod_b((v16i8)
q0, (v16i8) q1);
1821 q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1822 q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1824 tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0);
1825 tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5);
1826 q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1827 q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1830 q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1831 q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1833 tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4);
1834 tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6);
1835 q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1836 q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1838 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1839 output += (8 * out_pitch);
1840 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1849 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1850 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1852 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1853 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1854 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1856 v8i16 vec0, vec1, vec2, vec3;
1859 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1861 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1862 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1863 limit = (v16u8) __msa_fill_b(limit_ptr);
1866 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1869 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1874 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
1877 if (__msa_test_bz_v(flat)) {
1878 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1880 ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
1883 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1884 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1886 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1887 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1890 p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r);
1891 p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r);
1892 p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r);
1893 q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r);
1894 q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r);
1895 q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r);
1898 p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, flat);
1899 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, flat);
1900 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, flat);
1901 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, flat);
1902 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, flat);
1903 q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, flat);
1905 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1906 filter48 += (4 * 16);
1907 ST_UB2(q1_out, q2_out, filter48, 16);
1908 filter48 += (2 * 16);
1909 ST_UB(flat, filter48);
1919 v16u8 filter8,
flat, flat2;
1920 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
1921 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
1922 v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1923 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
1924 v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1925 v8u16 tmp0_r, tmp1_r;
1928 flat =
LD_UB(filter48 + 6 * 16);
1930 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1931 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1933 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1936 if (__msa_test_bz_v(flat2)) {
1937 v8i16 vec0, vec1, vec2, vec3, vec4;
1939 LD_UB4(filter48, 16, p2, p1, p0, q0);
1940 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1944 vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1947 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
1948 ST2x4_UB(vec2, 0, (src_org + 4), pitch);
1949 src_org += (4 * pitch);
1950 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
1951 ST2x4_UB(vec2, 4, (src_org + 4), pitch);
1957 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
1958 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
1959 p3_r_in, p2_r_in, p1_r_in, p0_r_in);
1960 q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q0);
1962 tmp0_r = p7_r_in << 3;
1966 tmp1_r = p6_r_in + p5_r_in;
1974 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1975 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1976 p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
1981 q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q1);
1982 tmp0_r = p5_r_in - p6_r_in;
1986 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1987 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1988 p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
1993 q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
1994 tmp0_r = p4_r_in - p5_r_in;
1998 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1999 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2000 p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
2005 q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
2006 tmp0_r = p3_r_in - p4_r_in;
2010 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2011 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2012 p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
2017 q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
2018 filter8 =
LD_UB(filter48);
2019 tmp0_r = p2_r_in - p3_r_in;
2023 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2024 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2025 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2030 q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
2031 filter8 =
LD_UB(filter48 + 16);
2032 tmp0_r = p1_r_in - p2_r_in;
2036 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2037 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2038 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2043 q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
2044 filter8 =
LD_UB(filter48 + 32);
2045 tmp0_r = p0_r_in - p1_r_in;
2049 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2050 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2051 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2056 q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
2057 filter8 =
LD_UB(filter48 + 48);
2058 tmp0_r = q7_r_in - p0_r_in;
2062 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2063 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2064 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2069 filter8 =
LD_UB(filter48 + 64);
2070 tmp0_r = q7_r_in - q0_r_in;
2074 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2075 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2076 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2081 filter8 =
LD_UB(filter48 + 80);
2082 tmp0_r = q7_r_in - q1_r_in;
2086 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2087 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2088 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2093 tmp0_r = q7_r_in - q2_r_in;
2097 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2098 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2099 q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2104 tmp0_r = q7_r_in - q3_r_in;
2108 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2109 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2110 q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2115 tmp0_r = q7_r_in - q4_r_in;
2119 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2120 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2121 q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2126 tmp0_r = q7_r_in - q5_r_in;
2130 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2131 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2132 q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2146 uint8_t *filter48 = &transposed_input[16 * 16];
2151 &filter48[0], src, pitch,
2152 b_limit_ptr, limit_ptr, thresh_ptr);
2154 if (0 == early_exit) {
2158 if (0 == early_exit) {
2170 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
2171 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2173 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
2174 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2175 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
2176 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
2177 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
2178 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
2180 v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
2183 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
2185 thresh = (v16u8) __msa_fill_b(thresh_ptr);
2186 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
2187 limit = (v16u8) __msa_fill_b(limit_ptr);
2190 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2193 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2199 if (__msa_test_bz_v(flat)) {
2200 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2202 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2206 ST4x8_UB(vec2, vec3, src_org, pitch);
2207 src_org += 8 * pitch;
2208 ST4x8_UB(vec4, vec5, src_org, pitch);
2212 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
2213 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
2215 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
2216 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
2217 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
2219 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
2221 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2222 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2225 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
2226 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
2227 p0_filt8_r, q0_filt8_r);
2228 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
2232 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
2233 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
2234 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
2235 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
2236 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
2237 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
2239 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
2240 filter48 += (4 * 16);
2241 ST_UB2(q1_out, q2_out, filter48, 16);
2242 filter48 += (2 * 16);
2243 ST_UB(flat, filter48);
2252 v16u8
flat, flat2, filter8;
2254 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
2255 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
2256 v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
2257 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
2258 v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
2259 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2260 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2261 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2262 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2263 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
2266 flat =
LD_UB(filter48 + 6 * 16);
2268 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
2269 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
2271 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2274 if (__msa_test_bz_v(flat2)) {
2275 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2277 LD_UB4(filter48, 16, p2, p1, p0, q0);
2278 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
2287 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
2288 ST2x4_UB(vec2, 0, (src_org + 4), pitch);
2289 src_org += (4 * pitch);
2290 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
2291 ST2x4_UB(vec2, 4, (src_org + 4), pitch);
2292 src_org += (4 * pitch);
2293 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
2294 ST2x4_UB(vec5, 0, (src_org + 4), pitch);
2295 src_org += (4 * pitch);
2296 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
2297 ST2x4_UB(vec5, 4, (src_org + 4), pitch);
2303 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
2304 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
2305 p3_r_in, p2_r_in, p1_r_in, p0_r_in);
2306 q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q0);
2308 tmp0_r = p7_r_in << 3;
2312 tmp1_r = p6_r_in + p5_r_in;
2319 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2321 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
2323 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
2325 q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8)
q0);
2327 tmp0_l = p7_l_in << 3;
2331 tmp1_l = p6_l_in + p5_l_in;
2338 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2340 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2341 p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
2346 q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q1);
2347 tmp0_r = p5_r_in - p6_r_in;
2351 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2352 q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8)
q1);
2353 tmp0_l = p5_l_in - p6_l_in;
2357 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2358 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2359 p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
2364 q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
2365 tmp0_r = p4_r_in - p5_r_in;
2369 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2370 q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
2371 tmp0_l = p4_l_in - p5_l_in;
2375 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2376 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2377 p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
2382 q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
2383 tmp0_r = p3_r_in - p4_r_in;
2387 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2388 q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
2389 tmp0_l = p3_l_in - p4_l_in;
2393 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2394 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2395 p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
2400 q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
2401 filter8 =
LD_UB(filter48);
2402 tmp0_r = p2_r_in - p3_r_in;
2406 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2407 q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
2408 tmp0_l = p2_l_in - p3_l_in;
2412 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2413 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2414 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2415 ST_UB(filter8, src);
2419 q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
2420 filter8 =
LD_UB(filter48 + 16);
2421 tmp0_r = p1_r_in - p2_r_in;
2425 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2426 q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
2427 tmp0_l = p1_l_in - p2_l_in;
2431 l_out = __msa_srari_h((v8i16) (tmp1_l), 4);
2432 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2433 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2434 ST_UB(filter8, src);
2438 q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
2439 filter8 =
LD_UB(filter48 + 32);
2440 tmp0_r = p0_r_in - p1_r_in;
2444 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2445 q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
2446 tmp0_l = p0_l_in - p1_l_in;
2450 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2451 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2452 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2453 ST_UB(filter8, src);
2457 q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
2458 filter8 =
LD_UB(filter48 + 48);
2459 tmp0_r = q7_r_in - p0_r_in;
2463 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2464 q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
2465 tmp0_l = q7_l_in - p0_l_in;
2469 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2470 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2471 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2472 ST_UB(filter8, src);
2476 filter8 =
LD_UB(filter48 + 64);
2477 tmp0_r = q7_r_in - q0_r_in;
2481 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2482 tmp0_l = q7_l_in - q0_l_in;
2486 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2487 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2488 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2489 ST_UB(filter8, src);
2493 filter8 =
LD_UB(filter48 + 80);
2494 tmp0_r = q7_r_in - q1_r_in;
2498 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2499 tmp0_l = q7_l_in - q1_l_in;
2503 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2504 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2505 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2506 ST_UB(filter8, src);
2510 tmp0_r = q7_r_in - q2_r_in;
2514 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2515 tmp0_l = q7_l_in - q2_l_in;
2519 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2520 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2521 q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2526 tmp0_r = q7_r_in - q3_r_in;
2530 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2531 tmp0_l = q7_l_in - q3_l_in;
2535 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2536 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2537 q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2542 tmp0_r = q7_r_in - q4_r_in;
2546 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2547 tmp0_l = q7_l_in - q4_l_in;
2551 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2552 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2553 q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2558 tmp0_r = q7_r_in - q5_r_in;
2562 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2563 tmp0_l = q7_l_in - q5_l_in;
2567 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2568 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2569 q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2583 uint8_t *filter48 = &transposed_input[16 * 16];
2588 &filter48[0], src, pitch,
2589 b_limit_ptr, limit_ptr, thresh_ptr);
2591 if (0 == early_exit) {
2595 if (0 == early_exit) {
void ff_loop_filter_h_8_8_msa(uint8_t *src, int32_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, uint8_t *src_org, int32_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
void ff_loop_filter_v_16_8_msa(uint8_t *src, int32_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in,q0_in, q1_in, q2_in, q3_in,p2_filt8_out, p1_filt8_out, p0_filt8_out,q0_filt8_out, q1_filt8_out, q2_filt8_out)
void ff_loop_filter_v_8_8_msa(uint8_t *src, int32_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
void ff_loop_filter_h_48_16_msa(uint8_t *src, int32_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static const uint8_t q1[256]
void ff_loop_filter_v_84_16_msa(uint8_t *src, int32_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
void ff_loop_filter_v_4_8_msa(uint8_t *src, int32_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,in8, in9, in10, in11, in12, in13, in14, in15,out0, out1, out2, out3, out4, out5, out6, out7)
#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)
#define SLDI_B4_0_UB(...)
static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, uint8_t *filter48)
static const uint16_t mask[17]
void ff_loop_filter_h_16_16_msa(uint8_t *src, int32_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, uint8_t *src_org, int32_t pitch_org, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static const uint8_t q0[256]
void ff_loop_filter_v_16_16_msa(uint8_t *src, int32_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in,q5_in, q6_in, q7_in, flat_in, flat2_out)
void ff_loop_filter_h_16_8_msa(uint8_t *src, int32_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
void ff_loop_filter_v_88_16_msa(uint8_t *src, int32_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
void ff_loop_filter_h_88_16_msa(uint8_t *src, int32_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define ST2x4_UB(in, stidx, pdst, stride)
#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,q0_in, q1_in, q2_in, q3_in,limit_in, b_limit_in, thresh_in,hev_out, mask_out, flat_out)
static void vp9_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48)
static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, uint8_t *filter48)
void ff_loop_filter_h_4_8_msa(uint8_t *src, int32_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define TRANSPOSE8x8_UB_UB(...)
static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
void ff_loop_filter_v_44_16_msa(uint8_t *src, int32_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define SD4(in0, in1, in2, in3, pdst, stride)
static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define ST4x8_UB(in0, in1, pdst, stride)
void ff_loop_filter_h_84_16_msa(uint8_t *src, int32_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define ALLOC_ALIGNED(align)
void ff_loop_filter_h_44_16_msa(uint8_t *src, int32_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
void ff_loop_filter_v_48_16_msa(uint8_t *src, int32_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,p1_out, p0_out, q0_out, q1_out)
#define ST8x1_UB(in, pdst)
static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,p1_out, p0_out, q0_out, q1_out)