27 #define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, _in2, \
28 _in3, _in4, _in5, _in6, _in7) \
30 _in0 = __lsx_vld(_src, 0); \
31 _in1 = __lsx_vldx(_src, _stride); \
32 _in2 = __lsx_vldx(_src, _stride2); \
33 _in3 = __lsx_vldx(_src, _stride3); \
35 _in4 = __lsx_vld(_src, 0); \
36 _in5 = __lsx_vldx(_src, _stride); \
37 _in6 = __lsx_vldx(_src, _stride2); \
38 _in7 = __lsx_vldx(_src, _stride3); \
41 #define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, \
42 _dst, _stride, _stride2, _stride3, _stride4) \
44 __lsx_vst(_dst0, _dst, 0); \
45 __lsx_vstx(_dst1, _dst, _stride); \
46 __lsx_vstx(_dst2, _dst, _stride2); \
47 __lsx_vstx(_dst3, _dst, _stride3); \
49 __lsx_vst(_dst4, _dst, 0); \
50 __lsx_vstx(_dst5, _dst, _stride); \
51 __lsx_vstx(_dst6, _dst, _stride2); \
52 __lsx_vstx(_dst7, _dst, _stride3); \
55 #define VP9_LPF_FILTER4_4W(p1_src, p0_src, q0_src, q1_src, mask_src, hev_src, \
56 p1_dst, p0_dst, q0_dst, q1_dst) \
58 __m128i p1_tmp, p0_tmp, q0_tmp, q1_tmp, q0_sub_p0, filt, filt1, filt2; \
59 const __m128i cnst3b = __lsx_vldi(3); \
60 const __m128i cnst4b = __lsx_vldi(4); \
62 p1_tmp = __lsx_vxori_b(p1_src, 0x80); \
63 p0_tmp = __lsx_vxori_b(p0_src, 0x80); \
64 q0_tmp = __lsx_vxori_b(q0_src, 0x80); \
65 q1_tmp = __lsx_vxori_b(q1_src, 0x80); \
67 filt = __lsx_vssub_b(p1_tmp, q1_tmp); \
69 filt = filt & hev_src; \
71 q0_sub_p0 = __lsx_vssub_b(q0_tmp, p0_tmp); \
72 filt = __lsx_vsadd_b(filt, q0_sub_p0); \
73 filt = __lsx_vsadd_b(filt, q0_sub_p0); \
74 filt = __lsx_vsadd_b(filt, q0_sub_p0); \
75 filt = filt & mask_src; \
77 filt1 = __lsx_vsadd_b(filt, cnst4b); \
78 filt1 = __lsx_vsrai_b(filt1, 3); \
80 filt2 = __lsx_vsadd_b(filt, cnst3b); \
81 filt2 = __lsx_vsrai_b(filt2, 3); \
83 q0_tmp = __lsx_vssub_b(q0_tmp, filt1); \
84 q0_dst = __lsx_vxori_b(q0_tmp, 0x80); \
85 p0_tmp = __lsx_vsadd_b(p0_tmp, filt2); \
86 p0_dst = __lsx_vxori_b(p0_tmp, 0x80); \
88 filt = __lsx_vsrari_b(filt1, 1); \
89 hev_src = __lsx_vxori_b(hev_src, 0xff); \
90 filt = filt & hev_src; \
92 q1_tmp = __lsx_vssub_b(q1_tmp, filt); \
93 q1_dst = __lsx_vxori_b(q1_tmp, 0x80); \
94 p1_tmp = __lsx_vsadd_b(p1_tmp, filt); \
95 p1_dst = __lsx_vxori_b(p1_tmp, 0x80); \
98 #define VP9_FLAT4(p3_src, p2_src, p0_src, q0_src, q2_src, q3_src, flat_dst) \
100 __m128i f_tmp = __lsx_vldi(1); \
101 __m128i p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
103 p2_a_sub_p0 = __lsx_vabsd_bu(p2_src, p0_src); \
104 q2_a_sub_q0 = __lsx_vabsd_bu(q2_src, q0_src); \
105 p3_a_sub_p0 = __lsx_vabsd_bu(p3_src, p0_src); \
106 q3_a_sub_q0 = __lsx_vabsd_bu(q3_src, q0_src); \
108 p2_a_sub_p0 = __lsx_vmax_bu(p2_a_sub_p0, q2_a_sub_q0); \
109 flat_dst = __lsx_vmax_bu(p2_a_sub_p0, flat_dst); \
110 p3_a_sub_p0 = __lsx_vmax_bu(p3_a_sub_p0, q3_a_sub_q0); \
111 flat_dst = __lsx_vmax_bu(p3_a_sub_p0, flat_dst); \
113 flat_dst = __lsx_vslt_bu(f_tmp, flat_dst); \
114 flat_dst = __lsx_vxori_b(flat_dst, 0xff); \
115 flat_dst = flat_dst & mask; \
118 #define VP9_FLAT5(p7_src, p6_src, p5_src, p4_src, p0_src, q0_src, q4_src, \
119 q5_src, q6_src, q7_src, flat_src, flat2_dst) \
121 __m128i f_tmp = __lsx_vldi(1); \
122 __m128i p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
123 __m128i p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
125 p4_a_sub_p0 = __lsx_vabsd_bu(p4_src, p0_src); \
126 q4_a_sub_q0 = __lsx_vabsd_bu(q4_src, q0_src); \
127 p5_a_sub_p0 = __lsx_vabsd_bu(p5_src, p0_src); \
128 q5_a_sub_q0 = __lsx_vabsd_bu(q5_src, q0_src); \
129 p6_a_sub_p0 = __lsx_vabsd_bu(p6_src, p0_src); \
130 q6_a_sub_q0 = __lsx_vabsd_bu(q6_src, q0_src); \
131 p7_a_sub_p0 = __lsx_vabsd_bu(p7_src, p0_src); \
132 q7_a_sub_q0 = __lsx_vabsd_bu(q7_src, q0_src); \
134 p4_a_sub_p0 = __lsx_vmax_bu(p4_a_sub_p0, q4_a_sub_q0); \
135 flat2_dst = __lsx_vmax_bu(p5_a_sub_p0, q5_a_sub_q0); \
136 flat2_dst = __lsx_vmax_bu(p4_a_sub_p0, flat2_dst); \
137 p6_a_sub_p0 = __lsx_vmax_bu(p6_a_sub_p0, q6_a_sub_q0); \
138 flat2_dst = __lsx_vmax_bu(p6_a_sub_p0, flat2_dst); \
139 p7_a_sub_p0 = __lsx_vmax_bu(p7_a_sub_p0, q7_a_sub_q0); \
140 flat2_dst = __lsx_vmax_bu(p7_a_sub_p0, flat2_dst); \
142 flat2_dst = __lsx_vslt_bu(f_tmp, flat2_dst); \
143 flat2_dst = __lsx_vxori_b(flat2_dst, 0xff); \
144 flat2_dst = flat2_dst & flat_src; \
147 #define VP9_FILTER8(p3_src, p2_src, p1_src, p0_src, \
148 q0_src, q1_src, q2_src, q3_src, \
149 p2_filt8_dst, p1_filt8_dst, p0_filt8_dst, \
150 q0_filt8_dst, q1_filt8_dst, q2_filt8_dst) \
152 __m128i tmp0, tmp1, tmp2; \
154 tmp2 = __lsx_vadd_h(p2_src, p1_src); \
155 tmp2 = __lsx_vadd_h(tmp2, p0_src); \
156 tmp0 = __lsx_vslli_h(p3_src, 1); \
158 tmp0 = __lsx_vadd_h(tmp0, tmp2); \
159 tmp0 = __lsx_vadd_h(tmp0, q0_src); \
160 tmp1 = __lsx_vadd_h(tmp0, p3_src); \
161 tmp1 = __lsx_vadd_h(tmp1, p2_src); \
162 p2_filt8_dst = __lsx_vsrari_h(tmp1, 3); \
164 tmp1 = __lsx_vadd_h(tmp0, p1_src); \
165 tmp1 = __lsx_vadd_h(tmp1, q1_src); \
166 p1_filt8_dst = __lsx_vsrari_h(tmp1, 3); \
168 tmp1 = __lsx_vadd_h(q2_src, q1_src); \
169 tmp1 = __lsx_vadd_h(tmp1, q0_src); \
170 tmp2 = __lsx_vadd_h(tmp2, tmp1); \
171 tmp0 = __lsx_vadd_h(tmp2, p0_src); \
172 tmp0 = __lsx_vadd_h(tmp0, p3_src); \
173 p0_filt8_dst = __lsx_vsrari_h(tmp0, 3); \
175 tmp0 = __lsx_vadd_h(q2_src, q3_src); \
176 tmp0 = __lsx_vadd_h(tmp0, p0_src); \
177 tmp0 = __lsx_vadd_h(tmp0, tmp1); \
178 tmp1 = __lsx_vadd_h(q3_src, q3_src); \
179 tmp1 = __lsx_vadd_h(tmp1, tmp0); \
180 q2_filt8_dst = __lsx_vsrari_h(tmp1, 3); \
182 tmp0 = __lsx_vadd_h(tmp2, q3_src); \
183 tmp1 = __lsx_vadd_h(tmp0, q0_src); \
184 q0_filt8_dst = __lsx_vsrari_h(tmp1, 3); \
186 tmp1 = __lsx_vsub_h(tmp0, p2_src); \
187 tmp0 = __lsx_vadd_h(q1_src, q3_src); \
188 tmp1 = __lsx_vadd_h(tmp0, tmp1); \
189 q1_filt8_dst = __lsx_vsrari_h(tmp1, 3); \
192 #define LPF_MASK_HEV(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, \
193 q2_src, q3_src, limit_src, b_limit_src, thresh_src, \
194 hev_dst, mask_dst, flat_dst) \
196 __m128i p3_asub_p2_tmp, p2_asub_p1_tmp, p1_asub_p0_tmp, q1_asub_q0_tmp; \
197 __m128i p1_asub_q1_tmp, p0_asub_q0_tmp, q3_asub_q2_tmp, q2_asub_q1_tmp; \
200 p3_asub_p2_tmp = __lsx_vabsd_bu(p3_src, p2_src); \
201 p2_asub_p1_tmp = __lsx_vabsd_bu(p2_src, p1_src); \
202 p1_asub_p0_tmp = __lsx_vabsd_bu(p1_src, p0_src); \
203 q1_asub_q0_tmp = __lsx_vabsd_bu(q1_src, q0_src); \
204 q2_asub_q1_tmp = __lsx_vabsd_bu(q2_src, q1_src); \
205 q3_asub_q2_tmp = __lsx_vabsd_bu(q3_src, q2_src); \
206 p0_asub_q0_tmp = __lsx_vabsd_bu(p0_src, q0_src); \
207 p1_asub_q1_tmp = __lsx_vabsd_bu(p1_src, q1_src); \
210 flat_dst = __lsx_vmax_bu(p1_asub_p0_tmp, q1_asub_q0_tmp); \
211 hev_dst = __lsx_vslt_bu(thresh_src, flat_dst); \
214 p0_asub_q0_tmp = __lsx_vsadd_bu(p0_asub_q0_tmp, p0_asub_q0_tmp); \
215 p1_asub_q1_tmp = __lsx_vsrli_b(p1_asub_q1_tmp, 1); \
216 p0_asub_q0_tmp = __lsx_vsadd_bu(p0_asub_q0_tmp, p1_asub_q1_tmp); \
218 mask_dst = __lsx_vslt_bu(b_limit_src, p0_asub_q0_tmp); \
219 mask_dst = __lsx_vmax_bu(flat_dst, mask_dst); \
220 p3_asub_p2_tmp = __lsx_vmax_bu(p3_asub_p2_tmp, p2_asub_p1_tmp); \
221 mask_dst = __lsx_vmax_bu(p3_asub_p2_tmp, mask_dst); \
222 q2_asub_q1_tmp = __lsx_vmax_bu(q2_asub_q1_tmp, q3_asub_q2_tmp); \
223 mask_dst = __lsx_vmax_bu(q2_asub_q1_tmp, mask_dst); \
225 mask_dst = __lsx_vslt_bu(limit_src, mask_dst); \
226 mask_dst = __lsx_vxori_b(mask_dst, 0xff); \
234 ptrdiff_t stride2 =
stride << 1;
235 ptrdiff_t stride3 = stride2 +
stride;
236 ptrdiff_t stride4 = stride2 << 1;
238 __m128i p3, p2, p1, p0, q3, q2,
q1,
q0, p1_out, p0_out, q0_out, q1_out;
242 q0 = __lsx_vld(
dst, 0);
244 q3 = __lsx_vldx(
dst, stride3);
246 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
247 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
248 limit = __lsx_vreplgr2vr_b(limit_ptr);
250 LPF_MASK_HEV(p3, p2, p1, p0,
q0,
q1, q2, q3,
limit, b_limit, thresh,
256 __lsx_vstelm_d(p1_out,
dst - stride2, 0, 0);
257 __lsx_vstelm_d(p0_out,
dst -
stride, 0, 0);
258 __lsx_vstelm_d(q0_out,
dst , 0, 0);
259 __lsx_vstelm_d(q1_out,
dst +
stride, 0, 0);
267 ptrdiff_t stride2 =
stride << 1;
268 ptrdiff_t stride3 = stride2 +
stride;
269 ptrdiff_t stride4 = stride2 << 1;
271 __m128i limit0, thresh1, b_limit1, limit1;
272 __m128i p3, p2, p1, p0, q3, q2,
q1,
q0;
276 q0 = __lsx_vld(
dst, 0);
278 q3 = __lsx_vldx(
dst, stride3);
280 thresh0 = __lsx_vreplgr2vr_b(thresh_ptr);
281 thresh1 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
282 thresh0 = __lsx_vilvl_d(thresh1, thresh0);
284 b_limit0 = __lsx_vreplgr2vr_b(b_limit_ptr);
285 b_limit1 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
286 b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
288 limit0 = __lsx_vreplgr2vr_b(limit_ptr);
289 limit1 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
290 limit0 = __lsx_vilvl_d(limit1, limit0);
292 LPF_MASK_HEV(p3, p2, p1, p0,
q0,
q1, q2, q3, limit0, b_limit0, thresh0,
296 __lsx_vst(p1,
dst - stride2, 0);
298 __lsx_vst(
q0,
dst , 0);
307 ptrdiff_t stride2 =
stride << 1;
308 ptrdiff_t stride3 = stride2 +
stride;
309 ptrdiff_t stride4 = stride2 << 1;
311 __m128i p3, p2, p1, p0, q3, q2,
q1,
q0;
312 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
313 __m128i p2_filter8, p1_filter8, p0_filter8;
314 __m128i q0_filter8, q1_filter8, q2_filter8;
315 __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
316 __m128i
zero = __lsx_vldi(0);
320 q0 = __lsx_vld(
dst, 0);
322 q3 = __lsx_vldx(
dst, stride3);
324 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
325 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
326 limit = __lsx_vreplgr2vr_b(limit_ptr);
328 LPF_MASK_HEV(p3, p2, p1, p0,
q0,
q1, q2, q3,
limit, b_limit, thresh,
337 if (__lsx_bz_v(
flat)) {
338 __lsx_vstelm_d(p1_out,
dst - stride2, 0, 0);
339 __lsx_vstelm_d(p0_out,
dst -
stride, 0, 0);
340 __lsx_vstelm_d(q0_out,
dst , 0, 0);
341 __lsx_vstelm_d(q1_out,
dst +
stride, 0, 0);
344 p3_l, p2_l, p1_l, p0_l);
346 q0_l, q1_l, q2_l, q3_l);
347 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
348 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
352 zero, p0_filter8,
zero, q0_filter8, p2_filter8,
353 p1_filter8, p0_filter8, q0_filter8);
355 q1_filter8, q2_filter8);
358 p2_out = __lsx_vbitsel_v(p2, p2_filter8,
flat);
359 p1_out = __lsx_vbitsel_v(p1_out, p1_filter8,
flat);
360 p0_out = __lsx_vbitsel_v(p0_out, p0_filter8,
flat);
361 q0_out = __lsx_vbitsel_v(q0_out, q0_filter8,
flat);
362 q1_out = __lsx_vbitsel_v(q1_out, q1_filter8,
flat);
363 q2_out = __lsx_vbitsel_v(q2, q2_filter8,
flat);
365 __lsx_vstelm_d(p2_out,
dst - stride3, 0, 0);
366 __lsx_vstelm_d(p1_out,
dst - stride2, 0, 0);
367 __lsx_vstelm_d(p0_out,
dst -
stride, 0, 0);
368 __lsx_vstelm_d(q0_out,
dst, 0, 0);
369 __lsx_vstelm_d(q1_out,
dst +
stride, 0, 0);
370 __lsx_vstelm_d(q2_out,
dst + stride2, 0, 0);
379 ptrdiff_t stride2 =
stride << 1;
380 ptrdiff_t stride3 = stride2 +
stride;
381 ptrdiff_t stride4 = stride2 << 1;
382 __m128i p3, p2, p1, p0, q3, q2,
q1,
q0;
383 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
385 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
386 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
387 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
388 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
389 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
390 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
391 __m128i
zero = __lsx_vldi(0);
396 q0 = __lsx_vld(
dst, 0);
398 q3 = __lsx_vldx(
dst, stride3);
400 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
401 tmp = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
402 thresh = __lsx_vilvl_d(
tmp, thresh);
404 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
405 tmp = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
406 b_limit = __lsx_vilvl_d(
tmp, b_limit);
408 limit = __lsx_vreplgr2vr_b(limit_ptr);
409 tmp = __lsx_vreplgr2vr_b(limit_ptr >> 8);
413 LPF_MASK_HEV(p3, p2, p1, p0,
q0,
q1, q2, q3,
limit, b_limit, thresh,
420 if (__lsx_bz_v(
flat)) {
421 __lsx_vst(p1_out,
dst - stride2, 0);
423 __lsx_vst(q0_out,
dst, 0);
427 p3_l, p2_l, p1_l, p0_l);
429 q0_l, q1_l, q2_l, q3_l);
430 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
431 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
434 p3_h, p2_h, p1_h, p0_h);
436 q0_h, q1_h, q2_h, q3_h);
437 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
438 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
441 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
442 p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l,
443 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
444 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
445 q2_filt8_l, q1_filt8_l, q2_filt8_l);
448 p2_out = __lsx_vbitsel_v(p2, p2_filt8_l,
flat);
449 p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l,
flat);
450 p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l,
flat);
451 q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l,
flat);
452 q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l,
flat);
453 q2_out = __lsx_vbitsel_v(q2, q2_filt8_l,
flat);
456 __lsx_vstx(p2_out,
dst, -stride3);
457 __lsx_vstx(p1_out,
dst, -stride2);
459 __lsx_vst(q0_out,
dst, 0);
461 __lsx_vstx(q2_out,
dst, stride2);
470 ptrdiff_t stride2 =
stride << 1;
471 ptrdiff_t stride3 = stride2 +
stride;
472 ptrdiff_t stride4 = stride2 << 1;
473 __m128i p3, p2, p1, p0, q3, q2,
q1,
q0;
474 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
476 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
477 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
478 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
479 __m128i
zero = __lsx_vldi(0);
484 q0 = __lsx_vld(
dst, 0);
486 q3 = __lsx_vldx(
dst, stride3);
488 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
489 tmp = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
490 thresh = __lsx_vilvl_d(
tmp, thresh);
492 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
493 tmp = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
494 b_limit = __lsx_vilvl_d(
tmp, b_limit);
496 limit = __lsx_vreplgr2vr_b(limit_ptr);
497 tmp = __lsx_vreplgr2vr_b(limit_ptr >> 8);
501 LPF_MASK_HEV(p3, p2, p1, p0,
q0,
q1, q2, q3,
limit, b_limit, thresh,
510 if (__lsx_bz_v(
flat)) {
511 __lsx_vstx(p1_out,
dst, -stride2);
513 __lsx_vst(q0_out,
dst, 0);
517 p3_l, p2_l, p1_l, p0_l);
519 q0_l, q1_l, q2_l, q3_l);
520 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
521 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
524 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l,
525 p1_filt8_l, p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
526 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
527 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l,
528 q2_filt8_l, q1_filt8_l, q2_filt8_l);
531 p2_out = __lsx_vbitsel_v(p2, p2_filt8_l,
flat);
532 p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l,
flat);
533 p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l,
flat);
534 q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l,
flat);
535 q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l,
flat);
536 q2_out = __lsx_vbitsel_v(q2, q2_filt8_l,
flat);
538 __lsx_vstx(p2_out,
dst, -stride3);
539 __lsx_vstx(p1_out,
dst, -stride2);
541 __lsx_vst(q0_out,
dst, 0);
543 __lsx_vstx(q2_out,
dst, stride2);
552 ptrdiff_t stride2 =
stride << 1;
553 ptrdiff_t stride3 = stride2 +
stride;
554 ptrdiff_t stride4 = stride2 << 1;
555 __m128i p3, p2, p1, p0, q3, q2,
q1,
q0;
556 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
558 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
559 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
560 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
561 __m128i
zero = { 0 };
566 q0 = __lsx_vld(
dst, 0);
568 q3 = __lsx_vldx(
dst, stride3);
570 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
571 tmp = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
572 thresh = __lsx_vilvl_d(
tmp, thresh);
574 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
575 tmp = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
576 b_limit = __lsx_vilvl_d(
tmp, b_limit);
578 limit = __lsx_vreplgr2vr_b(limit_ptr);
579 tmp = __lsx_vreplgr2vr_b(limit_ptr >> 8);
583 LPF_MASK_HEV(p3, p2, p1, p0,
q0,
q1, q2, q3,
limit, b_limit, thresh,
592 if (__lsx_bz_v(
flat)) {
593 __lsx_vstx(p1_out,
dst, -stride2);
595 __lsx_vst(q0_out,
dst, 0);
599 p3_h, p2_h, p1_h, p0_h);
601 q0_h, q1_h, q2_h, q3_h);
602 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
603 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
606 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_h, p1_filt8_h,
607 p1_filt8_h, p0_filt8_h, p0_filt8_h, q0_filt8_h, q0_filt8_h,
608 p2_filt8_h, p1_filt8_h, p0_filt8_h, q0_filt8_h);
609 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_h, q2_filt8_h,
610 q2_filt8_h, q1_filt8_h, q2_filt8_h);
613 p2_out = __lsx_vbitsel_v(p2, p2_filt8_h,
flat);
614 p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_h,
flat);
615 p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_h,
flat);
616 q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_h,
flat);
617 q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_h,
flat);
618 q2_out = __lsx_vbitsel_v(q2, q2_filt8_h,
flat);
620 __lsx_vstx(p2_out,
dst, -stride3);
621 __lsx_vstx(p1_out,
dst, -stride2);
623 __lsx_vst(q0_out,
dst, 0);
625 __lsx_vstx(q2_out,
dst, stride2);
635 ptrdiff_t stride2 =
stride << 1;
636 ptrdiff_t stride3 = stride2 +
stride;
637 ptrdiff_t stride4 = stride2 << 1;
638 __m128i p3, p2, p1, p0, q3, q2,
q1,
q0;
639 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
641 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
642 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
643 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
644 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
645 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
646 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
647 __m128i
zero = __lsx_vldi(0);
652 q0 = __lsx_vld(
dst, 0);
654 q3 = __lsx_vldx(
dst, stride3);
656 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
657 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
658 limit = __lsx_vreplgr2vr_b(limit_ptr);
661 LPF_MASK_HEV(p3, p2, p1, p0,
q0,
q1, q2, q3,
limit, b_limit, thresh,
668 if (__lsx_bz_v(
flat)) {
669 __lsx_vstx(p1_out,
dst, -stride2);
671 __lsx_vst(q0_out,
dst, 0);
676 p3_l, p2_l, p1_l, p0_l);
678 q0_l, q1_l, q2_l, q3_l);
679 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
680 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
683 p3_h, p2_h, p1_h, p0_h);
685 q0_h, q1_h, q2_h, q3_h);
686 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
687 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
690 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
691 p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l,
692 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
693 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
694 q2_filt8_l, q1_filt8_l, q2_filt8_l);
697 p2_out = __lsx_vbitsel_v(p2, p2_filt8_l,
flat);
698 p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l,
flat);
699 p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l,
flat);
700 q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l,
flat);
701 q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l,
flat);
702 q2_out = __lsx_vbitsel_v(q2, q2_filt8_l,
flat);
704 __lsx_vst(p2_out, filter48, 0);
705 __lsx_vst(p1_out, filter48, 16);
706 __lsx_vst(p0_out, filter48, 32);
707 __lsx_vst(q0_out, filter48, 48);
708 __lsx_vst(q1_out, filter48, 64);
709 __lsx_vst(q2_out, filter48, 80);
710 __lsx_vst(
flat, filter48, 96);
719 ptrdiff_t stride2 =
stride << 1;
720 ptrdiff_t stride3 = stride2 +
stride;
721 ptrdiff_t stride4 = stride2 << 1;
722 uint8_t *dst_tmp =
dst - stride4;
723 uint8_t *dst_tmp1 =
dst + stride4;
724 __m128i p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
725 __m128i
flat, flat2, filter8;
726 __m128i
zero = __lsx_vldi(0);
727 __m128i out_h, out_l;
728 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
729 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
730 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
731 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
732 v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
733 v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
734 v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
735 v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
736 v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
738 flat = __lsx_vld(filter48, 96);
740 DUP4_ARG2(__lsx_vldx, dst_tmp, -stride4, dst_tmp, -stride3, dst_tmp,
741 -stride2, dst_tmp, -
stride, p7, p6, p5, p4);
742 p3 = __lsx_vld(dst_tmp, 0);
744 p0 = __lsx_vldx(dst_tmp, stride3);
746 q0 = __lsx_vld(
dst, 0);
748 q3 = __lsx_vldx(
dst, stride3);
750 q4 = __lsx_vld(dst_tmp1, 0);
752 q7 = __lsx_vldx(dst_tmp1, stride3);
753 VP9_FLAT5(p7, p6, p5, p4, p0,
q0, q4, q5, q6, q7,
flat, flat2);
756 if (__lsx_bz_v(flat2)) {
757 DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48,
759 DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80,
q1, q2);
761 __lsx_vstx(p2,
dst, -stride3);
762 __lsx_vstx(p1,
dst, -stride2);
764 __lsx_vst(
q0,
dst, 0);
766 __lsx_vstx(q2,
dst, stride2);
768 dst = dst_tmp - stride3;
770 p7_l_in = (v8u16)__lsx_vilvl_b(
zero, p7);
771 p6_l_in = (v8u16)__lsx_vilvl_b(
zero, p6);
772 p5_l_in = (v8u16)__lsx_vilvl_b(
zero, p5);
773 p4_l_in = (v8u16)__lsx_vilvl_b(
zero, p4);
774 p3_l_in = (v8u16)__lsx_vilvl_b(
zero, p3);
775 p2_l_in = (v8u16)__lsx_vilvl_b(
zero, p2);
776 p1_l_in = (v8u16)__lsx_vilvl_b(
zero, p1);
777 p0_l_in = (v8u16)__lsx_vilvl_b(
zero, p0);
779 q0_l_in = (v8u16)__lsx_vilvl_b(
zero,
q0);
781 tmp0_l = p7_l_in << 3;
785 tmp1_l = p6_l_in + p5_l_in;
793 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
795 p7_h_in = (v8u16)__lsx_vilvh_b(
zero, p7);
796 p6_h_in = (v8u16)__lsx_vilvh_b(
zero, p6);
797 p5_h_in = (v8u16)__lsx_vilvh_b(
zero, p5);
798 p4_h_in = (v8u16)__lsx_vilvh_b(
zero, p4);
800 p3_h_in = (v8u16)__lsx_vilvh_b(
zero, p3);
801 p2_h_in = (v8u16)__lsx_vilvh_b(
zero, p2);
802 p1_h_in = (v8u16)__lsx_vilvh_b(
zero, p1);
803 p0_h_in = (v8u16)__lsx_vilvh_b(
zero, p0);
804 q0_h_in = (v8u16)__lsx_vilvh_b(
zero,
q0);
806 tmp0_h = p7_h_in << 3;
810 tmp1_h = p6_h_in + p5_h_in;
818 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
820 out_l = __lsx_vpickev_b(out_h, out_l);
821 p6 = __lsx_vbitsel_v(p6, out_l, flat2);
822 __lsx_vst(p6,
dst, 0);
826 q1_l_in = (v8u16)__lsx_vilvl_b(
zero,
q1);
827 tmp0_l = p5_l_in - p6_l_in;
831 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
833 q1_h_in = (v8u16)__lsx_vilvh_b(
zero,
q1);
834 tmp0_h = p5_h_in - p6_h_in;
838 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
840 out_l = __lsx_vpickev_b(out_h, out_l);
841 p5 = __lsx_vbitsel_v(p5, out_l, flat2);
842 __lsx_vst(p5,
dst, 0);
846 q2_l_in = (v8u16)__lsx_vilvl_b(
zero, q2);
847 tmp0_l = p4_l_in - p5_l_in;
851 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
853 q2_h_in = (v8u16)__lsx_vilvh_b(
zero, q2);
854 tmp0_h = p4_h_in - p5_h_in;
858 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
860 out_l = __lsx_vpickev_b(out_h, out_l);
861 p4 = __lsx_vbitsel_v(p4, out_l, flat2);
862 __lsx_vst(p4,
dst, 0);
866 q3_l_in = (v8u16)__lsx_vilvl_b(
zero, q3);
867 tmp0_l = p3_l_in - p4_l_in;
871 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
873 q3_h_in = (v8u16)__lsx_vilvh_b(
zero, q3);
874 tmp0_h = p3_h_in - p4_h_in;
878 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
880 out_l = __lsx_vpickev_b(out_h, out_l);
881 p3 = __lsx_vbitsel_v(p3, out_l, flat2);
882 __lsx_vst(p3,
dst, 0);
886 q4_l_in = (v8u16)__lsx_vilvl_b(
zero, q4);
887 filter8 = __lsx_vld(filter48, 0);
888 tmp0_l = p2_l_in - p3_l_in;
892 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
894 q4_h_in = (v8u16)__lsx_vilvh_b(
zero, q4);
895 tmp0_h = p2_h_in - p3_h_in;
899 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
901 out_l = __lsx_vpickev_b(out_h, out_l);
902 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
903 __lsx_vst(filter8,
dst, 0);
907 q5_l_in = (v8u16)__lsx_vilvl_b(
zero, q5);
908 filter8 = __lsx_vld(filter48, 16);
909 tmp0_l = p1_l_in - p2_l_in;
913 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
915 q5_h_in = (v8u16)__lsx_vilvh_b(
zero, q5);
916 tmp0_h = p1_h_in - p2_h_in;
920 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
922 out_l = __lsx_vpickev_b(out_h, out_l);
923 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
924 __lsx_vst(filter8,
dst, 0);
928 q6_l_in = (v8u16)__lsx_vilvl_b(
zero, q6);
929 filter8 = __lsx_vld(filter48, 32);
930 tmp0_l = p0_l_in - p1_l_in;
934 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
936 q6_h_in = (v8u16)__lsx_vilvh_b(
zero, q6);
937 tmp0_h = p0_h_in - p1_h_in;
941 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
943 out_l = __lsx_vpickev_b(out_h, out_l);
944 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
945 __lsx_vst(filter8,
dst, 0);
949 q7_l_in = (v8u16)__lsx_vilvl_b(
zero, q7);
950 filter8 = __lsx_vld(filter48, 48);
951 tmp0_l = q7_l_in - p0_l_in;
955 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
957 q7_h_in = (v8u16)__lsx_vilvh_b(
zero, q7);
958 tmp0_h = q7_h_in - p0_h_in;
962 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
964 out_l = __lsx_vpickev_b(out_h, out_l);
965 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
966 __lsx_vst(filter8,
dst, 0);
970 filter8 = __lsx_vld(filter48, 64);
971 tmp0_l = q7_l_in - q0_l_in;
975 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
977 tmp0_h = q7_h_in - q0_h_in;
981 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
983 out_l = __lsx_vpickev_b(out_h, out_l);
984 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
985 __lsx_vst(filter8,
dst, 0);
989 filter8 = __lsx_vld(filter48, 80);
990 tmp0_l = q7_l_in - q1_l_in;
994 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
996 tmp0_h = q7_h_in - q1_h_in;
1000 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1002 out_l = __lsx_vpickev_b(out_h, out_l);
1003 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
1004 __lsx_vst(filter8,
dst, 0);
1008 tmp0_l = q7_l_in - q2_l_in;
1012 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1014 tmp0_h = q7_h_in - q2_h_in;
1018 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1020 out_l = __lsx_vpickev_b(out_h, out_l);
1021 q3 = __lsx_vbitsel_v(q3, out_l, flat2);
1022 __lsx_vst(q3,
dst, 0);
1026 tmp0_l = q7_l_in - q3_l_in;
1030 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1032 tmp0_h = q7_h_in - q3_h_in;
1036 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1038 out_l = __lsx_vpickev_b(out_h, out_l);
1039 q4 = __lsx_vbitsel_v(q4, out_l, flat2);
1040 __lsx_vst(q4,
dst, 0);
1044 tmp0_l = q7_l_in - q4_l_in;
1048 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1050 tmp0_h = q7_h_in - q4_h_in;
1054 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1056 out_l = __lsx_vpickev_b(out_h, out_l);
1057 q5 = __lsx_vbitsel_v(q5, out_l, flat2);
1058 __lsx_vst(q5,
dst, 0);
1062 tmp0_l = q7_l_in - q5_l_in;
1066 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1068 tmp0_h = q7_h_in - q5_h_in;
1072 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1074 out_l = __lsx_vpickev_b(out_h, out_l);
1075 q6 = __lsx_vbitsel_v(q6, out_l, flat2);
1076 __lsx_vst(q6,
dst, 0);
1085 uint8_t filter48[16 * 8] __attribute__ ((
aligned(16)));
1086 uint8_t early_exit = 0;
1089 b_limit_ptr, limit_ptr, thresh_ptr);
1091 if (0 == early_exit) {
1101 ptrdiff_t stride2 =
stride << 1;
1102 ptrdiff_t stride3 = stride2 +
stride;
1103 ptrdiff_t stride4 = stride2 << 1;
1104 uint8_t *dst_tmp =
dst - stride4;
1105 uint8_t *dst_tmp1 =
dst + stride4;
1106 __m128i
zero = __lsx_vldi(0);
1108 __m128i p3, p2, p1, p0, q3, q2,
q1,
q0, p7, p6, p5, p4, q4, q5, q6, q7;
1109 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1110 __m128i p0_filter16, p1_filter16;
1111 __m128i p2_filter8, p1_filter8, p0_filter8;
1112 __m128i q0_filter8, q1_filter8, q2_filter8;
1113 __m128i p7_l, p6_l, p5_l, p4_l, q7_l, q6_l, q5_l, q4_l;
1114 __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
1115 __m128i tmp0, tmp1, tmp2;
1120 q0 = __lsx_vld(
dst, 0);
1122 q3 = __lsx_vldx(
dst, stride3);
1124 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1125 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1126 limit = __lsx_vreplgr2vr_b(limit_ptr);
1128 LPF_MASK_HEV(p3, p2, p1, p0,
q0,
q1, q2, q3,
limit, b_limit, thresh,
1137 if (__lsx_bz_v(
flat)) {
1138 __lsx_vstelm_d(p1_out,
dst - stride2, 0, 0);
1139 __lsx_vstelm_d(p0_out,
dst -
stride, 0, 0);
1140 __lsx_vstelm_d(q0_out,
dst , 0, 0);
1141 __lsx_vstelm_d(q1_out,
dst +
stride, 0, 0);
1145 p3_l, p2_l, p1_l, p0_l);
1147 q0_l, q1_l, q2_l, q3_l);
1148 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l,
1149 p2_filter8, p1_filter8, p0_filter8, q0_filter8,
1150 q1_filter8, q2_filter8);
1154 zero, p0_filter8,
zero, q0_filter8, p2_filter8,
1155 p1_filter8, p0_filter8, q0_filter8);
1157 q1_filter8, q2_filter8);
1160 p2_out = __lsx_vbitsel_v(p2, p2_filter8,
flat);
1161 p1_out = __lsx_vbitsel_v(p1_out, p1_filter8,
flat);
1162 p0_out = __lsx_vbitsel_v(p0_out, p0_filter8,
flat);
1163 q0_out = __lsx_vbitsel_v(q0_out, q0_filter8,
flat);
1164 q1_out = __lsx_vbitsel_v(q1_out, q1_filter8,
flat);
1165 q2_out = __lsx_vbitsel_v(q2, q2_filter8,
flat);
1168 DUP4_ARG2(__lsx_vld, dst_tmp - stride4, 0, dst_tmp - stride3, 0,
1169 dst_tmp - stride2, 0, dst_tmp -
stride, 0, p7, p6, p5, p4);
1171 dst_tmp1 + stride2, 0, dst_tmp1 + stride3, 0, q4, q5, q6, q7);
1173 VP9_FLAT5(p7, p6, p5, p4, p0,
q0, q4, q5, q6, q7,
flat, flat2);
1176 if (__lsx_bz_v(flat2)) {
1178 __lsx_vstelm_d(p2_out,
dst, 0, 0);
1180 __lsx_vstelm_d(p1_out,
dst, 0, 0);
1182 __lsx_vstelm_d(p0_out,
dst, 0, 0);
1184 __lsx_vstelm_d(q0_out,
dst, 0, 0);
1186 __lsx_vstelm_d(q1_out,
dst, 0, 0);
1188 __lsx_vstelm_d(q2_out,
dst, 0, 0);
1192 p7_l, p6_l, p5_l, p4_l);
1194 q4_l, q5_l, q6_l, q7_l);
1196 tmp0 = __lsx_vslli_h(p7_l, 3);
1197 tmp0 = __lsx_vsub_h(tmp0, p7_l);
1198 tmp0 = __lsx_vadd_h(tmp0, p6_l);
1199 tmp0 = __lsx_vadd_h(tmp0, q0_l);
1201 dst = dst_tmp - stride3;
1204 tmp1 = __lsx_vadd_h(p6_l, p5_l);
1205 tmp1 = __lsx_vadd_h(tmp1, p4_l);
1206 tmp1 = __lsx_vadd_h(tmp1, p3_l);
1207 tmp1 = __lsx_vadd_h(tmp1, p2_l);
1208 tmp1 = __lsx_vadd_h(tmp1, p1_l);
1209 tmp1 = __lsx_vadd_h(tmp1, p0_l);
1210 tmp1 = __lsx_vadd_h(tmp1, tmp0);
1212 p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1213 tmp0 = __lsx_vsub_h(p5_l, p6_l);
1214 tmp0 = __lsx_vadd_h(tmp0, q1_l);
1215 tmp0 = __lsx_vsub_h(tmp0, p7_l);
1216 tmp1 = __lsx_vadd_h(tmp1, tmp0);
1218 p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1220 p1_filter16, p0_filter16, p1_filter16);
1221 p0_filter16 = __lsx_vbitsel_v(p6, p0_filter16, flat2);
1222 p1_filter16 = __lsx_vbitsel_v(p5, p1_filter16, flat2);
1223 __lsx_vstelm_d(p0_filter16,
dst, 0, 0);
1225 __lsx_vstelm_d(p1_filter16,
dst, 0, 0);
1229 tmp0 = __lsx_vsub_h(p4_l, p5_l);
1230 tmp0 = __lsx_vadd_h(tmp0, q2_l);
1231 tmp0 = __lsx_vsub_h(tmp0, p7_l);
1232 tmp2 = __lsx_vsub_h(p3_l, p4_l);
1233 tmp2 = __lsx_vadd_h(tmp2, q3_l);
1234 tmp2 = __lsx_vsub_h(tmp2, p7_l);
1235 tmp1 = __lsx_vadd_h(tmp1, tmp0);
1236 p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1237 tmp1 = __lsx_vadd_h(tmp1, tmp2);
1238 p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1240 p1_filter16, p0_filter16, p1_filter16);
1241 p0_filter16 = __lsx_vbitsel_v(p4, p0_filter16, flat2);
1242 p1_filter16 = __lsx_vbitsel_v(p3, p1_filter16, flat2);
1243 __lsx_vstelm_d(p0_filter16,
dst, 0, 0);
1245 __lsx_vstelm_d(p1_filter16,
dst, 0, 0);
1249 tmp0 = __lsx_vsub_h(p2_l, p3_l);
1250 tmp0 = __lsx_vadd_h(tmp0, q4_l);
1251 tmp0 = __lsx_vsub_h(tmp0, p7_l);
1252 tmp2 = __lsx_vsub_h(p1_l, p2_l);
1253 tmp2 = __lsx_vadd_h(tmp2, q5_l);
1254 tmp2 = __lsx_vsub_h(tmp2, p7_l);
1255 tmp1 = __lsx_vadd_h(tmp1, tmp0);
1256 p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1257 tmp1 = __lsx_vadd_h(tmp1, tmp2);
1258 p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1260 p1_filter16, p0_filter16, p1_filter16);
1261 p0_filter16 = __lsx_vbitsel_v(p2_out, p0_filter16, flat2);
1262 p1_filter16 = __lsx_vbitsel_v(p1_out, p1_filter16, flat2);
1263 __lsx_vstelm_d(p0_filter16,
dst, 0, 0);
1265 __lsx_vstelm_d(p1_filter16,
dst, 0, 0);
1269 tmp0 = __lsx_vsub_h(p0_l, p1_l);
1270 tmp0 = __lsx_vadd_h(tmp0, q6_l);
1271 tmp0 = __lsx_vsub_h(tmp0, p7_l);
1272 tmp2 = __lsx_vsub_h(q7_l, p0_l);
1273 tmp2 = __lsx_vadd_h(tmp2, q0_l);
1274 tmp2 = __lsx_vsub_h(tmp2, p7_l);
1275 tmp1 = __lsx_vadd_h(tmp1, tmp0);
1276 p0_filter16 = __lsx_vsrari_h((__m128i)tmp1, 4);
1277 tmp1 = __lsx_vadd_h(tmp1, tmp2);
1278 p1_filter16 = __lsx_vsrari_h((__m128i)tmp1, 4);
1280 p1_filter16, p0_filter16, p1_filter16);
1281 p0_filter16 = __lsx_vbitsel_v(p0_out, p0_filter16, flat2);
1282 p1_filter16 = __lsx_vbitsel_v(q0_out, p1_filter16, flat2);
1283 __lsx_vstelm_d(p0_filter16,
dst, 0, 0);
1285 __lsx_vstelm_d(p1_filter16,
dst, 0, 0);
1289 tmp0 = __lsx_vsub_h(q7_l, q0_l);
1290 tmp0 = __lsx_vadd_h(tmp0, q1_l);
1291 tmp0 = __lsx_vsub_h(tmp0, p6_l);
1292 tmp2 = __lsx_vsub_h(q7_l, q1_l);
1293 tmp2 = __lsx_vadd_h(tmp2, q2_l);
1294 tmp2 = __lsx_vsub_h(tmp2, p5_l);
1295 tmp1 = __lsx_vadd_h(tmp1, tmp0);
1296 p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1297 tmp1 = __lsx_vadd_h(tmp1, tmp2);
1298 p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1300 p1_filter16, p0_filter16, p1_filter16);
1301 p0_filter16 = __lsx_vbitsel_v(q1_out, p0_filter16, flat2);
1302 p1_filter16 = __lsx_vbitsel_v(q2_out, p1_filter16, flat2);
1303 __lsx_vstelm_d(p0_filter16,
dst, 0, 0);
1305 __lsx_vstelm_d(p1_filter16,
dst, 0, 0);
1309 tmp0 = __lsx_vsub_h(q7_l, q2_l);
1310 tmp0 = __lsx_vadd_h(tmp0, q3_l);
1311 tmp0 = __lsx_vsub_h(tmp0, p4_l);
1312 tmp2 = __lsx_vsub_h(q7_l, q3_l);
1313 tmp2 = __lsx_vadd_h(tmp2, q4_l);
1314 tmp2 = __lsx_vsub_h(tmp2, p3_l);
1315 tmp1 = __lsx_vadd_h(tmp1, tmp0);
1316 p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1317 tmp1 = __lsx_vadd_h(tmp1, tmp2);
1318 p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1320 p1_filter16, p0_filter16, p1_filter16);
1321 p0_filter16 = __lsx_vbitsel_v(q3, p0_filter16, flat2);
1322 p1_filter16 = __lsx_vbitsel_v(q4, p1_filter16, flat2);
1323 __lsx_vstelm_d(p0_filter16,
dst, 0, 0);
1325 __lsx_vstelm_d(p1_filter16,
dst, 0, 0);
1329 tmp0 = __lsx_vsub_h(q7_l, q4_l);
1330 tmp0 = __lsx_vadd_h(tmp0, q5_l);
1331 tmp0 = __lsx_vsub_h(tmp0, p2_l);
1332 tmp2 = __lsx_vsub_h(q7_l, q5_l);
1333 tmp2 = __lsx_vadd_h(tmp2, q6_l);
1334 tmp2 = __lsx_vsub_h(tmp2, p1_l);
1335 tmp1 = __lsx_vadd_h(tmp1, tmp0);
1336 p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1337 tmp1 = __lsx_vadd_h(tmp1, tmp2);
1338 p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1340 p1_filter16, p0_filter16, p1_filter16);
1341 p0_filter16 = __lsx_vbitsel_v(q5, p0_filter16, flat2);
1342 p1_filter16 = __lsx_vbitsel_v(q6, p1_filter16, flat2);
1343 __lsx_vstelm_d(p0_filter16,
dst, 0, 0);
1345 __lsx_vstelm_d(p1_filter16,
dst, 0, 0);
1355 ptrdiff_t stride2 =
stride << 1;
1356 ptrdiff_t stride3 = stride2 +
stride;
1357 ptrdiff_t stride4 = stride2 << 1;
1358 uint8_t *dst_tmp1 =
dst - 4;
1359 uint8_t *dst_tmp2 = dst_tmp1 + stride4;
1361 __m128i p3, p2, p1, p0, q3, q2,
q1,
q0;
1362 __m128i vec0, vec1, vec2, vec3;
1364 p3 = __lsx_vld(dst_tmp1, 0);
1366 p0 = __lsx_vldx(dst_tmp1, stride3);
1367 q0 = __lsx_vld(dst_tmp2, 0);
1369 q3 = __lsx_vldx(dst_tmp2, stride3);
1371 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1372 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1373 limit = __lsx_vreplgr2vr_b(limit_ptr);
1375 LSX_TRANSPOSE8x8_B(p3, p2, p1, p0,
q0,
q1, q2, q3,
1376 p3, p2, p1, p0,
q0,
q1, q2, q3);
1377 LPF_MASK_HEV(p3, p2, p1, p0,
q0,
q1, q2, q3,
limit, b_limit, thresh,
1381 vec2 = __lsx_vilvl_h(vec1, vec0);
1382 vec3 = __lsx_vilvh_h(vec1, vec0);
1385 __lsx_vstelm_w(vec2,
dst, 0, 0);
1386 __lsx_vstelm_w(vec2,
dst +
stride, 0, 1);
1387 __lsx_vstelm_w(vec2,
dst + stride2, 0, 2);
1388 __lsx_vstelm_w(vec2,
dst + stride3, 0, 3);
1390 __lsx_vstelm_w(vec3,
dst, 0, 0);
1391 __lsx_vstelm_w(vec3,
dst +
stride, 0, 1);
1392 __lsx_vstelm_w(vec3,
dst + stride2, 0, 2);
1393 __lsx_vstelm_w(vec3,
dst + stride3, 0, 3);
1401 ptrdiff_t stride2 =
stride << 1;
1402 ptrdiff_t stride3 = stride2 +
stride;
1403 ptrdiff_t stride4 = stride2 << 1;
1404 uint8_t *dst_tmp =
dst - 4;
1406 __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
1407 __m128i p3, p2, p1, p0, q3, q2,
q1,
q0;
1408 __m128i row0, row1, row2, row3, row4, row5, row6, row7;
1409 __m128i row8, row9, row10, row11, row12, row13, row14, row15;
1410 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1412 row0 = __lsx_vld(dst_tmp, 0);
1414 row3 = __lsx_vldx(dst_tmp, stride3);
1416 row4 = __lsx_vld(dst_tmp, 0);
1418 row7 = __lsx_vldx(dst_tmp, stride3);
1420 row8 = __lsx_vld(dst_tmp, 0);
1421 DUP2_ARG2(__lsx_vldx, dst_tmp,
stride, dst_tmp, stride2, row9, row10);
1422 row11 = __lsx_vldx(dst_tmp, stride3);
1424 row12 = __lsx_vld(dst_tmp, 0);
1425 DUP2_ARG2(__lsx_vldx, dst_tmp,
stride, dst_tmp, stride2, row13, row14);
1426 row15 = __lsx_vldx(dst_tmp, stride3);
1428 LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7,
1429 row8, row9, row10, row11, row12, row13, row14, row15,
1430 p3, p2, p1, p0,
q0,
q1, q2, q3);
1432 thresh0 = __lsx_vreplgr2vr_b(thresh_ptr);
1433 thresh1 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1434 thresh0 = __lsx_vilvl_d(thresh1, thresh0);
1436 b_limit0 = __lsx_vreplgr2vr_b(b_limit_ptr);
1437 b_limit1 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
1438 b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
1440 limit0 = __lsx_vreplgr2vr_b(limit_ptr);
1441 limit1 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
1442 limit0 = __lsx_vilvl_d(limit1, limit0);
1444 LPF_MASK_HEV(p3, p2, p1, p0,
q0,
q1, q2, q3, limit0, b_limit0, thresh0,
1448 tmp2 = __lsx_vilvl_h(tmp1, tmp0);
1449 tmp3 = __lsx_vilvh_h(tmp1, tmp0);
1451 tmp4 = __lsx_vilvl_h(tmp1, tmp0);
1452 tmp5 = __lsx_vilvh_h(tmp1, tmp0);
1455 __lsx_vstelm_w(tmp2,
dst, 0, 0);
1456 __lsx_vstelm_w(tmp2,
dst +
stride, 0, 1);
1457 __lsx_vstelm_w(tmp2,
dst + stride2, 0, 2);
1458 __lsx_vstelm_w(tmp2,
dst + stride3, 0, 3);
1460 __lsx_vstelm_w(tmp3,
dst, 0, 0);
1461 __lsx_vstelm_w(tmp3,
dst +
stride, 0, 1);
1462 __lsx_vstelm_w(tmp3,
dst + stride2, 0, 2);
1463 __lsx_vstelm_w(tmp3,
dst + stride3, 0, 3);
1465 __lsx_vstelm_w(tmp4,
dst, 0, 0);
1466 __lsx_vstelm_w(tmp4,
dst +
stride, 0, 1);
1467 __lsx_vstelm_w(tmp4,
dst + stride2, 0, 2);
1468 __lsx_vstelm_w(tmp4,
dst + stride3, 0, 3);
1470 __lsx_vstelm_w(tmp5,
dst, 0, 0);
1471 __lsx_vstelm_w(tmp5,
dst +
stride, 0, 1);
1472 __lsx_vstelm_w(tmp5,
dst + stride2, 0, 2);
1473 __lsx_vstelm_w(tmp5,
dst + stride3, 0, 3);
1481 ptrdiff_t stride2 =
stride << 1;
1482 ptrdiff_t stride3 = stride2 +
stride;
1483 ptrdiff_t stride4 = stride2 << 1;
1484 uint8_t *dst_tmp =
dst - 4;
1485 __m128i p3, p2, p1, p0, q3, q2,
q1,
q0;
1486 __m128i p1_out, p0_out, q0_out, q1_out;
1488 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1489 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
1490 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
1491 __m128i vec0, vec1, vec2, vec3, vec4;
1492 __m128i
zero = __lsx_vldi(0);
1495 p3 = __lsx_vld(dst_tmp, 0);
1497 p0 = __lsx_vldx(dst_tmp, stride3);
1499 q0 = __lsx_vld(dst_tmp, 0);
1501 q3 = __lsx_vldx(dst_tmp, stride3);
1503 LSX_TRANSPOSE8x8_B(p3, p2, p1, p0,
q0,
q1, q2, q3,
1504 p3, p2, p1, p0,
q0,
q1, q2, q3);
1506 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1507 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1508 limit = __lsx_vreplgr2vr_b(limit_ptr);
1511 LPF_MASK_HEV(p3, p2, p1, p0,
q0,
q1, q2, q3,
limit, b_limit, thresh,
1522 if (__lsx_bz_v(
flat)) {
1524 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1525 vec2 = __lsx_vilvl_h(vec1, vec0);
1526 vec3 = __lsx_vilvh_h(vec1, vec0);
1529 __lsx_vstelm_w(vec2,
dst, 0, 0);
1530 __lsx_vstelm_w(vec2,
dst +
stride, 0, 1);
1531 __lsx_vstelm_w(vec2,
dst + stride2, 0, 2);
1532 __lsx_vstelm_w(vec2,
dst + stride3, 0, 3);
1534 __lsx_vstelm_w(vec3,
dst, 0, 0);
1535 __lsx_vstelm_w(vec3,
dst +
stride, 0, 1);
1536 __lsx_vstelm_w(vec3,
dst + stride2, 0, 2);
1537 __lsx_vstelm_w(vec3,
dst + stride3, 0, 3);
1540 p3_l, p2_l, p1_l, p0_l);
1542 q0_l, q1_l, q2_l, q3_l);
1543 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1544 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1546 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l,
1547 p1_filt8_l, p0_filt8_l, p0_filt8_l, q0_filt8_l,
1548 q0_filt8_l, p2_filt8_l, p1_filt8_l, p0_filt8_l,
1550 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l,
1551 q2_filt8_l, q1_filt8_l, q2_filt8_l);
1554 p2 = __lsx_vbitsel_v(p2, p2_filt8_l,
flat);
1555 p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l,
flat);
1556 p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l,
flat);
1557 q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l,
flat);
1558 q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l,
flat);
1559 q2 = __lsx_vbitsel_v(q2, q2_filt8_l,
flat);
1562 DUP2_ARG2(__lsx_vilvl_b, p1, p2,
q0, p0, vec0, vec1);
1563 vec2 = __lsx_vilvl_h(vec1, vec0);
1564 vec3 = __lsx_vilvh_h(vec1, vec0);
1565 vec4 = __lsx_vilvl_b(q2,
q1);
1568 __lsx_vstelm_w(vec2,
dst, 0, 0);
1569 __lsx_vstelm_h(vec4,
dst, 4, 0);
1571 __lsx_vstelm_w(vec2,
dst, 0, 1);
1572 __lsx_vstelm_h(vec4,
dst, 4, 1);
1574 __lsx_vstelm_w(vec2,
dst, 0, 2);
1575 __lsx_vstelm_h(vec4,
dst, 4, 2);
1577 __lsx_vstelm_w(vec2,
dst, 0, 3);
1578 __lsx_vstelm_h(vec4,
dst, 4, 3);
1580 __lsx_vstelm_w(vec3,
dst, 0, 0);
1581 __lsx_vstelm_h(vec4,
dst, 4, 4);
1583 __lsx_vstelm_w(vec3,
dst, 0, 1);
1584 __lsx_vstelm_h(vec4,
dst, 4, 5);
1586 __lsx_vstelm_w(vec3,
dst, 0, 2);
1587 __lsx_vstelm_h(vec4,
dst, 4, 6);
1589 __lsx_vstelm_w(vec3,
dst, 0, 3);
1590 __lsx_vstelm_h(vec4,
dst, 4, 7);
1599 ptrdiff_t stride2 =
stride << 1;
1600 ptrdiff_t stride3 = stride2 +
stride;
1601 ptrdiff_t stride4 = stride2 << 1;
1602 uint8_t *dst_tmp =
dst - 4;
1603 __m128i p3, p2, p1, p0, q3, q2,
q1,
q0;
1604 __m128i p1_out, p0_out, q0_out, q1_out;
1606 __m128i row4, row5, row6, row7, row12, row13, row14, row15;
1607 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1608 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
1609 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
1610 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
1611 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
1612 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
1613 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1614 __m128i
zero = __lsx_vldi(0);
1616 p0 = __lsx_vld(dst_tmp, 0);
1618 p3 = __lsx_vldx(dst_tmp, stride3);
1620 row4 = __lsx_vld(dst_tmp, 0);
1622 row7 = __lsx_vldx(dst_tmp, stride3);
1624 q3 = __lsx_vld(dst_tmp, 0);
1626 q0 = __lsx_vldx(dst_tmp, stride3);
1628 row12 = __lsx_vld(dst_tmp, 0);
1629 DUP2_ARG2(__lsx_vldx, dst_tmp,
stride, dst_tmp, stride2, row13, row14);
1630 row15 = __lsx_vldx(dst_tmp, stride3);
1633 LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7,
1634 q3, q2,
q1,
q0, row12, row13, row14, row15,
1635 p3, p2, p1, p0,
q0,
q1, q2, q3);
1637 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1638 vec0 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1639 thresh = __lsx_vilvl_d(vec0, thresh);
1641 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1642 vec0 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
1643 b_limit = __lsx_vilvl_d(vec0, b_limit);
1645 limit = __lsx_vreplgr2vr_b(limit_ptr);
1646 vec0 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
1650 LPF_MASK_HEV(p3, p2, p1, p0,
q0,
q1, q2, q3,
limit, b_limit, thresh,
1659 if (__lsx_bz_v(
flat)) {
1660 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1661 vec2 = __lsx_vilvl_h(vec1, vec0);
1662 vec3 = __lsx_vilvh_h(vec1, vec0);
1663 DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1664 vec4 = __lsx_vilvl_h(vec1, vec0);
1665 vec5 = __lsx_vilvh_h(vec1, vec0);
1668 __lsx_vstelm_w(vec2,
dst, 0, 0);
1669 __lsx_vstelm_w(vec2,
dst +
stride, 0, 1);
1670 __lsx_vstelm_w(vec2,
dst + stride2, 0, 2);
1671 __lsx_vstelm_w(vec2,
dst + stride3, 0, 3);
1673 __lsx_vstelm_w(vec3,
dst, 0, 0);
1674 __lsx_vstelm_w(vec3,
dst +
stride, 0, 1);
1675 __lsx_vstelm_w(vec3,
dst + stride2, 0, 2);
1676 __lsx_vstelm_w(vec3,
dst + stride3, 0, 3);
1678 __lsx_vstelm_w(vec4,
dst, 0, 0);
1679 __lsx_vstelm_w(vec4,
dst +
stride, 0, 1);
1680 __lsx_vstelm_w(vec4,
dst + stride2, 0, 2);
1681 __lsx_vstelm_w(vec4,
dst + stride3, 0, 3);
1683 __lsx_vstelm_w(vec5,
dst, 0, 0);
1684 __lsx_vstelm_w(vec5,
dst +
stride, 0, 1);
1685 __lsx_vstelm_w(vec5,
dst + stride2, 0, 2);
1686 __lsx_vstelm_w(vec5,
dst + stride3, 0, 3);
1689 p3_l, p2_l, p1_l, p0_l);
1691 q0_l, q1_l, q2_l, q3_l);
1692 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1693 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1696 p3_h, p2_h, p1_h, p0_h);
1698 q0_h, q1_h, q2_h, q3_h);
1701 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
1702 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
1705 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
1706 p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l,
1707 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
1708 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
1709 q2_filt8_l, q1_filt8_l, q2_filt8_l);
1712 p2 = __lsx_vbitsel_v(p2, p2_filt8_l,
flat);
1713 p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l,
flat);
1714 p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l,
flat);
1715 q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l,
flat);
1716 q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l,
flat);
1717 q2 = __lsx_vbitsel_v(q2, q2_filt8_l,
flat);
1719 DUP2_ARG2(__lsx_vilvl_b, p1, p2,
q0, p0, vec0, vec1);
1720 vec3 = __lsx_vilvl_h(vec1, vec0);
1721 vec4 = __lsx_vilvh_h(vec1, vec0);
1722 DUP2_ARG2(__lsx_vilvh_b, p1, p2,
q0, p0, vec0, vec1);
1723 vec6 = __lsx_vilvl_h(vec1, vec0);
1724 vec7 = __lsx_vilvh_h(vec1, vec0);
1725 vec2 = __lsx_vilvl_b(q2,
q1);
1726 vec5 = __lsx_vilvh_b(q2,
q1);
1729 __lsx_vstelm_w(vec3,
dst, 0, 0);
1730 __lsx_vstelm_h(vec2,
dst, 4, 0);
1732 __lsx_vstelm_w(vec3,
dst, 0, 1);
1733 __lsx_vstelm_h(vec2,
dst, 4, 1);
1735 __lsx_vstelm_w(vec3,
dst, 0, 2);
1736 __lsx_vstelm_h(vec2,
dst, 4, 2);
1738 __lsx_vstelm_w(vec3,
dst, 0, 3);
1739 __lsx_vstelm_h(vec2,
dst, 4, 3);
1741 __lsx_vstelm_w(vec4,
dst, 0, 0);
1742 __lsx_vstelm_h(vec2,
dst, 4, 4);
1744 __lsx_vstelm_w(vec4,
dst, 0, 1);
1745 __lsx_vstelm_h(vec2,
dst, 4, 5);
1747 __lsx_vstelm_w(vec4,
dst, 0, 2);
1748 __lsx_vstelm_h(vec2,
dst, 4, 6);
1750 __lsx_vstelm_w(vec4,
dst, 0, 3);
1751 __lsx_vstelm_h(vec2,
dst, 4, 7);
1753 __lsx_vstelm_w(vec6,
dst, 0, 0);
1754 __lsx_vstelm_h(vec5,
dst, 4, 0);
1756 __lsx_vstelm_w(vec6,
dst, 0, 1);
1757 __lsx_vstelm_h(vec5,
dst, 4, 1);
1759 __lsx_vstelm_w(vec6,
dst, 0, 2);
1760 __lsx_vstelm_h(vec5,
dst, 4, 2);
1762 __lsx_vstelm_w(vec6,
dst, 0, 3);
1763 __lsx_vstelm_h(vec5,
dst, 4, 3);
1765 __lsx_vstelm_w(vec7,
dst, 0, 0);
1766 __lsx_vstelm_h(vec5,
dst, 4, 4);
1768 __lsx_vstelm_w(vec7,
dst, 0, 1);
1769 __lsx_vstelm_h(vec5,
dst, 4, 5);
1771 __lsx_vstelm_w(vec7,
dst, 0, 2);
1772 __lsx_vstelm_h(vec5,
dst, 4, 6);
1774 __lsx_vstelm_w(vec7,
dst, 0, 3);
1775 __lsx_vstelm_h(vec5,
dst, 4, 7);
1784 ptrdiff_t stride2 =
stride << 1;
1785 ptrdiff_t stride3 = stride2 +
stride;
1786 ptrdiff_t stride4 = stride2 << 1;
1787 uint8_t *dst_tmp =
dst - 4;
1788 __m128i p3, p2, p1, p0, q3, q2,
q1,
q0;
1789 __m128i p1_out, p0_out, q0_out, q1_out;
1791 __m128i row4, row5, row6, row7, row12, row13, row14, row15;
1792 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1793 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
1794 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
1795 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1796 __m128i
zero = __lsx_vldi(0);
1798 p0 = __lsx_vld(dst_tmp, 0);
1800 p3 = __lsx_vldx(dst_tmp, stride3);
1802 row4 = __lsx_vld(dst_tmp, 0);
1804 row7 = __lsx_vldx(dst_tmp, stride3);
1806 q3 = __lsx_vld(dst_tmp, 0);
1808 q0 = __lsx_vldx(dst_tmp, stride3);
1810 row12 = __lsx_vld(dst_tmp, 0);
1811 DUP2_ARG2(__lsx_vldx, dst_tmp,
stride, dst_tmp, stride2, row13, row14);
1812 row15 = __lsx_vldx(dst_tmp, stride3);
1815 LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7,
1816 q3, q2,
q1,
q0, row12, row13, row14, row15,
1817 p3, p2, p1, p0,
q0,
q1, q2, q3);
1819 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1820 vec0 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1821 thresh = __lsx_vilvl_d(vec0, thresh);
1823 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1824 vec0 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
1825 b_limit = __lsx_vilvl_d(vec0, b_limit);
1827 limit = __lsx_vreplgr2vr_b(limit_ptr);
1828 vec0 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
1832 LPF_MASK_HEV(p3, p2, p1, p0,
q0,
q1, q2, q3,
limit, b_limit, thresh,
1843 if (__lsx_bz_v(
flat)) {
1844 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1845 vec2 = __lsx_vilvl_h(vec1, vec0);
1846 vec3 = __lsx_vilvh_h(vec1, vec0);
1847 DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1848 vec4 = __lsx_vilvl_h(vec1, vec0);
1849 vec5 = __lsx_vilvh_h(vec1, vec0);
1852 __lsx_vstelm_w(vec2,
dst, 0, 0);
1853 __lsx_vstelm_w(vec2,
dst +
stride, 0, 1);
1854 __lsx_vstelm_w(vec2,
dst + stride2, 0, 2);
1855 __lsx_vstelm_w(vec2,
dst + stride3, 0, 3);
1857 __lsx_vstelm_w(vec3,
dst, 0, 0);
1858 __lsx_vstelm_w(vec3,
dst +
stride, 0, 1);
1859 __lsx_vstelm_w(vec3,
dst + stride2, 0, 2);
1860 __lsx_vstelm_w(vec3,
dst + stride3, 0, 3);
1862 __lsx_vstelm_w(vec4,
dst, 0, 0);
1863 __lsx_vstelm_w(vec4,
dst +
stride, 0, 1);
1864 __lsx_vstelm_w(vec4,
dst + stride2, 0, 2);
1865 __lsx_vstelm_w(vec4,
dst + stride3, 0, 3);
1867 __lsx_vstelm_w(vec5,
dst, 0, 0);
1868 __lsx_vstelm_w(vec5,
dst +
stride, 0, 1);
1869 __lsx_vstelm_w(vec5,
dst + stride2, 0, 2);
1870 __lsx_vstelm_w(vec5,
dst + stride3, 0, 3);
1873 p3_l, p2_l, p1_l, p0_l);
1875 q0_l, q1_l, q2_l, q3_l);
1876 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1877 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1880 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
1881 p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l,
1882 p1_filt8_l, p0_filt8_l, q0_filt8_l);
1883 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
1884 q1_filt8_l, q2_filt8_l);
1887 p2 = __lsx_vbitsel_v(p2, p2_filt8_l,
flat);
1888 p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l,
flat);
1889 p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l,
flat);
1890 q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l,
flat);
1891 q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l,
flat);
1892 q2 = __lsx_vbitsel_v(q2, q2_filt8_l,
flat);
1894 DUP2_ARG2(__lsx_vilvl_b, p1, p2,
q0, p0, vec0, vec1);
1895 vec3 = __lsx_vilvl_h(vec1, vec0);
1896 vec4 = __lsx_vilvh_h(vec1, vec0);
1897 DUP2_ARG2(__lsx_vilvh_b, p1, p2,
q0, p0, vec0, vec1);
1898 vec6 = __lsx_vilvl_h(vec1, vec0);
1899 vec7 = __lsx_vilvh_h(vec1, vec0);
1900 vec2 = __lsx_vilvl_b(q2,
q1);
1901 vec5 = __lsx_vilvh_b(q2,
q1);
1904 __lsx_vstelm_w(vec3,
dst, 0, 0);
1905 __lsx_vstelm_h(vec2,
dst, 4, 0);
1907 __lsx_vstelm_w(vec3,
dst, 0, 1);
1908 __lsx_vstelm_h(vec2,
dst, 4, 1);
1910 __lsx_vstelm_w(vec3,
dst, 0, 2);
1911 __lsx_vstelm_h(vec2,
dst, 4, 2);
1913 __lsx_vstelm_w(vec3,
dst, 0, 3);
1914 __lsx_vstelm_h(vec2,
dst, 4, 3);
1916 __lsx_vstelm_w(vec4,
dst, 0, 0);
1917 __lsx_vstelm_h(vec2,
dst, 4, 4);
1919 __lsx_vstelm_w(vec4,
dst, 0, 1);
1920 __lsx_vstelm_h(vec2,
dst, 4, 5);
1922 __lsx_vstelm_w(vec4,
dst, 0, 2);
1923 __lsx_vstelm_h(vec2,
dst, 4, 6);
1925 __lsx_vstelm_w(vec4,
dst, 0, 3);
1926 __lsx_vstelm_h(vec2,
dst, 4, 7);
1928 __lsx_vstelm_w(vec6,
dst, 0, 0);
1929 __lsx_vstelm_h(vec5,
dst, 4, 0);
1931 __lsx_vstelm_w(vec6,
dst, 0, 1);
1932 __lsx_vstelm_h(vec5,
dst, 4, 1);
1934 __lsx_vstelm_w(vec6,
dst, 0, 2);
1935 __lsx_vstelm_h(vec5,
dst, 4, 2);
1937 __lsx_vstelm_w(vec6,
dst, 0, 3);
1938 __lsx_vstelm_h(vec5,
dst, 4, 3);
1940 __lsx_vstelm_w(vec7,
dst, 0, 0);
1941 __lsx_vstelm_h(vec5,
dst, 4, 4);
1943 __lsx_vstelm_w(vec7,
dst, 0, 1);
1944 __lsx_vstelm_h(vec5,
dst, 4, 5);
1946 __lsx_vstelm_w(vec7,
dst, 0, 2);
1947 __lsx_vstelm_h(vec5,
dst, 4, 6);
1949 __lsx_vstelm_w(vec7,
dst, 0, 3);
1950 __lsx_vstelm_h(vec5,
dst, 4, 7);
1959 ptrdiff_t stride2 =
stride << 1;
1960 ptrdiff_t stride3 = stride2 +
stride;
1961 ptrdiff_t stride4 = stride2 << 1;
1962 uint8_t *dst_tmp =
dst - 4;
1963 __m128i p3, p2, p1, p0, q3, q2,
q1,
q0;
1964 __m128i p1_out, p0_out, q0_out, q1_out;
1966 __m128i row4, row5, row6, row7, row12, row13, row14, row15;
1967 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
1968 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
1969 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
1970 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1971 __m128i
zero = __lsx_vldi(0);
1973 p0 = __lsx_vld(dst_tmp, 0);
1975 p3 = __lsx_vldx(dst_tmp, stride3);
1977 row4 = __lsx_vld(dst_tmp, 0);
1979 row7 = __lsx_vldx(dst_tmp, stride3);
1981 q3 = __lsx_vld(dst_tmp, 0);
1983 q0 = __lsx_vldx(dst_tmp, stride3);
1985 row12 = __lsx_vld(dst_tmp, 0);
1986 DUP2_ARG2(__lsx_vldx, dst_tmp,
stride, dst_tmp, stride2, row13, row14);
1987 row15 = __lsx_vldx(dst_tmp, stride3);
1990 LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7,
1991 q3, q2,
q1,
q0, row12, row13, row14, row15,
1992 p3, p2, p1, p0,
q0,
q1, q2, q3);
1994 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1995 vec0 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1996 thresh = __lsx_vilvl_d(vec0, thresh);
1998 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1999 vec0 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
2000 b_limit = __lsx_vilvl_d(vec0, b_limit);
2002 limit = __lsx_vreplgr2vr_b(limit_ptr);
2003 vec0 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
2007 LPF_MASK_HEV(p3, p2, p1, p0,
q0,
q1, q2, q3,
limit, b_limit, thresh,
2018 if (__lsx_bz_v(
flat)) {
2019 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2020 vec2 = __lsx_vilvl_h(vec1, vec0);
2021 vec3 = __lsx_vilvh_h(vec1, vec0);
2022 DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2023 vec4 = __lsx_vilvl_h(vec1, vec0);
2024 vec5 = __lsx_vilvh_h(vec1, vec0);
2027 __lsx_vstelm_w(vec2,
dst, 0, 0);
2028 __lsx_vstelm_w(vec2,
dst +
stride, 0, 1);
2029 __lsx_vstelm_w(vec2,
dst + stride2, 0, 2);
2030 __lsx_vstelm_w(vec2,
dst + stride3, 0, 3);
2032 __lsx_vstelm_w(vec3,
dst, 0, 0);
2033 __lsx_vstelm_w(vec3,
dst +
stride, 0, 1);
2034 __lsx_vstelm_w(vec3,
dst + stride2, 0, 2);
2035 __lsx_vstelm_w(vec3,
dst + stride3, 0, 3);
2037 __lsx_vstelm_w(vec4,
dst, 0, 0);
2038 __lsx_vstelm_w(vec4,
dst +
stride, 0, 1);
2039 __lsx_vstelm_w(vec4,
dst + stride2, 0, 2);
2040 __lsx_vstelm_w(vec4,
dst + stride3, 0, 3);
2042 __lsx_vstelm_w(vec5,
dst, 0, 0);
2043 __lsx_vstelm_w(vec5,
dst +
stride, 0, 1);
2044 __lsx_vstelm_w(vec5,
dst + stride2, 0, 2);
2045 __lsx_vstelm_w(vec5,
dst + stride3, 0, 3);
2048 p3_h, p2_h, p1_h, p0_h);
2050 q0_h, q1_h, q2_h, q3_h);
2052 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
2053 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
2056 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_h, p1_filt8_h,
2057 p1_filt8_h, p0_filt8_h, p0_filt8_h, q0_filt8_h, q0_filt8_h,
2058 p2_filt8_h, p1_filt8_h, p0_filt8_h, q0_filt8_h);
2059 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_h, q2_filt8_h,
2060 q2_filt8_h, q1_filt8_h, q2_filt8_h);
2063 p2 = __lsx_vbitsel_v(p2, p2_filt8_h,
flat);
2064 p1 = __lsx_vbitsel_v(p1_out, p1_filt8_h,
flat);
2065 p0 = __lsx_vbitsel_v(p0_out, p0_filt8_h,
flat);
2066 q0 = __lsx_vbitsel_v(q0_out, q0_filt8_h,
flat);
2067 q1 = __lsx_vbitsel_v(q1_out, q1_filt8_h,
flat);
2068 q2 = __lsx_vbitsel_v(q2, q2_filt8_h,
flat);
2070 DUP2_ARG2(__lsx_vilvl_b, p1, p2,
q0, p0, vec0, vec1);
2071 vec3 = __lsx_vilvl_h(vec1, vec0);
2072 vec4 = __lsx_vilvh_h(vec1, vec0);
2073 DUP2_ARG2(__lsx_vilvh_b, p1, p2,
q0, p0, vec0, vec1);
2074 vec6 = __lsx_vilvl_h(vec1, vec0);
2075 vec7 = __lsx_vilvh_h(vec1, vec0);
2076 vec2 = __lsx_vilvl_b(q2,
q1);
2077 vec5 = __lsx_vilvh_b(q2,
q1);
2080 __lsx_vstelm_w(vec3,
dst, 0, 0);
2081 __lsx_vstelm_h(vec2,
dst, 4, 0);
2083 __lsx_vstelm_w(vec3,
dst, 0, 1);
2084 __lsx_vstelm_h(vec2,
dst, 4, 1);
2086 __lsx_vstelm_w(vec3,
dst, 0, 2);
2087 __lsx_vstelm_h(vec2,
dst, 4, 2);
2089 __lsx_vstelm_w(vec3,
dst, 0, 3);
2090 __lsx_vstelm_h(vec2,
dst, 4, 3);
2092 __lsx_vstelm_w(vec4,
dst, 0, 0);
2093 __lsx_vstelm_h(vec2,
dst, 4, 4);
2095 __lsx_vstelm_w(vec4,
dst, 0, 1);
2096 __lsx_vstelm_h(vec2,
dst, 4, 5);
2098 __lsx_vstelm_w(vec4,
dst, 0, 2);
2099 __lsx_vstelm_h(vec2,
dst, 4, 6);
2101 __lsx_vstelm_w(vec4,
dst, 0, 3);
2102 __lsx_vstelm_h(vec2,
dst, 4, 7);
2104 __lsx_vstelm_w(vec6,
dst, 0, 0);
2105 __lsx_vstelm_h(vec5,
dst, 4, 0);
2107 __lsx_vstelm_w(vec6,
dst, 0, 1);
2108 __lsx_vstelm_h(vec5,
dst, 4, 1);
2110 __lsx_vstelm_w(vec6,
dst, 0, 2);
2111 __lsx_vstelm_h(vec5,
dst, 4, 2);
2113 __lsx_vstelm_w(vec6,
dst, 0, 3);
2114 __lsx_vstelm_h(vec5,
dst, 4, 3);
2116 __lsx_vstelm_w(vec7,
dst, 0, 0);
2117 __lsx_vstelm_h(vec5,
dst, 4, 4);
2119 __lsx_vstelm_w(vec7,
dst, 0, 1);
2120 __lsx_vstelm_h(vec5,
dst, 4, 5);
2122 __lsx_vstelm_w(vec7,
dst, 0, 2);
2123 __lsx_vstelm_h(vec5,
dst, 4, 6);
2125 __lsx_vstelm_w(vec7,
dst, 0, 3);
2126 __lsx_vstelm_h(vec5,
dst, 4, 7);
2133 __m128i p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
2134 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2135 __m128i p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
2136 ptrdiff_t in_pitch2 = in_pitch << 1;
2137 ptrdiff_t in_pitch3 = in_pitch2 + in_pitch;
2138 ptrdiff_t in_pitch4 = in_pitch2 << 1;
2141 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
2143 LSX_TRANSPOSE8x8_B(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
2144 p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
2146 DUP4_ARG2(__lsx_vilvh_b, p5_org, p7_org, p4_org, p6_org, p1_org,
2147 p3_org, p0_org, p2_org, tmp0, tmp1, tmp2, tmp3);
2148 DUP2_ARG2(__lsx_vilvl_b, tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
2149 DUP2_ARG2(__lsx_vilvh_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
2150 DUP2_ARG2(__lsx_vilvl_w, tmp6, tmp4, tmp7, tmp5,
q0, q4);
2151 DUP2_ARG2(__lsx_vilvh_w, tmp6, tmp4, tmp7, tmp5, q2, q6);
2152 DUP4_ARG2(__lsx_vbsrl_v,
q0, 8, q2, 8, q4, 8, q6, 8,
q1, q3, q5, q7);
2154 __lsx_vst(p7,
output, 0);
2155 __lsx_vst(p6,
output, 16);
2156 __lsx_vst(p5,
output, 32);
2157 __lsx_vst(p4,
output, 48);
2158 __lsx_vst(p3,
output, 64);
2159 __lsx_vst(p2,
output, 80);
2160 __lsx_vst(p1,
output, 96);
2161 __lsx_vst(p0,
output, 112);
2164 __lsx_vst(q2,
output, 160);
2165 __lsx_vst(q3,
output, 176);
2166 __lsx_vst(q4,
output, 192);
2167 __lsx_vst(q5,
output, 208);
2168 __lsx_vst(q6,
output, 224);
2169 __lsx_vst(q7,
output, 240);
2173 ptrdiff_t out_pitch)
2175 __m128i p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
2176 __m128i p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
2177 ptrdiff_t out_pitch2 = out_pitch << 1;
2178 ptrdiff_t out_pitch3 = out_pitch2 + out_pitch;
2179 ptrdiff_t out_pitch4 = out_pitch2 << 1;
2189 LSX_TRANSPOSE16x8_B(p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5,
2190 q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
2191 LSX_ST_8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o,
2192 output, out_pitch, out_pitch2, out_pitch3, out_pitch4);
2198 __m128i row0, row1, row2, row3, row4, row5, row6, row7;
2199 __m128i row8, row9, row10, row11, row12, row13, row14, row15;
2200 __m128i tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
2202 __m128i p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
2203 int32_t in_stride2 = in_stride << 1;
2204 int32_t in_stride3 = in_stride2 + in_stride;
2205 int32_t in_stride4 = in_stride2 << 1;
2206 int32_t out_stride2 = out_stride << 1;
2207 int32_t out_stride3 = out_stride2 + out_stride;
2208 int32_t out_stride4 = out_stride2 << 1;
2210 LSX_LD_8(
input, in_stride, in_stride2, in_stride3, in_stride4,
2211 row0, row1, row2, row3, row4, row5, row6, row7);
2212 input += in_stride4;
2213 LSX_LD_8(
input, in_stride, in_stride2, in_stride3, in_stride4,
2214 row8, row9, row10, row11, row12, row13, row14, row15);
2216 LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7,
2217 row8, row9, row10, row11, row12, row13, row14, row15,
2218 p7, p6, p5, p4, p3, p2, p1, p0);
2222 q7 = __lsx_vpackod_d(row8, row0);
2223 q6 = __lsx_vpackod_d(row9, row1);
2224 q5 = __lsx_vpackod_d(row10, row2);
2225 q4 = __lsx_vpackod_d(row11, row3);
2226 q3 = __lsx_vpackod_d(row12, row4);
2227 q2 = __lsx_vpackod_d(row13, row5);
2228 q1 = __lsx_vpackod_d(row14, row6);
2229 q0 = __lsx_vpackod_d(row15, row7);
2231 DUP2_ARG2(__lsx_vpackev_b, q6, q7, q4, q5, tmp0, tmp1);
2232 DUP2_ARG2(__lsx_vpackod_b, q6, q7, q4, q5, tmp4, tmp5);
2237 DUP2_ARG2(__lsx_vpackev_h, tmp1, tmp0, q7, q5, tmp2, tmp3);
2238 q0 = __lsx_vpackev_w(tmp3, tmp2);
2239 q4 = __lsx_vpackod_w(tmp3, tmp2);
2241 tmp2 = __lsx_vpackod_h(tmp1, tmp0);
2242 tmp3 = __lsx_vpackod_h(q7, q5);
2243 q2 = __lsx_vpackev_w(tmp3, tmp2);
2244 q6 = __lsx_vpackod_w(tmp3, tmp2);
2246 DUP2_ARG2(__lsx_vpackev_h, tmp5, tmp4, tmp7, tmp6, tmp2, tmp3);
2247 q1 = __lsx_vpackev_w(tmp3, tmp2);
2248 q5 = __lsx_vpackod_w(tmp3, tmp2);
2250 tmp2 = __lsx_vpackod_h(tmp5, tmp4);
2251 tmp3 = __lsx_vpackod_h(tmp7, tmp6);
2252 q3 = __lsx_vpackev_w(tmp3, tmp2);
2253 q7 = __lsx_vpackod_w(tmp3, tmp2);
2255 LSX_ST_8(p7, p6, p5, p4, p3, p2, p1, p0,
output, out_stride,
2256 out_stride2, out_stride3, out_stride4);
2258 LSX_ST_8(
q0,
q1, q2, q3, q4, q5, q6, q7,
output, out_stride,
2259 out_stride2, out_stride3, out_stride4);
2263 uint8_t *src_org,
int32_t pitch_org,
2268 __m128i p3, p2, p1, p0, q3, q2,
q1,
q0;
2269 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2271 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2272 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
2273 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
2274 __m128i vec0, vec1, vec2, vec3;
2275 __m128i
zero = __lsx_vldi(0);
2280 DUP4_ARG2(__lsx_vld,
src, 0,
src, 16,
src, 32,
src, 48,
q0,
q1, q2, q3);
2282 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
2283 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
2284 limit = __lsx_vreplgr2vr_b(limit_ptr);
2287 LPF_MASK_HEV(p3, p2, p1, p0,
q0,
q1, q2, q3,
limit, b_limit, thresh,
2298 if (__lsx_bz_v(
flat)) {
2299 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2300 vec2 = __lsx_vilvl_h(vec1, vec0);
2301 vec3 = __lsx_vilvh_h(vec1, vec0);
2304 __lsx_vstelm_w(vec2, src_org, 0, 0);
2305 src_org += pitch_org;
2306 __lsx_vstelm_w(vec2, src_org, 0, 1);
2307 src_org += pitch_org;
2308 __lsx_vstelm_w(vec2, src_org, 0, 2);
2309 src_org += pitch_org;
2310 __lsx_vstelm_w(vec2, src_org, 0, 3);
2311 src_org += pitch_org;
2312 __lsx_vstelm_w(vec3, src_org, 0, 0);
2313 src_org += pitch_org;
2314 __lsx_vstelm_w(vec3, src_org, 0, 1);
2315 src_org += pitch_org;
2316 __lsx_vstelm_w(vec3, src_org, 0, 2);
2317 src_org += pitch_org;
2318 __lsx_vstelm_w(vec3, src_org, 0, 3);
2322 p3_l, p2_l, p1_l, p0_l);
2324 q0_l, q1_l, q2_l, q3_l);
2325 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2326 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2329 p2_l = __lsx_vpickev_b(p2_filt8_l, p2_filt8_l);
2330 p1_l = __lsx_vpickev_b(p1_filt8_l, p1_filt8_l);
2331 p0_l = __lsx_vpickev_b(p0_filt8_l, p0_filt8_l);
2332 q0_l = __lsx_vpickev_b(q0_filt8_l, q0_filt8_l);
2333 q1_l = __lsx_vpickev_b(q1_filt8_l, q1_filt8_l);
2334 q2_l = __lsx_vpickev_b(q2_filt8_l, q2_filt8_l);
2337 p2_out = __lsx_vbitsel_v(p2, p2_l,
flat);
2338 p1_out = __lsx_vbitsel_v(p1_out, p1_l,
flat);
2339 p0_out = __lsx_vbitsel_v(p0_out, p0_l,
flat);
2340 q0_out = __lsx_vbitsel_v(q0_out, q0_l,
flat);
2341 q1_out = __lsx_vbitsel_v(q1_out, q1_l,
flat);
2342 q2_out = __lsx_vbitsel_v(q2, q2_l,
flat);
2344 __lsx_vst(p2_out, filter48, 0);
2345 __lsx_vst(p1_out, filter48, 16);
2346 __lsx_vst(p0_out, filter48, 32);
2347 __lsx_vst(q0_out, filter48, 48);
2348 __lsx_vst(q1_out, filter48, 64);
2349 __lsx_vst(q2_out, filter48, 80);
2350 __lsx_vst(
flat, filter48, 96);
2360 __m128i
zero = __lsx_vldi(0);
2361 __m128i filter8,
flat, flat2;
2362 __m128i p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
2363 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2364 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2365 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2366 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2367 v8u16 tmp0_l, tmp1_l;
2369 uint8_t *dst_tmp =
dst - 128;
2372 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32,
2373 dst_tmp, 48, p7, p6, p5, p4);
2374 DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96,
2375 dst_tmp, 112, p3, p2, p1, p0);
2376 DUP4_ARG2(__lsx_vld,
dst, 0,
dst, 16,
dst, 32,
dst, 48,
q0,
q1, q2, q3);
2377 DUP4_ARG2(__lsx_vld,
dst, 64,
dst, 80,
dst, 96,
dst, 112, q4, q5, q6, q7);
2379 flat = __lsx_vld(filter48, 96);
2382 VP9_FLAT5(p7, p6, p5, p4, p0,
q0, q4, q5, q6, q7,
flat, flat2);
2385 if (__lsx_bz_v(flat2)) {
2386 __m128i vec0, vec1, vec2, vec3, vec4;
2388 DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32,
2389 filter48, 48, p2, p1, p0,
q0);
2390 DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80,
q1, q2);
2392 DUP2_ARG2(__lsx_vilvl_b, p1, p2,
q0, p0, vec0, vec1);
2393 vec3 = __lsx_vilvl_h(vec1, vec0);
2394 vec4 = __lsx_vilvh_h(vec1, vec0);
2395 vec2 = __lsx_vilvl_b(q2,
q1);
2398 __lsx_vstelm_w(vec3, dst_org, 0, 0);
2399 __lsx_vstelm_h(vec2, dst_org, 4, 0);
2401 __lsx_vstelm_w(vec3, dst_org, 0, 1);
2402 __lsx_vstelm_h(vec2, dst_org, 4, 1);
2404 __lsx_vstelm_w(vec3, dst_org, 0, 2);
2405 __lsx_vstelm_h(vec2, dst_org, 4, 2);
2407 __lsx_vstelm_w(vec3, dst_org, 0, 3);
2408 __lsx_vstelm_h(vec2, dst_org, 4, 3);
2410 __lsx_vstelm_w(vec4, dst_org, 0, 0);
2411 __lsx_vstelm_h(vec2, dst_org, 4, 4);
2413 __lsx_vstelm_w(vec4, dst_org, 0, 1);
2414 __lsx_vstelm_h(vec2, dst_org, 4, 5);
2416 __lsx_vstelm_w(vec4, dst_org, 0, 2);
2417 __lsx_vstelm_h(vec2, dst_org, 4, 6);
2419 __lsx_vstelm_w(vec4, dst_org, 0, 3);
2420 __lsx_vstelm_h(vec2, dst_org, 4, 7);
2425 p7_l_in = (v8u16)__lsx_vilvl_b(
zero, p7);
2426 p6_l_in = (v8u16)__lsx_vilvl_b(
zero, p6);
2427 p5_l_in = (v8u16)__lsx_vilvl_b(
zero, p5);
2428 p4_l_in = (v8u16)__lsx_vilvl_b(
zero, p4);
2429 p3_l_in = (v8u16)__lsx_vilvl_b(
zero, p3);
2430 p2_l_in = (v8u16)__lsx_vilvl_b(
zero, p2);
2431 p1_l_in = (v8u16)__lsx_vilvl_b(
zero, p1);
2432 p0_l_in = (v8u16)__lsx_vilvl_b(
zero, p0);
2433 q0_l_in = (v8u16)__lsx_vilvl_b(
zero,
q0);
2435 tmp0_l = p7_l_in << 3;
2439 tmp1_l = p6_l_in + p5_l_in;
2447 out_l =__lsx_vsrari_h((__m128i)tmp1_l, 4);
2448 out_l =__lsx_vpickev_b(out_l, out_l);
2449 p6 = __lsx_vbitsel_v(p6, out_l, flat2);
2450 __lsx_vstelm_d(p6,
dst, 0, 0);
2454 q1_l_in = (v8u16)__lsx_vilvl_b(
zero,
q1);
2455 tmp0_l = p5_l_in - p6_l_in;
2459 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2460 out_l = __lsx_vpickev_b(out_l, out_l);
2461 p5 = __lsx_vbitsel_v(p5, out_l, flat2);
2462 __lsx_vstelm_d(p5,
dst, 0, 0);
2466 q2_l_in = (v8u16)__lsx_vilvl_b(
zero, q2);
2467 tmp0_l = p4_l_in - p5_l_in;
2471 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2472 out_l = __lsx_vpickev_b(out_l, out_l);
2473 p4 = __lsx_vbitsel_v(p4, out_l, flat2);
2474 __lsx_vstelm_d(p4,
dst, 0, 0);
2478 q3_l_in = (v8u16)__lsx_vilvl_b(
zero, q3);
2479 tmp0_l = p3_l_in - p4_l_in;
2483 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2484 out_l = __lsx_vpickev_b(out_l, out_l);
2485 p3 = __lsx_vbitsel_v(p3, out_l, flat2);
2486 __lsx_vstelm_d(p3,
dst, 0, 0);
2490 q4_l_in = (v8u16)__lsx_vilvl_b(
zero, q4);
2491 filter8 = __lsx_vld(filter48, 0);
2492 tmp0_l = p2_l_in - p3_l_in;
2496 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2497 out_l = __lsx_vpickev_b(out_l, out_l);
2498 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2499 __lsx_vstelm_d(filter8,
dst, 0, 0);
2503 q5_l_in = (v8u16)__lsx_vilvl_b(
zero, q5);
2504 filter8 = __lsx_vld(filter48, 16);
2505 tmp0_l = p1_l_in - p2_l_in;
2509 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2510 out_l = __lsx_vpickev_b(out_l, out_l);
2511 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2512 __lsx_vstelm_d(filter8,
dst, 0, 0);
2516 q6_l_in = (v8u16)__lsx_vilvl_b(
zero, q6);
2517 filter8 = __lsx_vld(filter48, 32);
2518 tmp0_l = p0_l_in - p1_l_in;
2522 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2523 out_l = __lsx_vpickev_b(out_l, out_l);
2524 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2525 __lsx_vstelm_d(filter8,
dst, 0, 0);
2529 q7_l_in = (v8u16)__lsx_vilvl_b(
zero, q7);
2530 filter8 = __lsx_vld(filter48, 48);
2531 tmp0_l = q7_l_in - p0_l_in;
2535 out_l = __lsx_vsrari_h((v8i16) tmp1_l, 4);
2536 out_l = __lsx_vpickev_b(out_l, out_l);
2537 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2538 __lsx_vstelm_d(filter8,
dst, 0, 0);
2542 filter8 = __lsx_vld(filter48, 64);
2543 tmp0_l = q7_l_in - q0_l_in;
2547 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2548 out_l = __lsx_vpickev_b(out_l, out_l);
2549 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2550 __lsx_vstelm_d(filter8,
dst, 0, 0);
2554 filter8 = __lsx_vld(filter48, 80);
2555 tmp0_l = q7_l_in - q1_l_in;
2559 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2560 out_l = __lsx_vpickev_b(out_l, out_l);
2561 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2562 __lsx_vstelm_d(filter8,
dst, 0, 0);
2566 tmp0_l = q7_l_in - q2_l_in;
2570 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2571 out_l = __lsx_vpickev_b(out_l, out_l);
2572 q3 = __lsx_vbitsel_v(q3, out_l, flat2);
2573 __lsx_vstelm_d(q3,
dst, 0, 0);
2577 tmp0_l = q7_l_in - q3_l_in;
2581 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2582 out_l = __lsx_vpickev_b(out_l, out_l);
2583 q4 = __lsx_vbitsel_v(q4, out_l, flat2);
2584 __lsx_vstelm_d(q4,
dst, 0, 0);
2588 tmp0_l = q7_l_in - q4_l_in;
2592 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2593 out_l = __lsx_vpickev_b(out_l, out_l);
2594 q5 = __lsx_vbitsel_v(q5, out_l, flat2);
2595 __lsx_vstelm_d(q5,
dst, 0, 0);
2599 tmp0_l = q7_l_in - q5_l_in;
2603 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2604 out_l = __lsx_vpickev_b(out_l, out_l);
2605 q6 = __lsx_vbitsel_v(q6, out_l, flat2);
2606 __lsx_vstelm_d(q6,
dst, 0, 0);
2617 uint8_t early_exit = 0;
2618 uint8_t transposed_input[16 * 24] __attribute__ ((
aligned(16)));
2619 uint8_t *filter48 = &transposed_input[16 * 16];
2625 b_limit_ptr, limit_ptr, thresh_ptr);
2627 if (0 == early_exit) {
2631 if (0 == early_exit) {
2638 uint8_t *dst_org, ptrdiff_t
stride,
2643 ptrdiff_t stride2 =
stride << 1;
2644 ptrdiff_t stride3 = stride2 +
stride;
2645 ptrdiff_t stride4 = stride2 << 1;
2646 __m128i p3, p2, p1, p0, q3, q2,
q1,
q0;
2647 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2649 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2650 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
2651 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
2652 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
2653 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
2654 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
2655 __m128i vec0, vec1, vec2, vec3, vec4, vec5;
2656 __m128i
zero = __lsx_vldi(0);
2661 DUP4_ARG2(__lsx_vld,
dst, 0,
dst, 16,
dst, 32,
dst, 48,
q0,
q1, q2, q3);
2663 thresh = __lsx_vreplgr2vr_b(thresh_ptr);
2664 b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
2665 limit = __lsx_vreplgr2vr_b(limit_ptr);
2668 LPF_MASK_HEV(p3, p2, p1, p0,
q0,
q1, q2, q3,
limit, b_limit, thresh,
2677 if (__lsx_bz_v(
flat)) {
2678 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2679 vec2 = __lsx_vilvl_h(vec1, vec0);
2680 vec3 = __lsx_vilvh_h(vec1, vec0);
2681 DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2682 vec4 = __lsx_vilvl_h(vec1, vec0);
2683 vec5 = __lsx_vilvh_h(vec1, vec0);
2686 __lsx_vstelm_w(vec2, dst_org, 0, 0);
2687 __lsx_vstelm_w(vec2, dst_org +
stride, 0, 1);
2688 __lsx_vstelm_w(vec2, dst_org + stride2, 0, 2);
2689 __lsx_vstelm_w(vec2, dst_org + stride3, 0, 3);
2691 __lsx_vstelm_w(vec3, dst_org, 0, 0);
2692 __lsx_vstelm_w(vec3, dst_org +
stride, 0, 1);
2693 __lsx_vstelm_w(vec3, dst_org + stride2, 0, 2);
2694 __lsx_vstelm_w(vec3, dst_org + stride3, 0, 3);
2696 __lsx_vstelm_w(vec4, dst_org, 0, 0);
2697 __lsx_vstelm_w(vec4, dst_org +
stride, 0, 1);
2698 __lsx_vstelm_w(vec4, dst_org + stride2, 0, 2);
2699 __lsx_vstelm_w(vec4, dst_org + stride3, 0, 3);
2701 __lsx_vstelm_w(vec5, dst_org, 0, 0);
2702 __lsx_vstelm_w(vec5, dst_org +
stride, 0, 1);
2703 __lsx_vstelm_w(vec5, dst_org + stride2, 0, 2);
2704 __lsx_vstelm_w(vec5, dst_org + stride3, 0, 3);
2709 p3_l, p2_l, p1_l, p0_l);
2711 q0_l, q1_l, q2_l, q3_l);
2712 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2713 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2715 p3_h, p2_h, p1_h, p0_h);
2717 q0_h, q1_h, q2_h, q3_h);
2718 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
2719 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
2722 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
2723 p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h,
2724 q0_filt8_l, p2_filt8_l, p1_filt8_l, p0_filt8_l,
2726 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
2727 q2_filt8_l, q1_filt8_l, q2_filt8_l);
2730 p2_out = __lsx_vbitsel_v(p2, p2_filt8_l,
flat);
2731 p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l,
flat);
2732 p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l,
flat);
2733 q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l,
flat);
2734 q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l,
flat);
2735 q2_out = __lsx_vbitsel_v(q2, q2_filt8_l,
flat);
2737 __lsx_vst(p2_out, filter48, 0);
2738 __lsx_vst(p1_out, filter48, 16);
2739 __lsx_vst(p0_out, filter48, 32);
2740 __lsx_vst(q0_out, filter48, 48);
2741 __lsx_vst(q1_out, filter48, 64);
2742 __lsx_vst(q2_out, filter48, 80);
2743 __lsx_vst(
flat, filter48, 96);
2753 __m128i
zero = __lsx_vldi(0);
2754 __m128i
flat, flat2, filter8;
2755 __m128i p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
2756 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2757 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2758 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2759 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2760 v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
2761 v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
2762 v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
2763 v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
2764 v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
2765 __m128i out_l, out_h;
2766 uint8_t *dst_tmp =
dst - 128;
2768 flat = __lsx_vld(filter48, 96);
2770 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32,
2771 dst_tmp, 48, p7, p6, p5, p4);
2772 DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96,
2773 dst_tmp, 112, p3, p2, p1, p0);
2774 DUP4_ARG2(__lsx_vld,
dst, 0,
dst, 16,
dst, 32,
dst, 48,
q0,
q1, q2, q3);
2775 DUP4_ARG2(__lsx_vld,
dst, 64,
dst, 80,
dst, 96,
dst, 112, q4, q5, q6, q7);
2777 VP9_FLAT5(p7, p6, p5, p4, p0,
q0, q4, q5, q6, q7,
flat, flat2);
2780 if (__lsx_bz_v(flat2)) {
2781 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2783 DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32,
2784 filter48, 48, p2, p1, p0,
q0);
2785 DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80,
q1, q2);
2787 DUP2_ARG2(__lsx_vilvl_b, p1, p2,
q0, p0, vec0, vec1);
2788 vec3 = __lsx_vilvl_h(vec1, vec0);
2789 vec4 = __lsx_vilvh_h(vec1, vec0);
2790 DUP2_ARG2(__lsx_vilvh_b, p1, p2,
q0, p0, vec0, vec1);
2791 vec6 = __lsx_vilvl_h(vec1, vec0);
2792 vec7 = __lsx_vilvh_h(vec1, vec0);
2793 vec2 = __lsx_vilvl_b(q2,
q1);
2794 vec5 = __lsx_vilvh_b(q2,
q1);
2797 __lsx_vstelm_w(vec3, dst_org, 0, 0);
2798 __lsx_vstelm_h(vec2, dst_org, 4, 0);
2800 __lsx_vstelm_w(vec3, dst_org, 0, 1);
2801 __lsx_vstelm_h(vec2, dst_org, 4, 1);
2803 __lsx_vstelm_w(vec3, dst_org, 0, 2);
2804 __lsx_vstelm_h(vec2, dst_org, 4, 2);
2806 __lsx_vstelm_w(vec3, dst_org, 0, 3);
2807 __lsx_vstelm_h(vec2, dst_org, 4, 3);
2809 __lsx_vstelm_w(vec4, dst_org, 0, 0);
2810 __lsx_vstelm_h(vec2, dst_org, 4, 4);
2812 __lsx_vstelm_w(vec4, dst_org, 0, 1);
2813 __lsx_vstelm_h(vec2, dst_org, 4, 5);
2815 __lsx_vstelm_w(vec4, dst_org, 0, 2);
2816 __lsx_vstelm_h(vec2, dst_org, 4, 6);
2818 __lsx_vstelm_w(vec4, dst_org, 0, 3);
2819 __lsx_vstelm_h(vec2, dst_org, 4, 7);
2821 __lsx_vstelm_w(vec6, dst_org, 0, 0);
2822 __lsx_vstelm_h(vec5, dst_org, 4, 0);
2824 __lsx_vstelm_w(vec6, dst_org, 0, 1);
2825 __lsx_vstelm_h(vec5, dst_org, 4, 1);
2827 __lsx_vstelm_w(vec6, dst_org, 0, 2);
2828 __lsx_vstelm_h(vec5, dst_org, 4, 2);
2830 __lsx_vstelm_w(vec6, dst_org, 0, 3);
2831 __lsx_vstelm_h(vec5, dst_org, 4, 3);
2833 __lsx_vstelm_w(vec7, dst_org, 0, 0);
2834 __lsx_vstelm_h(vec5, dst_org, 4, 4);
2836 __lsx_vstelm_w(vec7, dst_org, 0, 1);
2837 __lsx_vstelm_h(vec5, dst_org, 4, 5);
2839 __lsx_vstelm_w(vec7, dst_org, 0, 2);
2840 __lsx_vstelm_h(vec5, dst_org, 4, 6);
2842 __lsx_vstelm_w(vec7, dst_org, 0, 3);
2843 __lsx_vstelm_h(vec5, dst_org, 4, 7);
2849 p7_l_in = (v8u16)__lsx_vilvl_b(
zero, p7);
2850 p6_l_in = (v8u16)__lsx_vilvl_b(
zero, p6);
2851 p5_l_in = (v8u16)__lsx_vilvl_b(
zero, p5);
2852 p4_l_in = (v8u16)__lsx_vilvl_b(
zero, p4);
2853 p3_l_in = (v8u16)__lsx_vilvl_b(
zero, p3);
2854 p2_l_in = (v8u16)__lsx_vilvl_b(
zero, p2);
2855 p1_l_in = (v8u16)__lsx_vilvl_b(
zero, p1);
2856 p0_l_in = (v8u16)__lsx_vilvl_b(
zero, p0);
2857 q0_l_in = (v8u16)__lsx_vilvl_b(
zero,
q0);
2859 tmp0_l = p7_l_in << 3;
2863 tmp1_l = p6_l_in + p5_l_in;
2870 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2872 p7_h_in = (v8u16)__lsx_vilvh_b(
zero, p7);
2873 p6_h_in = (v8u16)__lsx_vilvh_b(
zero, p6);
2874 p5_h_in = (v8u16)__lsx_vilvh_b(
zero, p5);
2875 p4_h_in = (v8u16)__lsx_vilvh_b(
zero, p4);
2876 p3_h_in = (v8u16)__lsx_vilvh_b(
zero, p3);
2877 p2_h_in = (v8u16)__lsx_vilvh_b(
zero, p2);
2878 p1_h_in = (v8u16)__lsx_vilvh_b(
zero, p1);
2879 p0_h_in = (v8u16)__lsx_vilvh_b(
zero, p0);
2880 q0_h_in = (v8u16)__lsx_vilvh_b(
zero,
q0);
2882 tmp0_h = p7_h_in << 3;
2886 tmp1_h = p6_h_in + p5_h_in;
2893 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2895 out_l = __lsx_vpickev_b(out_h, out_l);
2896 p6 = __lsx_vbitsel_v(p6, out_l, flat2);
2897 __lsx_vst(p6,
dst, 0);
2900 q1_l_in = (v8u16)__lsx_vilvl_b(
zero,
q1);
2901 tmp0_l = p5_l_in - p6_l_in;
2905 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2906 q1_h_in = (v8u16)__lsx_vilvh_b(
zero,
q1);
2907 tmp0_h = p5_h_in - p6_h_in;
2911 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2912 out_l = __lsx_vpickev_b(out_h, out_l);
2913 p5 = __lsx_vbitsel_v(p5, out_l, flat2);
2914 __lsx_vst(p5,
dst, 16);
2917 q2_l_in = (v8u16)__lsx_vilvl_b(
zero, q2);
2918 tmp0_l = p4_l_in - p5_l_in;
2922 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2923 q2_h_in = (v8u16)__lsx_vilvh_b(
zero, q2);
2924 tmp0_h = p4_h_in - p5_h_in;
2928 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2929 out_l = __lsx_vpickev_b(out_h, out_l);
2930 p4 = __lsx_vbitsel_v(p4, out_l, flat2);
2931 __lsx_vst(p4,
dst, 16*2);
2934 q3_l_in = (v8u16)__lsx_vilvl_b(
zero, q3);
2935 tmp0_l = p3_l_in - p4_l_in;
2939 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2940 q3_h_in = (v8u16)__lsx_vilvh_b(
zero, q3);
2941 tmp0_h = p3_h_in - p4_h_in;
2945 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2946 out_l = __lsx_vpickev_b(out_h, out_l);
2947 p3 = __lsx_vbitsel_v(p3, out_l, flat2);
2948 __lsx_vst(p3,
dst, 16*3);
2951 q4_l_in = (v8u16)__lsx_vilvl_b(
zero, q4);
2952 filter8 = __lsx_vld(filter48, 0);
2953 tmp0_l = p2_l_in - p3_l_in;
2957 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2958 q4_h_in = (v8u16)__lsx_vilvh_b(
zero, q4);
2959 tmp0_h = p2_h_in - p3_h_in;
2963 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2964 out_l = __lsx_vpickev_b(out_h, out_l);
2965 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2966 __lsx_vst(filter8,
dst, 16*4);
2969 q5_l_in = (v8u16)__lsx_vilvl_b(
zero, q5);
2970 filter8 = __lsx_vld(filter48, 16);
2971 tmp0_l = p1_l_in - p2_l_in;
2975 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2976 q5_h_in = (v8u16)__lsx_vilvh_b(
zero, q5);
2977 tmp0_h = p1_h_in - p2_h_in;
2981 out_h = __lsx_vsrari_h((__m128i)(tmp1_h), 4);
2982 out_l = __lsx_vpickev_b(out_h, out_l);
2983 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2984 __lsx_vst(filter8,
dst, 16*5);
2987 q6_l_in = (v8u16)__lsx_vilvl_b(
zero, q6);
2988 filter8 = __lsx_vld(filter48, 32);
2989 tmp0_l = p0_l_in - p1_l_in;
2993 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2994 q6_h_in = (v8u16)__lsx_vilvh_b(
zero, q6);
2995 tmp0_h = p0_h_in - p1_h_in;
2999 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3000 out_l = __lsx_vpickev_b(out_h, out_l);
3001 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3002 __lsx_vst(filter8,
dst, 16*6);
3005 q7_l_in = (v8u16)__lsx_vilvl_b(
zero, q7);
3006 filter8 = __lsx_vld(filter48, 48);
3007 tmp0_l = q7_l_in - p0_l_in;
3011 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3012 q7_h_in = (v8u16)__lsx_vilvh_b(
zero, q7);
3013 tmp0_h = q7_h_in - p0_h_in;
3017 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3018 out_l = __lsx_vpickev_b(out_h, out_l);
3019 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3020 __lsx_vst(filter8,
dst, 16*7);
3023 filter8 = __lsx_vld(filter48, 64);
3024 tmp0_l = q7_l_in - q0_l_in;
3028 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3029 tmp0_h = q7_h_in - q0_h_in;
3033 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3034 out_l = __lsx_vpickev_b(out_h, out_l);
3035 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3036 __lsx_vst(filter8,
dst, 16*8);
3039 filter8 = __lsx_vld(filter48, 80);
3040 tmp0_l = q7_l_in - q1_l_in;
3044 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3045 tmp0_h = q7_h_in - q1_h_in;
3049 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3050 out_l = __lsx_vpickev_b(out_h, out_l);
3051 filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3052 __lsx_vst(filter8,
dst, 16*9);
3055 tmp0_l = q7_l_in - q2_l_in;
3059 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3060 tmp0_h = q7_h_in - q2_h_in;
3064 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3065 out_l = __lsx_vpickev_b(out_h, out_l);
3066 q3 = __lsx_vbitsel_v(q3, out_l, flat2);
3067 __lsx_vst(q3,
dst, 16*10);
3070 tmp0_l = q7_l_in - q3_l_in;
3074 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3075 tmp0_h = q7_h_in - q3_h_in;
3079 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3080 out_l = __lsx_vpickev_b(out_h, out_l);
3081 q4 = __lsx_vbitsel_v(q4, out_l, flat2);
3082 __lsx_vst(q4,
dst, 16*11);
3085 tmp0_l = q7_l_in - q4_l_in;
3089 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3090 tmp0_h = q7_h_in - q4_h_in;
3094 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3095 out_l = __lsx_vpickev_b(out_h, out_l);
3096 q5 = __lsx_vbitsel_v(q5, out_l, flat2);
3097 __lsx_vst(q5,
dst, 16*12);
3100 tmp0_l = q7_l_in - q5_l_in;
3104 out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3105 tmp0_h = q7_h_in - q5_h_in;
3109 out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3110 out_l = __lsx_vpickev_b(out_h, out_l);
3111 q6 = __lsx_vbitsel_v(q6, out_l, flat2);
3112 __lsx_vst(q6,
dst, 16*13);
3123 uint8_t early_exit = 0;
3124 uint8_t transposed_input[16 * 24] __attribute__ ((
aligned(16)));
3125 uint8_t *filter48 = &transposed_input[16 * 16];
3131 b_limit_ptr, limit_ptr, thresh_ptr);
3133 if (0 == early_exit) {
3137 if (0 == early_exit) {