FFmpeg
vp9_lpf_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/vp9dsp.h"
23 #include "vp9dsp_mips.h"
24 
25 #define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
26  p1_out, p0_out, q0_out, q1_out) \
27 { \
28  v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt, filt1, filt2; \
29  const v16i8 cnst4b = __msa_ldi_b(4); \
30  const v16i8 cnst3b = __msa_ldi_b(3); \
31  \
32  p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \
33  p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \
34  q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \
35  q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \
36  \
37  filt = __msa_subs_s_b(p1_m, q1_m); \
38  \
39  filt = filt & (v16i8) hev_in; \
40  \
41  q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \
42  filt = __msa_adds_s_b(filt, q0_sub_p0); \
43  filt = __msa_adds_s_b(filt, q0_sub_p0); \
44  filt = __msa_adds_s_b(filt, q0_sub_p0); \
45  filt = filt & (v16i8) mask_in; \
46  \
47  filt1 = __msa_adds_s_b(filt, cnst4b); \
48  filt1 >>= 3; \
49  \
50  filt2 = __msa_adds_s_b(filt, cnst3b); \
51  filt2 >>= 3; \
52  \
53  q0_m = __msa_subs_s_b(q0_m, filt1); \
54  q0_out = __msa_xori_b((v16u8) q0_m, 0x80); \
55  p0_m = __msa_adds_s_b(p0_m, filt2); \
56  p0_out = __msa_xori_b((v16u8) p0_m, 0x80); \
57  \
58  filt = __msa_srari_b(filt1, 1); \
59  hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \
60  filt = filt & (v16i8) hev_in; \
61  \
62  q1_m = __msa_subs_s_b(q1_m, filt); \
63  q1_out = __msa_xori_b((v16u8) q1_m, 0x80); \
64  p1_m = __msa_adds_s_b(p1_m, filt); \
65  p1_out = __msa_xori_b((v16u8) p1_m, 0x80); \
66 }
67 
68 #define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
69 { \
70  v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
71  v16u8 zero_in = { 0 }; \
72  \
73  tmp = __msa_ori_b(zero_in, 1); \
74  p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \
75  q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \
76  p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \
77  q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \
78  \
79  p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \
80  flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \
81  p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \
82  flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \
83  \
84  flat_out = (tmp < (v16u8) flat_out); \
85  flat_out = __msa_xori_b(flat_out, 0xff); \
86  flat_out = flat_out & (mask); \
87 }
88 
89 #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \
90  q5_in, q6_in, q7_in, flat_in, flat2_out) \
91 { \
92  v16u8 tmp, zero_in = { 0 }; \
93  v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
94  v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
95  \
96  tmp = __msa_ori_b(zero_in, 1); \
97  p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \
98  q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \
99  p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \
100  q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \
101  p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \
102  q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \
103  p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \
104  q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \
105  \
106  p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \
107  flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \
108  flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \
109  p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \
110  flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \
111  p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \
112  flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \
113  \
114  flat2_out = (tmp < (v16u8) flat2_out); \
115  flat2_out = __msa_xori_b(flat2_out, 0xff); \
116  flat2_out = flat2_out & flat_in; \
117 }
118 
119 #define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, \
120  q0_in, q1_in, q2_in, q3_in, \
121  p2_filt8_out, p1_filt8_out, p0_filt8_out, \
122  q0_filt8_out, q1_filt8_out, q2_filt8_out) \
123 { \
124  v8u16 tmp0, tmp1, tmp2; \
125  \
126  tmp2 = p2_in + p1_in + p0_in; \
127  tmp0 = p3_in << 1; \
128  \
129  tmp0 = tmp0 + tmp2 + q0_in; \
130  tmp1 = tmp0 + p3_in + p2_in; \
131  p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
132  \
133  tmp1 = tmp0 + p1_in + q1_in; \
134  p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
135  \
136  tmp1 = q2_in + q1_in + q0_in; \
137  tmp2 = tmp2 + tmp1; \
138  tmp0 = tmp2 + (p0_in); \
139  tmp0 = tmp0 + (p3_in); \
140  p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3); \
141  \
142  tmp0 = q2_in + q3_in; \
143  tmp0 = p0_in + tmp1 + tmp0; \
144  tmp1 = q3_in + q3_in; \
145  tmp1 = tmp1 + tmp0; \
146  q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
147  \
148  tmp0 = tmp2 + q3_in; \
149  tmp1 = tmp0 + q0_in; \
150  q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
151  \
152  tmp1 = tmp0 - p2_in; \
153  tmp0 = q1_in + q3_in; \
154  tmp1 = tmp0 + tmp1; \
155  q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
156 }
157 
158 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \
159  q0_in, q1_in, q2_in, q3_in, \
160  limit_in, b_limit_in, thresh_in, \
161  hev_out, mask_out, flat_out) \
162 { \
163  v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
164  v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
165  \
166  /* absolute subtraction of pixel values */ \
167  p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
168  p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
169  p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
170  q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
171  q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
172  q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
173  p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
174  p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
175  \
176  /* calculation of hev */ \
177  flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
178  hev_out = thresh_in < (v16u8) flat_out; \
179  \
180  /* calculation of mask */ \
181  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
182  p1_asub_q1_m >>= 1; \
183  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
184  \
185  mask_out = b_limit_in < p0_asub_q0_m; \
186  mask_out = __msa_max_u_b(flat_out, mask_out); \
187  p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
188  mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
189  q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
190  mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
191  \
192  mask_out = limit_in < (v16u8) mask_out; \
193  mask_out = __msa_xori_b(mask_out, 0xff); \
194 }
195 
196 void ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch,
197  int32_t b_limit_ptr,
198  int32_t limit_ptr,
199  int32_t thresh_ptr)
200 {
201  uint64_t p1_d, p0_d, q0_d, q1_d;
202  v16u8 mask, hev, flat, thresh, b_limit, limit;
203  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
204 
205  /* load vector elements */
206  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
207 
208  thresh = (v16u8) __msa_fill_b(thresh_ptr);
209  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
210  limit = (v16u8) __msa_fill_b(limit_ptr);
211 
212  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
213  hev, mask, flat);
214  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
215  q1_out);
216 
217  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
218  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
219  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
220  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
221  SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
222 }
223 
224 
225 void ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch,
226  int32_t b_limit_ptr,
227  int32_t limit_ptr,
228  int32_t thresh_ptr)
229 {
230  v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
231  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
232 
233  /* load vector elements */
234  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
235 
236  thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
237  thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
238  thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
239 
240  b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
241  b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
242  b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
243 
244  limit0 = (v16u8) __msa_fill_b(limit_ptr);
245  limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
246  limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
247 
248  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
249  hev, mask, flat);
250  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
251 
252  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
253 }
254 
255 void ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch,
256  int32_t b_limit_ptr,
257  int32_t limit_ptr,
258  int32_t thresh_ptr)
259 {
260  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
261  v16u8 mask, hev, flat, thresh, b_limit, limit;
262  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
263  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
264  v8i16 p2_filter8, p1_filter8, p0_filter8;
265  v8i16 q0_filter8, q1_filter8, q2_filter8;
266  v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
267  v16i8 zero = { 0 };
268 
269  /* load vector elements */
270  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
271 
272  thresh = (v16u8) __msa_fill_b(thresh_ptr);
273  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
274  limit = (v16u8) __msa_fill_b(limit_ptr);
275 
276  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
277  hev, mask, flat);
278  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
279  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
280  q1_out);
281 
282  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
283 
284  /* if flat is zero for all pixels, then no need to calculate other filter */
285  if (__msa_test_bz_v(flat)) {
286  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
287  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
288  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
289  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
290  SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
291  } else {
292  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
293  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
294  q2_r, q3_r);
295  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
296  p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
297 
298  /* convert 16 bit output data into 8 bit */
299  PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
300  zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
301  q0_filter8);
302  PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
303 
304  /* store pixel values */
305  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
306  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
307  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
308  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
309  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
310  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
311 
312  p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
313  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
314  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
315  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
316  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
317  q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
318 
319  src -= 3 * pitch;
320 
321  SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
322  src += (4 * pitch);
323  SD(q1_d, src);
324  src += pitch;
325  SD(q2_d, src);
326  }
327 }
328 
329 void ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch,
330  int32_t b_limit_ptr,
331  int32_t limit_ptr,
332  int32_t thresh_ptr)
333 {
334  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
335  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
336  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
337  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
338  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
339  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
340  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
341  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
342  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
343  v16u8 zero = { 0 };
344 
345  /* load vector elements */
346  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
347 
348  thresh = (v16u8) __msa_fill_b(thresh_ptr);
349  tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
350  thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
351 
352  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
353  tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
354  b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
355 
356  limit = (v16u8) __msa_fill_b(limit_ptr);
357  tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
358  limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
359 
360  /* mask and hev */
361  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
362  hev, mask, flat);
363  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
364  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
365  q1_out);
366 
367  /* if flat is zero for all pixels, then no need to calculate other filter */
368  if (__msa_test_bz_v(flat)) {
369  ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
370  } else {
371  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
372  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
373  q2_r, q3_r);
374  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
375  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
376 
377  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
378  p0_l);
379  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
380  q3_l);
381  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
382  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
383 
384  /* convert 16 bit output data into 8 bit */
385  PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
386  p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
387  p0_filt8_r, q0_filt8_r);
388  PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r,
389  q1_filt8_r, q2_filt8_r);
390 
391  /* store pixel values */
392  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
393  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
394  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
395  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
396  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
397  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
398 
399  src -= 3 * pitch;
400 
401  ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
402  src += (4 * pitch);
403  ST_UB2(q1_out, q2_out, src, pitch);
404  src += (2 * pitch);
405  }
406 }
407 
408 void ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch,
409  int32_t b_limit_ptr,
410  int32_t limit_ptr,
411  int32_t thresh_ptr)
412 {
413  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
414  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
415  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
416  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
417  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
418  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
419  v16u8 zero = { 0 };
420 
421  /* load vector elements */
422  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
423 
424  thresh = (v16u8) __msa_fill_b(thresh_ptr);
425  tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
426  thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
427 
428  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
429  tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
430  b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
431 
432  limit = (v16u8) __msa_fill_b(limit_ptr);
433  tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
434  limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
435 
436  /* mask and hev */
437  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
438  hev, mask, flat);
439  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
440  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
441  q1_out);
442 
443  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
444 
445  /* if flat is zero for all pixels, then no need to calculate other filter */
446  if (__msa_test_bz_v(flat)) {
447  ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
448  } else {
449  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
450  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
451  q2_r, q3_r);
452  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
453  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
454 
455  /* convert 16 bit output data into 8 bit */
456  PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
457  p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
458  p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
459  PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
460  q1_filt8_r, q2_filt8_r);
461 
462  /* store pixel values */
463  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
464  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
465  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
466  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
467  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
468  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
469 
470  src -= 3 * pitch;
471 
472  ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
473  src += (4 * pitch);
474  ST_UB2(q1_out, q2_out, src, pitch);
475  src += (2 * pitch);
476  }
477 }
478 
479 void ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch,
480  int32_t b_limit_ptr,
481  int32_t limit_ptr,
482  int32_t thresh_ptr)
483 {
484  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
485  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
486  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
487  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
488  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
489  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
490  v16u8 zero = { 0 };
491 
492  /* load vector elements */
493  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
494 
495  thresh = (v16u8) __msa_fill_b(thresh_ptr);
496  tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
497  thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
498 
499  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
500  tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
501  b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
502 
503  limit = (v16u8) __msa_fill_b(limit_ptr);
504  tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
505  limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
506 
507  /* mask and hev */
508  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
509  hev, mask, flat);
510  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
511  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
512  q1_out);
513 
514  flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
515 
516  /* if flat is zero for all pixels, then no need to calculate other filter */
517  if (__msa_test_bz_v(flat)) {
518  ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
519  } else {
520  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
521  p0_l);
522  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
523  q3_l);
524  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
525  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
526 
527  /* convert 16 bit output data into 8 bit */
528  PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
529  p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
530  p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
531  PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
532  q1_filt8_l, q2_filt8_l);
533 
534  /* store pixel values */
535  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
536  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
537  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
538  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
539  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
540  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
541 
542  src -= 3 * pitch;
543 
544  ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
545  src += (4 * pitch);
546  ST_UB2(q1_out, q2_out, src, pitch);
547  src += (2 * pitch);
548  }
549 }
550 
551 static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch,
552  uint8_t *filter48,
553  int32_t b_limit_ptr,
554  int32_t limit_ptr,
555  int32_t thresh_ptr)
556 {
557  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
558  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
559  v16u8 flat, mask, hev, thresh, b_limit, limit;
560  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
561  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
562  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
563  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
564  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
565  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
566  v16u8 zero = { 0 };
567 
568  /* load vector elements */
569  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
570 
571  thresh = (v16u8) __msa_fill_b(thresh_ptr);
572  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
573  limit = (v16u8) __msa_fill_b(limit_ptr);
574 
575  /* mask and hev */
576  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
577  hev, mask, flat);
578  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
579  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
580  q1_out);
581 
582  /* if flat is zero for all pixels, then no need to calculate other filter */
583  if (__msa_test_bz_v(flat)) {
584  ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
585 
586  return 1;
587  } else {
588  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
589  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
590  q2_r, q3_r);
591  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
592  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
593 
594  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
595  p0_l);
596  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
597  q3_l);
598  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
599  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
600 
601  /* convert 16 bit output data into 8 bit */
602  PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
603  p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
604  p0_filt8_r, q0_filt8_r);
605  PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
606  q2_filt8_r);
607 
608  /* store pixel values */
609  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
610  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
611  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
612  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
613  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
614  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
615 
616  ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
617  filter48 += (4 * 16);
618  ST_UB2(q1_out, q2_out, filter48, 16);
619  filter48 += (2 * 16);
620  ST_UB(flat, filter48);
621 
622  return 0;
623  }
624 }
625 
626 static void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48)
627 {
628  v16u8 flat, flat2, filter8;
629  v16i8 zero = { 0 };
630  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
631  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
632  v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
633  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
634  v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
635  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
636  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
637  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
638  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
639  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
640  v8i16 l_out, r_out;
641 
642  flat = LD_UB(filter48 + 96);
643 
644  LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
645  LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
646  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
647 
648  /* if flat2 is zero for all pixels, then no need to calculate other filter */
649  if (__msa_test_bz_v(flat2)) {
650  LD_UB4(filter48, 16, p2, p1, p0, q0);
651  LD_UB2(filter48 + 4 * 16, 16, q1, q2);
652 
653  src -= 3 * pitch;
654  ST_UB4(p2, p1, p0, q0, src, pitch);
655  src += (4 * pitch);
656  ST_UB2(q1, q2, src, pitch);
657  } else {
658  src -= 7 * pitch;
659 
660  ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
661  zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
662  p3_r_in, p2_r_in, p1_r_in, p0_r_in);
663 
664  q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
665 
666  tmp0_r = p7_r_in << 3;
667  tmp0_r -= p7_r_in;
668  tmp0_r += p6_r_in;
669  tmp0_r += q0_r_in;
670  tmp1_r = p6_r_in + p5_r_in;
671  tmp1_r += p4_r_in;
672  tmp1_r += p3_r_in;
673  tmp1_r += p2_r_in;
674  tmp1_r += p1_r_in;
675  tmp1_r += p0_r_in;
676  tmp1_r += tmp0_r;
677  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
678 
679  ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
680  p5_l_in, p4_l_in);
681  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
682  p1_l_in, p0_l_in);
683  q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
684 
685  tmp0_l = p7_l_in << 3;
686  tmp0_l -= p7_l_in;
687  tmp0_l += p6_l_in;
688  tmp0_l += q0_l_in;
689  tmp1_l = p6_l_in + p5_l_in;
690  tmp1_l += p4_l_in;
691  tmp1_l += p3_l_in;
692  tmp1_l += p2_l_in;
693  tmp1_l += p1_l_in;
694  tmp1_l += p0_l_in;
695  tmp1_l += tmp0_l;
696  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
697 
698  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
699  p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
700  ST_UB(p6, src);
701  src += pitch;
702 
703  /* p5 */
704  q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
705  tmp0_r = p5_r_in - p6_r_in;
706  tmp0_r += q1_r_in;
707  tmp0_r -= p7_r_in;
708  tmp1_r += tmp0_r;
709  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
710 
711  q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
712  tmp0_l = p5_l_in - p6_l_in;
713  tmp0_l += q1_l_in;
714  tmp0_l -= p7_l_in;
715  tmp1_l += tmp0_l;
716  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
717 
718  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
719  p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
720  ST_UB(p5, src);
721  src += pitch;
722 
723  /* p4 */
724  q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
725  tmp0_r = p4_r_in - p5_r_in;
726  tmp0_r += q2_r_in;
727  tmp0_r -= p7_r_in;
728  tmp1_r += tmp0_r;
729  r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4);
730 
731  q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
732  tmp0_l = p4_l_in - p5_l_in;
733  tmp0_l += q2_l_in;
734  tmp0_l -= p7_l_in;
735  tmp1_l += tmp0_l;
736  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
737 
738  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
739  p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
740  ST_UB(p4, src);
741  src += pitch;
742 
743  /* p3 */
744  q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
745  tmp0_r = p3_r_in - p4_r_in;
746  tmp0_r += q3_r_in;
747  tmp0_r -= p7_r_in;
748  tmp1_r += tmp0_r;
749  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
750 
751  q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
752  tmp0_l = p3_l_in - p4_l_in;
753  tmp0_l += q3_l_in;
754  tmp0_l -= p7_l_in;
755  tmp1_l += tmp0_l;
756  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
757 
758  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
759  p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
760  ST_UB(p3, src);
761  src += pitch;
762 
763  /* p2 */
764  q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
765  filter8 = LD_UB(filter48);
766  tmp0_r = p2_r_in - p3_r_in;
767  tmp0_r += q4_r_in;
768  tmp0_r -= p7_r_in;
769  tmp1_r += tmp0_r;
770  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
771 
772  q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
773  tmp0_l = p2_l_in - p3_l_in;
774  tmp0_l += q4_l_in;
775  tmp0_l -= p7_l_in;
776  tmp1_l += tmp0_l;
777  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
778 
779  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
780  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
781  ST_UB(filter8, src);
782  src += pitch;
783 
784  /* p1 */
785  q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
786  filter8 = LD_UB(filter48 + 16);
787  tmp0_r = p1_r_in - p2_r_in;
788  tmp0_r += q5_r_in;
789  tmp0_r -= p7_r_in;
790  tmp1_r += tmp0_r;
791  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
792 
793  q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
794  tmp0_l = p1_l_in - p2_l_in;
795  tmp0_l += q5_l_in;
796  tmp0_l -= p7_l_in;
797  tmp1_l += tmp0_l;
798  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
799 
800  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
801  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
802  ST_UB(filter8, src);
803  src += pitch;
804 
805  /* p0 */
806  q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
807  filter8 = LD_UB(filter48 + 32);
808  tmp0_r = p0_r_in - p1_r_in;
809  tmp0_r += q6_r_in;
810  tmp0_r -= p7_r_in;
811  tmp1_r += tmp0_r;
812  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
813 
814  q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
815  tmp0_l = p0_l_in - p1_l_in;
816  tmp0_l += q6_l_in;
817  tmp0_l -= p7_l_in;
818  tmp1_l += tmp0_l;
819  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
820 
821  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
822  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
823  ST_UB(filter8, src);
824  src += pitch;
825 
826  /* q0 */
827  q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
828  filter8 = LD_UB(filter48 + 48);
829  tmp0_r = q7_r_in - p0_r_in;
830  tmp0_r += q0_r_in;
831  tmp0_r -= p7_r_in;
832  tmp1_r += tmp0_r;
833  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
834 
835  q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
836  tmp0_l = q7_l_in - p0_l_in;
837  tmp0_l += q0_l_in;
838  tmp0_l -= p7_l_in;
839  tmp1_l += tmp0_l;
840  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
841 
842  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
843  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
844  ST_UB(filter8, src);
845  src += pitch;
846 
847  /* q1 */
848  filter8 = LD_UB(filter48 + 64);
849  tmp0_r = q7_r_in - q0_r_in;
850  tmp0_r += q1_r_in;
851  tmp0_r -= p6_r_in;
852  tmp1_r += tmp0_r;
853  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
854 
855  tmp0_l = q7_l_in - q0_l_in;
856  tmp0_l += q1_l_in;
857  tmp0_l -= p6_l_in;
858  tmp1_l += tmp0_l;
859  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
860 
861  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
862  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
863  ST_UB(filter8, src);
864  src += pitch;
865 
866  /* q2 */
867  filter8 = LD_UB(filter48 + 80);
868  tmp0_r = q7_r_in - q1_r_in;
869  tmp0_r += q2_r_in;
870  tmp0_r -= p5_r_in;
871  tmp1_r += tmp0_r;
872  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
873 
874  tmp0_l = q7_l_in - q1_l_in;
875  tmp0_l += q2_l_in;
876  tmp0_l -= p5_l_in;
877  tmp1_l += tmp0_l;
878  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
879 
880  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
881  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
882  ST_UB(filter8, src);
883  src += pitch;
884 
885  /* q3 */
886  tmp0_r = q7_r_in - q2_r_in;
887  tmp0_r += q3_r_in;
888  tmp0_r -= p4_r_in;
889  tmp1_r += tmp0_r;
890  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
891 
892  tmp0_l = q7_l_in - q2_l_in;
893  tmp0_l += q3_l_in;
894  tmp0_l -= p4_l_in;
895  tmp1_l += tmp0_l;
896  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
897 
898  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
899  q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
900  ST_UB(q3, src);
901  src += pitch;
902 
903  /* q4 */
904  tmp0_r = q7_r_in - q3_r_in;
905  tmp0_r += q4_r_in;
906  tmp0_r -= p3_r_in;
907  tmp1_r += tmp0_r;
908  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
909 
910  tmp0_l = q7_l_in - q3_l_in;
911  tmp0_l += q4_l_in;
912  tmp0_l -= p3_l_in;
913  tmp1_l += tmp0_l;
914  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
915 
916  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
917  q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
918  ST_UB(q4, src);
919  src += pitch;
920 
921  /* q5 */
922  tmp0_r = q7_r_in - q4_r_in;
923  tmp0_r += q5_r_in;
924  tmp0_r -= p2_r_in;
925  tmp1_r += tmp0_r;
926  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
927 
928  tmp0_l = q7_l_in - q4_l_in;
929  tmp0_l += q5_l_in;
930  tmp0_l -= p2_l_in;
931  tmp1_l += tmp0_l;
932  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
933 
934  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
935  q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
936  ST_UB(q5, src);
937  src += pitch;
938 
939  /* q6 */
940  tmp0_r = q7_r_in - q5_r_in;
941  tmp0_r += q6_r_in;
942  tmp0_r -= p1_r_in;
943  tmp1_r += tmp0_r;
944  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
945 
946  tmp0_l = q7_l_in - q5_l_in;
947  tmp0_l += q6_l_in;
948  tmp0_l -= p1_l_in;
949  tmp1_l += tmp0_l;
950  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
951 
952  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
953  q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
954  ST_UB(q6, src);
955  }
956 }
957 
958 void ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch,
959  int32_t b_limit_ptr,
960  int32_t limit_ptr,
961  int32_t thresh_ptr)
962 {
963  uint8_t filter48[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
964  uint8_t early_exit = 0;
965 
966  early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0],
967  b_limit_ptr, limit_ptr, thresh_ptr);
968 
969  if (0 == early_exit) {
970  vp9_hz_lpf_t16_16w(src, pitch, filter48);
971  }
972 }
973 
974 void ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch,
975  int32_t b_limit_ptr,
976  int32_t limit_ptr,
977  int32_t thresh_ptr)
978 {
979  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
980  uint64_t dword0, dword1;
981  v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
982  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
983  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
984  v16u8 p0_filter16, p1_filter16;
985  v8i16 p2_filter8, p1_filter8, p0_filter8;
986  v8i16 q0_filter8, q1_filter8, q2_filter8;
987  v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
988  v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
989  v16i8 zero = { 0 };
990  v8u16 tmp0, tmp1, tmp2;
991 
992  /* load vector elements */
993  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
994 
995  thresh = (v16u8) __msa_fill_b(thresh_ptr);
996  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
997  limit = (v16u8) __msa_fill_b(limit_ptr);
998 
999  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1000  hev, mask, flat);
1001  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1002  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1003  q1_out);
1004 
1005  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1006 
1007  /* if flat is zero for all pixels, then no need to calculate other filter */
1008  if (__msa_test_bz_v(flat)) {
1009  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1010  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1011  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1012  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1013  SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
1014  } else {
1015  /* convert 8 bit input data into 16 bit */
1016  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero,
1017  q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r,
1018  q1_r, q2_r, q3_r);
1019  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r,
1020  p2_filter8, p1_filter8, p0_filter8, q0_filter8,
1021  q1_filter8, q2_filter8);
1022 
1023  /* convert 16 bit output data into 8 bit */
1024  PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
1025  zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
1026  q0_filter8);
1027  PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8,
1028  q2_filter8);
1029 
1030  /* store pixel values */
1031  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
1032  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
1033  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
1034  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
1035  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
1036  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
1037 
1038  /* load 16 vector elements */
1039  LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
1040  LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
1041 
1042  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1043 
1044  /* if flat2 is zero for all pixels, then no need to calculate other filter */
1045  if (__msa_test_bz_v(flat2)) {
1046  p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
1047  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1048  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1049  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1050  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1051  q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
1052 
1053  SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
1054  SD(q1_d, src + pitch);
1055  SD(q2_d, src + 2 * pitch);
1056  } else {
1057  /* LSB(right) 8 pixel operation */
1058  ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4,
1059  zero, q5, zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r,
1060  q4_r, q5_r, q6_r, q7_r);
1061 
1062  tmp0 = p7_r << 3;
1063  tmp0 -= p7_r;
1064  tmp0 += p6_r;
1065  tmp0 += q0_r;
1066 
1067  src -= 7 * pitch;
1068 
1069  /* calculation of p6 and p5 */
1070  tmp1 = p6_r + p5_r + p4_r + p3_r;
1071  tmp1 += (p2_r + p1_r + p0_r);
1072  tmp1 += tmp0;
1073  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1074  tmp0 = p5_r - p6_r + q1_r - p7_r;
1075  tmp1 += tmp0;
1076  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1077  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1078  p0_filter16, p1_filter16);
1079  p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
1080  p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
1081  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1082  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1083  SD(dword0, src);
1084  src += pitch;
1085  SD(dword1, src);
1086  src += pitch;
1087 
1088  /* calculation of p4 and p3 */
1089  tmp0 = p4_r - p5_r + q2_r - p7_r;
1090  tmp2 = p3_r - p4_r + q3_r - p7_r;
1091  tmp1 += tmp0;
1092  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1093  tmp1 += tmp2;
1094  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1095  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1096  p0_filter16, p1_filter16);
1097  p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
1098  p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
1099  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1100  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1101  SD(dword0, src);
1102  src += pitch;
1103  SD(dword1, src);
1104  src += pitch;
1105 
1106  /* calculation of p2 and p1 */
1107  tmp0 = p2_r - p3_r + q4_r - p7_r;
1108  tmp2 = p1_r - p2_r + q5_r - p7_r;
1109  tmp1 += tmp0;
1110  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1111  tmp1 += tmp2;
1112  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1113  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1114  p0_filter16, p1_filter16);
1115  p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
1116  p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
1117  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1118  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1119  SD(dword0, src);
1120  src += pitch;
1121  SD(dword1, src);
1122  src += pitch;
1123 
1124  /* calculation of p0 and q0 */
1125  tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
1126  tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
1127  tmp1 += tmp0;
1128  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1129  tmp1 += tmp2;
1130  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1131  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1132  p0_filter16, p1_filter16);
1133  p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
1134  p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
1135  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1136  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1137  SD(dword0, src);
1138  src += pitch;
1139  SD(dword1, src);
1140  src += pitch;
1141 
1142  /* calculation of q1 and q2 */
1143  tmp0 = q7_r - q0_r + q1_r - p6_r;
1144  tmp2 = q7_r - q1_r + q2_r - p5_r;
1145  tmp1 += tmp0;
1146  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1147  tmp1 += tmp2;
1148  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1149  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1150  p0_filter16, p1_filter16);
1151  p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
1152  p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
1153  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1154  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1155  SD(dword0, src);
1156  src += pitch;
1157  SD(dword1, src);
1158  src += pitch;
1159 
1160  /* calculation of q3 and q4 */
1161  tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
1162  tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
1163  tmp1 += tmp0;
1164  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1165  tmp1 += tmp2;
1166  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1167  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1168  p0_filter16, p1_filter16);
1169  p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
1170  p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
1171  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1172  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1173  SD(dword0, src);
1174  src += pitch;
1175  SD(dword1, src);
1176  src += pitch;
1177 
1178  /* calculation of q5 and q6 */
1179  tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
1180  tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
1181  tmp1 += tmp0;
1182  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1183  tmp1 += tmp2;
1184  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1185  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1186  p0_filter16, p1_filter16);
1187  p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
1188  p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
1189  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1190  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1191  SD(dword0, src);
1192  src += pitch;
1193  SD(dword1, src);
1194  }
1195  }
1196 }
1197 
1198 void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch,
1199  int32_t b_limit_ptr,
1200  int32_t limit_ptr,
1201  int32_t thresh_ptr)
1202 {
1203  v16u8 mask, hev, flat, limit, thresh, b_limit;
1204  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1205  v8i16 vec0, vec1, vec2, vec3;
1206 
1207  LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1208 
1209  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1210  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1211  limit = (v16u8) __msa_fill_b(limit_ptr);
1212 
1213  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
1214  p3, p2, p1, p0, q0, q1, q2, q3);
1215  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1216  hev, mask, flat);
1217  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1218  ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
1219  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1220 
1221  src -= 2;
1222  ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1223 }
1224 
1225 void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch,
1226  int32_t b_limit_ptr,
1227  int32_t limit_ptr,
1228  int32_t thresh_ptr)
1229 {
1230  v16u8 mask, hev, flat;
1231  v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
1232  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1233  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1234  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1235  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1236 
1237  LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1238  LD_UB8(src - 4 + (8 * pitch), pitch,
1239  row8, row9, row10, row11, row12, row13, row14, row15);
1240 
1241  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1242  row8, row9, row10, row11, row12, row13, row14, row15,
1243  p3, p2, p1, p0, q0, q1, q2, q3);
1244 
1245  thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
1246  thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
1247  thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
1248 
1249  b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
1250  b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
1251  b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
1252 
1253  limit0 = (v16u8) __msa_fill_b(limit_ptr);
1254  limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
1255  limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
1256 
1257  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
1258  hev, mask, flat);
1259  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1260  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
1261  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
1262  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
1263  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
1264 
1265  src -= 2;
1266 
1267  ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1268  ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1269 }
1270 
1271 void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch,
1272  int32_t b_limit_ptr,
1273  int32_t limit_ptr,
1274  int32_t thresh_ptr)
1275 {
1276  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1277  v16u8 p1_out, p0_out, q0_out, q1_out;
1278  v16u8 flat, mask, hev, thresh, b_limit, limit;
1279  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1280  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1281  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1282  v16u8 zero = { 0 };
1283  v8i16 vec0, vec1, vec2, vec3, vec4;
1284 
1285  /* load vector elements */
1286  LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1287 
1288  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
1289  p3, p2, p1, p0, q0, q1, q2, q3);
1290 
1291  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1292  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1293  limit = (v16u8) __msa_fill_b(limit_ptr);
1294 
1295  /* mask and hev */
1296  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1297  hev, mask, flat);
1298  /* flat4 */
1299  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1300  /* filter4 */
1301  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1302  q1_out);
1303 
1304  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1305 
1306  /* if flat is zero for all pixels, then no need to calculate other filter */
1307  if (__msa_test_bz_v(flat)) {
1308  /* Store 4 pixels p1-_q1 */
1309  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1310  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1311 
1312  src -= 2;
1313  ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1314  } else {
1315  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1316  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1317  q3_r);
1318  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1319  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1320  /* convert 16 bit output data into 8 bit */
1321  PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
1322  p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1323  p0_filt8_r, q0_filt8_r);
1324  PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
1325  q2_filt8_r);
1326 
1327  /* store pixel values */
1328  p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1329  p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1330  p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1331  q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1332  q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1333  q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1334 
1335  /* Store 6 pixels p2-_q2 */
1336  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1337  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1338  vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1339 
1340  src -= 3;
1341  ST_W4(vec2, 0, 1, 2, 3, src, pitch);
1342  ST_H4(vec4, 0, 1, 2, 3, src + 4, pitch);
1343  src += (4 * pitch);
1344  ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1345  ST_H4(vec4, 4, 5, 6, 7, src + 4, pitch);
1346  }
1347 }
1348 
1349 void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch,
1350  int32_t b_limit_ptr,
1351  int32_t limit_ptr,
1352  int32_t thresh_ptr)
1353 {
1354  uint8_t *temp_src;
1355  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1356  v16u8 p1_out, p0_out, q0_out, q1_out;
1357  v16u8 flat, mask, hev, thresh, b_limit, limit;
1358  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1359  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1360  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1361  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1362  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1363  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1364  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1365  v16u8 zero = { 0 };
1366  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1367 
1368  temp_src = src - 4;
1369 
1370  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1371  temp_src += (8 * pitch);
1372  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1373 
1374  /* transpose 16x8 matrix into 8x16 */
1375  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1376  q3, q2, q1, q0, row12, row13, row14, row15,
1377  p3, p2, p1, p0, q0, q1, q2, q3);
1378 
1379  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1380  vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1381  thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1382 
1383  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1384  vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1385  b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1386 
1387  limit = (v16u8) __msa_fill_b(limit_ptr);
1388  vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1389  limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1390 
1391  /* mask and hev */
1392  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1393  hev, mask, flat);
1394  /* flat4 */
1395  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1396  /* filter4 */
1397  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1398  q1_out);
1399 
1400  /* if flat is zero for all pixels, then no need to calculate other filter */
1401  if (__msa_test_bz_v(flat)) {
1402  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1403  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1404  ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1405  ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1406 
1407  src -= 2;
1408  ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1409  ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1410  } else {
1411  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1412  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1413  q3_r);
1414  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1415  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1416 
1417  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1418  p0_l);
1419  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1420  q3_l);
1421 
1422  /* filter8 */
1423  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1424  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1425 
1426  /* convert 16 bit output data into 8 bit */
1427  PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1428  p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1429  p0_filt8_r, q0_filt8_r);
1430  PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1431  q2_filt8_r);
1432 
1433  /* store pixel values */
1434  p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1435  p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1436  p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1437  q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1438  q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1439  q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1440 
1441  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1442  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1443  ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1444  ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1445  ILVRL_B2_SH(q2, q1, vec2, vec5);
1446 
1447  src -= 3;
1448  ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1449  ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1450  src += (4 * pitch);
1451  ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1452  ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1453  src += (4 * pitch);
1454  ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1455  ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1456  src += (4 * pitch);
1457  ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1458  ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1459  }
1460 }
1461 
1462 void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch,
1463  int32_t b_limit_ptr,
1464  int32_t limit_ptr,
1465  int32_t thresh_ptr)
1466 {
1467  uint8_t *temp_src;
1468  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1469  v16u8 p1_out, p0_out, q0_out, q1_out;
1470  v16u8 flat, mask, hev, thresh, b_limit, limit;
1471  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1472  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1473  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1474  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1475  v16u8 zero = { 0 };
1476  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1477 
1478  temp_src = src - 4;
1479 
1480  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1481  temp_src += (8 * pitch);
1482  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1483 
1484  /* transpose 16x8 matrix into 8x16 */
1485  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1486  q3, q2, q1, q0, row12, row13, row14, row15,
1487  p3, p2, p1, p0, q0, q1, q2, q3);
1488 
1489  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1490  vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1491  thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1492 
1493  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1494  vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1495  b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1496 
1497  limit = (v16u8) __msa_fill_b(limit_ptr);
1498  vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1499  limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1500 
1501  /* mask and hev */
1502  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1503  hev, mask, flat);
1504  /* flat4 */
1505  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1506  /* filter4 */
1507  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1508  q1_out);
1509 
1510  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1511 
1512  /* if flat is zero for all pixels, then no need to calculate other filter */
1513  if (__msa_test_bz_v(flat)) {
1514  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1515  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1516  ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1517  ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1518 
1519  src -= 2;
1520  ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1521  ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1522  } else {
1523  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1524  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1525  q3_r);
1526  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1527  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1528 
1529  /* convert 16 bit output data into 8 bit */
1530  PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
1531  p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
1532  p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
1533  PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
1534  q1_filt8_r, q2_filt8_r);
1535 
1536  /* store pixel values */
1537  p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1538  p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1539  p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1540  q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1541  q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1542  q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1543 
1544  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1545  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1546  ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1547  ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1548  ILVRL_B2_SH(q2, q1, vec2, vec5);
1549 
1550  src -= 3;
1551  ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1552  ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1553  src += (4 * pitch);
1554  ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1555  ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1556  src += (4 * pitch);
1557  ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1558  ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1559  src += (4 * pitch);
1560  ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1561  ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1562  }
1563 }
1564 
1565 void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch,
1566  int32_t b_limit_ptr,
1567  int32_t limit_ptr,
1568  int32_t thresh_ptr)
1569 {
1570  uint8_t *temp_src;
1571  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1572  v16u8 p1_out, p0_out, q0_out, q1_out;
1573  v16u8 flat, mask, hev, thresh, b_limit, limit;
1574  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1575  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1576  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1577  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1578  v16u8 zero = { 0 };
1579  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1580 
1581  temp_src = src - 4;
1582 
1583  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1584  temp_src += (8 * pitch);
1585  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1586 
1587  /* transpose 16x8 matrix into 8x16 */
1588  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1589  q3, q2, q1, q0, row12, row13, row14, row15,
1590  p3, p2, p1, p0, q0, q1, q2, q3);
1591 
1592  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1593  vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1594  thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1595 
1596  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1597  vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1598  b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1599 
1600  limit = (v16u8) __msa_fill_b(limit_ptr);
1601  vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1602  limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1603 
1604  /* mask and hev */
1605  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1606  hev, mask, flat);
1607  /* flat4 */
1608  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1609  /* filter4 */
1610  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1611  q1_out);
1612 
1613  flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
1614 
1615  /* if flat is zero for all pixels, then no need to calculate other filter */
1616  if (__msa_test_bz_v(flat)) {
1617  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1618  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1619  ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1620  ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1621 
1622  src -= 2;
1623  ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1624  ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1625  } else {
1626  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1627  p0_l);
1628  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1629  q3_l);
1630 
1631  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1632  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1633 
1634  /* convert 16 bit output data into 8 bit */
1635  PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
1636  p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
1637  p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
1638  PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
1639  q1_filt8_l, q2_filt8_l);
1640 
1641  /* store pixel values */
1642  p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
1643  p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
1644  p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
1645  q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
1646  q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
1647  q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
1648 
1649  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1650  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1651  ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1652  ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1653  ILVRL_B2_SH(q2, q1, vec2, vec5);
1654 
1655  src -= 3;
1656  ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1657  ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1658  src += (4 * pitch);
1659  ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1660  ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1661  src += (4 * pitch);
1662  ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1663  ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1664  src += (4 * pitch);
1665  ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1666  ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1667  }
1668 }
1669 
1671  uint8_t *output, int32_t out_pitch)
1672 {
1673  v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
1674  v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1675  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1676 
1677  LD_UB8(input, in_pitch,
1678  p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
1679  /* 8x8 transpose */
1680  TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
1681  p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
1682  /* 8x8 transpose */
1683  ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
1684  tmp0, tmp1, tmp2, tmp3);
1685  ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
1686  ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
1687  ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
1688  ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
1689  SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
1690 
1691  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1692  output += (8 * out_pitch);
1693  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1694 }
1695 
1697  uint8_t *output, int32_t out_pitch)
1698 {
1699  v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
1700  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1701 
1702  LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
1703  LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
1704  TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
1705  q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
1706  ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
1707 }
1708 
1709 static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch,
1710  uint8_t *output, int32_t out_pitch)
1711 {
1712  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1713  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1714  v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
1715  v4i32 tmp2, tmp3;
1716  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1717 
1718  LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1719  input += (8 * in_pitch);
1720  LD_UB8(input, in_pitch,
1721  row8, row9, row10, row11, row12, row13, row14, row15);
1722 
1723  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1724  row8, row9, row10, row11, row12, row13, row14, row15,
1725  p7, p6, p5, p4, p3, p2, p1, p0);
1726 
1727  /* transpose 16x8 matrix into 8x16 */
1728  /* total 8 intermediate register and 32 instructions */
1729  q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0);
1730  q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1);
1731  q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2);
1732  q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3);
1733  q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4);
1734  q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5);
1735  q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6);
1736  q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7);
1737 
1738  ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
1739  tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7);
1740  tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5);
1741 
1742  ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
1743  tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3);
1744  tmp7 = (v8i16) __msa_ilvod_b((v16i8) q0, (v16i8) q1);
1745 
1746  ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
1747  q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1748  q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1749 
1750  tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0);
1751  tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5);
1752  q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1753  q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1754 
1755  ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
1756  q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1757  q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1758 
1759  tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4);
1760  tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6);
1761  q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1762  q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1763 
1764  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1765  output += (8 * out_pitch);
1766  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1767 }
1768 
1770  uint8_t *src_org, int32_t pitch_org,
1771  int32_t b_limit_ptr,
1772  int32_t limit_ptr,
1773  int32_t thresh_ptr)
1774 {
1775  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1776  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1777  v16u8 flat, mask, hev, thresh, b_limit, limit;
1778  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1779  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1780  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1781  v16i8 zero = { 0 };
1782  v8i16 vec0, vec1, vec2, vec3;
1783 
1784  /* load vector elements */
1785  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1786 
1787  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1788  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1789  limit = (v16u8) __msa_fill_b(limit_ptr);
1790 
1791  /* mask and hev */
1792  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1793  hev, mask, flat);
1794  /* flat4 */
1795  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1796  /* filter4 */
1797  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1798  q1_out);
1799 
1800  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1801 
1802  /* if flat is zero for all pixels, then no need to calculate other filter */
1803  if (__msa_test_bz_v(flat)) {
1804  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1805  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1806  ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, (src_org - 2), pitch_org);
1807  return 1;
1808  } else {
1809  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1810  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1811  q3_r);
1812  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1813  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1814 
1815  /* convert 16 bit output data into 8 bit */
1816  p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r);
1817  p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r);
1818  p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r);
1819  q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r);
1820  q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r);
1821  q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r);
1822 
1823  /* store pixel values */
1824  p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, flat);
1825  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, flat);
1826  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, flat);
1827  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, flat);
1828  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, flat);
1829  q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, flat);
1830 
1831  ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1832  filter48 += (4 * 16);
1833  ST_UB2(q1_out, q2_out, filter48, 16);
1834  filter48 += (2 * 16);
1835  ST_UB(flat, filter48);
1836 
1837  return 0;
1838  }
1839 }
1840 
1841 static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
1842  uint8_t *filter48)
1843 {
1844  v16i8 zero = { 0 };
1845  v16u8 filter8, flat, flat2;
1846  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1847  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
1848  v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1849  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
1850  v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1851  v8u16 tmp0_r, tmp1_r;
1852  v8i16 r_out;
1853 
1854  flat = LD_UB(filter48 + 6 * 16);
1855 
1856  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1857  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1858 
1859  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1860 
1861  /* if flat2 is zero for all pixels, then no need to calculate other filter */
1862  if (__msa_test_bz_v(flat2)) {
1863  v8i16 vec0, vec1, vec2, vec3, vec4;
1864 
1865  LD_UB4(filter48, 16, p2, p1, p0, q0);
1866  LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1867 
1868  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1869  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1870  vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1871 
1872  src_org -= 3;
1873  ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
1874  ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
1875  src_org += (4 * pitch);
1876  ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
1877  ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
1878 
1879  return 1;
1880  } else {
1881  src -= 7 * 16;
1882 
1883  ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
1884  zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
1885  p3_r_in, p2_r_in, p1_r_in, p0_r_in);
1886  q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
1887 
1888  tmp0_r = p7_r_in << 3;
1889  tmp0_r -= p7_r_in;
1890  tmp0_r += p6_r_in;
1891  tmp0_r += q0_r_in;
1892  tmp1_r = p6_r_in + p5_r_in;
1893  tmp1_r += p4_r_in;
1894  tmp1_r += p3_r_in;
1895  tmp1_r += p2_r_in;
1896  tmp1_r += p1_r_in;
1897  tmp1_r += p0_r_in;
1898  tmp1_r += tmp0_r;
1899 
1900  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1901  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1902  p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
1903  ST_D1(p6, 0, src);
1904  src += 16;
1905 
1906  /* p5 */
1907  q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
1908  tmp0_r = p5_r_in - p6_r_in;
1909  tmp0_r += q1_r_in;
1910  tmp0_r -= p7_r_in;
1911  tmp1_r += tmp0_r;
1912  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1913  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1914  p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
1915  ST_D1(p5, 0, src);
1916  src += 16;
1917 
1918  /* p4 */
1919  q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
1920  tmp0_r = p4_r_in - p5_r_in;
1921  tmp0_r += q2_r_in;
1922  tmp0_r -= p7_r_in;
1923  tmp1_r += tmp0_r;
1924  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1925  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1926  p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
1927  ST_D1(p4, 0, src);
1928  src += 16;
1929 
1930  /* p3 */
1931  q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
1932  tmp0_r = p3_r_in - p4_r_in;
1933  tmp0_r += q3_r_in;
1934  tmp0_r -= p7_r_in;
1935  tmp1_r += tmp0_r;
1936  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1937  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1938  p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
1939  ST_D1(p3, 0, src);
1940  src += 16;
1941 
1942  /* p2 */
1943  q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
1944  filter8 = LD_UB(filter48);
1945  tmp0_r = p2_r_in - p3_r_in;
1946  tmp0_r += q4_r_in;
1947  tmp0_r -= p7_r_in;
1948  tmp1_r += tmp0_r;
1949  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1950  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1951  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1952  ST_D1(filter8, 0, src);
1953  src += 16;
1954 
1955  /* p1 */
1956  q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
1957  filter8 = LD_UB(filter48 + 16);
1958  tmp0_r = p1_r_in - p2_r_in;
1959  tmp0_r += q5_r_in;
1960  tmp0_r -= p7_r_in;
1961  tmp1_r += tmp0_r;
1962  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1963  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1964  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1965  ST_D1(filter8, 0, src);
1966  src += 16;
1967 
1968  /* p0 */
1969  q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
1970  filter8 = LD_UB(filter48 + 32);
1971  tmp0_r = p0_r_in - p1_r_in;
1972  tmp0_r += q6_r_in;
1973  tmp0_r -= p7_r_in;
1974  tmp1_r += tmp0_r;
1975  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1976  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1977  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1978  ST_D1(filter8, 0, src);
1979  src += 16;
1980 
1981  /* q0 */
1982  q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
1983  filter8 = LD_UB(filter48 + 48);
1984  tmp0_r = q7_r_in - p0_r_in;
1985  tmp0_r += q0_r_in;
1986  tmp0_r -= p7_r_in;
1987  tmp1_r += tmp0_r;
1988  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1989  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1990  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1991  ST_D1(filter8, 0, src);
1992  src += 16;
1993 
1994  /* q1 */
1995  filter8 = LD_UB(filter48 + 64);
1996  tmp0_r = q7_r_in - q0_r_in;
1997  tmp0_r += q1_r_in;
1998  tmp0_r -= p6_r_in;
1999  tmp1_r += tmp0_r;
2000  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2001  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2002  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2003  ST_D1(filter8, 0, src);
2004  src += 16;
2005 
2006  /* q2 */
2007  filter8 = LD_UB(filter48 + 80);
2008  tmp0_r = q7_r_in - q1_r_in;
2009  tmp0_r += q2_r_in;
2010  tmp0_r -= p5_r_in;
2011  tmp1_r += tmp0_r;
2012  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2013  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2014  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2015  ST_D1(filter8, 0, src);
2016  src += 16;
2017 
2018  /* q3 */
2019  tmp0_r = q7_r_in - q2_r_in;
2020  tmp0_r += q3_r_in;
2021  tmp0_r -= p4_r_in;
2022  tmp1_r += tmp0_r;
2023  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2024  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2025  q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2026  ST_D1(q3, 0, src);
2027  src += 16;
2028 
2029  /* q4 */
2030  tmp0_r = q7_r_in - q3_r_in;
2031  tmp0_r += q4_r_in;
2032  tmp0_r -= p3_r_in;
2033  tmp1_r += tmp0_r;
2034  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2035  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2036  q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2037  ST_D1(q4, 0, src);
2038  src += 16;
2039 
2040  /* q5 */
2041  tmp0_r = q7_r_in - q4_r_in;
2042  tmp0_r += q5_r_in;
2043  tmp0_r -= p2_r_in;
2044  tmp1_r += tmp0_r;
2045  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2046  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2047  q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2048  ST_D1(q5, 0, src);
2049  src += 16;
2050 
2051  /* q6 */
2052  tmp0_r = q7_r_in - q5_r_in;
2053  tmp0_r += q6_r_in;
2054  tmp0_r -= p1_r_in;
2055  tmp1_r += tmp0_r;
2056  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2057  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2058  q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2059  ST_D1(q6, 0, src);
2060 
2061  return 0;
2062  }
2063 }
2064 
2065 void ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch,
2066  int32_t b_limit_ptr,
2067  int32_t limit_ptr,
2068  int32_t thresh_ptr)
2069 {
2070  uint8_t early_exit = 0;
2071  uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
2072  uint8_t *filter48 = &transposed_input[16 * 16];
2073 
2074  vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
2075 
2076  early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
2077  &filter48[0], src, pitch,
2078  b_limit_ptr, limit_ptr, thresh_ptr);
2079 
2080  if (0 == early_exit) {
2081  early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
2082  &filter48[0]);
2083 
2084  if (0 == early_exit) {
2085  vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
2086  }
2087  }
2088 }
2089 
2091  uint8_t *src_org, ptrdiff_t pitch,
2092  int32_t b_limit_ptr,
2093  int32_t limit_ptr,
2094  int32_t thresh_ptr)
2095 {
2096  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
2097  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2098  v16u8 flat, mask, hev, thresh, b_limit, limit;
2099  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
2100  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2101  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
2102  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
2103  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
2104  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
2105  v16i8 zero = { 0 };
2106  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
2107 
2108  /* load vector elements */
2109  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
2110 
2111  thresh = (v16u8) __msa_fill_b(thresh_ptr);
2112  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
2113  limit = (v16u8) __msa_fill_b(limit_ptr);
2114 
2115  /* mask and hev */
2116  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2117  hev, mask, flat);
2118  /* flat4 */
2119  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2120  /* filter4 */
2121  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2122  q1_out);
2123 
2124  /* if flat is zero for all pixels, then no need to calculate other filter */
2125  if (__msa_test_bz_v(flat)) {
2126  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2127  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
2128  ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2129  ILVRL_H2_SH(vec1, vec0, vec4, vec5);
2130 
2131  src_org -= 2;
2132  ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src_org, pitch);
2133  ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src_org + 8 * pitch, pitch);
2134 
2135  return 1;
2136  } else {
2137  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
2138  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
2139  q3_r);
2140  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
2141  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
2142  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
2143  p0_l);
2144  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
2145  q3_l);
2146  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2147  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2148 
2149  /* convert 16 bit output data into 8 bit */
2150  PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
2151  p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
2152  p0_filt8_r, q0_filt8_r);
2153  PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
2154  q2_filt8_r);
2155 
2156  /* store pixel values */
2157  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
2158  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
2159  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
2160  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
2161  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
2162  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
2163 
2164  ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
2165  filter48 += (4 * 16);
2166  ST_UB2(q1_out, q2_out, filter48, 16);
2167  filter48 += (2 * 16);
2168  ST_UB(flat, filter48);
2169 
2170  return 0;
2171  }
2172 }
2173 
2174 static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
2175  uint8_t *filter48)
2176 {
2177  v16u8 flat, flat2, filter8;
2178  v16i8 zero = { 0 };
2179  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2180  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
2181  v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
2182  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
2183  v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
2184  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2185  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2186  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2187  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2188  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
2189  v8i16 l_out, r_out;
2190 
2191  flat = LD_UB(filter48 + 6 * 16);
2192 
2193  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
2194  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
2195 
2196  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2197 
2198  /* if flat2 is zero for all pixels, then no need to calculate other filter */
2199  if (__msa_test_bz_v(flat2)) {
2200  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2201 
2202  LD_UB4(filter48, 16, p2, p1, p0, q0);
2203  LD_UB2(filter48 + 4 * 16, 16, q1, q2);
2204 
2205  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
2206  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
2207  ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
2208  ILVRL_H2_SH(vec1, vec0, vec6, vec7);
2209  ILVRL_B2_SH(q2, q1, vec2, vec5);
2210 
2211  src_org -= 3;
2212  ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
2213  ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
2214  src_org += (4 * pitch);
2215  ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
2216  ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
2217  src_org += (4 * pitch);
2218  ST_W4(vec6, 0, 1, 2, 3, src_org, pitch);
2219  ST_H4(vec5, 0, 1, 2, 3, (src_org + 4), pitch);
2220  src_org += (4 * pitch);
2221  ST_W4(vec7, 0, 1, 2, 3, src_org, pitch);
2222  ST_H4(vec5, 4, 5, 6, 7, (src_org + 4), pitch);
2223 
2224  return 1;
2225  } else {
2226  src -= 7 * 16;
2227 
2228  ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
2229  zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
2230  p3_r_in, p2_r_in, p1_r_in, p0_r_in);
2231  q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
2232 
2233  tmp0_r = p7_r_in << 3;
2234  tmp0_r -= p7_r_in;
2235  tmp0_r += p6_r_in;
2236  tmp0_r += q0_r_in;
2237  tmp1_r = p6_r_in + p5_r_in;
2238  tmp1_r += p4_r_in;
2239  tmp1_r += p3_r_in;
2240  tmp1_r += p2_r_in;
2241  tmp1_r += p1_r_in;
2242  tmp1_r += p0_r_in;
2243  tmp1_r += tmp0_r;
2244  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2245 
2246  ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
2247  p5_l_in, p4_l_in);
2248  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
2249  p1_l_in, p0_l_in);
2250  q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
2251 
2252  tmp0_l = p7_l_in << 3;
2253  tmp0_l -= p7_l_in;
2254  tmp0_l += p6_l_in;
2255  tmp0_l += q0_l_in;
2256  tmp1_l = p6_l_in + p5_l_in;
2257  tmp1_l += p4_l_in;
2258  tmp1_l += p3_l_in;
2259  tmp1_l += p2_l_in;
2260  tmp1_l += p1_l_in;
2261  tmp1_l += p0_l_in;
2262  tmp1_l += tmp0_l;
2263  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2264 
2265  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2266  p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
2267  ST_UB(p6, src);
2268  src += 16;
2269 
2270  /* p5 */
2271  q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
2272  tmp0_r = p5_r_in - p6_r_in;
2273  tmp0_r += q1_r_in;
2274  tmp0_r -= p7_r_in;
2275  tmp1_r += tmp0_r;
2276  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2277  q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
2278  tmp0_l = p5_l_in - p6_l_in;
2279  tmp0_l += q1_l_in;
2280  tmp0_l -= p7_l_in;
2281  tmp1_l += tmp0_l;
2282  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2283  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2284  p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
2285  ST_UB(p5, src);
2286  src += 16;
2287 
2288  /* p4 */
2289  q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
2290  tmp0_r = p4_r_in - p5_r_in;
2291  tmp0_r += q2_r_in;
2292  tmp0_r -= p7_r_in;
2293  tmp1_r += tmp0_r;
2294  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2295  q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
2296  tmp0_l = p4_l_in - p5_l_in;
2297  tmp0_l += q2_l_in;
2298  tmp0_l -= p7_l_in;
2299  tmp1_l += tmp0_l;
2300  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2301  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2302  p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
2303  ST_UB(p4, src);
2304  src += 16;
2305 
2306  /* p3 */
2307  q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
2308  tmp0_r = p3_r_in - p4_r_in;
2309  tmp0_r += q3_r_in;
2310  tmp0_r -= p7_r_in;
2311  tmp1_r += tmp0_r;
2312  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2313  q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
2314  tmp0_l = p3_l_in - p4_l_in;
2315  tmp0_l += q3_l_in;
2316  tmp0_l -= p7_l_in;
2317  tmp1_l += tmp0_l;
2318  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2319  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2320  p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
2321  ST_UB(p3, src);
2322  src += 16;
2323 
2324  /* p2 */
2325  q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
2326  filter8 = LD_UB(filter48);
2327  tmp0_r = p2_r_in - p3_r_in;
2328  tmp0_r += q4_r_in;
2329  tmp0_r -= p7_r_in;
2330  tmp1_r += tmp0_r;
2331  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2332  q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
2333  tmp0_l = p2_l_in - p3_l_in;
2334  tmp0_l += q4_l_in;
2335  tmp0_l -= p7_l_in;
2336  tmp1_l += tmp0_l;
2337  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2338  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2339  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2340  ST_UB(filter8, src);
2341  src += 16;
2342 
2343  /* p1 */
2344  q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
2345  filter8 = LD_UB(filter48 + 16);
2346  tmp0_r = p1_r_in - p2_r_in;
2347  tmp0_r += q5_r_in;
2348  tmp0_r -= p7_r_in;
2349  tmp1_r += tmp0_r;
2350  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2351  q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
2352  tmp0_l = p1_l_in - p2_l_in;
2353  tmp0_l += q5_l_in;
2354  tmp0_l -= p7_l_in;
2355  tmp1_l += tmp0_l;
2356  l_out = __msa_srari_h((v8i16) (tmp1_l), 4);
2357  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2358  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2359  ST_UB(filter8, src);
2360  src += 16;
2361 
2362  /* p0 */
2363  q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
2364  filter8 = LD_UB(filter48 + 32);
2365  tmp0_r = p0_r_in - p1_r_in;
2366  tmp0_r += q6_r_in;
2367  tmp0_r -= p7_r_in;
2368  tmp1_r += tmp0_r;
2369  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2370  q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
2371  tmp0_l = p0_l_in - p1_l_in;
2372  tmp0_l += q6_l_in;
2373  tmp0_l -= p7_l_in;
2374  tmp1_l += tmp0_l;
2375  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2376  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2377  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2378  ST_UB(filter8, src);
2379  src += 16;
2380 
2381  /* q0 */
2382  q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
2383  filter8 = LD_UB(filter48 + 48);
2384  tmp0_r = q7_r_in - p0_r_in;
2385  tmp0_r += q0_r_in;
2386  tmp0_r -= p7_r_in;
2387  tmp1_r += tmp0_r;
2388  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2389  q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
2390  tmp0_l = q7_l_in - p0_l_in;
2391  tmp0_l += q0_l_in;
2392  tmp0_l -= p7_l_in;
2393  tmp1_l += tmp0_l;
2394  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2395  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2396  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2397  ST_UB(filter8, src);
2398  src += 16;
2399 
2400  /* q1 */
2401  filter8 = LD_UB(filter48 + 64);
2402  tmp0_r = q7_r_in - q0_r_in;
2403  tmp0_r += q1_r_in;
2404  tmp0_r -= p6_r_in;
2405  tmp1_r += tmp0_r;
2406  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2407  tmp0_l = q7_l_in - q0_l_in;
2408  tmp0_l += q1_l_in;
2409  tmp0_l -= p6_l_in;
2410  tmp1_l += tmp0_l;
2411  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2412  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2413  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2414  ST_UB(filter8, src);
2415  src += 16;
2416 
2417  /* q2 */
2418  filter8 = LD_UB(filter48 + 80);
2419  tmp0_r = q7_r_in - q1_r_in;
2420  tmp0_r += q2_r_in;
2421  tmp0_r -= p5_r_in;
2422  tmp1_r += tmp0_r;
2423  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2424  tmp0_l = q7_l_in - q1_l_in;
2425  tmp0_l += q2_l_in;
2426  tmp0_l -= p5_l_in;
2427  tmp1_l += tmp0_l;
2428  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2429  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2430  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2431  ST_UB(filter8, src);
2432  src += 16;
2433 
2434  /* q3 */
2435  tmp0_r = q7_r_in - q2_r_in;
2436  tmp0_r += q3_r_in;
2437  tmp0_r -= p4_r_in;
2438  tmp1_r += tmp0_r;
2439  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2440  tmp0_l = q7_l_in - q2_l_in;
2441  tmp0_l += q3_l_in;
2442  tmp0_l -= p4_l_in;
2443  tmp1_l += tmp0_l;
2444  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2445  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2446  q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2447  ST_UB(q3, src);
2448  src += 16;
2449 
2450  /* q4 */
2451  tmp0_r = q7_r_in - q3_r_in;
2452  tmp0_r += q4_r_in;
2453  tmp0_r -= p3_r_in;
2454  tmp1_r += tmp0_r;
2455  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2456  tmp0_l = q7_l_in - q3_l_in;
2457  tmp0_l += q4_l_in;
2458  tmp0_l -= p3_l_in;
2459  tmp1_l += tmp0_l;
2460  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2461  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2462  q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2463  ST_UB(q4, src);
2464  src += 16;
2465 
2466  /* q5 */
2467  tmp0_r = q7_r_in - q4_r_in;
2468  tmp0_r += q5_r_in;
2469  tmp0_r -= p2_r_in;
2470  tmp1_r += tmp0_r;
2471  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2472  tmp0_l = q7_l_in - q4_l_in;
2473  tmp0_l += q5_l_in;
2474  tmp0_l -= p2_l_in;
2475  tmp1_l += tmp0_l;
2476  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2477  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2478  q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2479  ST_UB(q5, src);
2480  src += 16;
2481 
2482  /* q6 */
2483  tmp0_r = q7_r_in - q5_r_in;
2484  tmp0_r += q6_r_in;
2485  tmp0_r -= p1_r_in;
2486  tmp1_r += tmp0_r;
2487  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2488  tmp0_l = q7_l_in - q5_l_in;
2489  tmp0_l += q6_l_in;
2490  tmp0_l -= p1_l_in;
2491  tmp1_l += tmp0_l;
2492  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2493  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2494  q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2495  ST_UB(q6, src);
2496 
2497  return 0;
2498  }
2499 }
2500 
2501 void ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch,
2502  int32_t b_limit_ptr,
2503  int32_t limit_ptr,
2504  int32_t thresh_ptr)
2505 {
2506  uint8_t early_exit = 0;
2507  uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
2508  uint8_t *filter48 = &transposed_input[16 * 16];
2509 
2510  vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
2511 
2512  early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
2513  &filter48[0], src, pitch,
2514  b_limit_ptr, limit_ptr, thresh_ptr);
2515 
2516  if (0 == early_exit) {
2517  early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
2518  &filter48[0]);
2519 
2520  if (0 == early_exit) {
2521  vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch);
2522  }
2523  }
2524 }
ff_loop_filter_h_88_16_msa
void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1349
vp9_transpose_8x16_to_16x8
static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
Definition: vp9_lpf_msa.c:1696
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:458
q1
static const uint8_t q1[256]
Definition: twofish.c:96
LD_UB8
#define LD_UB8(...)
Definition: generic_macros_msa.h:336
ALLOC_ALIGNED
#define ALLOC_ALIGNED(align)
Definition: generic_macros_msa.h:33
ff_loop_filter_v_44_16_msa
void ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:225
ff_loop_filter_h_8_8_msa
void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1271
ff_loop_filter_v_4_8_msa
void ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:196
ST_UB2
#define ST_UB2(...)
Definition: generic_macros_msa.h:363
PCKEV_B4_SH
#define PCKEV_B4_SH(...)
Definition: generic_macros_msa.h:1789
output
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce output
Definition: filter_design.txt:225
ff_loop_filter_h_48_16_msa
void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1565
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
ST_D1
#define ST_D1(in, idx, pdst)
Definition: generic_macros_msa.h:485
vp9_transpose_16x16
static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
Definition: vp9_lpf_msa.c:1709
ST_UB8
#define ST_UB8(...)
Definition: generic_macros_msa.h:391
ST_UB4
#define ST_UB4(...)
Definition: generic_macros_msa.h:374
generic_macros_msa.h
VP9_FLAT5
#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, q6_in, q7_in, flat_in, flat2_out)
Definition: vp9_lpf_msa.c:89
src
#define src
Definition: vp8dsp.c:254
ILVR_W2_UB
#define ILVR_W2_UB(...)
Definition: generic_macros_msa.h:1465
LD_UB
#define LD_UB(...)
Definition: generic_macros_msa.h:36
VP9_FLAT4
#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)
Definition: vp9_lpf_msa.c:68
ILVEV_B2_SH
#define ILVEV_B2_SH(...)
Definition: generic_macros_msa.h:1240
vp9_hz_lpf_t4_and_t8_16w
static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:551
ILVEV_B2_UB
#define ILVEV_B2_UB(...)
Definition: generic_macros_msa.h:1238
ILVR_B8_UH
#define ILVR_B8_UH(...)
Definition: generic_macros_msa.h:1424
mask
static const uint16_t mask[17]
Definition: lzw.c:38
vp9_vt_lpf_t4_and_t8_8w
static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, uint8_t *src_org, int32_t pitch_org, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1769
ff_loop_filter_v_48_16_msa
void ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:479
q0
static const uint8_t q0[256]
Definition: twofish.c:77
vp9dsp_mips.h
int32_t
int32_t
Definition: audio_convert.c:194
vp9_hz_lpf_t16_16w
static void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48)
Definition: vp9_lpf_msa.c:626
ILVL_B2_SH
#define ILVL_B2_SH(...)
Definition: generic_macros_msa.h:1315
PCKEV_B2_UB
#define PCKEV_B2_UB(...)
Definition: generic_macros_msa.h:1769
ILVRL_H2_SH
#define ILVRL_H2_SH(...)
Definition: generic_macros_msa.h:1557
ff_loop_filter_h_84_16_msa
void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1462
ILVEV_H2_SW
#define ILVEV_H2_SW(...)
Definition: generic_macros_msa.h:1259
ff_loop_filter_v_16_16_msa
void ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:958
ff_loop_filter_h_4_8_msa
void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1198
TRANSPOSE8x8_UB_UB
#define TRANSPOSE8x8_UB_UB(...)
Definition: generic_macros_msa.h:2446
vp9_vt_lpf_t16_16w
static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, uint8_t *filter48)
Definition: vp9_lpf_msa.c:2174
vp9dsp.h
ILVL_W2_UB
#define ILVL_W2_UB(...)
Definition: generic_macros_msa.h:1368
vp9_vt_lpf_t16_8w
static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, uint8_t *filter48)
Definition: vp9_lpf_msa.c:1841
SD4
#define SD4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:258
LD_UB4
#define LD_UB4(...)
Definition: generic_macros_msa.h:298
ILVR_B2_SB
#define ILVR_B2_SB(...)
Definition: generic_macros_msa.h:1388
ff_loop_filter_h_16_8_msa
void ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:2065
TRANSPOSE16x8_UB_UB
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7)
Definition: generic_macros_msa.h:2491
ff_loop_filter_v_84_16_msa
void ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:408
input
and forward the test the status of outputs and forward it to the corresponding return FFERROR_NOT_READY If the filters stores internally one or a few frame for some input
Definition: filter_design.txt:172
vp9_transpose_16x8_to_8x16
static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
Definition: vp9_lpf_msa.c:1670
PCKEV_B2_SH
#define PCKEV_B2_SH(...)
Definition: generic_macros_msa.h:1770
ST_H4
#define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:417
ff_loop_filter_v_16_8_msa
void ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:974
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:470
ff_loop_filter_h_44_16_msa
void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1225
uint8_t
uint8_t
Definition: audio_convert.c:194
ILVL_B2_SB
#define ILVL_B2_SB(...)
Definition: generic_macros_msa.h:1313
ST_UB
#define ST_UB(...)
Definition: generic_macros_msa.h:44
LD_UB2
#define LD_UB2(...)
Definition: generic_macros_msa.h:279
SLDI_B4_0_UB
#define SLDI_B4_0_UB(...)
Definition: generic_macros_msa.h:637
ff_loop_filter_v_8_8_msa
void ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:255
ff_loop_filter_h_16_16_msa
void ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:2501
ILVL_B4_SB
#define ILVL_B4_SB(...)
Definition: generic_macros_msa.h:1324
zero
#define zero
Definition: regdef.h:64
hev
static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
Definition: vp8dsp_mmi.c:729
VP9_FILTER8
#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, q1_filt8_out, q2_filt8_out)
Definition: vp9_lpf_msa.c:119
ILVRL_B2_SH
#define ILVRL_B2_SH(...)
Definition: generic_macros_msa.h:1547
ff_loop_filter_v_88_16_msa
void ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:329
vp9_vt_lpf_t4_and_t8_16w
static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, uint8_t *src_org, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:2090
LPF_MASK_HEV
#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, limit_in, b_limit_in, thresh_in, hev_out, mask_out, flat_out)
Definition: vp9_lpf_msa.c:158
flat
static av_always_inline void flat(WaveformContext *s, AVFrame *in, AVFrame *out, int component, int intensity, int offset_y, int offset_x, int column, int mirror, int jobnr, int nb_jobs)
Definition: vf_waveform.c:984
ALIGNMENT
#define ALIGNMENT
Definition: generic_macros_msa.h:32
ILVR_B2_SH
#define ILVR_B2_SH(...)
Definition: generic_macros_msa.h:1390
ILVL_B4_UH
#define ILVL_B4_UH(...)
Definition: generic_macros_msa.h:1325
SD
#define SD
Definition: ccaption_dec.c:819
VP9_LPF_FILTER4_4W
#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, p1_out, p0_out, q0_out, q1_out)
Definition: vp9_lpf_msa.c:25