FFmpeg
h264dsp_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "h264dsp_mips.h"
23 
24 static void avc_wgt_4x2_msa(uint8_t *data, ptrdiff_t stride,
25  int32_t log2_denom, int32_t src_weight,
26  int32_t offset_in)
27 {
28  uint32_t tp0, tp1, offset_val;
29  v16u8 zero = { 0 };
30  v16u8 src0 = { 0 };
31  v8i16 src0_r, tmp0, wgt, denom, offset;
32 
33  offset_val = (unsigned) offset_in << log2_denom;
34 
35  wgt = __msa_fill_h(src_weight);
36  offset = __msa_fill_h(offset_val);
37  denom = __msa_fill_h(log2_denom);
38 
39  LW2(data, stride, tp0, tp1);
40  INSERT_W2_UB(tp0, tp1, src0);
41  src0_r = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) src0);
42  tmp0 = wgt * src0_r;
43  tmp0 = __msa_adds_s_h(tmp0, offset);
44  tmp0 = __msa_maxi_s_h(tmp0, 0);
45  tmp0 = __msa_srlr_h(tmp0, denom);
46  tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
47  src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
48  ST_W2(src0, 0, 1, data, stride);
49 }
50 
51 static void avc_wgt_4x4_msa(uint8_t *data, ptrdiff_t stride,
52  int32_t log2_denom, int32_t src_weight,
53  int32_t offset_in)
54 {
55  uint32_t tp0, tp1, tp2, tp3, offset_val;
56  v16u8 src0 = { 0 };
57  v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom, offset;
58 
59  offset_val = (unsigned) offset_in << log2_denom;
60 
61  wgt = __msa_fill_h(src_weight);
62  offset = __msa_fill_h(offset_val);
63  denom = __msa_fill_h(log2_denom);
64 
65  LW4(data, stride, tp0, tp1, tp2, tp3);
66  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
67  UNPCK_UB_SH(src0, src0_r, src1_r);
68  MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1);
69  ADDS_SH2_SH(tmp0, offset, tmp1, offset, tmp0, tmp1);
70  MAXI_SH2_SH(tmp0, tmp1, 0);
71  tmp0 = __msa_srlr_h(tmp0, denom);
72  tmp1 = __msa_srlr_h(tmp1, denom);
73  SAT_UH2_SH(tmp0, tmp1, 7);
74  src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
75  ST_W4(src0, 0, 1, 2, 3, data, stride);
76 }
77 
78 static void avc_wgt_4x8_msa(uint8_t *data, ptrdiff_t stride,
79  int32_t log2_denom, int32_t src_weight,
80  int32_t offset_in)
81 {
82  uint32_t tp0, tp1, tp2, tp3, offset_val;
83  v16u8 src0 = { 0 }, src1 = { 0 };
84  v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
85  v8i16 wgt, denom, offset;
86 
87  offset_val = (unsigned) offset_in << log2_denom;
88 
89  wgt = __msa_fill_h(src_weight);
90  offset = __msa_fill_h(offset_val);
91  denom = __msa_fill_h(log2_denom);
92 
93  LW4(data, stride, tp0, tp1, tp2, tp3);
94  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
95  LW4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
96  INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
97  UNPCK_UB_SH(src0, src0_r, src1_r);
98  UNPCK_UB_SH(src1, src2_r, src3_r);
99  MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
100  tmp3);
101  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
102  tmp1, tmp2, tmp3);
103  MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
104  SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
105  SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
106  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
107  ST_W8(src0, src1, 0, 1, 2, 3, 0, 1, 2, 3, data, stride);
108 }
109 
110 static void avc_wgt_8x4_msa(uint8_t *data, ptrdiff_t stride,
111  int32_t log2_denom, int32_t src_weight,
112  int32_t offset_in)
113 {
114  uint32_t offset_val;
115  uint64_t tp0, tp1, tp2, tp3;
116  v16u8 src0 = { 0 }, src1 = { 0 };
117  v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
118  v8i16 wgt, denom, offset;
119 
120  offset_val = (unsigned) offset_in << log2_denom;
121 
122  wgt = __msa_fill_h(src_weight);
123  offset = __msa_fill_h(offset_val);
124  denom = __msa_fill_h(log2_denom);
125 
126  LD4(data, stride, tp0, tp1, tp2, tp3);
127  INSERT_D2_UB(tp0, tp1, src0);
128  INSERT_D2_UB(tp2, tp3, src1);
129  UNPCK_UB_SH(src0, src0_r, src1_r);
130  UNPCK_UB_SH(src1, src2_r, src3_r);
131  MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
132  tmp3);
133  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
134  tmp1, tmp2, tmp3);
135  MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
136  SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
137  SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
138  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
139  ST_D4(src0, src1, 0, 1, 0, 1, data, stride);
140 }
141 
142 static void avc_wgt_8x8_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom,
143  int32_t src_weight, int32_t offset_in)
144 {
145  uint32_t offset_val;
146  uint64_t tp0, tp1, tp2, tp3;
147  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
148  v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
149  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
150  v8i16 wgt, denom, offset;
151 
152  offset_val = (unsigned) offset_in << log2_denom;
153 
154  wgt = __msa_fill_h(src_weight);
155  offset = __msa_fill_h(offset_val);
156  denom = __msa_fill_h(log2_denom);
157 
158  LD4(data, stride, tp0, tp1, tp2, tp3);
159  INSERT_D2_UB(tp0, tp1, src0);
160  INSERT_D2_UB(tp2, tp3, src1);
161  LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
162  INSERT_D2_UB(tp0, tp1, src2);
163  INSERT_D2_UB(tp2, tp3, src3);
164  UNPCK_UB_SH(src0, src0_r, src1_r);
165  UNPCK_UB_SH(src1, src2_r, src3_r);
166  UNPCK_UB_SH(src2, src4_r, src5_r);
167  UNPCK_UB_SH(src3, src6_r, src7_r);
168  MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
169  tmp3);
170  MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, tmp6,
171  tmp7);
172  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
173  tmp1, tmp2, tmp3);
174  ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
175  tmp5, tmp6, tmp7);
176  MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
177  SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
178  SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
179  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
180  src2, src3);
181  ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
182 }
183 
184 static void avc_wgt_8x16_msa(uint8_t *data, ptrdiff_t stride,
185  int32_t log2_denom, int32_t src_weight,
186  int32_t offset_in)
187 {
188  uint32_t offset_val, cnt;
189  uint64_t tp0, tp1, tp2, tp3;
190  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
191  v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
192  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
193  v8i16 wgt, denom, offset;
194 
195  offset_val = (unsigned) offset_in << log2_denom;
196 
197  wgt = __msa_fill_h(src_weight);
198  offset = __msa_fill_h(offset_val);
199  denom = __msa_fill_h(log2_denom);
200 
201  for (cnt = 2; cnt--;) {
202  LD4(data, stride, tp0, tp1, tp2, tp3);
203  INSERT_D2_UB(tp0, tp1, src0);
204  INSERT_D2_UB(tp2, tp3, src1);
205  LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
206  INSERT_D2_UB(tp0, tp1, src2);
207  INSERT_D2_UB(tp2, tp3, src3);
208  UNPCK_UB_SH(src0, src0_r, src1_r);
209  UNPCK_UB_SH(src1, src2_r, src3_r);
210  UNPCK_UB_SH(src2, src4_r, src5_r);
211  UNPCK_UB_SH(src3, src6_r, src7_r);
212  MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1,
213  tmp2, tmp3);
214  MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5,
215  tmp6, tmp7);
216  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
217  tmp0, tmp1, tmp2, tmp3);
218  ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
219  tmp4, tmp5, tmp6, tmp7);
220  MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
221  SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
222  SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
223  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
224  src2, src3);
225  ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
226  data += 8 * stride;
227  }
228 }
229 
230 static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
231  int32_t log2_denom, int32_t src_weight,
232  int32_t dst_weight, int32_t offset_in)
233 {
234  uint32_t tp0, tp1;
235  v16i8 src_wgt, dst_wgt, wgt, vec0;
236  v16u8 src0 = { 0 }, dst0 = { 0 };
237  v8i16 tmp0, denom, offset, max255 = __msa_ldi_h(255);
238 
239  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
240  offset_in += (128 * (src_weight + dst_weight));
241 
242  src_wgt = __msa_fill_b(src_weight);
243  dst_wgt = __msa_fill_b(dst_weight);
244  offset = __msa_fill_h(offset_in);
245  denom = __msa_fill_h(log2_denom + 1);
246 
247  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
248 
249  LW2(src, stride, tp0, tp1);
250  INSERT_W2_UB(tp0, tp1, src0);
251  LW2(dst, stride, tp0, tp1);
252  INSERT_W2_UB(tp0, tp1, dst0);
253  XORI_B2_128_UB(src0, dst0);
254  vec0 = (v16i8) __msa_ilvr_b((v16i8) dst0, (v16i8) src0);
255  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
256  tmp0 >>= denom;
257  tmp0 = __msa_maxi_s_h(tmp0, 0);
258  tmp0 = __msa_min_s_h(max255, tmp0);
259  dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
260  ST_W2(dst0, 0, 1, dst, stride);
261 }
262 
263 static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
264  int32_t log2_denom, int32_t src_weight,
265  int32_t dst_weight, int32_t offset_in)
266 {
267  uint32_t tp0, tp1, tp2, tp3;
268  v16i8 src_wgt, dst_wgt, wgt, vec0, vec1;
269  v16u8 src0, dst0;
270  v8i16 tmp0, tmp1, denom, offset;
271 
272  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
273  offset_in += (128 * (src_weight + dst_weight));
274 
275  src_wgt = __msa_fill_b(src_weight);
276  dst_wgt = __msa_fill_b(dst_weight);
277  offset = __msa_fill_h(offset_in);
278  denom = __msa_fill_h(log2_denom + 1);
279 
280  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
281 
282  LW4(src, stride, tp0, tp1, tp2, tp3);
283  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
284  LW4(dst, stride, tp0, tp1, tp2, tp3);
285  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
286  XORI_B2_128_UB(src0, dst0);
287  ILVRL_B2_SB(dst0, src0, vec0, vec1);
288  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
289  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
290  tmp0 >>= denom;
291  tmp1 >>= denom;
292  CLIP_SH2_0_255(tmp0, tmp1);
293  dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
294  ST_W4(dst0, 0, 1, 2, 3, dst, stride);
295 }
296 
297 static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
298  int32_t log2_denom, int32_t src_weight,
299  int32_t dst_weight, int32_t offset_in)
300 {
301  uint32_t tp0, tp1, tp2, tp3;
302  v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
303  v16u8 src0, src1, dst0, dst1;
304  v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
305 
306  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
307  offset_in += (128 * (src_weight + dst_weight));
308 
309  src_wgt = __msa_fill_b(src_weight);
310  dst_wgt = __msa_fill_b(dst_weight);
311  offset = __msa_fill_h(offset_in);
312  denom = __msa_fill_h(log2_denom + 1);
313  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
314 
315  LW4(src, stride, tp0, tp1, tp2, tp3);
316  src += 4 * stride;
317  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
318  LW4(src, stride, tp0, tp1, tp2, tp3);
319  INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
320  LW4(dst, stride, tp0, tp1, tp2, tp3);
321  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
322  LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
323  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
324  XORI_B4_128_UB(src0, src1, dst0, dst1);
325  ILVRL_B2_SB(dst0, src0, vec0, vec1);
326  ILVRL_B2_SB(dst1, src1, vec2, vec3);
327  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
328  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
329  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
330  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
331  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
332  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
333  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
334  ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
335 }
336 
337 static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
338  int32_t log2_denom, int32_t src_weight,
339  int32_t dst_weight, int32_t offset_in)
340 {
341  uint64_t tp0, tp1, tp2, tp3;
342  v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
343  v16u8 src0, src1, dst0, dst1;
344  v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
345 
346  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
347  offset_in += (128 * (src_weight + dst_weight));
348 
349  src_wgt = __msa_fill_b(src_weight);
350  dst_wgt = __msa_fill_b(dst_weight);
351  offset = __msa_fill_h(offset_in);
352  denom = __msa_fill_h(log2_denom + 1);
353 
354  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
355 
356  LD4(src, stride, tp0, tp1, tp2, tp3);
357  INSERT_D2_UB(tp0, tp1, src0);
358  INSERT_D2_UB(tp2, tp3, src1);
359  LD4(dst, stride, tp0, tp1, tp2, tp3);
360  INSERT_D2_UB(tp0, tp1, dst0);
361  INSERT_D2_UB(tp2, tp3, dst1);
362  XORI_B4_128_UB(src0, src1, dst0, dst1);
363  ILVRL_B2_SB(dst0, src0, vec0, vec1);
364  ILVRL_B2_SB(dst1, src1, vec2, vec3);
365  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
366  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
367  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
368  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
369  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
370  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
371  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
372  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
373 }
374 
375 static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
376  int32_t log2_denom, int32_t src_weight,
377  int32_t dst_weight, int32_t offset_in)
378 {
379  uint64_t tp0, tp1, tp2, tp3;
380  v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
381  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
382  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
383 
384  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
385  offset_in += (128 * (src_weight + dst_weight));
386 
387  src_wgt = __msa_fill_b(src_weight);
388  dst_wgt = __msa_fill_b(dst_weight);
389  offset = __msa_fill_h(offset_in);
390  denom = __msa_fill_h(log2_denom + 1);
391  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
392 
393  LD4(src, stride, tp0, tp1, tp2, tp3);
394  INSERT_D2_UB(tp0, tp1, src0);
395  INSERT_D2_UB(tp2, tp3, src1);
396  LD4(src + 4 * stride, stride, tp0, tp1, tp2, tp3);
397  INSERT_D2_UB(tp0, tp1, src2);
398  INSERT_D2_UB(tp2, tp3, src3);
399  LD4(dst, stride, tp0, tp1, tp2, tp3);
400  INSERT_D2_UB(tp0, tp1, dst0);
401  INSERT_D2_UB(tp2, tp3, dst1);
402  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
403  INSERT_D2_UB(tp0, tp1, dst2);
404  INSERT_D2_UB(tp2, tp3, dst3);
405  XORI_B8_128_UB(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
406  ILVRL_B2_SB(dst0, src0, vec0, vec1);
407  ILVRL_B2_SB(dst1, src1, vec2, vec3);
408  ILVRL_B2_SB(dst2, src2, vec4, vec5);
409  ILVRL_B2_SB(dst3, src3, vec6, vec7);
410  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
411  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
412  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
413  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
414  tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
415  tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
416  tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
417  tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
418  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
419  SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
420  CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
421  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
422  PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
423  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
424 }
425 
426 static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
427  int32_t log2_denom, int32_t src_weight,
428  int32_t dst_weight, int32_t offset_in)
429 {
430  uint8_t cnt;
431  uint64_t tp0, tp1, tp2, tp3;
432  v16i8 src_wgt, dst_wgt, wgt;
433  v16u8 src0, src1, src2, src3;
434  v16u8 dst0, dst1, dst2, dst3;
435  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
436  v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
437  v8i16 denom, offset;
438 
439  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
440  offset_in += (128 * (src_weight + dst_weight));
441 
442  src_wgt = __msa_fill_b(src_weight);
443  dst_wgt = __msa_fill_b(dst_weight);
444  offset = __msa_fill_h(offset_in);
445  denom = __msa_fill_h(log2_denom + 1);
446  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
447 
448  for (cnt = 2; cnt--;) {
449  LD4(src, stride, tp0, tp1, tp2, tp3);
450  src += 4 * stride;
451  INSERT_D2_UB(tp0, tp1, src0);
452  INSERT_D2_UB(tp2, tp3, src1);
453  LD4(src, stride, tp0, tp1, tp2, tp3);
454  src += 4 * stride;
455  INSERT_D2_UB(tp0, tp1, src2);
456  INSERT_D2_UB(tp2, tp3, src3);
457  LD4(dst, stride, tp0, tp1, tp2, tp3);
458  INSERT_D2_UB(tp0, tp1, dst0);
459  INSERT_D2_UB(tp2, tp3, dst1);
460  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
461  INSERT_D2_UB(tp0, tp1, dst2);
462  INSERT_D2_UB(tp2, tp3, dst3);
463  XORI_B4_128_UB(src0, src1, src2, src3);
464  XORI_B4_128_UB(dst0, dst1, dst2, dst3);
465  ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
466  vec0, vec2, vec4, vec6);
467  ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
468  vec1, vec3, vec5, vec7);
469 
470  temp0 = __msa_dpadd_s_h(offset, wgt, vec0);
471  temp1 = __msa_dpadd_s_h(offset, wgt, vec1);
472  temp2 = __msa_dpadd_s_h(offset, wgt, vec2);
473  temp3 = __msa_dpadd_s_h(offset, wgt, vec3);
474  temp4 = __msa_dpadd_s_h(offset, wgt, vec4);
475  temp5 = __msa_dpadd_s_h(offset, wgt, vec5);
476  temp6 = __msa_dpadd_s_h(offset, wgt, vec6);
477  temp7 = __msa_dpadd_s_h(offset, wgt, vec7);
478 
479  SRA_4V(temp0, temp1, temp2, temp3, denom);
480  SRA_4V(temp4, temp5, temp6, temp7, denom);
481  CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
482  PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
483  dst0, dst1, dst2, dst3);
484  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
485  dst += 8 * stride;
486  }
487 }
488 
489 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \
490  q3_or_p3_org_in, p1_or_q1_org_in, \
491  p2_or_q2_org_in, q1_or_p1_org_in, \
492  p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \
493 { \
494  v8i16 threshold; \
495  v8i16 const3 = __msa_ldi_h(3); \
496  \
497  threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in); \
498  threshold += (p1_or_q1_org_in); \
499  \
500  (p0_or_q0_out) = threshold << 1; \
501  (p0_or_q0_out) += (p2_or_q2_org_in); \
502  (p0_or_q0_out) += (q1_or_p1_org_in); \
503  (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3); \
504  \
505  (p1_or_q1_out) = (p2_or_q2_org_in) + threshold; \
506  (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2); \
507  \
508  (p2_or_q2_out) = (p2_or_q2_org_in) * const3; \
509  (p2_or_q2_out) += (p3_or_q3_org_in); \
510  (p2_or_q2_out) += (p3_or_q3_org_in); \
511  (p2_or_q2_out) += threshold; \
512  (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3); \
513 }
514 
515 /* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
516 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \
517  p1_or_q1_org_in, p0_or_q0_out) \
518 { \
519  (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in); \
520  (p0_or_q0_out) += (p1_or_q1_org_in); \
521  (p0_or_q0_out) += (p1_or_q1_org_in); \
522  (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2); \
523 }
524 
525 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \
526  p1_or_q1_org_in, p2_or_q2_org_in, \
527  negate_tc_in, tc_in, p1_or_q1_out) \
528 { \
529  v8i16 clip3, temp; \
530  \
531  clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in, \
532  (v8u16) q0_or_p0_org_in); \
533  temp = p1_or_q1_org_in << 1; \
534  clip3 = clip3 - temp; \
535  clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3); \
536  CLIP_SH(clip3, negate_tc_in, tc_in); \
537  p1_or_q1_out = p1_or_q1_org_in + clip3; \
538 }
539 
540 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \
541  p1_or_q1_org_in, q1_or_p1_org_in, \
542  negate_threshold_in, threshold_in, \
543  p0_or_q0_out, q0_or_p0_out) \
544 { \
545  v8i16 q0_sub_p0, p1_sub_q1, delta; \
546  \
547  q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \
548  p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \
549  q0_sub_p0 <<= 2; \
550  p1_sub_q1 += 4; \
551  delta = q0_sub_p0 + p1_sub_q1; \
552  delta >>= 3; \
553  \
554  CLIP_SH(delta, negate_threshold_in, threshold_in); \
555  \
556  p0_or_q0_out = p0_or_q0_org_in + delta; \
557  q0_or_p0_out = q0_or_p0_org_in - delta; \
558  \
559  CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out); \
560 }
561 
562 #define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
563 { \
564  uint32_t load0, load1, load2, load3; \
565  v16u8 src0 = { 0 }; \
566  v16u8 src1 = { 0 }; \
567  v16u8 src2 = { 0 }; \
568  v16u8 src3 = { 0 }; \
569  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
570  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
571  v8i16 tc, q0_sub_p0, p1_sub_q1, delta; \
572  v8i16 res0_r, res1_r; \
573  v16i8 zeros = { 0 }; \
574  v16u8 res0, res1; \
575  \
576  LW4((src - 2), stride, load0, load1, load2, load3); \
577  src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
578  src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
579  src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2); \
580  src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3); \
581  \
582  TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3); \
583  \
584  p0_asub_q0 = __msa_asub_u_b(src2, src1); \
585  p1_asub_p0 = __msa_asub_u_b(src1, src0); \
586  q1_asub_q0 = __msa_asub_u_b(src2, src3); \
587  \
588  tc = __msa_fill_h(tc_val); \
589  \
590  is_less_than_alpha = (p0_asub_q0 < alpha); \
591  is_less_than_beta = (p1_asub_p0 < beta); \
592  is_less_than = is_less_than_alpha & is_less_than_beta; \
593  is_less_than_beta = (q1_asub_q0 < beta); \
594  is_less_than = is_less_than_beta & is_less_than; \
595  \
596  ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
597  HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
598  \
599  q0_sub_p0 <<= 2; \
600  delta = q0_sub_p0 + p1_sub_q1; \
601  delta = __msa_srari_h(delta, 3); \
602  \
603  CLIP_SH(delta, -tc, tc); \
604  \
605  ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
606  \
607  res0_r += delta; \
608  res1_r -= delta; \
609  \
610  CLIP_SH2_0_255(res0_r, res1_r); \
611  PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
612  \
613  res0 = __msa_bmnz_v(src1, res0, is_less_than); \
614  res1 = __msa_bmnz_v(src2, res1, is_less_than); \
615  \
616  res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
617 }
618 
619 #define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3) \
620 { \
621  v16i8 zero_m = { 0 }; \
622  \
623  out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0); \
624  out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2); \
625  SLDI_B2_UB(zero_m, out1, zero_m, out2, 2, out2, out3); \
626 }
627 
628 #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
629 { \
630  uint32_t load0, load1; \
631  v16u8 src0 = { 0 }; \
632  v16u8 src1 = { 0 }; \
633  v16u8 src2 = { 0 }; \
634  v16u8 src3 = { 0 }; \
635  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
636  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
637  v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r; \
638  v16i8 zeros = { 0 }; \
639  v16u8 res0, res1; \
640  \
641  load0 = LW(src - 2); \
642  load1 = LW(src - 2 + stride); \
643  \
644  src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
645  src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
646  \
647  TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3); \
648  \
649  p0_asub_q0 = __msa_asub_u_b(src2, src1); \
650  p1_asub_p0 = __msa_asub_u_b(src1, src0); \
651  q1_asub_q0 = __msa_asub_u_b(src2, src3); \
652  \
653  tc = __msa_fill_h(tc_val); \
654  \
655  is_less_than_alpha = (p0_asub_q0 < alpha); \
656  is_less_than_beta = (p1_asub_p0 < beta); \
657  is_less_than = is_less_than_alpha & is_less_than_beta; \
658  is_less_than_beta = (q1_asub_q0 < beta); \
659  is_less_than = is_less_than_beta & is_less_than; \
660  \
661  ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
662  HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
663  \
664  q0_sub_p0 <<= 2; \
665  delta = q0_sub_p0 + p1_sub_q1; \
666  delta = __msa_srari_h(delta, 3); \
667  CLIP_SH(delta, -tc, tc); \
668  \
669  ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
670  \
671  res0_r += delta; \
672  res1_r -= delta; \
673  \
674  CLIP_SH2_0_255(res0_r, res1_r); \
675  PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
676  \
677  res0 = __msa_bmnz_v(src1, res0, is_less_than); \
678  res1 = __msa_bmnz_v(src2, res1, is_less_than); \
679  \
680  res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
681 }
682 
684  uint8_t alpha_in,
685  uint8_t beta_in,
686  ptrdiff_t img_width)
687 {
688  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
689  v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
690  v16u8 p1_org, p0_org, q0_org, q1_org;
691 
692  LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
693 
694  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
695  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
696  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
697 
698  is_less_than_alpha = (p0_asub_q0 < alpha_in);
699  is_less_than_beta = (p1_asub_p0 < beta_in);
700  is_less_than = is_less_than_beta & is_less_than_alpha;
701  is_less_than_beta = (q1_asub_q0 < beta_in);
702  is_less_than = is_less_than_beta & is_less_than;
703 
704  if (!__msa_test_bz_v(is_less_than)) {
705  v16u8 p2_asub_p0, q2_asub_q0, p0, q0, negate_is_less_than_beta;
706  v8i16 p0_r = { 0 };
707  v8i16 q0_r = { 0 };
708  v8i16 p0_l = { 0 };
709  v8i16 q0_l = { 0 };
710  v16i8 zero = { 0 };
711  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
712  v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
713  v16u8 q2_org = LD_UB(data + (2 * img_width));
714  v16u8 p2_org = LD_UB(data - (3 * img_width));
715  v16u8 tmp_flag = (v16u8)__msa_fill_b((alpha_in >> 2) + 2);
716 
717  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
718  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
719  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
720 
721  tmp_flag = (p0_asub_q0 < tmp_flag);
722 
723  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
724  is_less_than_beta = (p2_asub_p0 < beta_in);
725  is_less_than_beta = is_less_than_beta & tmp_flag;
726  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
727  is_less_than_beta = is_less_than_beta & is_less_than;
728  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
729 
730  q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
731  q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
732 
733  /* combine and store */
734  if (!__msa_test_bz_v(is_less_than_beta)) {
735  v8i16 p3_org_l, p3_org_r;
736  v16u8 p3_org = LD_UB(data - (img_width << 2));
737  v16u8 p2, p1;
738  v8i16 p2_r = { 0 };
739  v8i16 p2_l = { 0 };
740  v8i16 p1_r = { 0 };
741  v8i16 p1_l = { 0 };
742 
743  ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
744  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
745  p2_r, q1_org_r, p0_r, p1_r, p2_r);
746 
747  ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
748  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
749  p2_l, q1_org_l, p0_l, p1_l, p2_l);
750 
751  PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
752 
753  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
754  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
755  p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
756 
757  ST_UB(p1_org, data - (2 * img_width));
758  ST_UB(p2_org, data - (3 * img_width));
759  }
760 
761  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
762  AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
763 
764  /* combine */
765  p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
766  p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
767 
768  ST_UB(p0_org, data - img_width);
769 
770  /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
771  q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
772  is_less_than_beta = (q2_asub_q0 < beta_in);
773  is_less_than_beta = is_less_than_beta & tmp_flag;
774  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
775  is_less_than_beta = is_less_than_beta & is_less_than;
776  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
777 
778  /* combine and store */
779  if (!__msa_test_bz_v(is_less_than_beta)) {
780  v8i16 q3_org_r, q3_org_l;
781  v16u8 q3_org = LD_UB(data + (3 * img_width));
782  v16u8 q1, q2;
783  v8i16 q2_r = { 0 };
784  v8i16 q2_l = { 0 };
785  v8i16 q1_r = { 0 };
786  v8i16 q1_l = { 0 };
787 
788  ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
789  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
790  q2_r, p1_org_r, q0_r, q1_r, q2_r);
791 
792  ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
793  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
794  q2_l, p1_org_l, q0_l, q1_l, q2_l);
795 
796  PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
797  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
798  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
799  q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
800 
801  ST_UB(q1_org, data + img_width);
802  ST_UB(q2_org, data + 2 * img_width);
803  }
804 
805  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
806  AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
807 
808  /* combine */
809  q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
810  q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
811 
812  ST_UB(q0_org, data);
813  }
814 }
815 
817  uint8_t alpha_in,
818  uint8_t beta_in,
819  ptrdiff_t img_width)
820 {
821  uint8_t *src = data - 4;
822  v16u8 alpha, beta, p0_asub_q0;
823  v16u8 is_less_than_alpha, is_less_than, is_less_than_beta;
824  v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
825  v16u8 p1_asub_p0, q1_asub_q0;
826 
827 
828  {
829  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
830  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
831 
832  LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
833  LD_UB8(src + (8 * img_width), img_width,
834  row8, row9, row10, row11, row12, row13, row14, row15);
835 
836  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3,
837  row4, row5, row6, row7,
838  row8, row9, row10, row11,
839  row12, row13, row14, row15,
840  p3_org, p2_org, p1_org, p0_org,
841  q0_org, q1_org, q2_org, q3_org);
842  }
843 
844  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
845  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
846  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
847 
848  alpha = (v16u8) __msa_fill_b(alpha_in);
849  beta = (v16u8) __msa_fill_b(beta_in);
850 
851  is_less_than_alpha = (p0_asub_q0 < alpha);
852  is_less_than_beta = (p1_asub_p0 < beta);
853  is_less_than = is_less_than_beta & is_less_than_alpha;
854  is_less_than_beta = (q1_asub_q0 < beta);
855  is_less_than = is_less_than_beta & is_less_than;
856 
857  if (!__msa_test_bz_v(is_less_than)) {
858  v8i16 p0_r = { 0 };
859  v8i16 q0_r = { 0 };
860  v8i16 p0_l = { 0 };
861  v8i16 q0_l = { 0 };
862  v16i8 zero = { 0 };
863  v16u8 tmp_flag, p0, q0, p2_asub_p0, q2_asub_q0;
864  v16u8 negate_is_less_than_beta;
865  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
866  v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
867 
868  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
869  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
870  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
871  UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l);
872 
873  tmp_flag = alpha >> 2;
874  tmp_flag = tmp_flag + 2;
875  tmp_flag = (p0_asub_q0 < tmp_flag);
876 
877  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
878  is_less_than_beta = (p2_asub_p0 < beta);
879  is_less_than_beta = tmp_flag & is_less_than_beta;
880  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
881  is_less_than_beta = is_less_than_beta & is_less_than;
882  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
883 
884  if (!__msa_test_bz_v(is_less_than_beta)) {
885  v16u8 p2, p1;
886  v8i16 p3_org_r, p3_org_l;
887  v8i16 p2_l = { 0 };
888  v8i16 p2_r = { 0 };
889  v8i16 p1_l = { 0 };
890  v8i16 p1_r = { 0 };
891 
892  ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
893  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
894  p2_r, q1_org_r, p0_r, p1_r, p2_r);
895 
896  ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
897  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
898  p2_l, q1_org_l, p0_l, p1_l, p2_l);
899 
900  PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
901  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
902  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
903  p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
904  }
905 
906  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
907  AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
908 
909  p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
910  p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
911 
912  q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
913  is_less_than_beta = (q2_asub_q0 < beta);
914 
915  is_less_than_beta = is_less_than_beta & tmp_flag;
916  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
917 
918  is_less_than_beta = is_less_than_beta & is_less_than;
919  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
920 
921  if (!__msa_test_bz_v(is_less_than_beta)) {
922  v16u8 q1, q2;
923  v8i16 q3_org_r, q3_org_l;
924  v8i16 q1_l = { 0 };
925  v8i16 q1_r = { 0 };
926  v8i16 q2_l = { 0 };
927  v8i16 q2_r = { 0 };
928 
929  ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
930  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
931  q2_r, p1_org_r, q0_r, q1_r, q2_r);
932 
933  ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
934  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
935  q2_l, p1_org_l, q0_l, q1_l, q2_l);
936 
937  PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
938  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
939  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
940  q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
941  }
942 
943  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
944  AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
945 
946  q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
947  q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
948 
949  {
950  v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
951 
952  ILVRL_B2_SH(p1_org, p2_org, tp0, tp2);
953  ILVRL_B2_SH(q0_org, p0_org, tp1, tp3);
954  ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
955 
956  ILVRL_H2_SH(tp1, tp0, tmp3, tmp4);
957  ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
958 
959  src = data - 3;
960  ST_W4(tmp3, 0, 1, 2, 3, src, img_width);
961  ST_H4(tmp2, 0, 1, 2, 3, src + 4, img_width);
962  src += 4 * img_width;
963  ST_W4(tmp4, 0, 1, 2, 3, src, img_width);
964  ST_H4(tmp2, 4, 5, 6, 7, src + 4, img_width);
965  src += 4 * img_width;
966 
967  ST_W4(tmp6, 0, 1, 2, 3, src, img_width);
968  ST_H4(tmp5, 0, 1, 2, 3, src + 4, img_width);
969  src += 4 * img_width;
970  ST_W4(tmp7, 0, 1, 2, 3, src, img_width);
971  ST_H4(tmp5, 4, 5, 6, 7, src + 4, img_width);
972  }
973  }
974 }
975 
977  ptrdiff_t stride,
978  int32_t alpha_in,
979  int32_t beta_in)
980 {
981  uint64_t load0, load1;
982  uint32_t out0, out2;
983  uint16_t out1, out3;
984  v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
985  v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
986  v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
987  v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
988  v8i16 tmp0, tmp1, tmp2, tmp3;
989  v16u8 alpha, beta;
990  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
991  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
992  v16u8 is_less_than_beta1, is_less_than_beta2;
993  v16i8 src0 = { 0 };
994  v16i8 src1 = { 0 };
995  v16i8 src2 = { 0 };
996  v16i8 src3 = { 0 };
997  v16i8 src4 = { 0 };
998  v16i8 src5 = { 0 };
999  v16i8 src6 = { 0 };
1000  v16i8 src7 = { 0 };
1001  v16i8 zeros = { 0 };
1002 
1003  load0 = LD(src - 4);
1004  load1 = LD(src + stride - 4);
1005  src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0);
1006  src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1);
1007 
1008  load0 = LD(src + (2 * stride) - 4);
1009  load1 = LD(src + (3 * stride) - 4);
1010  src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0);
1011  src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
1012 
1013  load0 = LD(src + (4 * stride) - 4);
1014  load1 = LD(src + (5 * stride) - 4);
1015  src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
1016  src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
1017 
1018  load0 = LD(src + (6 * stride) - 4);
1019  load1 = LD(src + (7 * stride) - 4);
1020  src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
1021  src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
1022 
1023  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
1024  src0, src1, src2, src3);
1025 
1026  ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2);
1027  ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3);
1028 
1029  ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
1030  ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
1031  SLDI_B4_SB(zeros, src6, zeros, src1, zeros, src3, zeros, src5,
1032  8, src0, src2, src4, src7);
1033 
1034  p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1035  p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1036  q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1037 
1038  alpha = (v16u8) __msa_fill_b(alpha_in);
1039  beta = (v16u8) __msa_fill_b(beta_in);
1040 
1041  is_less_than_alpha = (p0_asub_q0 < alpha);
1042  is_less_than_beta = (p1_asub_p0 < beta);
1043  is_less_than = is_less_than_alpha & is_less_than_beta;
1044  is_less_than_beta = (q1_asub_q0 < beta);
1045  is_less_than = is_less_than & is_less_than_beta;
1046 
1047  alpha >>= 2;
1048  alpha += 2;
1049 
1050  is_less_than_alpha = (p0_asub_q0 < alpha);
1051 
1052  p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1053  is_less_than_beta1 = (p2_asub_p0 < beta);
1054  q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1055  is_less_than_beta2 = (q2_asub_q0 < beta);
1056 
1057  ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3,
1058  src0_r, src1_r, src2_r, src3_r);
1059  ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
1060  src4_r, src5_r, src6_r, src7_r);
1061 
1062  dst2_x_r = src1_r + src2_r + src3_r;
1063  dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
1064  dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
1065  dst1_r = src0_r + src1_r + src2_r + src3_r;
1066  dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
1067 
1068  dst0_r = (2 * src6_r) + (3 * src0_r);
1069  dst0_r += src1_r + src2_r + src3_r;
1070  dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
1071  dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1072  dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1073 
1074  PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
1075  dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
1076 
1077  dst3_x_r = src2_r + src3_r + src4_r;
1078  dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
1079  dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
1080  dst4_r = src2_r + src3_r + src4_r + src5_r;
1081  dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
1082 
1083  dst5_r = (2 * src7_r) + (3 * src5_r);
1084  dst5_r += src4_r + src3_r + src2_r;
1085  dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
1086  dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1087  dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1088 
1089  PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
1090  dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
1091 
1092  dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1093  dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1094  dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1095  dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1096 
1097  PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
1098 
1099  dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
1100  dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
1101  dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than);
1102  dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
1103 
1104  is_less_than = is_less_than_alpha & is_less_than;
1105  dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
1106  is_less_than_beta1 = is_less_than_beta1 & is_less_than;
1107  dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1);
1108 
1109  dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
1110  dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1);
1111  dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
1112  is_less_than_beta2 = is_less_than_beta2 & is_less_than;
1113  dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
1114  dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
1115  dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
1116 
1117  ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
1118  dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
1119  ILVRL_H2_SH(dst1, dst0, tmp0, tmp1);
1120  ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
1121 
1122  ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
1123  SLDI_B2_UB(zeros, dst0, zeros, dst4, 8, dst1, dst5);
1124  dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
1125  dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
1126  SLDI_B2_UB(zeros, dst2_x, zeros, dst2_y, 8, dst3_x, dst3_y);
1127 
1128  out0 = __msa_copy_u_w((v4i32) dst0, 0);
1129  out1 = __msa_copy_u_h((v8i16) dst0, 2);
1130  out2 = __msa_copy_u_w((v4i32) dst1, 0);
1131  out3 = __msa_copy_u_h((v8i16) dst1, 2);
1132 
1133  SW(out0, (src - 3));
1134  SH(out1, (src + 1));
1135  src += stride;
1136  SW(out2, (src - 3));
1137  SH(out3, (src + 1));
1138  src += stride;
1139 
1140  out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
1141  out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
1142  out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
1143  out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
1144 
1145  SW(out0, (src - 3));
1146  SH(out1, (src + 1));
1147  src += stride;
1148  SW(out2, (src - 3));
1149  SH(out3, (src + 1));
1150  src += stride;
1151 
1152  out0 = __msa_copy_u_w((v4i32) dst4, 0);
1153  out1 = __msa_copy_u_h((v8i16) dst4, 2);
1154  out2 = __msa_copy_u_w((v4i32) dst5, 0);
1155  out3 = __msa_copy_u_h((v8i16) dst5, 2);
1156 
1157  SW(out0, (src - 3));
1158  SH(out1, (src + 1));
1159  src += stride;
1160  SW(out2, (src - 3));
1161  SH(out3, (src + 1));
1162  src += stride;
1163 
1164  out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
1165  out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
1166  out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
1167  out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
1168 
1169  SW(out0, (src - 3));
1170  SH(out1, (src + 1));
1171  src += stride;
1172  SW(out2, (src - 3));
1173  SH(out3, (src + 1));
1174 }
1175 
1177  uint8_t alpha_in,
1178  uint8_t beta_in,
1179  ptrdiff_t img_width)
1180 {
1181  v16u8 alpha, beta;
1182  v16u8 is_less_than;
1183  v8i16 p0_or_q0, q0_or_p0;
1184  v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1185  v16i8 zero = { 0 };
1186  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1187  v16u8 is_less_than_alpha, is_less_than_beta;
1188  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1189 
1190  alpha = (v16u8) __msa_fill_b(alpha_in);
1191  beta = (v16u8) __msa_fill_b(beta_in);
1192 
1193  LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
1194  p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
1195 
1196  p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1197  p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1198  q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1199 
1200  is_less_than_alpha = (p0_asub_q0 < alpha);
1201  is_less_than_beta = (p1_asub_p0 < beta);
1202  is_less_than = is_less_than_beta & is_less_than_alpha;
1203  is_less_than_beta = (q1_asub_q0 < beta);
1204  is_less_than = is_less_than_beta & is_less_than;
1205 
1206  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1207 
1208  if (!__msa_test_bz_v(is_less_than)) {
1209  ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1210  zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1211  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1212  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1213  PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1214 
1215  p0_or_q0_org =
1216  __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1217  q0_or_p0_org =
1218  __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1219 
1220  ST_UB(q0_or_p0_org, data_cb_or_cr);
1221  ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
1222  }
1223 }
1224 
1226  uint8_t alpha_in,
1227  uint8_t beta_in,
1228  ptrdiff_t img_width)
1229 {
1230  v8i16 tmp1;
1231  v16u8 alpha, beta, is_less_than;
1232  v8i16 p0_or_q0, q0_or_p0;
1233  v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1234  v16i8 zero = { 0 };
1235  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1236  v16u8 is_less_than_alpha, is_less_than_beta;
1237  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1238 
1239  {
1240  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1241 
1242  LD_UB8((data_cb_or_cr - 2), img_width,
1243  row0, row1, row2, row3, row4, row5, row6, row7);
1244 
1245  TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1246  p1_or_q1_org, p0_or_q0_org,
1247  q0_or_p0_org, q1_or_p1_org);
1248  }
1249 
1250  alpha = (v16u8) __msa_fill_b(alpha_in);
1251  beta = (v16u8) __msa_fill_b(beta_in);
1252 
1253  p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1254  p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1255  q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1256 
1257  is_less_than_alpha = (p0_asub_q0 < alpha);
1258  is_less_than_beta = (p1_asub_p0 < beta);
1259  is_less_than = is_less_than_beta & is_less_than_alpha;
1260  is_less_than_beta = (q1_asub_q0 < beta);
1261  is_less_than = is_less_than_beta & is_less_than;
1262  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1263 
1264  if (!__msa_test_bz_v(is_less_than)) {
1265  ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1266  zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1267 
1268  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1269  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1270 
1271  /* convert 16 bit output into 8 bit output */
1272  PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1273 
1274  p0_or_q0_org =
1275  __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1276  q0_or_p0_org =
1277  __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1278  tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
1279 
1280  data_cb_or_cr -= 1;
1281  ST_H4(tmp1, 0, 1, 2, 3, data_cb_or_cr, img_width);
1282  data_cb_or_cr += 4 * img_width;
1283  ST_H4(tmp1, 4, 5, 6, 7, data_cb_or_cr, img_width);
1284  }
1285 }
1286 
1288  uint8_t bs0, uint8_t bs1,
1289  uint8_t bs2, uint8_t bs3,
1290  uint8_t tc0, uint8_t tc1,
1291  uint8_t tc2, uint8_t tc3,
1292  uint8_t alpha_in,
1293  uint8_t beta_in,
1294  ptrdiff_t img_width)
1295 {
1296  v16u8 tmp_vec, bs = { 0 };
1297 
1298  tmp_vec = (v16u8) __msa_fill_b(bs0);
1299  bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1300  tmp_vec = (v16u8) __msa_fill_b(bs1);
1301  bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1302  tmp_vec = (v16u8) __msa_fill_b(bs2);
1303  bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1304  tmp_vec = (v16u8) __msa_fill_b(bs3);
1305  bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1306 
1307  if (!__msa_test_bz_v(bs)) {
1308  uint8_t *src = data - 4;
1309  v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
1310  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
1311  v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
1312  v16u8 is_bs_greater_than0;
1313  v16u8 tc = { 0 };
1314  v16i8 zero = { 0 };
1315 
1316  tmp_vec = (v16u8) __msa_fill_b(tc0);
1317  tc = (v16u8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1318  tmp_vec = (v16u8) __msa_fill_b(tc1);
1319  tc = (v16u8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1320  tmp_vec = (v16u8) __msa_fill_b(tc2);
1321  tc = (v16u8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1322  tmp_vec = (v16u8) __msa_fill_b(tc3);
1323  tc = (v16u8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1324 
1325  is_bs_greater_than0 = (zero < bs);
1326 
1327  {
1328  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1329  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1330 
1331  LD_UB8(src, img_width,
1332  row0, row1, row2, row3, row4, row5, row6, row7);
1333  src += (8 * img_width);
1334  LD_UB8(src, img_width,
1335  row8, row9, row10, row11, row12, row13, row14, row15);
1336 
1337  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1338  row8, row9, row10, row11,
1339  row12, row13, row14, row15,
1340  p3_org, p2_org, p1_org, p0_org,
1341  q0_org, q1_org, q2_org, q3_org);
1342  }
1343 
1344  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1345  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1346  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1347 
1348  alpha = (v16u8) __msa_fill_b(alpha_in);
1349  beta = (v16u8) __msa_fill_b(beta_in);
1350 
1351  is_less_than_alpha = (p0_asub_q0 < alpha);
1352  is_less_than_beta = (p1_asub_p0 < beta);
1353  is_less_than = is_less_than_beta & is_less_than_alpha;
1354  is_less_than_beta = (q1_asub_q0 < beta);
1355  is_less_than = is_less_than_beta & is_less_than;
1356  is_less_than = is_less_than & is_bs_greater_than0;
1357 
1358  if (!__msa_test_bz_v(is_less_than)) {
1359  v16i8 negate_tc, sign_negate_tc;
1360  v16u8 p0, q0, p2_asub_p0, q2_asub_q0;
1361  v8i16 tc_r, tc_l, negate_tc_r, i16_negatetc_l;
1362  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1363  v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1364  v8i16 p0_r, q0_r, p0_l, q0_l;
1365 
1366  negate_tc = zero - (v16i8) tc;
1367  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1368 
1369  ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1370 
1371  UNPCK_UB_SH(tc, tc_r, tc_l);
1372  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1373  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1374  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1375 
1376  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1377  is_less_than_beta = (p2_asub_p0 < beta);
1378  is_less_than_beta = is_less_than_beta & is_less_than;
1379 
1380  if (!__msa_test_bz_v(is_less_than_beta)) {
1381  v16u8 p1;
1382  v8i16 p1_r = { 0 };
1383  v8i16 p1_l = { 0 };
1384  v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1385  v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1386 
1387  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1388  negate_tc_r, tc_r, p1_r);
1389  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1390  i16_negatetc_l, tc_l, p1_l);
1391 
1392  p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1393  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1394 
1395  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1396  tc = tc + is_less_than_beta;
1397  }
1398 
1399  q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1400  is_less_than_beta = (q2_asub_q0 < beta);
1401  is_less_than_beta = is_less_than_beta & is_less_than;
1402 
1403  q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1404  q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1405 
1406  if (!__msa_test_bz_v(is_less_than_beta)) {
1407  v16u8 q1;
1408  v8i16 q1_r = { 0 };
1409  v8i16 q1_l = { 0 };
1410  v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1411  v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1412 
1413  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1414  negate_tc_r, tc_r, q1_r);
1415  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1416  i16_negatetc_l, tc_l, q1_l);
1417 
1418  q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1419  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1420 
1421  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1422  tc = tc + is_less_than_beta;
1423  }
1424 
1425  {
1426  v8i16 threshold_r, negate_thresh_r;
1427  v8i16 threshold_l, negate_thresh_l;
1428  v16i8 negate_thresh, sign_negate_thresh;
1429 
1430  negate_thresh = zero - (v16i8) tc;
1431  sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1432 
1433  ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1434  threshold_r, negate_thresh_r);
1435 
1436  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1437  negate_thresh_r, threshold_r, p0_r, q0_r);
1438 
1439  threshold_l = (v8i16) __msa_ilvl_b(zero, (v16i8) tc);
1440  negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1441  negate_thresh);
1442 
1443  AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1444  negate_thresh_l, threshold_l, p0_l, q0_l);
1445  }
1446 
1447  PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1448 
1449  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1450  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1451 
1452  {
1453  v16i8 tp0, tp1, tp2, tp3;
1454  v8i16 tmp2, tmp5;
1455  v4i32 tmp3, tmp4, tmp6, tmp7;
1456  uint32_t out0, out2;
1457  uint16_t out1, out3;
1458 
1459  src = data - 3;
1460 
1461  ILVRL_B2_SB(p1_org, p2_org, tp0, tp2);
1462  ILVRL_B2_SB(q0_org, p0_org, tp1, tp3);
1463  ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
1464 
1465  ILVRL_H2_SW(tp1, tp0, tmp3, tmp4);
1466  ILVRL_H2_SW(tp3, tp2, tmp6, tmp7);
1467 
1468  out0 = __msa_copy_u_w(tmp3, 0);
1469  out1 = __msa_copy_u_h(tmp2, 0);
1470  out2 = __msa_copy_u_w(tmp3, 1);
1471  out3 = __msa_copy_u_h(tmp2, 1);
1472 
1473  SW(out0, src);
1474  SH(out1, (src + 4));
1475  src += img_width;
1476  SW(out2, src);
1477  SH(out3, (src + 4));
1478 
1479  out0 = __msa_copy_u_w(tmp3, 2);
1480  out1 = __msa_copy_u_h(tmp2, 2);
1481  out2 = __msa_copy_u_w(tmp3, 3);
1482  out3 = __msa_copy_u_h(tmp2, 3);
1483 
1484  src += img_width;
1485  SW(out0, src);
1486  SH(out1, (src + 4));
1487  src += img_width;
1488  SW(out2, src);
1489  SH(out3, (src + 4));
1490 
1491  out0 = __msa_copy_u_w(tmp4, 0);
1492  out1 = __msa_copy_u_h(tmp2, 4);
1493  out2 = __msa_copy_u_w(tmp4, 1);
1494  out3 = __msa_copy_u_h(tmp2, 5);
1495 
1496  src += img_width;
1497  SW(out0, src);
1498  SH(out1, (src + 4));
1499  src += img_width;
1500  SW(out2, src);
1501  SH(out3, (src + 4));
1502 
1503  out0 = __msa_copy_u_w(tmp4, 2);
1504  out1 = __msa_copy_u_h(tmp2, 6);
1505  out2 = __msa_copy_u_w(tmp4, 3);
1506  out3 = __msa_copy_u_h(tmp2, 7);
1507 
1508  src += img_width;
1509  SW(out0, src);
1510  SH(out1, (src + 4));
1511  src += img_width;
1512  SW(out2, src);
1513  SH(out3, (src + 4));
1514 
1515  out0 = __msa_copy_u_w(tmp6, 0);
1516  out1 = __msa_copy_u_h(tmp5, 0);
1517  out2 = __msa_copy_u_w(tmp6, 1);
1518  out3 = __msa_copy_u_h(tmp5, 1);
1519 
1520  src += img_width;
1521  SW(out0, src);
1522  SH(out1, (src + 4));
1523  src += img_width;
1524  SW(out2, src);
1525  SH(out3, (src + 4));
1526 
1527  out0 = __msa_copy_u_w(tmp6, 2);
1528  out1 = __msa_copy_u_h(tmp5, 2);
1529  out2 = __msa_copy_u_w(tmp6, 3);
1530  out3 = __msa_copy_u_h(tmp5, 3);
1531 
1532  src += img_width;
1533  SW(out0, src);
1534  SH(out1, (src + 4));
1535  src += img_width;
1536  SW(out2, src);
1537  SH(out3, (src + 4));
1538 
1539  out0 = __msa_copy_u_w(tmp7, 0);
1540  out1 = __msa_copy_u_h(tmp5, 4);
1541  out2 = __msa_copy_u_w(tmp7, 1);
1542  out3 = __msa_copy_u_h(tmp5, 5);
1543 
1544  src += img_width;
1545  SW(out0, src);
1546  SH(out1, (src + 4));
1547  src += img_width;
1548  SW(out2, src);
1549  SH(out3, (src + 4));
1550 
1551  out0 = __msa_copy_u_w(tmp7, 2);
1552  out1 = __msa_copy_u_h(tmp5, 6);
1553  out2 = __msa_copy_u_w(tmp7, 3);
1554  out3 = __msa_copy_u_h(tmp5, 7);
1555 
1556  src += img_width;
1557  SW(out0, src);
1558  SH(out1, (src + 4));
1559  src += img_width;
1560  SW(out2, src);
1561  SH(out3, (src + 4));
1562  }
1563  }
1564  }
1565 }
1566 
1568  uint8_t bs0, uint8_t bs1,
1569  uint8_t bs2, uint8_t bs3,
1570  uint8_t tc0, uint8_t tc1,
1571  uint8_t tc2, uint8_t tc3,
1572  uint8_t alpha_in,
1573  uint8_t beta_in,
1574  ptrdiff_t image_width)
1575 {
1576  v16u8 tmp_vec;
1577  v16u8 bs = { 0 };
1578 
1579  tmp_vec = (v16u8) __msa_fill_b(bs0);
1580  bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1581  tmp_vec = (v16u8) __msa_fill_b(bs1);
1582  bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1583  tmp_vec = (v16u8) __msa_fill_b(bs2);
1584  bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1585  tmp_vec = (v16u8) __msa_fill_b(bs3);
1586  bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1587 
1588  if (!__msa_test_bz_v(bs)) {
1589  v16u8 alpha, beta, is_less_than, is_less_than_beta;
1590  v16u8 p0, q0, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
1591  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1592  v16u8 is_less_than_alpha, is_bs_greater_than0;
1593  v8i16 p0_r, q0_r, p0_l, q0_l;
1594  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1595  v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1596  v16i8 zero = { 0 };
1597  v16i8 tc = { 0 };
1598 
1599  tmp_vec = (v16u8) __msa_fill_b(tc0);
1600  tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1601  tmp_vec = (v16u8) __msa_fill_b(tc1);
1602  tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1603  tmp_vec = (v16u8) __msa_fill_b(tc2);
1604  tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1605  tmp_vec = (v16u8) __msa_fill_b(tc3);
1606  tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1607 
1608  alpha = (v16u8) __msa_fill_b(alpha_in);
1609  beta = (v16u8) __msa_fill_b(beta_in);
1610 
1611  LD_UB5(data - (3 * image_width), image_width,
1612  p2_org, p1_org, p0_org, q0_org, q1_org);
1613 
1614  is_bs_greater_than0 = ((v16u8) zero < bs);
1615  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1616  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1617  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1618 
1619  is_less_than_alpha = (p0_asub_q0 < alpha);
1620  is_less_than_beta = (p1_asub_p0 < beta);
1621  is_less_than = is_less_than_beta & is_less_than_alpha;
1622  is_less_than_beta = (q1_asub_q0 < beta);
1623  is_less_than = is_less_than_beta & is_less_than;
1624  is_less_than = is_less_than & is_bs_greater_than0;
1625 
1626  if (!__msa_test_bz_v(is_less_than)) {
1627  v16i8 sign_negate_tc, negate_tc;
1628  v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
1629  v16u8 p2_asub_p0, q2_asub_q0;
1630 
1631  q2_org = LD_UB(data + (2 * image_width));
1632  negate_tc = zero - tc;
1633  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1634 
1635  ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1636 
1637  UNPCK_UB_SH(tc, tc_r, tc_l);
1638  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1639  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1640  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1641 
1642  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1643  is_less_than_beta = (p2_asub_p0 < beta);
1644  is_less_than_beta = is_less_than_beta & is_less_than;
1645 
1646  if (!__msa_test_bz_v(is_less_than_beta)) {
1647  v16u8 p1;
1648  v8i16 p1_r = { 0 };
1649  v8i16 p1_l = { 0 };
1650  v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1651  v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1652 
1653  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1654  negate_tc_r, tc_r, p1_r);
1655  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1656  i16_negatetc_l, tc_l, p1_l);
1657 
1658  p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1659  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1660  ST_UB(p1_org, data - (2 * image_width));
1661 
1662  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1663  tc = tc + (v16i8) is_less_than_beta;
1664  }
1665 
1666  q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1667  is_less_than_beta = (q2_asub_q0 < beta);
1668  is_less_than_beta = is_less_than_beta & is_less_than;
1669 
1670  q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1671  q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1672 
1673  if (!__msa_test_bz_v(is_less_than_beta)) {
1674  v16u8 q1;
1675  v8i16 q1_r = { 0 };
1676  v8i16 q1_l = { 0 };
1677  v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1678  v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1679 
1680  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1681  negate_tc_r, tc_r, q1_r);
1682  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1683  i16_negatetc_l, tc_l, q1_l);
1684 
1685  q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1686  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1687  ST_UB(q1_org, data + image_width);
1688 
1689  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1690  tc = tc + (v16i8) is_less_than_beta;
1691  }
1692  {
1693  v16i8 negate_thresh, sign_negate_thresh;
1694  v8i16 threshold_r, threshold_l;
1695  v8i16 negate_thresh_l, negate_thresh_r;
1696 
1697  negate_thresh = zero - tc;
1698  sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1699 
1700  ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1701  threshold_r, negate_thresh_r);
1702  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1703  negate_thresh_r, threshold_r, p0_r, q0_r);
1704 
1705  threshold_l = (v8i16) __msa_ilvl_b(zero, tc);
1706  negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1707  negate_thresh);
1708  AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1709  negate_thresh_l, threshold_l, p0_l, q0_l);
1710  }
1711 
1712  PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1713 
1714  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1715  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1716 
1717  ST_UB(p0_org, (data - image_width));
1718  ST_UB(q0_org, data);
1719  }
1720  }
1721 }
1722 
1724  int32_t alpha_in, int32_t beta_in,
1725  int8_t *tc0)
1726 {
1727  uint8_t *data = in;
1728  uint32_t out0, out1, out2, out3;
1729  uint64_t load;
1730  uint32_t tc_val;
1731  v16u8 alpha, beta;
1732  v16i8 inp0 = { 0 };
1733  v16i8 inp1 = { 0 };
1734  v16i8 inp2 = { 0 };
1735  v16i8 inp3 = { 0 };
1736  v16i8 inp4 = { 0 };
1737  v16i8 inp5 = { 0 };
1738  v16i8 inp6 = { 0 };
1739  v16i8 inp7 = { 0 };
1740  v16i8 src0, src1, src2, src3;
1741  v8i16 src4, src5, src6, src7;
1742  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
1743  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
1744  v16u8 is_less_than_beta1, is_less_than_beta2;
1745  v8i16 tc, tc_orig_r, tc_plus1;
1746  v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
1747  v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
1748  v8i16 src2_r, src3_r;
1749  v8i16 p2_r, p1_r, q2_r, q1_r;
1750  v16u8 p2, q2, p0, q0;
1751  v4i32 dst0, dst1;
1752  v16i8 zeros = { 0 };
1753 
1754  alpha = (v16u8) __msa_fill_b(alpha_in);
1755  beta = (v16u8) __msa_fill_b(beta_in);
1756 
1757  if (tc0[0] < 0) {
1758  data += (2 * stride);
1759  } else {
1760  load = LD(data - 3);
1761  inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
1762  load = LD(data - 3 + stride);
1763  inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
1764  data += (2 * stride);
1765  }
1766 
1767  if (tc0[1] < 0) {
1768  data += (2 * stride);
1769  } else {
1770  load = LD(data - 3);
1771  inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
1772  load = LD(data - 3 + stride);
1773  inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
1774  data += (2 * stride);
1775  }
1776 
1777  if (tc0[2] < 0) {
1778  data += (2 * stride);
1779  } else {
1780  load = LD(data - 3);
1781  inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
1782  load = LD(data - 3 + stride);
1783  inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
1784  data += (2 * stride);
1785  }
1786 
1787  if (tc0[3] < 0) {
1788  data += (2 * stride);
1789  } else {
1790  load = LD(data - 3);
1791  inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
1792  load = LD(data - 3 + stride);
1793  inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
1794  data += (2 * stride);
1795  }
1796 
1797  ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
1798  src0, src1, src2, src3);
1799 
1800  ILVR_H2_SH(src1, src0, src3, src2, src4, src6);
1801  ILVL_H2_SH(src1, src0, src3, src2, src5, src7);
1802 
1803  src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
1804  src1 = __msa_sldi_b(zeros, (v16i8) src0, 8);
1805  src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
1806  src3 = __msa_sldi_b(zeros, (v16i8) src2, 8);
1807  src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
1808  src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
1809 
1810  p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1811  p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1812  q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1813  p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1814  q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1815 
1816  is_less_than_alpha = (p0_asub_q0 < alpha);
1817  is_less_than_beta = (p1_asub_p0 < beta);
1818  is_less_than = is_less_than_alpha & is_less_than_beta;
1819  is_less_than_beta = (q1_asub_q0 < beta);
1820  is_less_than = is_less_than_beta & is_less_than;
1821 
1822  is_less_than_beta1 = (p2_asub_p0 < beta);
1823  is_less_than_beta2 = (q2_asub_q0 < beta);
1824 
1825  p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2);
1826  p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1827  p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
1828 
1829  ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r);
1830  p2_r += p0_add_q0;
1831  p2_r >>= 1;
1832  p2_r -= p1_r;
1833  ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
1834  q2_r += p0_add_q0;
1835  q2_r >>= 1;
1836  q2_r -= q1_r;
1837 
1838  tc_val = LW(tc0);
1839  tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
1840  tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
1841  is_tc_orig1 = tc_orig;
1842  is_tc_orig2 = tc_orig;
1843  tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
1844  tc = tc_orig_r;
1845 
1846  CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
1847  CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
1848 
1849  p2_r += p1_r;
1850  q2_r += q1_r;
1851 
1852  PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2);
1853 
1854  is_tc_orig1 = (zeros < is_tc_orig1);
1855  is_tc_orig2 = is_tc_orig1;
1856  is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
1857  is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
1858  is_tc_orig1 = is_less_than & is_tc_orig1;
1859  is_tc_orig2 = is_less_than & is_tc_orig2;
1860 
1861  p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1);
1862  q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
1863 
1864  q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1865  q0_sub_p0 <<= 2;
1866  p1_sub_q1 = p1_r - q1_r;
1867  q0_sub_p0 += p1_sub_q1;
1868  q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
1869 
1870  tc_plus1 = tc + 1;
1871  is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
1872  (v16i8) is_less_than_beta1);
1873  tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1);
1874  tc_plus1 = tc + 1;
1875  is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
1876  (v16i8) is_less_than_beta2);
1877  tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
1878 
1879  CLIP_SH(q0_sub_p0, -tc, tc);
1880 
1881  ILVR_B2_SH(zeros, src2, zeros, src3, src2_r, src3_r);
1882  src2_r += q0_sub_p0;
1883  src3_r -= q0_sub_p0;
1884 
1885  CLIP_SH2_0_255(src2_r, src3_r);
1886 
1887  PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
1888 
1889  p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than);
1890  q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than);
1891 
1892  ILVR_B2_UB(p0, p2, q2, q0, p2, q2);
1893 
1894  ILVRL_H2_SW(q2, p2, dst0, dst1);
1895 
1896  data = in;
1897 
1898  out0 = __msa_copy_u_w(dst0, 0);
1899  out1 = __msa_copy_u_w(dst0, 1);
1900  out2 = __msa_copy_u_w(dst0, 2);
1901  out3 = __msa_copy_u_w(dst0, 3);
1902 
1903  if (tc0[0] < 0) {
1904  data += (2 * stride);
1905  } else {
1906  SW(out0, (data - 2));
1907  data += stride;
1908  SW(out1, (data - 2));
1909  data += stride;
1910  }
1911 
1912  if (tc0[1] < 0) {
1913  data += (2 * stride);
1914  } else {
1915  SW(out2, (data - 2));
1916  data += stride;
1917  SW(out3, (data - 2));
1918  data += stride;
1919  }
1920 
1921  out0 = __msa_copy_u_w(dst1, 0);
1922  out1 = __msa_copy_u_w(dst1, 1);
1923  out2 = __msa_copy_u_w(dst1, 2);
1924  out3 = __msa_copy_u_w(dst1, 3);
1925 
1926  if (tc0[2] < 0) {
1927  data += (2 * stride);
1928  } else {
1929  SW(out0, (data - 2));
1930  data += stride;
1931  SW(out1, (data - 2));
1932  data += stride;
1933  }
1934 
1935  if (tc0[3] >= 0) {
1936  SW(out2, (data - 2));
1937  data += stride;
1938  SW(out3, (data - 2));
1939  }
1940 }
1941 
1943  uint8_t bs0, uint8_t bs1,
1944  uint8_t bs2, uint8_t bs3,
1945  uint8_t tc0, uint8_t tc1,
1946  uint8_t tc2, uint8_t tc3,
1947  uint8_t alpha_in,
1948  uint8_t beta_in,
1949  ptrdiff_t img_width)
1950 {
1951  v16u8 alpha, beta;
1952  v8i16 tmp_vec;
1953  v8i16 bs = { 0 };
1954  v8i16 tc = { 0 };
1955  v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
1956  v16u8 is_less_than;
1957  v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
1958  v8i16 p0_r, q0_r;
1959  v16u8 p1_org, p0_org, q0_org, q1_org;
1960  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1961  v16i8 negate_tc, sign_negate_tc;
1962  v8i16 tc_r, negate_tc_r;
1963  v16i8 zero = { 0 };
1964 
1965  tmp_vec = (v8i16) __msa_fill_b(bs0);
1966  bs = __msa_insve_h(bs, 0, tmp_vec);
1967  tmp_vec = (v8i16) __msa_fill_b(bs1);
1968  bs = __msa_insve_h(bs, 1, tmp_vec);
1969  tmp_vec = (v8i16) __msa_fill_b(bs2);
1970  bs = __msa_insve_h(bs, 2, tmp_vec);
1971  tmp_vec = (v8i16) __msa_fill_b(bs3);
1972  bs = __msa_insve_h(bs, 3, tmp_vec);
1973 
1974  if (!__msa_test_bz_v((v16u8) bs)) {
1975  tmp_vec = (v8i16) __msa_fill_b(tc0);
1976  tc = __msa_insve_h(tc, 0, tmp_vec);
1977  tmp_vec = (v8i16) __msa_fill_b(tc1);
1978  tc = __msa_insve_h(tc, 1, tmp_vec);
1979  tmp_vec = (v8i16) __msa_fill_b(tc2);
1980  tc = __msa_insve_h(tc, 2, tmp_vec);
1981  tmp_vec = (v8i16) __msa_fill_b(tc3);
1982  tc = __msa_insve_h(tc, 3, tmp_vec);
1983 
1984  is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
1985 
1986  alpha = (v16u8) __msa_fill_b(alpha_in);
1987  beta = (v16u8) __msa_fill_b(beta_in);
1988 
1989  LD_UB4(data - (img_width << 1), img_width,
1990  p1_org, p0_org, q0_org, q1_org);
1991 
1992  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1993  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1994  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1995 
1996  is_less_than_alpha = (p0_asub_q0 < alpha);
1997  is_less_than_beta = (p1_asub_p0 < beta);
1998  is_less_than = is_less_than_beta & is_less_than_alpha;
1999  is_less_than_beta = (q1_asub_q0 < beta);
2000  is_less_than = is_less_than_beta & is_less_than;
2001  is_less_than = is_less_than & is_bs_greater_than0;
2002 
2003  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
2004 
2005  if (!__msa_test_bz_v(is_less_than)) {
2006  negate_tc = zero - (v16i8) tc;
2007  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
2008 
2009  ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r);
2010 
2011  ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
2012  p1_org_r, p0_org_r, q0_org_r, q1_org_r);
2013 
2014  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
2015  tc_r, p0_r, q0_r);
2016 
2017  PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
2018 
2019  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
2020  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
2021 
2022  ST_UB(q0_org, data);
2023  ST_UB(p0_org, (data - img_width));
2024  }
2025  }
2026 }
2027 
2029  uint8_t bs0, uint8_t bs1,
2030  uint8_t bs2, uint8_t bs3,
2031  uint8_t tc0, uint8_t tc1,
2032  uint8_t tc2, uint8_t tc3,
2033  uint8_t alpha_in,
2034  uint8_t beta_in,
2035  ptrdiff_t img_width)
2036 {
2037  uint8_t *src;
2038  v16u8 alpha, beta;
2039  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
2040  v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
2041  v16u8 p0, q0;
2042  v8i16 p0_r = { 0 };
2043  v8i16 q0_r = { 0 };
2044  v16u8 p1_org, p0_org, q0_org, q1_org;
2045  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
2046  v16u8 is_bs_greater_than0;
2047  v8i16 tc_r, negate_tc_r;
2048  v16i8 negate_tc, sign_negate_tc;
2049  v16i8 zero = { 0 };
2050  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
2051  v8i16 tmp1, tmp_vec, bs = { 0 };
2052  v8i16 tc = { 0 };
2053 
2054  tmp_vec = (v8i16) __msa_fill_b(bs0);
2055  bs = __msa_insve_h(bs, 0, tmp_vec);
2056  tmp_vec = (v8i16) __msa_fill_b(bs1);
2057  bs = __msa_insve_h(bs, 1, tmp_vec);
2058  tmp_vec = (v8i16) __msa_fill_b(bs2);
2059  bs = __msa_insve_h(bs, 2, tmp_vec);
2060  tmp_vec = (v8i16) __msa_fill_b(bs3);
2061  bs = __msa_insve_h(bs, 3, tmp_vec);
2062 
2063  if (!__msa_test_bz_v((v16u8) bs)) {
2064  tmp_vec = (v8i16) __msa_fill_b(tc0);
2065  tc = __msa_insve_h(tc, 0, tmp_vec);
2066  tmp_vec = (v8i16) __msa_fill_b(tc1);
2067  tc = __msa_insve_h(tc, 1, tmp_vec);
2068  tmp_vec = (v8i16) __msa_fill_b(tc2);
2069  tc = __msa_insve_h(tc, 2, tmp_vec);
2070  tmp_vec = (v8i16) __msa_fill_b(tc3);
2071  tc = __msa_insve_h(tc, 3, tmp_vec);
2072 
2073  is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
2074 
2075  LD_UB8((data - 2), img_width,
2076  row0, row1, row2, row3, row4, row5, row6, row7);
2077 
2078  TRANSPOSE8x4_UB_UB(row0, row1, row2, row3,
2079  row4, row5, row6, row7,
2080  p1_org, p0_org, q0_org, q1_org);
2081 
2082  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
2083  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
2084  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
2085 
2086  alpha = (v16u8) __msa_fill_b(alpha_in);
2087  beta = (v16u8) __msa_fill_b(beta_in);
2088 
2089  is_less_than_alpha = (p0_asub_q0 < alpha);
2090  is_less_than_beta = (p1_asub_p0 < beta);
2091  is_less_than = is_less_than_beta & is_less_than_alpha;
2092  is_less_than_beta = (q1_asub_q0 < beta);
2093  is_less_than = is_less_than_beta & is_less_than;
2094  is_less_than = is_bs_greater_than0 & is_less_than;
2095 
2096  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
2097 
2098  if (!__msa_test_bz_v(is_less_than)) {
2099  ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
2100  p1_org_r, p0_org_r, q0_org_r, q1_org_r);
2101 
2102  negate_tc = zero - (v16i8) tc;
2103  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
2104 
2105  ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r);
2106 
2107  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
2108  tc_r, p0_r, q0_r);
2109 
2110  PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
2111 
2112  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
2113  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
2114  tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
2115  src = data - 1;
2116  ST_H4(tmp1, 0, 1, 2, 3, src, img_width);
2117  src += 4 * img_width;
2118  ST_H4(tmp1, 4, 5, 6, 7, src, img_width);
2119  }
2120  }
2121 }
2122 
2124  int32_t alpha_in, int32_t beta_in,
2125  int8_t *tc0)
2126 {
2127  int32_t col, tc_val;
2128  v16u8 alpha, beta, res;
2129 
2130  alpha = (v16u8) __msa_fill_b(alpha_in);
2131  beta = (v16u8) __msa_fill_b(beta_in);
2132 
2133  for (col = 0; col < 4; col++) {
2134  tc_val = (tc0[col] - 1) + 1;
2135 
2136  if (tc_val <= 0) {
2137  src += (4 * stride);
2138  continue;
2139  }
2140 
2141  AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2142  ST_H4(res, 0, 1, 2, 3, (src - 1), stride);
2143  src += (4 * stride);
2144  }
2145 }
2146 
2148  ptrdiff_t stride,
2149  int32_t alpha_in,
2150  int32_t beta_in,
2151  int8_t *tc0)
2152 {
2153  int32_t col, tc_val;
2154  int16_t out0, out1;
2155  v16u8 alpha, beta, res;
2156 
2157  alpha = (v16u8) __msa_fill_b(alpha_in);
2158  beta = (v16u8) __msa_fill_b(beta_in);
2159 
2160  for (col = 0; col < 4; col++) {
2161  tc_val = (tc0[col] - 1) + 1;
2162 
2163  if (tc_val <= 0) {
2164  src += 4 * stride;
2165  continue;
2166  }
2167 
2168  AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2169 
2170  out0 = __msa_copy_s_h((v8i16) res, 0);
2171  out1 = __msa_copy_s_h((v8i16) res, 1);
2172 
2173  SH(out0, (src - 1));
2174  src += stride;
2175  SH(out1, (src - 1));
2176  src += stride;
2177  }
2178 }
2179 
2180 void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2181  int alpha, int beta, int8_t *tc)
2182 {
2183  uint8_t bs0 = 1;
2184  uint8_t bs1 = 1;
2185  uint8_t bs2 = 1;
2186  uint8_t bs3 = 1;
2187 
2188  if (tc[0] < 0)
2189  bs0 = 0;
2190  if (tc[1] < 0)
2191  bs1 = 0;
2192  if (tc[2] < 0)
2193  bs2 = 0;
2194  if (tc[3] < 0)
2195  bs3 = 0;
2196 
2197  avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2198  tc[0], tc[1], tc[2], tc[3],
2199  alpha, beta, img_width);
2200 }
2201 
2202 void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2203  int alpha, int beta, int8_t *tc)
2204 {
2205 
2206  uint8_t bs0 = 1;
2207  uint8_t bs1 = 1;
2208  uint8_t bs2 = 1;
2209  uint8_t bs3 = 1;
2210 
2211  if (tc[0] < 0)
2212  bs0 = 0;
2213  if (tc[1] < 0)
2214  bs1 = 0;
2215  if (tc[2] < 0)
2216  bs2 = 0;
2217  if (tc[3] < 0)
2218  bs3 = 0;
2219 
2220  avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2221  tc[0], tc[1], tc[2], tc[3],
2222  alpha, beta, img_width);
2223 }
2224 
2225 void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2226  int alpha, int beta, int8_t *tc)
2227 {
2228  uint8_t bs0 = 1;
2229  uint8_t bs1 = 1;
2230  uint8_t bs2 = 1;
2231  uint8_t bs3 = 1;
2232 
2233  if (tc[0] < 0)
2234  bs0 = 0;
2235  if (tc[1] < 0)
2236  bs1 = 0;
2237  if (tc[2] < 0)
2238  bs2 = 0;
2239  if (tc[3] < 0)
2240  bs3 = 0;
2241 
2242  avc_loopfilter_cb_or_cr_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2243  tc[0], tc[1], tc[2], tc[3],
2244  alpha, beta, img_width);
2245 }
2246 
2247 void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2248  int alpha, int beta, int8_t *tc)
2249 {
2250  uint8_t bs0 = 1;
2251  uint8_t bs1 = 1;
2252  uint8_t bs2 = 1;
2253  uint8_t bs3 = 1;
2254 
2255  if (tc[0] < 0)
2256  bs0 = 0;
2257  if (tc[1] < 0)
2258  bs1 = 0;
2259  if (tc[2] < 0)
2260  bs2 = 0;
2261  if (tc[3] < 0)
2262  bs3 = 0;
2263 
2264  avc_loopfilter_cb_or_cr_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2265  tc[0], tc[1], tc[2], tc[3],
2266  alpha, beta, img_width);
2267 }
2268 
2269 void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2270  int alpha, int beta)
2271 {
2273  (uint8_t) beta,
2274  img_width);
2275 }
2276 
2277 void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2278  int alpha, int beta)
2279 {
2281  (uint8_t) beta,
2282  img_width);
2283 }
2284 
2285 void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2286  int alpha, int beta)
2287 {
2289  (uint8_t) beta,
2290  img_width);
2291 }
2292 
2293 void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2294  int alpha, int beta)
2295 {
2297  (uint8_t) beta,
2298  img_width);
2299 }
2300 
2302  ptrdiff_t ystride,
2303  int32_t alpha, int32_t beta,
2304  int8_t *tc0)
2305 {
2306  avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0);
2307 }
2308 
2310  ptrdiff_t ystride,
2311  int32_t alpha,
2312  int32_t beta,
2313  int8_t *tc0)
2314 {
2315  avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0);
2316 }
2317 
2319  ptrdiff_t ystride,
2320  int32_t alpha,
2321  int32_t beta,
2322  int8_t *tc0)
2323 {
2324  avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0);
2325 }
2326 
2328  ptrdiff_t ystride,
2329  int32_t alpha,
2330  int32_t beta)
2331 {
2332  avc_h_loop_filter_luma_mbaff_intra_msa(src, ystride, alpha, beta);
2333 }
2334 
2336  int height, int log2_denom,
2337  int weight_src, int offset_in)
2338 {
2339  uint32_t offset_val;
2340  v16i8 zero = { 0 };
2341  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2342  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2343  v8i16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
2344  v8i16 src4_l, src5_l, src6_l, src7_l, src4_r, src5_r, src6_r, src7_r;
2345  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2346  v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2347  v8i16 wgt, denom, offset;
2348 
2349  offset_val = (unsigned) offset_in << log2_denom;
2350 
2351  wgt = __msa_fill_h(weight_src);
2352  offset = __msa_fill_h(offset_val);
2353  denom = __msa_fill_h(log2_denom);
2354 
2355  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2356  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, src1_r,
2357  src2_r, src3_r);
2358  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, src1_l,
2359  src2_l, src3_l);
2360  ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, src5_r,
2361  src6_r, src7_r);
2362  ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, src5_l,
2363  src6_l, src7_l);
2364  MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, tmp2,
2365  tmp3);
2366  MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, tmp6,
2367  tmp7);
2368  MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, tmp10,
2369  tmp11);
2370  MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2371  tmp14, tmp15);
2372  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
2373  tmp1, tmp2, tmp3);
2374  ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
2375  tmp5, tmp6, tmp7);
2376  ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, tmp8,
2377  tmp9, tmp10, tmp11);
2378  ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2379  tmp12, tmp13, tmp14, tmp15);
2380  MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2381  MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2382  SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2383  SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2384  SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2385  SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2386  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2387  dst2, dst3);
2388  PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2389  dst5, dst6, dst7);
2390  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2391  src += 8 * stride;
2392 
2393  if (16 == height) {
2394  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2395  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r,
2396  src1_r, src2_r, src3_r);
2397  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l,
2398  src1_l, src2_l, src3_l);
2399  ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r,
2400  src5_r, src6_r, src7_r);
2401  ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l,
2402  src5_l, src6_l, src7_l);
2403  MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1,
2404  tmp2, tmp3);
2405  MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5,
2406  tmp6, tmp7);
2407  MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9,
2408  tmp10, tmp11);
2409  MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2410  tmp14, tmp15);
2411  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
2412  tmp0, tmp1, tmp2, tmp3);
2413  ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
2414  tmp4, tmp5, tmp6, tmp7);
2415  ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset,
2416  tmp8, tmp9, tmp10, tmp11);
2417  ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2418  tmp12, tmp13, tmp14, tmp15);
2419  MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2420  MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2421  SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2422  SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2423  SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2424  SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2425  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2426  dst2, dst3);
2427  PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2428  dst5, dst6, dst7);
2429  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2430  }
2431 }
2432 
2434  int height, int log2_denom,
2435  int weight_src, int offset)
2436 {
2437  if (4 == height) {
2438  avc_wgt_8x4_msa(src, stride, log2_denom, weight_src, offset);
2439  } else if (8 == height) {
2440  avc_wgt_8x8_msa(src, stride, log2_denom, weight_src, offset);
2441  } else {
2442  avc_wgt_8x16_msa(src, stride, log2_denom, weight_src, offset);
2443  }
2444 }
2445 
2447  int height, int log2_denom,
2448  int weight_src, int offset)
2449 {
2450  if (2 == height) {
2451  avc_wgt_4x2_msa(src, stride, log2_denom, weight_src, offset);
2452  } else if (4 == height) {
2453  avc_wgt_4x4_msa(src, stride, log2_denom, weight_src, offset);
2454  } else {
2455  avc_wgt_4x8_msa(src, stride, log2_denom, weight_src, offset);
2456  }
2457 }
2458 
2460  ptrdiff_t stride, int height,
2461  int log2_denom, int weight_dst,
2462  int weight_src, int offset_in)
2463 {
2464  v16i8 src_wgt, dst_wgt, wgt;
2465  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2466  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2467  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2468  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2469  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2470  v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2471  v8i16 denom, offset;
2472 
2473  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
2474  offset_in += (128 * (weight_src + weight_dst));
2475 
2476  src_wgt = __msa_fill_b(weight_src);
2477  dst_wgt = __msa_fill_b(weight_dst);
2478  offset = __msa_fill_h(offset_in);
2479  denom = __msa_fill_h(log2_denom + 1);
2480 
2481  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
2482 
2483  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2484  src += 8 * stride;
2485  LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2486  XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2487  XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2488  ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2, vec4,
2489  vec6);
2490  ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3, vec5,
2491  vec7);
2492  ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2493  vec12, vec14);
2494  ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2495  vec13, vec15);
2496  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2497  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2498  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2499  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2500  tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2501  tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2502  tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2503  tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2504  tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2505  tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2506  tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2507  tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2508  tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2509  tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2510  tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2511  tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2512  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2513  SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2514  SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2515  SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2516  CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2517  CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2518  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2519  dst2, dst3);
2520  PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2521  dst5, dst6, dst7);
2522  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2523  dst += 8 * stride;
2524 
2525  if (16 == height) {
2526  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2527  LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2528  XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2529  XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2530  ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2,
2531  vec4, vec6);
2532  ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3,
2533  vec5, vec7);
2534  ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2535  vec12, vec14);
2536  ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2537  vec13, vec15);
2538  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2539  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2540  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2541  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2542  tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2543  tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2544  tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2545  tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2546  tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2547  tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2548  tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2549  tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2550  tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2551  tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2552  tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2553  tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2554  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2555  SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2556  SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2557  SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2558  CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2559  CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2560  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2561  dst2, dst3);
2562  PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2563  dst5, dst6, dst7);
2564  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2565  }
2566 }
2567 
2569  ptrdiff_t stride, int height,
2570  int log2_denom, int weight_dst,
2571  int weight_src, int offset)
2572 {
2573  if (4 == height) {
2574  avc_biwgt_8x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2575  offset);
2576  } else if (8 == height) {
2577  avc_biwgt_8x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2578  offset);
2579  } else {
2580  avc_biwgt_8x16_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2581  offset);
2582  }
2583 }
2584 
2586  ptrdiff_t stride, int height,
2587  int log2_denom, int weight_dst,
2588  int weight_src, int offset)
2589 {
2590  if (2 == height) {
2591  avc_biwgt_4x2_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2592  offset);
2593  } else if (4 == height) {
2594  avc_biwgt_4x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2595  offset);
2596  } else {
2597  avc_biwgt_4x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2598  offset);
2599  }
2600 }
#define MAXI_SH2_SH(...)
static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:375
void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2225
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
#define XORI_B8_128_UB(...)
static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:230
#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,p1_or_q1_org_in, p2_or_q2_org_in,negate_tc_in, tc_in, p1_or_q1_out)
Definition: h264dsp_msa.c:525
ptrdiff_t const GLvoid * data
Definition: opengl_enc.c:100
#define ILVRL_B2_SH(...)
void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2180
#define PCKEV_B2_SH(...)
static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, ptrdiff_t img_width)
Definition: h264dsp_msa.c:2028
#define LW(psrc)
void ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset)
Definition: h264dsp_msa.c:2446
static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data, uint8_t alpha_in, uint8_t beta_in, ptrdiff_t img_width)
Definition: h264dsp_msa.c:816
#define MUL2(in0, in1, in2, in3, out0, out1)
static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr, uint8_t alpha_in, uint8_t beta_in, ptrdiff_t img_width)
Definition: h264dsp_msa.c:1176
#define tc
Definition: regdef.h:69
#define ILVRL_H2_SW(...)
#define PCKEV_B3_UB(...)
static const uint8_t q1[256]
Definition: twofish.c:96
#define LD_UB4(...)
#define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res)
Definition: h264dsp_msa.c:628
#define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)
#define ILVL_H2_SH(...)
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,in8, in9, in10, in11, in12, in13, in14, in15,out0, out1, out2, out3, out4, out5, out6, out7)
#define SRA_4V(in0, in1, in2, in3, shift)
#define XORI_B4_128_UB(...)
uint8_t
#define SAT_UH8_SH(...)
#define LD4(psrc, stride, out0, out1, out2, out3)
static void avc_wgt_8x16_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:184
#define UNPCK_UB_SH(in, out0, out1)
void ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset_in)
Definition: h264dsp_msa.c:2335
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2285
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define XORI_B2_128_UB(...)
#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in,p1_or_q1_org_in, p0_or_q0_out)
Definition: h264dsp_msa.c:516
#define height
static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:263
#define ILVRL_H2_SH(...)
#define CLIP_SH8_0_255(in0, in1, in2, in3,in4, in5, in6, in7)
void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, ptrdiff_t ystride, int32_t alpha, int32_t beta)
Definition: h264dsp_msa.c:2327
#define LD_UB5(...)
#define CLIP_SH2_0_255(in0, in1)
void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2293
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
#define src
Definition: vp8dsp.c:254
#define INSERT_W2_UB(...)
static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr, uint8_t alpha_in, uint8_t beta_in, ptrdiff_t img_width)
Definition: h264dsp_msa.c:1225
#define zero
Definition: regdef.h:64
#define LW2(psrc, stride, out0, out1)
static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:297
#define ADDS_SH2_SH(...)
#define ILVR_B2_SH(...)
static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:426
static void avc_wgt_4x2_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:24
#define ILVR_W2_SB(...)
static const uint8_t q0[256]
Definition: twofish.c:77
#define CLIP_SH4_0_255(in0, in1, in2, in3)
void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, ptrdiff_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
Definition: h264dsp_msa.c:2309
#define TRANSPOSE8x4_UB_UB(...)
static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:337
#define SLDI_B2_UB(...)
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
#define LD_UB8(...)
void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)
Definition: h264dsp_msa.c:2585
static void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, ptrdiff_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
Definition: h264dsp_msa.c:2147
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
int32_t
#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in,p1_or_q1_org_in, q1_or_p1_org_in,negate_threshold_in, threshold_in,p0_or_q0_out, q0_or_p0_out)
Definition: h264dsp_msa.c:540
static void avc_wgt_8x8_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:142
#define ILVR_B4_UH(...)
#define ILVL_B4_SH(...)
#define ST_UB(...)
static void avc_wgt_4x8_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:78
#define PCKEV_B4_UB(...)
#define SRLR_H8_SH(...)
#define INSERT_W4_UB(...)
void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)
Definition: h264dsp_msa.c:2568
static void avc_wgt_8x4_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:110
#define ILVL_B2_SH(...)
void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2269
#define ST_UB8(...)
static void avc_h_loop_filter_chroma422_msa(uint8_t *src, ptrdiff_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
Definition: h264dsp_msa.c:2123
static void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, ptrdiff_t image_width)
Definition: h264dsp_msa.c:1567
#define SAT_UH2_SH(...)
#define src1
Definition: h264pred.c:139
void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2247
#define SAT_UH4_SH(...)
#define ILVL_B4_SB(...)
static const int16_t alpha[]
Definition: ilbcdata.h:55
#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,q3_or_p3_org_in, p1_or_q1_org_in,p2_or_q2_org_in, q1_or_p1_org_in,p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)
Definition: h264dsp_msa.c:489
void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src, ptrdiff_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
Definition: h264dsp_msa.c:2301
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31))))#define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac){}void ff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map){AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);return NULL;}return ac;}in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;}int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){int use_generic=1;int len=in->nb_samples;int p;if(ac->dc){av_log(ac->avr, AV_LOG_TRACE,"%d samples - audio_convert: %s to %s (dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> in
#define MAXI_SH8_SH(...)
#define SRLR_H4_SH(...)
static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, ptrdiff_t stride, int32_t alpha_in, int32_t beta_in)
Definition: h264dsp_msa.c:976
#define ILVR_B4_SH(...)
#define CLIP_SH(in, min, max)
#define src0
Definition: h264pred.c:138
#define LD(psrc)
#define SH(val, pdst)
void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2202
#define SW(val, pdst)
#define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res)
Definition: h264dsp_msa.c:562
void ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset)
Definition: h264dsp_msa.c:2433
#define ILVR_W2_UB(...)
void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src, ptrdiff_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
Definition: h264dsp_msa.c:2318
#define ILVL_W2_SB(...)
#define INSERT_D2_UB(...)
GLint GLenum GLboolean GLsizei stride
Definition: opengl_enc.c:104
#define LW4(psrc, stride, out0, out1, out2, out3)
#define ILVRL_B2_SB(...)
#define ILVR_H2_SH(...)
void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2277
#define LD_UB(...)
static void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data, uint8_t alpha_in, uint8_t beta_in, ptrdiff_t img_width)
Definition: h264dsp_msa.c:683
static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, ptrdiff_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
Definition: h264dsp_msa.c:1723
#define MAXI_SH4_SH(...)
static void avc_wgt_4x4_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:51
static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, ptrdiff_t img_width)
Definition: h264dsp_msa.c:1287
#define ILVR_B4_SB(...)
void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset_in)
Definition: h264dsp_msa.c:2459
#define stride
#define ST_W2(in, idx0, idx1, pdst, stride)
static void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, ptrdiff_t img_width)
Definition: h264dsp_msa.c:1942
#define PCKEV_B2_UB(...)
#define ILVR_B2_UB(...)
#define ADDS_SH4_SH(...)
#define SLDI_B4_SB(...)