FFmpeg
h264dsp_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "h264dsp_mips.h"
23 
25  int32_t log2_denom, int32_t src_weight,
26  int32_t offset_in)
27 {
28  uint32_t tp0, tp1, offset_val;
29  v16u8 zero = { 0 };
30  v16u8 src0 = { 0 };
31  v8i16 src0_r, tmp0, wgt, denom, offset;
32 
33  offset_val = (unsigned) offset_in << log2_denom;
34 
35  wgt = __msa_fill_h(src_weight);
36  offset = __msa_fill_h(offset_val);
37  denom = __msa_fill_h(log2_denom);
38 
39  LW2(data, stride, tp0, tp1);
40  INSERT_W2_UB(tp0, tp1, src0);
41  src0_r = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) src0);
42  tmp0 = wgt * src0_r;
43  tmp0 = __msa_adds_s_h(tmp0, offset);
44  tmp0 = __msa_maxi_s_h(tmp0, 0);
45  tmp0 = __msa_srlr_h(tmp0, denom);
46  tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
47  src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
48  ST_W2(src0, 0, 1, data, stride);
49 }
50 
51 static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
52  int32_t src_weight, int32_t offset_in)
53 {
54  uint32_t tp0, tp1, tp2, tp3, offset_val;
55  v16u8 src0 = { 0 };
56  v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom, offset;
57 
58  offset_val = (unsigned) offset_in << log2_denom;
59 
60  wgt = __msa_fill_h(src_weight);
61  offset = __msa_fill_h(offset_val);
62  denom = __msa_fill_h(log2_denom);
63 
64  LW4(data, stride, tp0, tp1, tp2, tp3);
65  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
66  UNPCK_UB_SH(src0, src0_r, src1_r);
67  MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1);
68  ADDS_SH2_SH(tmp0, offset, tmp1, offset, tmp0, tmp1);
69  MAXI_SH2_SH(tmp0, tmp1, 0);
70  tmp0 = __msa_srlr_h(tmp0, denom);
71  tmp1 = __msa_srlr_h(tmp1, denom);
72  SAT_UH2_SH(tmp0, tmp1, 7);
73  src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
74  ST_W4(src0, 0, 1, 2, 3, data, stride);
75 }
76 
77 static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
78  int32_t src_weight, int32_t offset_in)
79 {
80  uint32_t tp0, tp1, tp2, tp3, offset_val;
81  v16u8 src0 = { 0 }, src1 = { 0 };
82  v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
83  v8i16 wgt, denom, offset;
84 
85  offset_val = (unsigned) offset_in << log2_denom;
86 
87  wgt = __msa_fill_h(src_weight);
88  offset = __msa_fill_h(offset_val);
89  denom = __msa_fill_h(log2_denom);
90 
91  LW4(data, stride, tp0, tp1, tp2, tp3);
92  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
93  LW4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
94  INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
95  UNPCK_UB_SH(src0, src0_r, src1_r);
96  UNPCK_UB_SH(src1, src2_r, src3_r);
97  MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
98  tmp3);
99  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
100  tmp1, tmp2, tmp3);
101  MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
102  SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
103  SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
104  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
105  ST_W8(src0, src1, 0, 1, 2, 3, 0, 1, 2, 3, data, stride);
106 }
107 
108 static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
109  int32_t src_weight, int32_t offset_in)
110 {
111  uint32_t offset_val;
112  uint64_t tp0, tp1, tp2, tp3;
113  v16u8 src0 = { 0 }, src1 = { 0 };
114  v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
115  v8i16 wgt, denom, offset;
116 
117  offset_val = (unsigned) offset_in << log2_denom;
118 
119  wgt = __msa_fill_h(src_weight);
120  offset = __msa_fill_h(offset_val);
121  denom = __msa_fill_h(log2_denom);
122 
123  LD4(data, stride, tp0, tp1, tp2, tp3);
124  INSERT_D2_UB(tp0, tp1, src0);
125  INSERT_D2_UB(tp2, tp3, src1);
126  UNPCK_UB_SH(src0, src0_r, src1_r);
127  UNPCK_UB_SH(src1, src2_r, src3_r);
128  MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
129  tmp3);
130  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
131  tmp1, tmp2, tmp3);
132  MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
133  SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
134  SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
135  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
136  ST_D4(src0, src1, 0, 1, 0, 1, data, stride);
137 }
138 
139 static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
140  int32_t src_weight, int32_t offset_in)
141 {
142  uint32_t offset_val;
143  uint64_t tp0, tp1, tp2, tp3;
144  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
145  v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
146  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
147  v8i16 wgt, denom, offset;
148 
149  offset_val = (unsigned) offset_in << log2_denom;
150 
151  wgt = __msa_fill_h(src_weight);
152  offset = __msa_fill_h(offset_val);
153  denom = __msa_fill_h(log2_denom);
154 
155  LD4(data, stride, tp0, tp1, tp2, tp3);
156  INSERT_D2_UB(tp0, tp1, src0);
157  INSERT_D2_UB(tp2, tp3, src1);
158  LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
159  INSERT_D2_UB(tp0, tp1, src2);
160  INSERT_D2_UB(tp2, tp3, src3);
161  UNPCK_UB_SH(src0, src0_r, src1_r);
162  UNPCK_UB_SH(src1, src2_r, src3_r);
163  UNPCK_UB_SH(src2, src4_r, src5_r);
164  UNPCK_UB_SH(src3, src6_r, src7_r);
165  MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
166  tmp3);
167  MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, tmp6,
168  tmp7);
169  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
170  tmp1, tmp2, tmp3);
171  ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
172  tmp5, tmp6, tmp7);
173  MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
174  SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
175  SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
176  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
177  src2, src3);
178  ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
179 }
180 
181 static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
182  int32_t src_weight, int32_t offset_in)
183 {
184  uint32_t offset_val, cnt;
185  uint64_t tp0, tp1, tp2, tp3;
186  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
187  v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
188  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
189  v8i16 wgt, denom, offset;
190 
191  offset_val = (unsigned) offset_in << log2_denom;
192 
193  wgt = __msa_fill_h(src_weight);
194  offset = __msa_fill_h(offset_val);
195  denom = __msa_fill_h(log2_denom);
196 
197  for (cnt = 2; cnt--;) {
198  LD4(data, stride, tp0, tp1, tp2, tp3);
199  INSERT_D2_UB(tp0, tp1, src0);
200  INSERT_D2_UB(tp2, tp3, src1);
201  LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
202  INSERT_D2_UB(tp0, tp1, src2);
203  INSERT_D2_UB(tp2, tp3, src3);
204  UNPCK_UB_SH(src0, src0_r, src1_r);
205  UNPCK_UB_SH(src1, src2_r, src3_r);
206  UNPCK_UB_SH(src2, src4_r, src5_r);
207  UNPCK_UB_SH(src3, src6_r, src7_r);
208  MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1,
209  tmp2, tmp3);
210  MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5,
211  tmp6, tmp7);
212  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
213  tmp0, tmp1, tmp2, tmp3);
214  ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
215  tmp4, tmp5, tmp6, tmp7);
216  MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
217  SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
218  SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
219  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
220  src2, src3);
221  ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
222  data += 8 * stride;
223  }
224 }
225 
227  int32_t log2_denom, int32_t src_weight,
228  int32_t dst_weight, int32_t offset_in)
229 {
230  uint32_t tp0, tp1;
231  v16i8 src_wgt, dst_wgt, wgt, vec0;
232  v16u8 src0 = { 0 }, dst0 = { 0 };
233  v8i16 tmp0, denom, offset, max255 = __msa_ldi_h(255);
234 
235  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
236  offset_in += (128 * (src_weight + dst_weight));
237 
238  src_wgt = __msa_fill_b(src_weight);
239  dst_wgt = __msa_fill_b(dst_weight);
240  offset = __msa_fill_h(offset_in);
241  denom = __msa_fill_h(log2_denom + 1);
242 
243  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
244 
245  LW2(src, stride, tp0, tp1);
246  INSERT_W2_UB(tp0, tp1, src0);
247  LW2(dst, stride, tp0, tp1);
248  INSERT_W2_UB(tp0, tp1, dst0);
249  XORI_B2_128_UB(src0, dst0);
250  vec0 = (v16i8) __msa_ilvr_b((v16i8) dst0, (v16i8) src0);
251  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
252  tmp0 >>= denom;
253  tmp0 = __msa_maxi_s_h(tmp0, 0);
254  tmp0 = __msa_min_s_h(max255, tmp0);
255  dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
256  ST_W2(dst0, 0, 1, dst, stride);
257 }
258 
260  int32_t log2_denom, int32_t src_weight,
261  int32_t dst_weight, int32_t offset_in)
262 {
263  uint32_t tp0, tp1, tp2, tp3;
264  v16i8 src_wgt, dst_wgt, wgt, vec0, vec1;
265  v16u8 src0, dst0;
266  v8i16 tmp0, tmp1, denom, offset;
267 
268  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
269  offset_in += (128 * (src_weight + dst_weight));
270 
271  src_wgt = __msa_fill_b(src_weight);
272  dst_wgt = __msa_fill_b(dst_weight);
273  offset = __msa_fill_h(offset_in);
274  denom = __msa_fill_h(log2_denom + 1);
275 
276  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
277 
278  LW4(src, stride, tp0, tp1, tp2, tp3);
279  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
280  LW4(dst, stride, tp0, tp1, tp2, tp3);
281  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
282  XORI_B2_128_UB(src0, dst0);
283  ILVRL_B2_SB(dst0, src0, vec0, vec1);
284  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
285  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
286  tmp0 >>= denom;
287  tmp1 >>= denom;
288  CLIP_SH2_0_255(tmp0, tmp1);
289  dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
290  ST_W4(dst0, 0, 1, 2, 3, dst, stride);
291 }
292 
294  int32_t log2_denom, int32_t src_weight,
295  int32_t dst_weight, int32_t offset_in)
296 {
297  uint32_t tp0, tp1, tp2, tp3;
298  v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
299  v16u8 src0, src1, dst0, dst1;
300  v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
301 
302  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
303  offset_in += (128 * (src_weight + dst_weight));
304 
305  src_wgt = __msa_fill_b(src_weight);
306  dst_wgt = __msa_fill_b(dst_weight);
307  offset = __msa_fill_h(offset_in);
308  denom = __msa_fill_h(log2_denom + 1);
309  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
310 
311  LW4(src, stride, tp0, tp1, tp2, tp3);
312  src += 4 * stride;
313  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
314  LW4(src, stride, tp0, tp1, tp2, tp3);
315  INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
316  LW4(dst, stride, tp0, tp1, tp2, tp3);
317  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
318  LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
319  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
320  XORI_B4_128_UB(src0, src1, dst0, dst1);
321  ILVRL_B2_SB(dst0, src0, vec0, vec1);
322  ILVRL_B2_SB(dst1, src1, vec2, vec3);
323  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
324  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
325  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
326  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
327  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
328  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
329  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
330  ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
331 }
332 
334  int32_t log2_denom, int32_t src_weight,
335  int32_t dst_weight, int32_t offset_in)
336 {
337  uint64_t tp0, tp1, tp2, tp3;
338  v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
339  v16u8 src0, src1, dst0, dst1;
340  v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
341 
342  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
343  offset_in += (128 * (src_weight + dst_weight));
344 
345  src_wgt = __msa_fill_b(src_weight);
346  dst_wgt = __msa_fill_b(dst_weight);
347  offset = __msa_fill_h(offset_in);
348  denom = __msa_fill_h(log2_denom + 1);
349 
350  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
351 
352  LD4(src, stride, tp0, tp1, tp2, tp3);
353  INSERT_D2_UB(tp0, tp1, src0);
354  INSERT_D2_UB(tp2, tp3, src1);
355  LD4(dst, stride, tp0, tp1, tp2, tp3);
356  INSERT_D2_UB(tp0, tp1, dst0);
357  INSERT_D2_UB(tp2, tp3, dst1);
358  XORI_B4_128_UB(src0, src1, dst0, dst1);
359  ILVRL_B2_SB(dst0, src0, vec0, vec1);
360  ILVRL_B2_SB(dst1, src1, vec2, vec3);
361  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
362  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
363  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
364  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
365  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
366  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
367  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
368  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
369 }
370 
372  int32_t log2_denom, int32_t src_weight,
373  int32_t dst_weight, int32_t offset_in)
374 {
375  uint64_t tp0, tp1, tp2, tp3;
376  v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
377  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
378  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
379 
380  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
381  offset_in += (128 * (src_weight + dst_weight));
382 
383  src_wgt = __msa_fill_b(src_weight);
384  dst_wgt = __msa_fill_b(dst_weight);
385  offset = __msa_fill_h(offset_in);
386  denom = __msa_fill_h(log2_denom + 1);
387  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
388 
389  LD4(src, stride, tp0, tp1, tp2, tp3);
390  INSERT_D2_UB(tp0, tp1, src0);
391  INSERT_D2_UB(tp2, tp3, src1);
392  LD4(src + 4 * stride, stride, tp0, tp1, tp2, tp3);
393  INSERT_D2_UB(tp0, tp1, src2);
394  INSERT_D2_UB(tp2, tp3, src3);
395  LD4(dst, stride, tp0, tp1, tp2, tp3);
396  INSERT_D2_UB(tp0, tp1, dst0);
397  INSERT_D2_UB(tp2, tp3, dst1);
398  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
399  INSERT_D2_UB(tp0, tp1, dst2);
400  INSERT_D2_UB(tp2, tp3, dst3);
401  XORI_B8_128_UB(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
402  ILVRL_B2_SB(dst0, src0, vec0, vec1);
403  ILVRL_B2_SB(dst1, src1, vec2, vec3);
404  ILVRL_B2_SB(dst2, src2, vec4, vec5);
405  ILVRL_B2_SB(dst3, src3, vec6, vec7);
406  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
407  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
408  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
409  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
410  tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
411  tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
412  tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
413  tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
414  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
415  SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
416  CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
417  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
418  PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
419  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
420 }
421 
423  int32_t log2_denom, int32_t src_weight,
424  int32_t dst_weight, int32_t offset_in)
425 {
426  uint8_t cnt;
427  uint64_t tp0, tp1, tp2, tp3;
428  v16i8 src_wgt, dst_wgt, wgt;
429  v16u8 src0, src1, src2, src3;
430  v16u8 dst0, dst1, dst2, dst3;
431  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
432  v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
433  v8i16 denom, offset;
434 
435  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
436  offset_in += (128 * (src_weight + dst_weight));
437 
438  src_wgt = __msa_fill_b(src_weight);
439  dst_wgt = __msa_fill_b(dst_weight);
440  offset = __msa_fill_h(offset_in);
441  denom = __msa_fill_h(log2_denom + 1);
442  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
443 
444  for (cnt = 2; cnt--;) {
445  LD4(src, stride, tp0, tp1, tp2, tp3);
446  src += 4 * stride;
447  INSERT_D2_UB(tp0, tp1, src0);
448  INSERT_D2_UB(tp2, tp3, src1);
449  LD4(src, stride, tp0, tp1, tp2, tp3);
450  src += 4 * stride;
451  INSERT_D2_UB(tp0, tp1, src2);
452  INSERT_D2_UB(tp2, tp3, src3);
453  LD4(dst, stride, tp0, tp1, tp2, tp3);
454  INSERT_D2_UB(tp0, tp1, dst0);
455  INSERT_D2_UB(tp2, tp3, dst1);
456  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
457  INSERT_D2_UB(tp0, tp1, dst2);
458  INSERT_D2_UB(tp2, tp3, dst3);
459  XORI_B4_128_UB(src0, src1, src2, src3);
460  XORI_B4_128_UB(dst0, dst1, dst2, dst3);
461  ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
462  vec0, vec2, vec4, vec6);
463  ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
464  vec1, vec3, vec5, vec7);
465 
466  temp0 = __msa_dpadd_s_h(offset, wgt, vec0);
467  temp1 = __msa_dpadd_s_h(offset, wgt, vec1);
468  temp2 = __msa_dpadd_s_h(offset, wgt, vec2);
469  temp3 = __msa_dpadd_s_h(offset, wgt, vec3);
470  temp4 = __msa_dpadd_s_h(offset, wgt, vec4);
471  temp5 = __msa_dpadd_s_h(offset, wgt, vec5);
472  temp6 = __msa_dpadd_s_h(offset, wgt, vec6);
473  temp7 = __msa_dpadd_s_h(offset, wgt, vec7);
474 
475  SRA_4V(temp0, temp1, temp2, temp3, denom);
476  SRA_4V(temp4, temp5, temp6, temp7, denom);
477  CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
478  PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
479  dst0, dst1, dst2, dst3);
480  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
481  dst += 8 * stride;
482  }
483 }
484 
485 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \
486  q3_or_p3_org_in, p1_or_q1_org_in, \
487  p2_or_q2_org_in, q1_or_p1_org_in, \
488  p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \
489 { \
490  v8i16 threshold; \
491  v8i16 const3 = __msa_ldi_h(3); \
492  \
493  threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in); \
494  threshold += (p1_or_q1_org_in); \
495  \
496  (p0_or_q0_out) = threshold << 1; \
497  (p0_or_q0_out) += (p2_or_q2_org_in); \
498  (p0_or_q0_out) += (q1_or_p1_org_in); \
499  (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3); \
500  \
501  (p1_or_q1_out) = (p2_or_q2_org_in) + threshold; \
502  (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2); \
503  \
504  (p2_or_q2_out) = (p2_or_q2_org_in) * const3; \
505  (p2_or_q2_out) += (p3_or_q3_org_in); \
506  (p2_or_q2_out) += (p3_or_q3_org_in); \
507  (p2_or_q2_out) += threshold; \
508  (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3); \
509 }
510 
511 /* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
512 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \
513  p1_or_q1_org_in, p0_or_q0_out) \
514 { \
515  (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in); \
516  (p0_or_q0_out) += (p1_or_q1_org_in); \
517  (p0_or_q0_out) += (p1_or_q1_org_in); \
518  (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2); \
519 }
520 
521 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \
522  p1_or_q1_org_in, p2_or_q2_org_in, \
523  negate_tc_in, tc_in, p1_or_q1_out) \
524 { \
525  v8i16 clip3, temp; \
526  \
527  clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in, \
528  (v8u16) q0_or_p0_org_in); \
529  temp = p1_or_q1_org_in << 1; \
530  clip3 = clip3 - temp; \
531  clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3); \
532  CLIP_SH(clip3, negate_tc_in, tc_in); \
533  p1_or_q1_out = p1_or_q1_org_in + clip3; \
534 }
535 
536 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \
537  p1_or_q1_org_in, q1_or_p1_org_in, \
538  negate_threshold_in, threshold_in, \
539  p0_or_q0_out, q0_or_p0_out) \
540 { \
541  v8i16 q0_sub_p0, p1_sub_q1, delta; \
542  \
543  q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \
544  p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \
545  q0_sub_p0 <<= 2; \
546  p1_sub_q1 += 4; \
547  delta = q0_sub_p0 + p1_sub_q1; \
548  delta >>= 3; \
549  \
550  CLIP_SH(delta, negate_threshold_in, threshold_in); \
551  \
552  p0_or_q0_out = p0_or_q0_org_in + delta; \
553  q0_or_p0_out = q0_or_p0_org_in - delta; \
554  \
555  CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out); \
556 }
557 
558 #define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
559 { \
560  uint32_t load0, load1, load2, load3; \
561  v16u8 src0 = { 0 }; \
562  v16u8 src1 = { 0 }; \
563  v16u8 src2 = { 0 }; \
564  v16u8 src3 = { 0 }; \
565  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
566  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
567  v8i16 tc, q0_sub_p0, p1_sub_q1, delta; \
568  v8i16 res0_r, res1_r; \
569  v16i8 zeros = { 0 }; \
570  v16u8 res0, res1; \
571  \
572  LW4((src - 2), stride, load0, load1, load2, load3); \
573  src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
574  src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
575  src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2); \
576  src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3); \
577  \
578  TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3); \
579  \
580  p0_asub_q0 = __msa_asub_u_b(src2, src1); \
581  p1_asub_p0 = __msa_asub_u_b(src1, src0); \
582  q1_asub_q0 = __msa_asub_u_b(src2, src3); \
583  \
584  tc = __msa_fill_h(tc_val); \
585  \
586  is_less_than_alpha = (p0_asub_q0 < alpha); \
587  is_less_than_beta = (p1_asub_p0 < beta); \
588  is_less_than = is_less_than_alpha & is_less_than_beta; \
589  is_less_than_beta = (q1_asub_q0 < beta); \
590  is_less_than = is_less_than_beta & is_less_than; \
591  \
592  ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
593  HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
594  \
595  q0_sub_p0 <<= 2; \
596  delta = q0_sub_p0 + p1_sub_q1; \
597  delta = __msa_srari_h(delta, 3); \
598  \
599  CLIP_SH(delta, -tc, tc); \
600  \
601  ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
602  \
603  res0_r += delta; \
604  res1_r -= delta; \
605  \
606  CLIP_SH2_0_255(res0_r, res1_r); \
607  PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
608  \
609  res0 = __msa_bmnz_v(src1, res0, is_less_than); \
610  res1 = __msa_bmnz_v(src2, res1, is_less_than); \
611  \
612  res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
613 }
614 
615 #define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3) \
616 { \
617  v16i8 zero_m = { 0 }; \
618  \
619  out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0); \
620  out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2); \
621  SLDI_B2_UB(zero_m, out1, zero_m, out2, 2, out2, out3); \
622 }
623 
624 #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
625 { \
626  uint32_t load0, load1; \
627  v16u8 src0 = { 0 }; \
628  v16u8 src1 = { 0 }; \
629  v16u8 src2 = { 0 }; \
630  v16u8 src3 = { 0 }; \
631  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
632  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
633  v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r; \
634  v16i8 zeros = { 0 }; \
635  v16u8 res0, res1; \
636  \
637  load0 = LW(src - 2); \
638  load1 = LW(src - 2 + stride); \
639  \
640  src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
641  src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
642  \
643  TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3); \
644  \
645  p0_asub_q0 = __msa_asub_u_b(src2, src1); \
646  p1_asub_p0 = __msa_asub_u_b(src1, src0); \
647  q1_asub_q0 = __msa_asub_u_b(src2, src3); \
648  \
649  tc = __msa_fill_h(tc_val); \
650  \
651  is_less_than_alpha = (p0_asub_q0 < alpha); \
652  is_less_than_beta = (p1_asub_p0 < beta); \
653  is_less_than = is_less_than_alpha & is_less_than_beta; \
654  is_less_than_beta = (q1_asub_q0 < beta); \
655  is_less_than = is_less_than_beta & is_less_than; \
656  \
657  ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
658  HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
659  \
660  q0_sub_p0 <<= 2; \
661  delta = q0_sub_p0 + p1_sub_q1; \
662  delta = __msa_srari_h(delta, 3); \
663  CLIP_SH(delta, -tc, tc); \
664  \
665  ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
666  \
667  res0_r += delta; \
668  res1_r -= delta; \
669  \
670  CLIP_SH2_0_255(res0_r, res1_r); \
671  PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
672  \
673  res0 = __msa_bmnz_v(src1, res0, is_less_than); \
674  res1 = __msa_bmnz_v(src2, res1, is_less_than); \
675  \
676  res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
677 }
678 
680  uint8_t alpha_in,
681  uint8_t beta_in,
682  uint32_t img_width)
683 {
684  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
685  v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
686  v16u8 p1_org, p0_org, q0_org, q1_org;
687 
688  LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
689 
690  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
691  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
692  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
693 
694  is_less_than_alpha = (p0_asub_q0 < alpha_in);
695  is_less_than_beta = (p1_asub_p0 < beta_in);
696  is_less_than = is_less_than_beta & is_less_than_alpha;
697  is_less_than_beta = (q1_asub_q0 < beta_in);
698  is_less_than = is_less_than_beta & is_less_than;
699 
700  if (!__msa_test_bz_v(is_less_than)) {
701  v16u8 p2_asub_p0, q2_asub_q0, p0, q0, negate_is_less_than_beta;
702  v8i16 p0_r = { 0 };
703  v8i16 q0_r = { 0 };
704  v8i16 p0_l = { 0 };
705  v8i16 q0_l = { 0 };
706  v16i8 zero = { 0 };
707  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
708  v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
709  v16u8 q2_org = LD_UB(data + (2 * img_width));
710  v16u8 p2_org = LD_UB(data - (3 * img_width));
711  v16u8 tmp_flag = (v16u8)__msa_fill_b((alpha_in >> 2) + 2);
712 
713  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
714  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
715  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
716 
717  tmp_flag = (p0_asub_q0 < tmp_flag);
718 
719  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
720  is_less_than_beta = (p2_asub_p0 < beta_in);
721  is_less_than_beta = is_less_than_beta & tmp_flag;
722  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
723  is_less_than_beta = is_less_than_beta & is_less_than;
724  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
725 
726  q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
727  q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
728 
729  /* combine and store */
730  if (!__msa_test_bz_v(is_less_than_beta)) {
731  v8i16 p3_org_l, p3_org_r;
732  v16u8 p3_org = LD_UB(data - (img_width << 2));
733  v16u8 p2, p1;
734  v8i16 p2_r = { 0 };
735  v8i16 p2_l = { 0 };
736  v8i16 p1_r = { 0 };
737  v8i16 p1_l = { 0 };
738 
739  ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
740  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
741  p2_r, q1_org_r, p0_r, p1_r, p2_r);
742 
743  ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
744  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
745  p2_l, q1_org_l, p0_l, p1_l, p2_l);
746 
747  PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
748 
749  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
750  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
751  p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
752 
753  ST_UB(p1_org, data - (2 * img_width));
754  ST_UB(p2_org, data - (3 * img_width));
755  }
756 
757  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
758  AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
759 
760  /* combine */
761  p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
762  p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
763 
764  ST_UB(p0_org, data - img_width);
765 
766  /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
767  q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
768  is_less_than_beta = (q2_asub_q0 < beta_in);
769  is_less_than_beta = is_less_than_beta & tmp_flag;
770  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
771  is_less_than_beta = is_less_than_beta & is_less_than;
772  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
773 
774  /* combine and store */
775  if (!__msa_test_bz_v(is_less_than_beta)) {
776  v8i16 q3_org_r, q3_org_l;
777  v16u8 q3_org = LD_UB(data + (3 * img_width));
778  v16u8 q1, q2;
779  v8i16 q2_r = { 0 };
780  v8i16 q2_l = { 0 };
781  v8i16 q1_r = { 0 };
782  v8i16 q1_l = { 0 };
783 
784  ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
785  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
786  q2_r, p1_org_r, q0_r, q1_r, q2_r);
787 
788  ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
789  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
790  q2_l, p1_org_l, q0_l, q1_l, q2_l);
791 
792  PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
793  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
794  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
795  q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
796 
797  ST_UB(q1_org, data + img_width);
798  ST_UB(q2_org, data + 2 * img_width);
799  }
800 
801  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
802  AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
803 
804  /* combine */
805  q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
806  q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
807 
808  ST_UB(q0_org, data);
809  }
810 }
811 
813  uint8_t alpha_in,
814  uint8_t beta_in,
815  uint32_t img_width)
816 {
817  uint8_t *src = data - 4;
818  v16u8 alpha, beta, p0_asub_q0;
819  v16u8 is_less_than_alpha, is_less_than, is_less_than_beta;
820  v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
821  v16u8 p1_asub_p0, q1_asub_q0;
822 
823 
824  {
825  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
826  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
827 
828  LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
829  LD_UB8(src + (8 * img_width), img_width,
830  row8, row9, row10, row11, row12, row13, row14, row15);
831 
832  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3,
833  row4, row5, row6, row7,
834  row8, row9, row10, row11,
835  row12, row13, row14, row15,
836  p3_org, p2_org, p1_org, p0_org,
837  q0_org, q1_org, q2_org, q3_org);
838  }
839 
840  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
841  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
842  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
843 
844  alpha = (v16u8) __msa_fill_b(alpha_in);
845  beta = (v16u8) __msa_fill_b(beta_in);
846 
847  is_less_than_alpha = (p0_asub_q0 < alpha);
848  is_less_than_beta = (p1_asub_p0 < beta);
849  is_less_than = is_less_than_beta & is_less_than_alpha;
850  is_less_than_beta = (q1_asub_q0 < beta);
851  is_less_than = is_less_than_beta & is_less_than;
852 
853  if (!__msa_test_bz_v(is_less_than)) {
854  v8i16 p0_r = { 0 };
855  v8i16 q0_r = { 0 };
856  v8i16 p0_l = { 0 };
857  v8i16 q0_l = { 0 };
858  v16i8 zero = { 0 };
859  v16u8 tmp_flag, p0, q0, p2_asub_p0, q2_asub_q0;
860  v16u8 negate_is_less_than_beta;
861  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
862  v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
863 
864  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
865  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
866  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
867  UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l);
868 
869  tmp_flag = alpha >> 2;
870  tmp_flag = tmp_flag + 2;
871  tmp_flag = (p0_asub_q0 < tmp_flag);
872 
873  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
874  is_less_than_beta = (p2_asub_p0 < beta);
875  is_less_than_beta = tmp_flag & is_less_than_beta;
876  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
877  is_less_than_beta = is_less_than_beta & is_less_than;
878  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
879 
880  if (!__msa_test_bz_v(is_less_than_beta)) {
881  v16u8 p2, p1;
882  v8i16 p3_org_r, p3_org_l;
883  v8i16 p2_l = { 0 };
884  v8i16 p2_r = { 0 };
885  v8i16 p1_l = { 0 };
886  v8i16 p1_r = { 0 };
887 
888  ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
889  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
890  p2_r, q1_org_r, p0_r, p1_r, p2_r);
891 
892  ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
893  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
894  p2_l, q1_org_l, p0_l, p1_l, p2_l);
895 
896  PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
897  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
898  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
899  p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
900  }
901 
902  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
903  AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
904 
905  p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
906  p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
907 
908  q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
909  is_less_than_beta = (q2_asub_q0 < beta);
910 
911  is_less_than_beta = is_less_than_beta & tmp_flag;
912  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
913 
914  is_less_than_beta = is_less_than_beta & is_less_than;
915  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
916 
917  if (!__msa_test_bz_v(is_less_than_beta)) {
918  v16u8 q1, q2;
919  v8i16 q3_org_r, q3_org_l;
920  v8i16 q1_l = { 0 };
921  v8i16 q1_r = { 0 };
922  v8i16 q2_l = { 0 };
923  v8i16 q2_r = { 0 };
924 
925  ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
926  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
927  q2_r, p1_org_r, q0_r, q1_r, q2_r);
928 
929  ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
930  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
931  q2_l, p1_org_l, q0_l, q1_l, q2_l);
932 
933  PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
934  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
935  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
936  q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
937  }
938 
939  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
940  AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
941 
942  q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
943  q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
944 
945  {
946  v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
947 
948  ILVRL_B2_SH(p1_org, p2_org, tp0, tp2);
949  ILVRL_B2_SH(q0_org, p0_org, tp1, tp3);
950  ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
951 
952  ILVRL_H2_SH(tp1, tp0, tmp3, tmp4);
953  ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
954 
955  src = data - 3;
956  ST_W4(tmp3, 0, 1, 2, 3, src, img_width);
957  ST_H4(tmp2, 0, 1, 2, 3, src + 4, img_width);
958  src += 4 * img_width;
959  ST_W4(tmp4, 0, 1, 2, 3, src, img_width);
960  ST_H4(tmp2, 4, 5, 6, 7, src + 4, img_width);
961  src += 4 * img_width;
962 
963  ST_W4(tmp6, 0, 1, 2, 3, src, img_width);
964  ST_H4(tmp5, 0, 1, 2, 3, src + 4, img_width);
965  src += 4 * img_width;
966  ST_W4(tmp7, 0, 1, 2, 3, src, img_width);
967  ST_H4(tmp5, 4, 5, 6, 7, src + 4, img_width);
968  }
969  }
970 }
971 
973  int32_t alpha_in,
974  int32_t beta_in)
975 {
976  uint64_t load0, load1;
977  uint32_t out0, out2;
978  uint16_t out1, out3;
979  v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
980  v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
981  v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
982  v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
983  v8i16 tmp0, tmp1, tmp2, tmp3;
984  v16u8 alpha, beta;
985  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
986  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
987  v16u8 is_less_than_beta1, is_less_than_beta2;
988  v16i8 src0 = { 0 };
989  v16i8 src1 = { 0 };
990  v16i8 src2 = { 0 };
991  v16i8 src3 = { 0 };
992  v16i8 src4 = { 0 };
993  v16i8 src5 = { 0 };
994  v16i8 src6 = { 0 };
995  v16i8 src7 = { 0 };
996  v16i8 zeros = { 0 };
997 
998  load0 = LD(src - 4);
999  load1 = LD(src + stride - 4);
1000  src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0);
1001  src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1);
1002 
1003  load0 = LD(src + (2 * stride) - 4);
1004  load1 = LD(src + (3 * stride) - 4);
1005  src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0);
1006  src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
1007 
1008  load0 = LD(src + (4 * stride) - 4);
1009  load1 = LD(src + (5 * stride) - 4);
1010  src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
1011  src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
1012 
1013  load0 = LD(src + (6 * stride) - 4);
1014  load1 = LD(src + (7 * stride) - 4);
1015  src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
1016  src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
1017 
1018  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
1019  src0, src1, src2, src3);
1020 
1021  ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2);
1022  ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3);
1023 
1024  ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
1025  ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
1026  SLDI_B4_SB(zeros, src6, zeros, src1, zeros, src3, zeros, src5,
1027  8, src0, src2, src4, src7);
1028 
1029  p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1030  p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1031  q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1032 
1033  alpha = (v16u8) __msa_fill_b(alpha_in);
1034  beta = (v16u8) __msa_fill_b(beta_in);
1035 
1036  is_less_than_alpha = (p0_asub_q0 < alpha);
1037  is_less_than_beta = (p1_asub_p0 < beta);
1038  is_less_than = is_less_than_alpha & is_less_than_beta;
1039  is_less_than_beta = (q1_asub_q0 < beta);
1040  is_less_than = is_less_than & is_less_than_beta;
1041 
1042  alpha >>= 2;
1043  alpha += 2;
1044 
1045  is_less_than_alpha = (p0_asub_q0 < alpha);
1046 
1047  p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1048  is_less_than_beta1 = (p2_asub_p0 < beta);
1049  q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1050  is_less_than_beta2 = (q2_asub_q0 < beta);
1051 
1052  ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3,
1053  src0_r, src1_r, src2_r, src3_r);
1054  ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
1055  src4_r, src5_r, src6_r, src7_r);
1056 
1057  dst2_x_r = src1_r + src2_r + src3_r;
1058  dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
1059  dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
1060  dst1_r = src0_r + src1_r + src2_r + src3_r;
1061  dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
1062 
1063  dst0_r = (2 * src6_r) + (3 * src0_r);
1064  dst0_r += src1_r + src2_r + src3_r;
1065  dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
1066  dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1067  dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1068 
1069  PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
1070  dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
1071 
1072  dst3_x_r = src2_r + src3_r + src4_r;
1073  dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
1074  dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
1075  dst4_r = src2_r + src3_r + src4_r + src5_r;
1076  dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
1077 
1078  dst5_r = (2 * src7_r) + (3 * src5_r);
1079  dst5_r += src4_r + src3_r + src2_r;
1080  dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
1081  dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1082  dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1083 
1084  PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
1085  dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
1086 
1087  dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1088  dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1089  dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1090  dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1091 
1092  PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
1093 
1094  dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
1095  dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
1096  dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than);
1097  dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
1098 
1099  is_less_than = is_less_than_alpha & is_less_than;
1100  dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
1101  is_less_than_beta1 = is_less_than_beta1 & is_less_than;
1102  dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1);
1103 
1104  dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
1105  dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1);
1106  dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
1107  is_less_than_beta2 = is_less_than_beta2 & is_less_than;
1108  dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
1109  dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
1110  dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
1111 
1112  ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
1113  dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
1114  ILVRL_H2_SH(dst1, dst0, tmp0, tmp1);
1115  ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
1116 
1117  ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
1118  SLDI_B2_UB(zeros, dst0, zeros, dst4, 8, dst1, dst5);
1119  dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
1120  dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
1121  SLDI_B2_UB(zeros, dst2_x, zeros, dst2_y, 8, dst3_x, dst3_y);
1122 
1123  out0 = __msa_copy_u_w((v4i32) dst0, 0);
1124  out1 = __msa_copy_u_h((v8i16) dst0, 2);
1125  out2 = __msa_copy_u_w((v4i32) dst1, 0);
1126  out3 = __msa_copy_u_h((v8i16) dst1, 2);
1127 
1128  SW(out0, (src - 3));
1129  SH(out1, (src + 1));
1130  src += stride;
1131  SW(out2, (src - 3));
1132  SH(out3, (src + 1));
1133  src += stride;
1134 
1135  out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
1136  out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
1137  out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
1138  out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
1139 
1140  SW(out0, (src - 3));
1141  SH(out1, (src + 1));
1142  src += stride;
1143  SW(out2, (src - 3));
1144  SH(out3, (src + 1));
1145  src += stride;
1146 
1147  out0 = __msa_copy_u_w((v4i32) dst4, 0);
1148  out1 = __msa_copy_u_h((v8i16) dst4, 2);
1149  out2 = __msa_copy_u_w((v4i32) dst5, 0);
1150  out3 = __msa_copy_u_h((v8i16) dst5, 2);
1151 
1152  SW(out0, (src - 3));
1153  SH(out1, (src + 1));
1154  src += stride;
1155  SW(out2, (src - 3));
1156  SH(out3, (src + 1));
1157  src += stride;
1158 
1159  out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
1160  out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
1161  out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
1162  out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
1163 
1164  SW(out0, (src - 3));
1165  SH(out1, (src + 1));
1166  src += stride;
1167  SW(out2, (src - 3));
1168  SH(out3, (src + 1));
1169 }
1170 
1172  uint8_t alpha_in,
1173  uint8_t beta_in,
1174  uint32_t img_width)
1175 {
1176  v16u8 alpha, beta;
1177  v16u8 is_less_than;
1178  v8i16 p0_or_q0, q0_or_p0;
1179  v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1180  v16i8 zero = { 0 };
1181  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1182  v16u8 is_less_than_alpha, is_less_than_beta;
1183  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1184 
1185  alpha = (v16u8) __msa_fill_b(alpha_in);
1186  beta = (v16u8) __msa_fill_b(beta_in);
1187 
1188  LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
1189  p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
1190 
1191  p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1192  p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1193  q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1194 
1195  is_less_than_alpha = (p0_asub_q0 < alpha);
1196  is_less_than_beta = (p1_asub_p0 < beta);
1197  is_less_than = is_less_than_beta & is_less_than_alpha;
1198  is_less_than_beta = (q1_asub_q0 < beta);
1199  is_less_than = is_less_than_beta & is_less_than;
1200 
1201  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1202 
1203  if (!__msa_test_bz_v(is_less_than)) {
1204  ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1205  zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1206  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1207  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1208  PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1209 
1210  p0_or_q0_org =
1211  __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1212  q0_or_p0_org =
1213  __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1214 
1215  ST_UB(q0_or_p0_org, data_cb_or_cr);
1216  ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
1217  }
1218 }
1219 
1221  uint8_t alpha_in,
1222  uint8_t beta_in,
1223  uint32_t img_width)
1224 {
1225  v8i16 tmp1;
1226  v16u8 alpha, beta, is_less_than;
1227  v8i16 p0_or_q0, q0_or_p0;
1228  v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1229  v16i8 zero = { 0 };
1230  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1231  v16u8 is_less_than_alpha, is_less_than_beta;
1232  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1233 
1234  {
1235  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1236 
1237  LD_UB8((data_cb_or_cr - 2), img_width,
1238  row0, row1, row2, row3, row4, row5, row6, row7);
1239 
1240  TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1241  p1_or_q1_org, p0_or_q0_org,
1242  q0_or_p0_org, q1_or_p1_org);
1243  }
1244 
1245  alpha = (v16u8) __msa_fill_b(alpha_in);
1246  beta = (v16u8) __msa_fill_b(beta_in);
1247 
1248  p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1249  p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1250  q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1251 
1252  is_less_than_alpha = (p0_asub_q0 < alpha);
1253  is_less_than_beta = (p1_asub_p0 < beta);
1254  is_less_than = is_less_than_beta & is_less_than_alpha;
1255  is_less_than_beta = (q1_asub_q0 < beta);
1256  is_less_than = is_less_than_beta & is_less_than;
1257  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1258 
1259  if (!__msa_test_bz_v(is_less_than)) {
1260  ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1261  zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1262 
1263  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1264  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1265 
1266  /* convert 16 bit output into 8 bit output */
1267  PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1268 
1269  p0_or_q0_org =
1270  __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1271  q0_or_p0_org =
1272  __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1273  tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
1274 
1275  data_cb_or_cr -= 1;
1276  ST_H4(tmp1, 0, 1, 2, 3, data_cb_or_cr, img_width);
1277  data_cb_or_cr += 4 * img_width;
1278  ST_H4(tmp1, 4, 5, 6, 7, data_cb_or_cr, img_width);
1279  }
1280 }
1281 
1283  uint8_t bs0, uint8_t bs1,
1284  uint8_t bs2, uint8_t bs3,
1285  uint8_t tc0, uint8_t tc1,
1286  uint8_t tc2, uint8_t tc3,
1287  uint8_t alpha_in,
1288  uint8_t beta_in,
1289  uint32_t img_width)
1290 {
1291  v16u8 tmp_vec, bs = { 0 };
1292 
1293  tmp_vec = (v16u8) __msa_fill_b(bs0);
1294  bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1295  tmp_vec = (v16u8) __msa_fill_b(bs1);
1296  bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1297  tmp_vec = (v16u8) __msa_fill_b(bs2);
1298  bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1299  tmp_vec = (v16u8) __msa_fill_b(bs3);
1300  bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1301 
1302  if (!__msa_test_bz_v(bs)) {
1303  uint8_t *src = data - 4;
1304  v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
1305  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
1306  v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
1307  v16u8 is_bs_greater_than0;
1308  v16u8 tc = { 0 };
1309  v16i8 zero = { 0 };
1310 
1311  tmp_vec = (v16u8) __msa_fill_b(tc0);
1312  tc = (v16u8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1313  tmp_vec = (v16u8) __msa_fill_b(tc1);
1314  tc = (v16u8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1315  tmp_vec = (v16u8) __msa_fill_b(tc2);
1316  tc = (v16u8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1317  tmp_vec = (v16u8) __msa_fill_b(tc3);
1318  tc = (v16u8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1319 
1320  is_bs_greater_than0 = (zero < bs);
1321 
1322  {
1323  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1324  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1325 
1326  LD_UB8(src, img_width,
1327  row0, row1, row2, row3, row4, row5, row6, row7);
1328  src += (8 * img_width);
1329  LD_UB8(src, img_width,
1330  row8, row9, row10, row11, row12, row13, row14, row15);
1331 
1332  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1333  row8, row9, row10, row11,
1334  row12, row13, row14, row15,
1335  p3_org, p2_org, p1_org, p0_org,
1336  q0_org, q1_org, q2_org, q3_org);
1337  }
1338 
1339  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1340  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1341  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1342 
1343  alpha = (v16u8) __msa_fill_b(alpha_in);
1344  beta = (v16u8) __msa_fill_b(beta_in);
1345 
1346  is_less_than_alpha = (p0_asub_q0 < alpha);
1347  is_less_than_beta = (p1_asub_p0 < beta);
1348  is_less_than = is_less_than_beta & is_less_than_alpha;
1349  is_less_than_beta = (q1_asub_q0 < beta);
1350  is_less_than = is_less_than_beta & is_less_than;
1351  is_less_than = is_less_than & is_bs_greater_than0;
1352 
1353  if (!__msa_test_bz_v(is_less_than)) {
1354  v16i8 negate_tc, sign_negate_tc;
1355  v16u8 p0, q0, p2_asub_p0, q2_asub_q0;
1356  v8i16 tc_r, tc_l, negate_tc_r, i16_negatetc_l;
1357  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1358  v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1359  v8i16 p0_r, q0_r, p0_l, q0_l;
1360 
1361  negate_tc = zero - (v16i8) tc;
1362  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1363 
1364  ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1365 
1366  UNPCK_UB_SH(tc, tc_r, tc_l);
1367  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1368  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1369  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1370 
1371  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1372  is_less_than_beta = (p2_asub_p0 < beta);
1373  is_less_than_beta = is_less_than_beta & is_less_than;
1374 
1375  if (!__msa_test_bz_v(is_less_than_beta)) {
1376  v16u8 p1;
1377  v8i16 p1_r = { 0 };
1378  v8i16 p1_l = { 0 };
1379  v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1380  v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1381 
1382  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1383  negate_tc_r, tc_r, p1_r);
1384  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1385  i16_negatetc_l, tc_l, p1_l);
1386 
1387  p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1388  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1389 
1390  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1391  tc = tc + is_less_than_beta;
1392  }
1393 
1394  q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1395  is_less_than_beta = (q2_asub_q0 < beta);
1396  is_less_than_beta = is_less_than_beta & is_less_than;
1397 
1398  q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1399  q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1400 
1401  if (!__msa_test_bz_v(is_less_than_beta)) {
1402  v16u8 q1;
1403  v8i16 q1_r = { 0 };
1404  v8i16 q1_l = { 0 };
1405  v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1406  v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1407 
1408  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1409  negate_tc_r, tc_r, q1_r);
1410  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1411  i16_negatetc_l, tc_l, q1_l);
1412 
1413  q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1414  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1415 
1416  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1417  tc = tc + is_less_than_beta;
1418  }
1419 
1420  {
1421  v8i16 threshold_r, negate_thresh_r;
1422  v8i16 threshold_l, negate_thresh_l;
1423  v16i8 negate_thresh, sign_negate_thresh;
1424 
1425  negate_thresh = zero - (v16i8) tc;
1426  sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1427 
1428  ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1429  threshold_r, negate_thresh_r);
1430 
1431  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1432  negate_thresh_r, threshold_r, p0_r, q0_r);
1433 
1434  threshold_l = (v8i16) __msa_ilvl_b(zero, (v16i8) tc);
1435  negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1436  negate_thresh);
1437 
1438  AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1439  negate_thresh_l, threshold_l, p0_l, q0_l);
1440  }
1441 
1442  PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1443 
1444  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1445  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1446 
1447  {
1448  v16i8 tp0, tp1, tp2, tp3;
1449  v8i16 tmp2, tmp5;
1450  v4i32 tmp3, tmp4, tmp6, tmp7;
1451  uint32_t out0, out2;
1452  uint16_t out1, out3;
1453 
1454  src = data - 3;
1455 
1456  ILVRL_B2_SB(p1_org, p2_org, tp0, tp2);
1457  ILVRL_B2_SB(q0_org, p0_org, tp1, tp3);
1458  ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
1459 
1460  ILVRL_H2_SW(tp1, tp0, tmp3, tmp4);
1461  ILVRL_H2_SW(tp3, tp2, tmp6, tmp7);
1462 
1463  out0 = __msa_copy_u_w(tmp3, 0);
1464  out1 = __msa_copy_u_h(tmp2, 0);
1465  out2 = __msa_copy_u_w(tmp3, 1);
1466  out3 = __msa_copy_u_h(tmp2, 1);
1467 
1468  SW(out0, src);
1469  SH(out1, (src + 4));
1470  src += img_width;
1471  SW(out2, src);
1472  SH(out3, (src + 4));
1473 
1474  out0 = __msa_copy_u_w(tmp3, 2);
1475  out1 = __msa_copy_u_h(tmp2, 2);
1476  out2 = __msa_copy_u_w(tmp3, 3);
1477  out3 = __msa_copy_u_h(tmp2, 3);
1478 
1479  src += img_width;
1480  SW(out0, src);
1481  SH(out1, (src + 4));
1482  src += img_width;
1483  SW(out2, src);
1484  SH(out3, (src + 4));
1485 
1486  out0 = __msa_copy_u_w(tmp4, 0);
1487  out1 = __msa_copy_u_h(tmp2, 4);
1488  out2 = __msa_copy_u_w(tmp4, 1);
1489  out3 = __msa_copy_u_h(tmp2, 5);
1490 
1491  src += img_width;
1492  SW(out0, src);
1493  SH(out1, (src + 4));
1494  src += img_width;
1495  SW(out2, src);
1496  SH(out3, (src + 4));
1497 
1498  out0 = __msa_copy_u_w(tmp4, 2);
1499  out1 = __msa_copy_u_h(tmp2, 6);
1500  out2 = __msa_copy_u_w(tmp4, 3);
1501  out3 = __msa_copy_u_h(tmp2, 7);
1502 
1503  src += img_width;
1504  SW(out0, src);
1505  SH(out1, (src + 4));
1506  src += img_width;
1507  SW(out2, src);
1508  SH(out3, (src + 4));
1509 
1510  out0 = __msa_copy_u_w(tmp6, 0);
1511  out1 = __msa_copy_u_h(tmp5, 0);
1512  out2 = __msa_copy_u_w(tmp6, 1);
1513  out3 = __msa_copy_u_h(tmp5, 1);
1514 
1515  src += img_width;
1516  SW(out0, src);
1517  SH(out1, (src + 4));
1518  src += img_width;
1519  SW(out2, src);
1520  SH(out3, (src + 4));
1521 
1522  out0 = __msa_copy_u_w(tmp6, 2);
1523  out1 = __msa_copy_u_h(tmp5, 2);
1524  out2 = __msa_copy_u_w(tmp6, 3);
1525  out3 = __msa_copy_u_h(tmp5, 3);
1526 
1527  src += img_width;
1528  SW(out0, src);
1529  SH(out1, (src + 4));
1530  src += img_width;
1531  SW(out2, src);
1532  SH(out3, (src + 4));
1533 
1534  out0 = __msa_copy_u_w(tmp7, 0);
1535  out1 = __msa_copy_u_h(tmp5, 4);
1536  out2 = __msa_copy_u_w(tmp7, 1);
1537  out3 = __msa_copy_u_h(tmp5, 5);
1538 
1539  src += img_width;
1540  SW(out0, src);
1541  SH(out1, (src + 4));
1542  src += img_width;
1543  SW(out2, src);
1544  SH(out3, (src + 4));
1545 
1546  out0 = __msa_copy_u_w(tmp7, 2);
1547  out1 = __msa_copy_u_h(tmp5, 6);
1548  out2 = __msa_copy_u_w(tmp7, 3);
1549  out3 = __msa_copy_u_h(tmp5, 7);
1550 
1551  src += img_width;
1552  SW(out0, src);
1553  SH(out1, (src + 4));
1554  src += img_width;
1555  SW(out2, src);
1556  SH(out3, (src + 4));
1557  }
1558  }
1559  }
1560 }
1561 
1563  uint8_t bs0, uint8_t bs1,
1564  uint8_t bs2, uint8_t bs3,
1565  uint8_t tc0, uint8_t tc1,
1566  uint8_t tc2, uint8_t tc3,
1567  uint8_t alpha_in,
1568  uint8_t beta_in,
1569  uint32_t image_width)
1570 {
1571  v16u8 tmp_vec;
1572  v16u8 bs = { 0 };
1573 
1574  tmp_vec = (v16u8) __msa_fill_b(bs0);
1575  bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1576  tmp_vec = (v16u8) __msa_fill_b(bs1);
1577  bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1578  tmp_vec = (v16u8) __msa_fill_b(bs2);
1579  bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1580  tmp_vec = (v16u8) __msa_fill_b(bs3);
1581  bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1582 
1583  if (!__msa_test_bz_v(bs)) {
1584  v16u8 alpha, beta, is_less_than, is_less_than_beta;
1585  v16u8 p0, q0, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
1586  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1587  v16u8 is_less_than_alpha, is_bs_greater_than0;
1588  v8i16 p0_r, q0_r, p0_l, q0_l;
1589  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1590  v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1591  v16i8 zero = { 0 };
1592  v16i8 tc = { 0 };
1593 
1594  tmp_vec = (v16u8) __msa_fill_b(tc0);
1595  tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1596  tmp_vec = (v16u8) __msa_fill_b(tc1);
1597  tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1598  tmp_vec = (v16u8) __msa_fill_b(tc2);
1599  tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1600  tmp_vec = (v16u8) __msa_fill_b(tc3);
1601  tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1602 
1603  alpha = (v16u8) __msa_fill_b(alpha_in);
1604  beta = (v16u8) __msa_fill_b(beta_in);
1605 
1606  LD_UB5(data - (3 * image_width), image_width,
1607  p2_org, p1_org, p0_org, q0_org, q1_org);
1608 
1609  is_bs_greater_than0 = ((v16u8) zero < bs);
1610  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1611  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1612  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1613 
1614  is_less_than_alpha = (p0_asub_q0 < alpha);
1615  is_less_than_beta = (p1_asub_p0 < beta);
1616  is_less_than = is_less_than_beta & is_less_than_alpha;
1617  is_less_than_beta = (q1_asub_q0 < beta);
1618  is_less_than = is_less_than_beta & is_less_than;
1619  is_less_than = is_less_than & is_bs_greater_than0;
1620 
1621  if (!__msa_test_bz_v(is_less_than)) {
1622  v16i8 sign_negate_tc, negate_tc;
1623  v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
1624  v16u8 p2_asub_p0, q2_asub_q0;
1625 
1626  q2_org = LD_UB(data + (2 * image_width));
1627  negate_tc = zero - tc;
1628  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1629 
1630  ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1631 
1632  UNPCK_UB_SH(tc, tc_r, tc_l);
1633  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1634  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1635  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1636 
1637  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1638  is_less_than_beta = (p2_asub_p0 < beta);
1639  is_less_than_beta = is_less_than_beta & is_less_than;
1640 
1641  if (!__msa_test_bz_v(is_less_than_beta)) {
1642  v16u8 p1;
1643  v8i16 p1_r = { 0 };
1644  v8i16 p1_l = { 0 };
1645  v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1646  v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1647 
1648  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1649  negate_tc_r, tc_r, p1_r);
1650  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1651  i16_negatetc_l, tc_l, p1_l);
1652 
1653  p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1654  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1655  ST_UB(p1_org, data - (2 * image_width));
1656 
1657  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1658  tc = tc + (v16i8) is_less_than_beta;
1659  }
1660 
1661  q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1662  is_less_than_beta = (q2_asub_q0 < beta);
1663  is_less_than_beta = is_less_than_beta & is_less_than;
1664 
1665  q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1666  q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1667 
1668  if (!__msa_test_bz_v(is_less_than_beta)) {
1669  v16u8 q1;
1670  v8i16 q1_r = { 0 };
1671  v8i16 q1_l = { 0 };
1672  v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1673  v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1674 
1675  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1676  negate_tc_r, tc_r, q1_r);
1677  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1678  i16_negatetc_l, tc_l, q1_l);
1679 
1680  q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1681  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1682  ST_UB(q1_org, data + image_width);
1683 
1684  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1685  tc = tc + (v16i8) is_less_than_beta;
1686  }
1687  {
1688  v16i8 negate_thresh, sign_negate_thresh;
1689  v8i16 threshold_r, threshold_l;
1690  v8i16 negate_thresh_l, negate_thresh_r;
1691 
1692  negate_thresh = zero - tc;
1693  sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1694 
1695  ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1696  threshold_r, negate_thresh_r);
1697  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1698  negate_thresh_r, threshold_r, p0_r, q0_r);
1699 
1700  threshold_l = (v8i16) __msa_ilvl_b(zero, tc);
1701  negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1702  negate_thresh);
1703  AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1704  negate_thresh_l, threshold_l, p0_l, q0_l);
1705  }
1706 
1707  PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1708 
1709  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1710  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1711 
1712  ST_UB(p0_org, (data - image_width));
1713  ST_UB(q0_org, data);
1714  }
1715  }
1716 }
1717 
1719  int32_t alpha_in, int32_t beta_in,
1720  int8_t *tc0)
1721 {
1722  uint8_t *data = in;
1723  uint32_t out0, out1, out2, out3;
1724  uint64_t load;
1725  uint32_t tc_val;
1726  v16u8 alpha, beta;
1727  v16i8 inp0 = { 0 };
1728  v16i8 inp1 = { 0 };
1729  v16i8 inp2 = { 0 };
1730  v16i8 inp3 = { 0 };
1731  v16i8 inp4 = { 0 };
1732  v16i8 inp5 = { 0 };
1733  v16i8 inp6 = { 0 };
1734  v16i8 inp7 = { 0 };
1735  v16i8 src0, src1, src2, src3;
1736  v8i16 src4, src5, src6, src7;
1737  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
1738  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
1739  v16u8 is_less_than_beta1, is_less_than_beta2;
1740  v8i16 tc, tc_orig_r, tc_plus1;
1741  v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
1742  v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
1743  v8i16 src2_r, src3_r;
1744  v8i16 p2_r, p1_r, q2_r, q1_r;
1745  v16u8 p2, q2, p0, q0;
1746  v4i32 dst0, dst1;
1747  v16i8 zeros = { 0 };
1748 
1749  alpha = (v16u8) __msa_fill_b(alpha_in);
1750  beta = (v16u8) __msa_fill_b(beta_in);
1751 
1752  if (tc0[0] < 0) {
1753  data += (2 * stride);
1754  } else {
1755  load = LD(data - 3);
1756  inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
1757  load = LD(data - 3 + stride);
1758  inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
1759  data += (2 * stride);
1760  }
1761 
1762  if (tc0[1] < 0) {
1763  data += (2 * stride);
1764  } else {
1765  load = LD(data - 3);
1766  inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
1767  load = LD(data - 3 + stride);
1768  inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
1769  data += (2 * stride);
1770  }
1771 
1772  if (tc0[2] < 0) {
1773  data += (2 * stride);
1774  } else {
1775  load = LD(data - 3);
1776  inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
1777  load = LD(data - 3 + stride);
1778  inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
1779  data += (2 * stride);
1780  }
1781 
1782  if (tc0[3] < 0) {
1783  data += (2 * stride);
1784  } else {
1785  load = LD(data - 3);
1786  inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
1787  load = LD(data - 3 + stride);
1788  inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
1789  data += (2 * stride);
1790  }
1791 
1792  ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
1793  src0, src1, src2, src3);
1794 
1795  ILVR_H2_SH(src1, src0, src3, src2, src4, src6);
1796  ILVL_H2_SH(src1, src0, src3, src2, src5, src7);
1797 
1798  src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
1799  src1 = __msa_sldi_b(zeros, (v16i8) src0, 8);
1800  src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
1801  src3 = __msa_sldi_b(zeros, (v16i8) src2, 8);
1802  src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
1803  src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
1804 
1805  p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1806  p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1807  q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1808  p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1809  q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1810 
1811  is_less_than_alpha = (p0_asub_q0 < alpha);
1812  is_less_than_beta = (p1_asub_p0 < beta);
1813  is_less_than = is_less_than_alpha & is_less_than_beta;
1814  is_less_than_beta = (q1_asub_q0 < beta);
1815  is_less_than = is_less_than_beta & is_less_than;
1816 
1817  is_less_than_beta1 = (p2_asub_p0 < beta);
1818  is_less_than_beta2 = (q2_asub_q0 < beta);
1819 
1820  p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2);
1821  p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1822  p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
1823 
1824  ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r);
1825  p2_r += p0_add_q0;
1826  p2_r >>= 1;
1827  p2_r -= p1_r;
1828  ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
1829  q2_r += p0_add_q0;
1830  q2_r >>= 1;
1831  q2_r -= q1_r;
1832 
1833  tc_val = LW(tc0);
1834  tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
1835  tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
1836  is_tc_orig1 = tc_orig;
1837  is_tc_orig2 = tc_orig;
1838  tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
1839  tc = tc_orig_r;
1840 
1841  CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
1842  CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
1843 
1844  p2_r += p1_r;
1845  q2_r += q1_r;
1846 
1847  PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2);
1848 
1849  is_tc_orig1 = (zeros < is_tc_orig1);
1850  is_tc_orig2 = is_tc_orig1;
1851  is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
1852  is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
1853  is_tc_orig1 = is_less_than & is_tc_orig1;
1854  is_tc_orig2 = is_less_than & is_tc_orig2;
1855 
1856  p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1);
1857  q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
1858 
1859  q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1860  q0_sub_p0 <<= 2;
1861  p1_sub_q1 = p1_r - q1_r;
1862  q0_sub_p0 += p1_sub_q1;
1863  q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
1864 
1865  tc_plus1 = tc + 1;
1866  is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
1867  (v16i8) is_less_than_beta1);
1868  tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1);
1869  tc_plus1 = tc + 1;
1870  is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
1871  (v16i8) is_less_than_beta2);
1872  tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
1873 
1874  CLIP_SH(q0_sub_p0, -tc, tc);
1875 
1876  ILVR_B2_SH(zeros, src2, zeros, src3, src2_r, src3_r);
1877  src2_r += q0_sub_p0;
1878  src3_r -= q0_sub_p0;
1879 
1880  CLIP_SH2_0_255(src2_r, src3_r);
1881 
1882  PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
1883 
1884  p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than);
1885  q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than);
1886 
1887  ILVR_B2_UB(p0, p2, q2, q0, p2, q2);
1888 
1889  ILVRL_H2_SW(q2, p2, dst0, dst1);
1890 
1891  data = in;
1892 
1893  out0 = __msa_copy_u_w(dst0, 0);
1894  out1 = __msa_copy_u_w(dst0, 1);
1895  out2 = __msa_copy_u_w(dst0, 2);
1896  out3 = __msa_copy_u_w(dst0, 3);
1897 
1898  if (tc0[0] < 0) {
1899  data += (2 * stride);
1900  } else {
1901  SW(out0, (data - 2));
1902  data += stride;
1903  SW(out1, (data - 2));
1904  data += stride;
1905  }
1906 
1907  if (tc0[1] < 0) {
1908  data += (2 * stride);
1909  } else {
1910  SW(out2, (data - 2));
1911  data += stride;
1912  SW(out3, (data - 2));
1913  data += stride;
1914  }
1915 
1916  out0 = __msa_copy_u_w(dst1, 0);
1917  out1 = __msa_copy_u_w(dst1, 1);
1918  out2 = __msa_copy_u_w(dst1, 2);
1919  out3 = __msa_copy_u_w(dst1, 3);
1920 
1921  if (tc0[2] < 0) {
1922  data += (2 * stride);
1923  } else {
1924  SW(out0, (data - 2));
1925  data += stride;
1926  SW(out1, (data - 2));
1927  data += stride;
1928  }
1929 
1930  if (tc0[3] >= 0) {
1931  SW(out2, (data - 2));
1932  data += stride;
1933  SW(out3, (data - 2));
1934  }
1935 }
1936 
1938  uint8_t bs0, uint8_t bs1,
1939  uint8_t bs2, uint8_t bs3,
1940  uint8_t tc0, uint8_t tc1,
1941  uint8_t tc2, uint8_t tc3,
1942  uint8_t alpha_in,
1943  uint8_t beta_in,
1944  uint32_t img_width)
1945 {
1946  v16u8 alpha, beta;
1947  v8i16 tmp_vec;
1948  v8i16 bs = { 0 };
1949  v8i16 tc = { 0 };
1950  v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
1951  v16u8 is_less_than;
1952  v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
1953  v8i16 p0_r, q0_r;
1954  v16u8 p1_org, p0_org, q0_org, q1_org;
1955  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1956  v16i8 negate_tc, sign_negate_tc;
1957  v8i16 tc_r, negate_tc_r;
1958  v16i8 zero = { 0 };
1959 
1960  tmp_vec = (v8i16) __msa_fill_b(bs0);
1961  bs = __msa_insve_h(bs, 0, tmp_vec);
1962  tmp_vec = (v8i16) __msa_fill_b(bs1);
1963  bs = __msa_insve_h(bs, 1, tmp_vec);
1964  tmp_vec = (v8i16) __msa_fill_b(bs2);
1965  bs = __msa_insve_h(bs, 2, tmp_vec);
1966  tmp_vec = (v8i16) __msa_fill_b(bs3);
1967  bs = __msa_insve_h(bs, 3, tmp_vec);
1968 
1969  if (!__msa_test_bz_v((v16u8) bs)) {
1970  tmp_vec = (v8i16) __msa_fill_b(tc0);
1971  tc = __msa_insve_h(tc, 0, tmp_vec);
1972  tmp_vec = (v8i16) __msa_fill_b(tc1);
1973  tc = __msa_insve_h(tc, 1, tmp_vec);
1974  tmp_vec = (v8i16) __msa_fill_b(tc2);
1975  tc = __msa_insve_h(tc, 2, tmp_vec);
1976  tmp_vec = (v8i16) __msa_fill_b(tc3);
1977  tc = __msa_insve_h(tc, 3, tmp_vec);
1978 
1979  is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
1980 
1981  alpha = (v16u8) __msa_fill_b(alpha_in);
1982  beta = (v16u8) __msa_fill_b(beta_in);
1983 
1984  LD_UB4(data - (img_width << 1), img_width,
1985  p1_org, p0_org, q0_org, q1_org);
1986 
1987  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1988  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1989  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1990 
1991  is_less_than_alpha = (p0_asub_q0 < alpha);
1992  is_less_than_beta = (p1_asub_p0 < beta);
1993  is_less_than = is_less_than_beta & is_less_than_alpha;
1994  is_less_than_beta = (q1_asub_q0 < beta);
1995  is_less_than = is_less_than_beta & is_less_than;
1996  is_less_than = is_less_than & is_bs_greater_than0;
1997 
1998  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1999 
2000  if (!__msa_test_bz_v(is_less_than)) {
2001  negate_tc = zero - (v16i8) tc;
2002  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
2003 
2004  ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r);
2005 
2006  ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
2007  p1_org_r, p0_org_r, q0_org_r, q1_org_r);
2008 
2009  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
2010  tc_r, p0_r, q0_r);
2011 
2012  PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
2013 
2014  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
2015  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
2016 
2017  ST_UB(q0_org, data);
2018  ST_UB(p0_org, (data - img_width));
2019  }
2020  }
2021 }
2022 
2024  uint8_t bs0, uint8_t bs1,
2025  uint8_t bs2, uint8_t bs3,
2026  uint8_t tc0, uint8_t tc1,
2027  uint8_t tc2, uint8_t tc3,
2028  uint8_t alpha_in,
2029  uint8_t beta_in,
2030  uint32_t img_width)
2031 {
2032  uint8_t *src;
2033  v16u8 alpha, beta;
2034  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
2035  v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
2036  v16u8 p0, q0;
2037  v8i16 p0_r = { 0 };
2038  v8i16 q0_r = { 0 };
2039  v16u8 p1_org, p0_org, q0_org, q1_org;
2040  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
2041  v16u8 is_bs_greater_than0;
2042  v8i16 tc_r, negate_tc_r;
2043  v16i8 negate_tc, sign_negate_tc;
2044  v16i8 zero = { 0 };
2045  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
2046  v8i16 tmp1, tmp_vec, bs = { 0 };
2047  v8i16 tc = { 0 };
2048 
2049  tmp_vec = (v8i16) __msa_fill_b(bs0);
2050  bs = __msa_insve_h(bs, 0, tmp_vec);
2051  tmp_vec = (v8i16) __msa_fill_b(bs1);
2052  bs = __msa_insve_h(bs, 1, tmp_vec);
2053  tmp_vec = (v8i16) __msa_fill_b(bs2);
2054  bs = __msa_insve_h(bs, 2, tmp_vec);
2055  tmp_vec = (v8i16) __msa_fill_b(bs3);
2056  bs = __msa_insve_h(bs, 3, tmp_vec);
2057 
2058  if (!__msa_test_bz_v((v16u8) bs)) {
2059  tmp_vec = (v8i16) __msa_fill_b(tc0);
2060  tc = __msa_insve_h(tc, 0, tmp_vec);
2061  tmp_vec = (v8i16) __msa_fill_b(tc1);
2062  tc = __msa_insve_h(tc, 1, tmp_vec);
2063  tmp_vec = (v8i16) __msa_fill_b(tc2);
2064  tc = __msa_insve_h(tc, 2, tmp_vec);
2065  tmp_vec = (v8i16) __msa_fill_b(tc3);
2066  tc = __msa_insve_h(tc, 3, tmp_vec);
2067 
2068  is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
2069 
2070  LD_UB8((data - 2), img_width,
2071  row0, row1, row2, row3, row4, row5, row6, row7);
2072 
2073  TRANSPOSE8x4_UB_UB(row0, row1, row2, row3,
2074  row4, row5, row6, row7,
2075  p1_org, p0_org, q0_org, q1_org);
2076 
2077  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
2078  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
2079  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
2080 
2081  alpha = (v16u8) __msa_fill_b(alpha_in);
2082  beta = (v16u8) __msa_fill_b(beta_in);
2083 
2084  is_less_than_alpha = (p0_asub_q0 < alpha);
2085  is_less_than_beta = (p1_asub_p0 < beta);
2086  is_less_than = is_less_than_beta & is_less_than_alpha;
2087  is_less_than_beta = (q1_asub_q0 < beta);
2088  is_less_than = is_less_than_beta & is_less_than;
2089  is_less_than = is_bs_greater_than0 & is_less_than;
2090 
2091  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
2092 
2093  if (!__msa_test_bz_v(is_less_than)) {
2094  ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
2095  p1_org_r, p0_org_r, q0_org_r, q1_org_r);
2096 
2097  negate_tc = zero - (v16i8) tc;
2098  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
2099 
2100  ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r);
2101 
2102  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
2103  tc_r, p0_r, q0_r);
2104 
2105  PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
2106 
2107  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
2108  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
2109  tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
2110  src = data - 1;
2111  ST_H4(tmp1, 0, 1, 2, 3, src, img_width);
2112  src += 4 * img_width;
2113  ST_H4(tmp1, 4, 5, 6, 7, src, img_width);
2114  }
2115  }
2116 }
2117 
2119  int32_t alpha_in, int32_t beta_in,
2120  int8_t *tc0)
2121 {
2122  int32_t col, tc_val;
2123  v16u8 alpha, beta, res;
2124 
2125  alpha = (v16u8) __msa_fill_b(alpha_in);
2126  beta = (v16u8) __msa_fill_b(beta_in);
2127 
2128  for (col = 0; col < 4; col++) {
2129  tc_val = (tc0[col] - 1) + 1;
2130 
2131  if (tc_val <= 0) {
2132  src += (4 * stride);
2133  continue;
2134  }
2135 
2136  AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2137  ST_H4(res, 0, 1, 2, 3, (src - 1), stride);
2138  src += (4 * stride);
2139  }
2140 }
2141 
2143  int32_t alpha_in,
2144  int32_t beta_in,
2145  int8_t *tc0)
2146 {
2147  int32_t col, tc_val;
2148  int16_t out0, out1;
2149  v16u8 alpha, beta, res;
2150 
2151  alpha = (v16u8) __msa_fill_b(alpha_in);
2152  beta = (v16u8) __msa_fill_b(beta_in);
2153 
2154  for (col = 0; col < 4; col++) {
2155  tc_val = (tc0[col] - 1) + 1;
2156 
2157  if (tc_val <= 0) {
2158  src += 4 * stride;
2159  continue;
2160  }
2161 
2162  AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2163 
2164  out0 = __msa_copy_s_h((v8i16) res, 0);
2165  out1 = __msa_copy_s_h((v8i16) res, 1);
2166 
2167  SH(out0, (src - 1));
2168  src += stride;
2169  SH(out1, (src - 1));
2170  src += stride;
2171  }
2172 }
2173 
2175  int alpha, int beta, int8_t *tc)
2176 {
2177  uint8_t bs0 = 1;
2178  uint8_t bs1 = 1;
2179  uint8_t bs2 = 1;
2180  uint8_t bs3 = 1;
2181 
2182  if (tc[0] < 0)
2183  bs0 = 0;
2184  if (tc[1] < 0)
2185  bs1 = 0;
2186  if (tc[2] < 0)
2187  bs2 = 0;
2188  if (tc[3] < 0)
2189  bs3 = 0;
2190 
2191  avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2192  tc[0], tc[1], tc[2], tc[3],
2193  alpha, beta, img_width);
2194 }
2195 
2197  int alpha, int beta, int8_t *tc)
2198 {
2199 
2200  uint8_t bs0 = 1;
2201  uint8_t bs1 = 1;
2202  uint8_t bs2 = 1;
2203  uint8_t bs3 = 1;
2204 
2205  if (tc[0] < 0)
2206  bs0 = 0;
2207  if (tc[1] < 0)
2208  bs1 = 0;
2209  if (tc[2] < 0)
2210  bs2 = 0;
2211  if (tc[3] < 0)
2212  bs3 = 0;
2213 
2214  avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2215  tc[0], tc[1], tc[2], tc[3],
2216  alpha, beta, img_width);
2217 }
2218 
2220  int alpha, int beta, int8_t *tc)
2221 {
2222  uint8_t bs0 = 1;
2223  uint8_t bs1 = 1;
2224  uint8_t bs2 = 1;
2225  uint8_t bs3 = 1;
2226 
2227  if (tc[0] < 0)
2228  bs0 = 0;
2229  if (tc[1] < 0)
2230  bs1 = 0;
2231  if (tc[2] < 0)
2232  bs2 = 0;
2233  if (tc[3] < 0)
2234  bs3 = 0;
2235 
2236  avc_loopfilter_cb_or_cr_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2237  tc[0], tc[1], tc[2], tc[3],
2238  alpha, beta, img_width);
2239 }
2240 
2242  int alpha, int beta, int8_t *tc)
2243 {
2244  uint8_t bs0 = 1;
2245  uint8_t bs1 = 1;
2246  uint8_t bs2 = 1;
2247  uint8_t bs3 = 1;
2248 
2249  if (tc[0] < 0)
2250  bs0 = 0;
2251  if (tc[1] < 0)
2252  bs1 = 0;
2253  if (tc[2] < 0)
2254  bs2 = 0;
2255  if (tc[3] < 0)
2256  bs3 = 0;
2257 
2258  avc_loopfilter_cb_or_cr_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2259  tc[0], tc[1], tc[2], tc[3],
2260  alpha, beta, img_width);
2261 }
2262 
2264  int alpha, int beta)
2265 {
2267  (uint8_t) beta,
2268  (unsigned int) img_width);
2269 }
2270 
2272  int alpha, int beta)
2273 {
2275  (uint8_t) beta,
2276  (unsigned int) img_width);
2277 }
2278 
2280  int alpha, int beta)
2281 {
2283  (uint8_t) beta,
2284  (unsigned int) img_width);
2285 }
2286 
2288  int alpha, int beta)
2289 {
2291  (uint8_t) beta,
2292  (unsigned int) img_width);
2293 }
2294 
2296  int32_t ystride,
2297  int32_t alpha, int32_t beta,
2298  int8_t *tc0)
2299 {
2300  avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0);
2301 }
2302 
2304  int32_t ystride,
2305  int32_t alpha,
2306  int32_t beta,
2307  int8_t *tc0)
2308 {
2309  avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0);
2310 }
2311 
2313  int32_t ystride,
2314  int32_t alpha,
2315  int32_t beta,
2316  int8_t *tc0)
2317 {
2318  avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0);
2319 }
2320 
2322  int32_t ystride,
2323  int32_t alpha,
2324  int32_t beta)
2325 {
2326  avc_h_loop_filter_luma_mbaff_intra_msa(src, ystride, alpha, beta);
2327 }
2328 
2330  int height, int log2_denom,
2331  int weight_src, int offset_in)
2332 {
2333  uint32_t offset_val;
2334  v16i8 zero = { 0 };
2335  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2336  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2337  v8i16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
2338  v8i16 src4_l, src5_l, src6_l, src7_l, src4_r, src5_r, src6_r, src7_r;
2339  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2340  v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2341  v8i16 wgt, denom, offset;
2342 
2343  offset_val = (unsigned) offset_in << log2_denom;
2344 
2345  wgt = __msa_fill_h(weight_src);
2346  offset = __msa_fill_h(offset_val);
2347  denom = __msa_fill_h(log2_denom);
2348 
2349  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2350  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, src1_r,
2351  src2_r, src3_r);
2352  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, src1_l,
2353  src2_l, src3_l);
2354  ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, src5_r,
2355  src6_r, src7_r);
2356  ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, src5_l,
2357  src6_l, src7_l);
2358  MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, tmp2,
2359  tmp3);
2360  MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, tmp6,
2361  tmp7);
2362  MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, tmp10,
2363  tmp11);
2364  MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2365  tmp14, tmp15);
2366  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
2367  tmp1, tmp2, tmp3);
2368  ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
2369  tmp5, tmp6, tmp7);
2370  ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, tmp8,
2371  tmp9, tmp10, tmp11);
2372  ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2373  tmp12, tmp13, tmp14, tmp15);
2374  MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2375  MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2376  SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2377  SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2378  SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2379  SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2380  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2381  dst2, dst3);
2382  PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2383  dst5, dst6, dst7);
2384  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2385  src += 8 * stride;
2386 
2387  if (16 == height) {
2388  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2389  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r,
2390  src1_r, src2_r, src3_r);
2391  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l,
2392  src1_l, src2_l, src3_l);
2393  ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r,
2394  src5_r, src6_r, src7_r);
2395  ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l,
2396  src5_l, src6_l, src7_l);
2397  MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1,
2398  tmp2, tmp3);
2399  MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5,
2400  tmp6, tmp7);
2401  MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9,
2402  tmp10, tmp11);
2403  MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2404  tmp14, tmp15);
2405  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
2406  tmp0, tmp1, tmp2, tmp3);
2407  ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
2408  tmp4, tmp5, tmp6, tmp7);
2409  ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset,
2410  tmp8, tmp9, tmp10, tmp11);
2411  ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2412  tmp12, tmp13, tmp14, tmp15);
2413  MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2414  MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2415  SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2416  SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2417  SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2418  SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2419  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2420  dst2, dst3);
2421  PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2422  dst5, dst6, dst7);
2423  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2424  }
2425 }
2426 
2428  int height, int log2_denom,
2429  int weight_src, int offset)
2430 {
2431  if (4 == height) {
2432  avc_wgt_8x4_msa(src, stride, log2_denom, weight_src, offset);
2433  } else if (8 == height) {
2434  avc_wgt_8x8_msa(src, stride, log2_denom, weight_src, offset);
2435  } else {
2436  avc_wgt_8x16_msa(src, stride, log2_denom, weight_src, offset);
2437  }
2438 }
2439 
2441  int height, int log2_denom,
2442  int weight_src, int offset)
2443 {
2444  if (2 == height) {
2445  avc_wgt_4x2_msa(src, stride, log2_denom, weight_src, offset);
2446  } else if (4 == height) {
2447  avc_wgt_4x4_msa(src, stride, log2_denom, weight_src, offset);
2448  } else {
2449  avc_wgt_4x8_msa(src, stride, log2_denom, weight_src, offset);
2450  }
2451 }
2452 
2454  ptrdiff_t stride, int height,
2455  int log2_denom, int weight_dst,
2456  int weight_src, int offset_in)
2457 {
2458  v16i8 src_wgt, dst_wgt, wgt;
2459  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2460  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2461  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2462  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2463  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2464  v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2465  v8i16 denom, offset;
2466 
2467  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
2468  offset_in += (128 * (weight_src + weight_dst));
2469 
2470  src_wgt = __msa_fill_b(weight_src);
2471  dst_wgt = __msa_fill_b(weight_dst);
2472  offset = __msa_fill_h(offset_in);
2473  denom = __msa_fill_h(log2_denom + 1);
2474 
2475  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
2476 
2477  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2478  src += 8 * stride;
2479  LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2480  XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2481  XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2482  ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2, vec4,
2483  vec6);
2484  ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3, vec5,
2485  vec7);
2486  ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2487  vec12, vec14);
2488  ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2489  vec13, vec15);
2490  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2491  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2492  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2493  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2494  tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2495  tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2496  tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2497  tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2498  tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2499  tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2500  tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2501  tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2502  tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2503  tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2504  tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2505  tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2506  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2507  SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2508  SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2509  SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2510  CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2511  CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2512  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2513  dst2, dst3);
2514  PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2515  dst5, dst6, dst7);
2516  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2517  dst += 8 * stride;
2518 
2519  if (16 == height) {
2520  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2521  LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2522  XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2523  XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2524  ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2,
2525  vec4, vec6);
2526  ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3,
2527  vec5, vec7);
2528  ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2529  vec12, vec14);
2530  ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2531  vec13, vec15);
2532  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2533  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2534  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2535  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2536  tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2537  tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2538  tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2539  tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2540  tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2541  tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2542  tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2543  tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2544  tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2545  tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2546  tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2547  tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2548  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2549  SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2550  SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2551  SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2552  CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2553  CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2554  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2555  dst2, dst3);
2556  PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2557  dst5, dst6, dst7);
2558  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2559  }
2560 }
2561 
2563  ptrdiff_t stride, int height,
2564  int log2_denom, int weight_dst,
2565  int weight_src, int offset)
2566 {
2567  if (4 == height) {
2568  avc_biwgt_8x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2569  offset);
2570  } else if (8 == height) {
2571  avc_biwgt_8x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2572  offset);
2573  } else {
2574  avc_biwgt_8x16_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2575  offset);
2576  }
2577 }
2578 
2580  ptrdiff_t stride, int height,
2581  int log2_denom, int weight_dst,
2582  int weight_src, int offset)
2583 {
2584  if (2 == height) {
2585  avc_biwgt_4x2_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2586  offset);
2587  } else if (4 == height) {
2588  avc_biwgt_4x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2589  offset);
2590  } else {
2591  avc_biwgt_4x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2592  offset);
2593  }
2594 }
#define MAXI_SH2_SH(...)
void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src, int32_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
Definition: h264dsp_msa.c:2312
static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:1220
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
#define XORI_B8_128_UB(...)
static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:2023
#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,p1_or_q1_org_in, p2_or_q2_org_in,negate_tc_in, tc_in, p1_or_q1_out)
Definition: h264dsp_msa.c:521
ptrdiff_t const GLvoid * data
Definition: opengl_enc.c:100
#define ILVRL_B2_SH(...)
static void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
Definition: h264dsp_msa.c:2142
static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
Definition: h264dsp_msa.c:1718
#define PCKEV_B2_SH(...)
static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:1171
#define LW(psrc)
void ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset)
Definition: h264dsp_msa.c:2440
#define MUL2(in0, in1, in2, in3, out0, out1)
#define tc
Definition: regdef.h:69
#define ILVRL_H2_SW(...)
#define PCKEV_B3_UB(...)
void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, int img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2219
static const uint8_t q1[256]
Definition: twofish.c:96
#define LD_UB4(...)
#define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res)
Definition: h264dsp_msa.c:624
#define src
Definition: vp8dsp.c:254
#define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)
#define ILVL_H2_SH(...)
static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:181
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,in8, in9, in10, in11, in12, in13, in14, in15,out0, out1, out2, out3, out4, out5, out6, out7)
static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:226
#define SRA_4V(in0, in1, in2, in3, shift)
void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, int img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2263
#define XORI_B4_128_UB(...)
static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:77
uint8_t
#define SAT_UH8_SH(...)
#define LD4(psrc, stride, out0, out1, out2, out3)
static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:139
#define UNPCK_UB_SH(in, out0, out1)
void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, int img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2241
void ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset_in)
Definition: h264dsp_msa.c:2329
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, int img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2271
static void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, uint32_t image_width)
Definition: h264dsp_msa.c:1562
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define XORI_B2_128_UB(...)
#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in,p1_or_q1_org_in, p0_or_q0_out)
Definition: h264dsp_msa.c:512
#define height
static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:812
#define ILVRL_H2_SH(...)
#define CLIP_SH8_0_255(in0, in1, in2, in3,in4, in5, in6, in7)
#define LD_UB5(...)
#define CLIP_SH2_0_255(in0, in1)
static void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:679
void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, int img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2279
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
#define INSERT_W2_UB(...)
static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:24
#define zero
Definition: regdef.h:64
#define LW2(psrc, stride, out0, out1)
void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src, int32_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
Definition: h264dsp_msa.c:2295
#define ADDS_SH2_SH(...)
#define ILVR_B2_SH(...)
#define ILVR_W2_SB(...)
static const uint8_t q0[256]
Definition: twofish.c:77
#define CLIP_SH4_0_255(in0, in1, in2, in3)
#define TRANSPOSE8x4_UB_UB(...)
static void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:1937
#define SLDI_B2_UB(...)
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:293
#define LD_UB8(...)
static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, uint32_t img_width)
Definition: h264dsp_msa.c:1282
void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)
Definition: h264dsp_msa.c:2579
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
int32_t
#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in,p1_or_q1_org_in, q1_or_p1_org_in,negate_threshold_in, threshold_in,p0_or_q0_out, q0_or_p0_out)
Definition: h264dsp_msa.c:536
#define ILVR_B4_UH(...)
#define ILVL_B4_SH(...)
#define ST_UB(...)
void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t ystride, int32_t alpha, int32_t beta)
Definition: h264dsp_msa.c:2321
#define PCKEV_B4_UB(...)
#define SRLR_H8_SH(...)
#define INSERT_W4_UB(...)
void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)
Definition: h264dsp_msa.c:2562
#define ILVL_B2_SH(...)
#define ST_UB8(...)
#define SAT_UH2_SH(...)
#define src1
Definition: h264pred.c:139
#define SAT_UH4_SH(...)
#define ILVL_B4_SB(...)
void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, int img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2287
static const int16_t alpha[]
Definition: ilbcdata.h:55
#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,q3_or_p3_org_in, p1_or_q1_org_in,p2_or_q2_org_in, q1_or_p1_org_in,p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)
Definition: h264dsp_msa.c:485
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31))))#define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac){}void ff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map){AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);return NULL;}return ac;}in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;}int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){int use_generic=1;int len=in->nb_samples;int p;if(ac->dc){av_log(ac->avr, AV_LOG_TRACE,"%d samples - audio_convert: %s to %s (dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> in
#define MAXI_SH8_SH(...)
#define SRLR_H4_SH(...)
#define ILVR_B4_SH(...)
#define CLIP_SH(in, min, max)
static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:51
static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:259
#define src0
Definition: h264pred.c:138
#define LD(psrc)
#define SH(val, pdst)
static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:371
static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride, int32_t alpha_in, int32_t beta_in)
Definition: h264dsp_msa.c:972
#define SW(val, pdst)
void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, int img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2196
void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, int img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2174
#define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res)
Definition: h264dsp_msa.c:558
void ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset)
Definition: h264dsp_msa.c:2427
#define ILVR_W2_UB(...)
#define ILVL_W2_SB(...)
#define INSERT_D2_UB(...)
GLint GLenum GLboolean GLsizei stride
Definition: opengl_enc.c:104
#define LW4(psrc, stride, out0, out1, out2, out3)
#define ILVRL_B2_SB(...)
#define ILVR_H2_SH(...)
void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
Definition: h264dsp_msa.c:2303
#define LD_UB(...)
#define MAXI_SH4_SH(...)
static void avc_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
Definition: h264dsp_msa.c:2118
static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:333
#define ILVR_B4_SB(...)
void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset_in)
Definition: h264dsp_msa.c:2453
#define stride
#define ST_W2(in, idx0, idx1, pdst, stride)
#define PCKEV_B2_UB(...)
static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:422
#define ILVR_B2_UB(...)
#define ADDS_SH4_SH(...)
static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:108
#define SLDI_B4_SB(...)