FFmpeg
h264dsp_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "h264dsp_mips.h"
23 
24 static void avc_wgt_4x2_msa(uint8_t *data, ptrdiff_t stride,
25  int32_t log2_denom, int32_t src_weight,
26  int32_t offset_in)
27 {
28  uint32_t tp0, tp1, offset_val;
29  v16u8 zero = { 0 };
30  v16u8 src0 = { 0 };
31  v8i16 src0_r, tmp0, wgt, denom, offset;
32 
33  offset_val = (unsigned) offset_in << log2_denom;
34 
35  wgt = __msa_fill_h(src_weight);
36  offset = __msa_fill_h(offset_val);
37  denom = __msa_fill_h(log2_denom);
38 
39  LW2(data, stride, tp0, tp1);
40  INSERT_W2_UB(tp0, tp1, src0);
41  src0_r = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) src0);
42  tmp0 = wgt * src0_r;
43  tmp0 = __msa_adds_s_h(tmp0, offset);
44  tmp0 = __msa_maxi_s_h(tmp0, 0);
45  tmp0 = __msa_srlr_h(tmp0, denom);
46  tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
47  src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
48  ST_W2(src0, 0, 1, data, stride);
49 }
50 
51 static void avc_wgt_4x4_msa(uint8_t *data, ptrdiff_t stride,
52  int32_t log2_denom, int32_t src_weight,
53  int32_t offset_in)
54 {
55  uint32_t tp0, tp1, tp2, tp3, offset_val;
56  v16u8 src0 = { 0 };
57  v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom, offset;
58 
59  offset_val = (unsigned) offset_in << log2_denom;
60 
61  wgt = __msa_fill_h(src_weight);
62  offset = __msa_fill_h(offset_val);
63  denom = __msa_fill_h(log2_denom);
64 
65  LW4(data, stride, tp0, tp1, tp2, tp3);
66  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
67  UNPCK_UB_SH(src0, src0_r, src1_r);
68  MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1);
69  ADDS_SH2_SH(tmp0, offset, tmp1, offset, tmp0, tmp1);
70  MAXI_SH2_SH(tmp0, tmp1, 0);
71  tmp0 = __msa_srlr_h(tmp0, denom);
72  tmp1 = __msa_srlr_h(tmp1, denom);
73  SAT_UH2_SH(tmp0, tmp1, 7);
74  src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
75  ST_W4(src0, 0, 1, 2, 3, data, stride);
76 }
77 
78 static void avc_wgt_4x8_msa(uint8_t *data, ptrdiff_t stride,
79  int32_t log2_denom, int32_t src_weight,
80  int32_t offset_in)
81 {
82  uint32_t tp0, tp1, tp2, tp3, offset_val;
83  v16u8 src0 = { 0 }, src1 = { 0 };
84  v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
85  v8i16 wgt, denom, offset;
86 
87  offset_val = (unsigned) offset_in << log2_denom;
88 
89  wgt = __msa_fill_h(src_weight);
90  offset = __msa_fill_h(offset_val);
91  denom = __msa_fill_h(log2_denom);
92 
93  LW4(data, stride, tp0, tp1, tp2, tp3);
94  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
95  LW4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
96  INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
97  UNPCK_UB_SH(src0, src0_r, src1_r);
98  UNPCK_UB_SH(src1, src2_r, src3_r);
99  MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
100  tmp3);
101  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
102  tmp1, tmp2, tmp3);
103  MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
104  SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
105  SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
106  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
107  ST_W8(src0, src1, 0, 1, 2, 3, 0, 1, 2, 3, data, stride);
108 }
109 
110 static void avc_wgt_8x4_msa(uint8_t *data, ptrdiff_t stride,
111  int32_t log2_denom, int32_t src_weight,
112  int32_t offset_in)
113 {
114  uint32_t offset_val;
115  uint64_t tp0, tp1, tp2, tp3;
116  v16u8 src0 = { 0 }, src1 = { 0 };
117  v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
118  v8i16 wgt, denom, offset;
119 
120  offset_val = (unsigned) offset_in << log2_denom;
121 
122  wgt = __msa_fill_h(src_weight);
123  offset = __msa_fill_h(offset_val);
124  denom = __msa_fill_h(log2_denom);
125 
126  LD4(data, stride, tp0, tp1, tp2, tp3);
127  INSERT_D2_UB(tp0, tp1, src0);
128  INSERT_D2_UB(tp2, tp3, src1);
129  UNPCK_UB_SH(src0, src0_r, src1_r);
130  UNPCK_UB_SH(src1, src2_r, src3_r);
131  MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
132  tmp3);
133  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
134  tmp1, tmp2, tmp3);
135  MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
136  SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
137  SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
138  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
139  ST_D4(src0, src1, 0, 1, 0, 1, data, stride);
140 }
141 
142 static void avc_wgt_8x8_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom,
143  int32_t src_weight, int32_t offset_in)
144 {
145  uint32_t offset_val;
146  uint64_t tp0, tp1, tp2, tp3;
147  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
148  v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
149  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
150  v8i16 wgt, denom, offset;
151 
152  offset_val = (unsigned) offset_in << log2_denom;
153 
154  wgt = __msa_fill_h(src_weight);
155  offset = __msa_fill_h(offset_val);
156  denom = __msa_fill_h(log2_denom);
157 
158  LD4(data, stride, tp0, tp1, tp2, tp3);
159  INSERT_D2_UB(tp0, tp1, src0);
160  INSERT_D2_UB(tp2, tp3, src1);
161  LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
162  INSERT_D2_UB(tp0, tp1, src2);
163  INSERT_D2_UB(tp2, tp3, src3);
164  UNPCK_UB_SH(src0, src0_r, src1_r);
165  UNPCK_UB_SH(src1, src2_r, src3_r);
166  UNPCK_UB_SH(src2, src4_r, src5_r);
167  UNPCK_UB_SH(src3, src6_r, src7_r);
168  MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
169  tmp3);
170  MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, tmp6,
171  tmp7);
172  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
173  tmp1, tmp2, tmp3);
174  ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
175  tmp5, tmp6, tmp7);
176  MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
177  SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
178  SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
179  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
180  src2, src3);
181  ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
182 }
183 
184 static void avc_wgt_8x16_msa(uint8_t *data, ptrdiff_t stride,
185  int32_t log2_denom, int32_t src_weight,
186  int32_t offset_in)
187 {
188  uint32_t offset_val, cnt;
189  uint64_t tp0, tp1, tp2, tp3;
190  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
191  v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
192  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
193  v8i16 wgt, denom, offset;
194 
195  offset_val = (unsigned) offset_in << log2_denom;
196 
197  wgt = __msa_fill_h(src_weight);
198  offset = __msa_fill_h(offset_val);
199  denom = __msa_fill_h(log2_denom);
200 
201  for (cnt = 2; cnt--;) {
202  LD4(data, stride, tp0, tp1, tp2, tp3);
203  INSERT_D2_UB(tp0, tp1, src0);
204  INSERT_D2_UB(tp2, tp3, src1);
205  LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
206  INSERT_D2_UB(tp0, tp1, src2);
207  INSERT_D2_UB(tp2, tp3, src3);
208  UNPCK_UB_SH(src0, src0_r, src1_r);
209  UNPCK_UB_SH(src1, src2_r, src3_r);
210  UNPCK_UB_SH(src2, src4_r, src5_r);
211  UNPCK_UB_SH(src3, src6_r, src7_r);
212  MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1,
213  tmp2, tmp3);
214  MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5,
215  tmp6, tmp7);
216  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
217  tmp0, tmp1, tmp2, tmp3);
218  ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
219  tmp4, tmp5, tmp6, tmp7);
220  MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
221  SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
222  SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
223  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
224  src2, src3);
225  ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
226  data += 8 * stride;
227  }
228 }
229 
230 static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
231  int32_t log2_denom, int32_t src_weight,
232  int32_t dst_weight, int32_t offset_in)
233 {
234  uint32_t tp0, tp1;
235  v16i8 src_wgt, dst_wgt, wgt, vec0;
236  v16u8 src0 = { 0 }, dst0 = { 0 };
237  v8i16 tmp0, denom, offset, max255 = __msa_ldi_h(255);
238 
239  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
240  offset_in += (128 * (src_weight + dst_weight));
241 
242  src_wgt = __msa_fill_b(src_weight);
243  dst_wgt = __msa_fill_b(dst_weight);
244  offset = __msa_fill_h(offset_in);
245  denom = __msa_fill_h(log2_denom + 1);
246 
247  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
248 
249  LW2(src, stride, tp0, tp1);
250  INSERT_W2_UB(tp0, tp1, src0);
251  LW2(dst, stride, tp0, tp1);
252  INSERT_W2_UB(tp0, tp1, dst0);
253  XORI_B2_128_UB(src0, dst0);
254  vec0 = (v16i8) __msa_ilvr_b((v16i8) dst0, (v16i8) src0);
255  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
256  tmp0 >>= denom;
257  tmp0 = __msa_maxi_s_h(tmp0, 0);
258  tmp0 = __msa_min_s_h(max255, tmp0);
259  dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
260  ST_W2(dst0, 0, 1, dst, stride);
261 }
262 
263 static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
264  int32_t log2_denom, int32_t src_weight,
265  int32_t dst_weight, int32_t offset_in)
266 {
267  uint32_t tp0, tp1, tp2, tp3;
268  v16i8 src_wgt, dst_wgt, wgt, vec0, vec1;
269  v16u8 src0, dst0;
270  v8i16 tmp0, tmp1, denom, offset;
271 
272  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
273  offset_in += (128 * (src_weight + dst_weight));
274 
275  src_wgt = __msa_fill_b(src_weight);
276  dst_wgt = __msa_fill_b(dst_weight);
277  offset = __msa_fill_h(offset_in);
278  denom = __msa_fill_h(log2_denom + 1);
279 
280  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
281 
282  LW4(src, stride, tp0, tp1, tp2, tp3);
283  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
284  LW4(dst, stride, tp0, tp1, tp2, tp3);
285  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
286  XORI_B2_128_UB(src0, dst0);
287  ILVRL_B2_SB(dst0, src0, vec0, vec1);
288  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
289  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
290  tmp0 >>= denom;
291  tmp1 >>= denom;
292  CLIP_SH2_0_255(tmp0, tmp1);
293  dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
294  ST_W4(dst0, 0, 1, 2, 3, dst, stride);
295 }
296 
297 static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
298  int32_t log2_denom, int32_t src_weight,
299  int32_t dst_weight, int32_t offset_in)
300 {
301  uint32_t tp0, tp1, tp2, tp3;
302  v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
303  v16u8 src0, src1, dst0, dst1;
304  v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
305 
306  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
307  offset_in += (128 * (src_weight + dst_weight));
308 
309  src_wgt = __msa_fill_b(src_weight);
310  dst_wgt = __msa_fill_b(dst_weight);
311  offset = __msa_fill_h(offset_in);
312  denom = __msa_fill_h(log2_denom + 1);
313  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
314 
315  LW4(src, stride, tp0, tp1, tp2, tp3);
316  src += 4 * stride;
317  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
318  LW4(src, stride, tp0, tp1, tp2, tp3);
319  INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
320  LW4(dst, stride, tp0, tp1, tp2, tp3);
321  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
322  LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
323  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
324  XORI_B4_128_UB(src0, src1, dst0, dst1);
325  ILVRL_B2_SB(dst0, src0, vec0, vec1);
326  ILVRL_B2_SB(dst1, src1, vec2, vec3);
327  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
328  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
329  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
330  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
331  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
332  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
333  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
334  ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
335 }
336 
337 static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
338  int32_t log2_denom, int32_t src_weight,
339  int32_t dst_weight, int32_t offset_in)
340 {
341  uint64_t tp0, tp1, tp2, tp3;
342  v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
343  v16u8 src0, src1, dst0, dst1;
344  v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
345 
346  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
347  offset_in += (128 * (src_weight + dst_weight));
348 
349  src_wgt = __msa_fill_b(src_weight);
350  dst_wgt = __msa_fill_b(dst_weight);
351  offset = __msa_fill_h(offset_in);
352  denom = __msa_fill_h(log2_denom + 1);
353 
354  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
355 
356  LD4(src, stride, tp0, tp1, tp2, tp3);
357  INSERT_D2_UB(tp0, tp1, src0);
358  INSERT_D2_UB(tp2, tp3, src1);
359  LD4(dst, stride, tp0, tp1, tp2, tp3);
360  INSERT_D2_UB(tp0, tp1, dst0);
361  INSERT_D2_UB(tp2, tp3, dst1);
362  XORI_B4_128_UB(src0, src1, dst0, dst1);
363  ILVRL_B2_SB(dst0, src0, vec0, vec1);
364  ILVRL_B2_SB(dst1, src1, vec2, vec3);
365  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
366  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
367  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
368  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
369  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
370  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
371  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
372  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
373 }
374 
375 static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
376  int32_t log2_denom, int32_t src_weight,
377  int32_t dst_weight, int32_t offset_in)
378 {
379  uint64_t tp0, tp1, tp2, tp3;
380  v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
381  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
382  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
383 
384  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
385  offset_in += (128 * (src_weight + dst_weight));
386 
387  src_wgt = __msa_fill_b(src_weight);
388  dst_wgt = __msa_fill_b(dst_weight);
389  offset = __msa_fill_h(offset_in);
390  denom = __msa_fill_h(log2_denom + 1);
391  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
392 
393  LD4(src, stride, tp0, tp1, tp2, tp3);
394  INSERT_D2_UB(tp0, tp1, src0);
395  INSERT_D2_UB(tp2, tp3, src1);
396  LD4(src + 4 * stride, stride, tp0, tp1, tp2, tp3);
397  INSERT_D2_UB(tp0, tp1, src2);
398  INSERT_D2_UB(tp2, tp3, src3);
399  LD4(dst, stride, tp0, tp1, tp2, tp3);
400  INSERT_D2_UB(tp0, tp1, dst0);
401  INSERT_D2_UB(tp2, tp3, dst1);
402  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
403  INSERT_D2_UB(tp0, tp1, dst2);
404  INSERT_D2_UB(tp2, tp3, dst3);
405  XORI_B8_128_UB(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
406  ILVRL_B2_SB(dst0, src0, vec0, vec1);
407  ILVRL_B2_SB(dst1, src1, vec2, vec3);
408  ILVRL_B2_SB(dst2, src2, vec4, vec5);
409  ILVRL_B2_SB(dst3, src3, vec6, vec7);
410  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
411  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
412  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
413  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
414  tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
415  tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
416  tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
417  tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
418  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
419  SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
420  CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
421  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
422  PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
423  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
424 }
425 
426 static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
427  int32_t log2_denom, int32_t src_weight,
428  int32_t dst_weight, int32_t offset_in)
429 {
430  uint8_t cnt;
431  uint64_t tp0, tp1, tp2, tp3;
432  v16i8 src_wgt, dst_wgt, wgt;
433  v16u8 src0, src1, src2, src3;
434  v16u8 dst0, dst1, dst2, dst3;
435  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
436  v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
437  v8i16 denom, offset;
438 
439  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
440  offset_in += (128 * (src_weight + dst_weight));
441 
442  src_wgt = __msa_fill_b(src_weight);
443  dst_wgt = __msa_fill_b(dst_weight);
444  offset = __msa_fill_h(offset_in);
445  denom = __msa_fill_h(log2_denom + 1);
446  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
447 
448  for (cnt = 2; cnt--;) {
449  LD4(src, stride, tp0, tp1, tp2, tp3);
450  src += 4 * stride;
451  INSERT_D2_UB(tp0, tp1, src0);
452  INSERT_D2_UB(tp2, tp3, src1);
453  LD4(src, stride, tp0, tp1, tp2, tp3);
454  src += 4 * stride;
455  INSERT_D2_UB(tp0, tp1, src2);
456  INSERT_D2_UB(tp2, tp3, src3);
457  LD4(dst, stride, tp0, tp1, tp2, tp3);
458  INSERT_D2_UB(tp0, tp1, dst0);
459  INSERT_D2_UB(tp2, tp3, dst1);
460  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
461  INSERT_D2_UB(tp0, tp1, dst2);
462  INSERT_D2_UB(tp2, tp3, dst3);
463  XORI_B4_128_UB(src0, src1, src2, src3);
464  XORI_B4_128_UB(dst0, dst1, dst2, dst3);
465  ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
466  vec0, vec2, vec4, vec6);
467  ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
468  vec1, vec3, vec5, vec7);
469 
470  temp0 = __msa_dpadd_s_h(offset, wgt, vec0);
471  temp1 = __msa_dpadd_s_h(offset, wgt, vec1);
472  temp2 = __msa_dpadd_s_h(offset, wgt, vec2);
473  temp3 = __msa_dpadd_s_h(offset, wgt, vec3);
474  temp4 = __msa_dpadd_s_h(offset, wgt, vec4);
475  temp5 = __msa_dpadd_s_h(offset, wgt, vec5);
476  temp6 = __msa_dpadd_s_h(offset, wgt, vec6);
477  temp7 = __msa_dpadd_s_h(offset, wgt, vec7);
478 
479  SRA_4V(temp0, temp1, temp2, temp3, denom);
480  SRA_4V(temp4, temp5, temp6, temp7, denom);
481  CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
482  PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
483  dst0, dst1, dst2, dst3);
484  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
485  dst += 8 * stride;
486  }
487 }
488 
489 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \
490  q3_or_p3_org_in, p1_or_q1_org_in, \
491  p2_or_q2_org_in, q1_or_p1_org_in, \
492  p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \
493 { \
494  v8i16 threshold; \
495  v8i16 const3 = __msa_ldi_h(3); \
496  \
497  threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in); \
498  threshold += (p1_or_q1_org_in); \
499  \
500  (p0_or_q0_out) = threshold << 1; \
501  (p0_or_q0_out) += (p2_or_q2_org_in); \
502  (p0_or_q0_out) += (q1_or_p1_org_in); \
503  (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3); \
504  \
505  (p1_or_q1_out) = (p2_or_q2_org_in) + threshold; \
506  (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2); \
507  \
508  (p2_or_q2_out) = (p2_or_q2_org_in) * const3; \
509  (p2_or_q2_out) += (p3_or_q3_org_in); \
510  (p2_or_q2_out) += (p3_or_q3_org_in); \
511  (p2_or_q2_out) += threshold; \
512  (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3); \
513 }
514 
515 /* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
516 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \
517  p1_or_q1_org_in, p0_or_q0_out) \
518 { \
519  (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in); \
520  (p0_or_q0_out) += (p1_or_q1_org_in); \
521  (p0_or_q0_out) += (p1_or_q1_org_in); \
522  (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2); \
523 }
524 
525 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \
526  p1_or_q1_org_in, p2_or_q2_org_in, \
527  negate_tc_in, tc_in, p1_or_q1_out) \
528 { \
529  v8i16 clip3, temp; \
530  \
531  clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in, \
532  (v8u16) q0_or_p0_org_in); \
533  temp = p1_or_q1_org_in << 1; \
534  clip3 = clip3 - temp; \
535  clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3); \
536  CLIP_SH(clip3, negate_tc_in, tc_in); \
537  p1_or_q1_out = p1_or_q1_org_in + clip3; \
538 }
539 
540 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \
541  p1_or_q1_org_in, q1_or_p1_org_in, \
542  negate_threshold_in, threshold_in, \
543  p0_or_q0_out, q0_or_p0_out) \
544 { \
545  v8i16 q0_sub_p0, p1_sub_q1, delta; \
546  \
547  q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \
548  p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \
549  q0_sub_p0 <<= 2; \
550  p1_sub_q1 += 4; \
551  delta = q0_sub_p0 + p1_sub_q1; \
552  delta >>= 3; \
553  \
554  CLIP_SH(delta, negate_threshold_in, threshold_in); \
555  \
556  p0_or_q0_out = p0_or_q0_org_in + delta; \
557  q0_or_p0_out = q0_or_p0_org_in - delta; \
558  \
559  CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out); \
560 }
561 
562 #define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
563 { \
564  uint32_t load0, load1, load2, load3; \
565  v16u8 src0 = { 0 }; \
566  v16u8 src1 = { 0 }; \
567  v16u8 src2 = { 0 }; \
568  v16u8 src3 = { 0 }; \
569  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
570  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
571  v8i16 tc, q0_sub_p0, p1_sub_q1, delta; \
572  v8i16 res0_r, res1_r; \
573  v16i8 zeros = { 0 }; \
574  v16u8 res0, res1; \
575  \
576  LW4((src - 2), stride, load0, load1, load2, load3); \
577  src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
578  src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
579  src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2); \
580  src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3); \
581  \
582  TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3); \
583  \
584  p0_asub_q0 = __msa_asub_u_b(src2, src1); \
585  p1_asub_p0 = __msa_asub_u_b(src1, src0); \
586  q1_asub_q0 = __msa_asub_u_b(src2, src3); \
587  \
588  tc = __msa_fill_h(tc_val); \
589  \
590  is_less_than_alpha = (p0_asub_q0 < alpha); \
591  is_less_than_beta = (p1_asub_p0 < beta); \
592  is_less_than = is_less_than_alpha & is_less_than_beta; \
593  is_less_than_beta = (q1_asub_q0 < beta); \
594  is_less_than = is_less_than_beta & is_less_than; \
595  \
596  ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
597  HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
598  \
599  q0_sub_p0 <<= 2; \
600  delta = q0_sub_p0 + p1_sub_q1; \
601  delta = __msa_srari_h(delta, 3); \
602  \
603  CLIP_SH(delta, -tc, tc); \
604  \
605  ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
606  \
607  res0_r += delta; \
608  res1_r -= delta; \
609  \
610  CLIP_SH2_0_255(res0_r, res1_r); \
611  PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
612  \
613  res0 = __msa_bmnz_v(src1, res0, is_less_than); \
614  res1 = __msa_bmnz_v(src2, res1, is_less_than); \
615  \
616  res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
617 }
618 
619 #define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3) \
620 { \
621  v16i8 zero_m = { 0 }; \
622  \
623  out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0); \
624  out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2); \
625  SLDI_B2_UB(zero_m, out1, zero_m, out2, 2, out2, out3); \
626 }
627 
628 #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
629 { \
630  uint32_t load0, load1; \
631  v16u8 src0 = { 0 }; \
632  v16u8 src1 = { 0 }; \
633  v16u8 src2 = { 0 }; \
634  v16u8 src3 = { 0 }; \
635  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
636  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
637  v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r; \
638  v16i8 zeros = { 0 }; \
639  v16u8 res0, res1; \
640  \
641  load0 = LW(src - 2); \
642  load1 = LW(src - 2 + stride); \
643  \
644  src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
645  src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
646  \
647  TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3); \
648  \
649  p0_asub_q0 = __msa_asub_u_b(src2, src1); \
650  p1_asub_p0 = __msa_asub_u_b(src1, src0); \
651  q1_asub_q0 = __msa_asub_u_b(src2, src3); \
652  \
653  tc = __msa_fill_h(tc_val); \
654  \
655  is_less_than_alpha = (p0_asub_q0 < alpha); \
656  is_less_than_beta = (p1_asub_p0 < beta); \
657  is_less_than = is_less_than_alpha & is_less_than_beta; \
658  is_less_than_beta = (q1_asub_q0 < beta); \
659  is_less_than = is_less_than_beta & is_less_than; \
660  \
661  ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
662  HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
663  \
664  q0_sub_p0 <<= 2; \
665  delta = q0_sub_p0 + p1_sub_q1; \
666  delta = __msa_srari_h(delta, 3); \
667  CLIP_SH(delta, -tc, tc); \
668  \
669  ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
670  \
671  res0_r += delta; \
672  res1_r -= delta; \
673  \
674  CLIP_SH2_0_255(res0_r, res1_r); \
675  PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
676  \
677  res0 = __msa_bmnz_v(src1, res0, is_less_than); \
678  res1 = __msa_bmnz_v(src2, res1, is_less_than); \
679  \
680  res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
681 }
682 
684  uint8_t alpha_in,
685  uint8_t beta_in,
686  ptrdiff_t img_width)
687 {
688  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
689  v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
690  v16u8 p1_org, p0_org, q0_org, q1_org;
691 
692  LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
693 
694  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
695  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
696  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
697 
698  is_less_than_alpha = (p0_asub_q0 < alpha_in);
699  is_less_than_beta = (p1_asub_p0 < beta_in);
700  is_less_than = is_less_than_beta & is_less_than_alpha;
701  is_less_than_beta = (q1_asub_q0 < beta_in);
702  is_less_than = is_less_than_beta & is_less_than;
703 
704  if (!__msa_test_bz_v(is_less_than)) {
705  v16u8 p2_asub_p0, q2_asub_q0, p0, q0, negate_is_less_than_beta;
706  v8i16 p0_r = { 0 };
707  v8i16 q0_r = { 0 };
708  v8i16 p0_l = { 0 };
709  v8i16 q0_l = { 0 };
710  v16i8 zero = { 0 };
711  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
712  v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
713  v16u8 q2_org = LD_UB(data + (2 * img_width));
714  v16u8 p2_org = LD_UB(data - (3 * img_width));
715  v16u8 tmp_flag = (v16u8)__msa_fill_b((alpha_in >> 2) + 2);
716 
717  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
718  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
719  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
720 
721  tmp_flag = (p0_asub_q0 < tmp_flag);
722 
723  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
724  is_less_than_beta = (p2_asub_p0 < beta_in);
725  is_less_than_beta = is_less_than_beta & tmp_flag;
726  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
727  is_less_than_beta = is_less_than_beta & is_less_than;
728  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
729 
730  q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
731  q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
732 
733  /* combine and store */
734  if (!__msa_test_bz_v(is_less_than_beta)) {
735  v8i16 p3_org_l, p3_org_r;
736  v16u8 p3_org = LD_UB(data - (img_width << 2));
737  v16u8 p2, p1;
738  v8i16 p2_r = { 0 };
739  v8i16 p2_l = { 0 };
740  v8i16 p1_r = { 0 };
741  v8i16 p1_l = { 0 };
742 
743  ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
744  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
745  p2_r, q1_org_r, p0_r, p1_r, p2_r);
746 
747  ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
748  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
749  p2_l, q1_org_l, p0_l, p1_l, p2_l);
750 
751  PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
752 
753  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
754  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
755  p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
756 
757  ST_UB(p1_org, data - (2 * img_width));
758  ST_UB(p2_org, data - (3 * img_width));
759  }
760 
761  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
762  AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
763 
764  /* combine */
765  p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
766  p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
767 
768  ST_UB(p0_org, data - img_width);
769 
770  /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
771  q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
772  is_less_than_beta = (q2_asub_q0 < beta_in);
773  is_less_than_beta = is_less_than_beta & tmp_flag;
774  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
775  is_less_than_beta = is_less_than_beta & is_less_than;
776  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
777 
778  /* combine and store */
779  if (!__msa_test_bz_v(is_less_than_beta)) {
780  v8i16 q3_org_r, q3_org_l;
781  v16u8 q3_org = LD_UB(data + (3 * img_width));
782  v16u8 q1, q2;
783  v8i16 q2_r = { 0 };
784  v8i16 q2_l = { 0 };
785  v8i16 q1_r = { 0 };
786  v8i16 q1_l = { 0 };
787 
788  ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
789  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
790  q2_r, p1_org_r, q0_r, q1_r, q2_r);
791 
792  ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
793  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
794  q2_l, p1_org_l, q0_l, q1_l, q2_l);
795 
796  PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
797  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
798  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
799  q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
800 
801  ST_UB(q1_org, data + img_width);
802  ST_UB(q2_org, data + 2 * img_width);
803  }
804 
805  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
806  AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
807 
808  /* combine */
809  q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
810  q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
811 
812  ST_UB(q0_org, data);
813  }
814 }
815 
817  uint8_t alpha_in,
818  uint8_t beta_in,
819  ptrdiff_t img_width)
820 {
821  uint8_t *src = data - 4;
822  v16u8 alpha, beta, p0_asub_q0;
823  v16u8 is_less_than_alpha, is_less_than, is_less_than_beta;
824  v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
825  v16u8 p1_asub_p0, q1_asub_q0;
826 
827 
828  {
829  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
830  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
831 
832  LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
833  LD_UB8(src + (8 * img_width), img_width,
834  row8, row9, row10, row11, row12, row13, row14, row15);
835 
836  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3,
837  row4, row5, row6, row7,
838  row8, row9, row10, row11,
839  row12, row13, row14, row15,
840  p3_org, p2_org, p1_org, p0_org,
841  q0_org, q1_org, q2_org, q3_org);
842  }
843 
844  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
845  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
846  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
847 
848  alpha = (v16u8) __msa_fill_b(alpha_in);
849  beta = (v16u8) __msa_fill_b(beta_in);
850 
851  is_less_than_alpha = (p0_asub_q0 < alpha);
852  is_less_than_beta = (p1_asub_p0 < beta);
853  is_less_than = is_less_than_beta & is_less_than_alpha;
854  is_less_than_beta = (q1_asub_q0 < beta);
855  is_less_than = is_less_than_beta & is_less_than;
856 
857  if (!__msa_test_bz_v(is_less_than)) {
858  v8i16 p0_r = { 0 };
859  v8i16 q0_r = { 0 };
860  v8i16 p0_l = { 0 };
861  v8i16 q0_l = { 0 };
862  v16i8 zero = { 0 };
863  v16u8 tmp_flag, p0, q0, p2_asub_p0, q2_asub_q0;
864  v16u8 negate_is_less_than_beta;
865  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
866  v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
867 
868  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
869  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
870  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
871  UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l);
872 
873  tmp_flag = alpha >> 2;
874  tmp_flag = tmp_flag + 2;
875  tmp_flag = (p0_asub_q0 < tmp_flag);
876 
877  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
878  is_less_than_beta = (p2_asub_p0 < beta);
879  is_less_than_beta = tmp_flag & is_less_than_beta;
880  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
881  is_less_than_beta = is_less_than_beta & is_less_than;
882  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
883 
884  if (!__msa_test_bz_v(is_less_than_beta)) {
885  v16u8 p2, p1;
886  v8i16 p3_org_r, p3_org_l;
887  v8i16 p2_l = { 0 };
888  v8i16 p2_r = { 0 };
889  v8i16 p1_l = { 0 };
890  v8i16 p1_r = { 0 };
891 
892  ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
893  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
894  p2_r, q1_org_r, p0_r, p1_r, p2_r);
895 
896  ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
897  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
898  p2_l, q1_org_l, p0_l, p1_l, p2_l);
899 
900  PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
901  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
902  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
903  p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
904  }
905 
906  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
907  AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
908 
909  p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
910  p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
911 
912  q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
913  is_less_than_beta = (q2_asub_q0 < beta);
914 
915  is_less_than_beta = is_less_than_beta & tmp_flag;
916  negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
917 
918  is_less_than_beta = is_less_than_beta & is_less_than;
919  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
920 
921  if (!__msa_test_bz_v(is_less_than_beta)) {
922  v16u8 q1, q2;
923  v8i16 q3_org_r, q3_org_l;
924  v8i16 q1_l = { 0 };
925  v8i16 q1_r = { 0 };
926  v8i16 q2_l = { 0 };
927  v8i16 q2_r = { 0 };
928 
929  ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
930  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
931  q2_r, p1_org_r, q0_r, q1_r, q2_r);
932 
933  ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
934  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
935  q2_l, p1_org_l, q0_l, q1_l, q2_l);
936 
937  PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
938  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
939  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
940  q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
941  }
942 
943  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
944  AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
945 
946  q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
947  q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
948 
949  {
950  v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
951 
952  ILVRL_B2_SH(p1_org, p2_org, tp0, tp2);
953  ILVRL_B2_SH(q0_org, p0_org, tp1, tp3);
954  ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
955 
956  ILVRL_H2_SH(tp1, tp0, tmp3, tmp4);
957  ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
958 
959  src = data - 3;
960  ST_W4(tmp3, 0, 1, 2, 3, src, img_width);
961  ST_H4(tmp2, 0, 1, 2, 3, src + 4, img_width);
962  src += 4 * img_width;
963  ST_W4(tmp4, 0, 1, 2, 3, src, img_width);
964  ST_H4(tmp2, 4, 5, 6, 7, src + 4, img_width);
965  src += 4 * img_width;
966 
967  ST_W4(tmp6, 0, 1, 2, 3, src, img_width);
968  ST_H4(tmp5, 0, 1, 2, 3, src + 4, img_width);
969  src += 4 * img_width;
970  ST_W4(tmp7, 0, 1, 2, 3, src, img_width);
971  ST_H4(tmp5, 4, 5, 6, 7, src + 4, img_width);
972  }
973  }
974 }
975 
977  ptrdiff_t stride,
978  int32_t alpha_in,
979  int32_t beta_in)
980 {
981  uint64_t load0, load1;
982  uint32_t out0, out2;
983  uint16_t out1, out3;
984  v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
985  v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
986  v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
987  v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
988  v8i16 tmp0, tmp1, tmp2, tmp3;
989  v16u8 alpha, beta;
990  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
991  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
992  v16u8 is_less_than_beta1, is_less_than_beta2;
993  v16i8 src0 = { 0 };
994  v16i8 src1 = { 0 };
995  v16i8 src2 = { 0 };
996  v16i8 src3 = { 0 };
997  v16i8 src4 = { 0 };
998  v16i8 src5 = { 0 };
999  v16i8 src6 = { 0 };
1000  v16i8 src7 = { 0 };
1001  v16i8 zeros = { 0 };
1002 
1003  load0 = LD(src - 4);
1004  load1 = LD(src + stride - 4);
1005  src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0);
1006  src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1);
1007 
1008  load0 = LD(src + (2 * stride) - 4);
1009  load1 = LD(src + (3 * stride) - 4);
1010  src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0);
1011  src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
1012 
1013  load0 = LD(src + (4 * stride) - 4);
1014  load1 = LD(src + (5 * stride) - 4);
1015  src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
1016  src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
1017 
1018  load0 = LD(src + (6 * stride) - 4);
1019  load1 = LD(src + (7 * stride) - 4);
1020  src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
1021  src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
1022 
1023  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
1024  src0, src1, src2, src3);
1025 
1026  ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2);
1027  ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3);
1028 
1029  ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
1030  ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
1031  SLDI_B4_SB(zeros, src6, zeros, src1, zeros, src3, zeros, src5,
1032  8, src0, src2, src4, src7);
1033 
1034  p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1035  p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1036  q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1037 
1038  alpha = (v16u8) __msa_fill_b(alpha_in);
1039  beta = (v16u8) __msa_fill_b(beta_in);
1040 
1041  is_less_than_alpha = (p0_asub_q0 < alpha);
1042  is_less_than_beta = (p1_asub_p0 < beta);
1043  is_less_than = is_less_than_alpha & is_less_than_beta;
1044  is_less_than_beta = (q1_asub_q0 < beta);
1045  is_less_than = is_less_than & is_less_than_beta;
1046 
1047  alpha >>= 2;
1048  alpha += 2;
1049 
1050  is_less_than_alpha = (p0_asub_q0 < alpha);
1051 
1052  p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1053  is_less_than_beta1 = (p2_asub_p0 < beta);
1054  q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1055  is_less_than_beta2 = (q2_asub_q0 < beta);
1056 
1057  ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3,
1058  src0_r, src1_r, src2_r, src3_r);
1059  ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
1060  src4_r, src5_r, src6_r, src7_r);
1061 
1062  dst2_x_r = src1_r + src2_r + src3_r;
1063  dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
1064  dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
1065  dst1_r = src0_r + src1_r + src2_r + src3_r;
1066  dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
1067 
1068  dst0_r = (2 * src6_r) + (3 * src0_r);
1069  dst0_r += src1_r + src2_r + src3_r;
1070  dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
1071  dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1072  dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1073 
1074  PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
1075  dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
1076 
1077  dst3_x_r = src2_r + src3_r + src4_r;
1078  dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
1079  dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
1080  dst4_r = src2_r + src3_r + src4_r + src5_r;
1081  dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
1082 
1083  dst5_r = (2 * src7_r) + (3 * src5_r);
1084  dst5_r += src4_r + src3_r + src2_r;
1085  dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
1086  dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1087  dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1088 
1089  PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
1090  dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
1091 
1092  dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1093  dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1094  dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1095  dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1096 
1097  PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
1098 
1099  dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
1100  dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
1101  dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than);
1102  dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
1103 
1104  is_less_than = is_less_than_alpha & is_less_than;
1105  dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
1106  is_less_than_beta1 = is_less_than_beta1 & is_less_than;
1107  dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1);
1108 
1109  dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
1110  dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1);
1111  dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
1112  is_less_than_beta2 = is_less_than_beta2 & is_less_than;
1113  dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
1114  dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
1115  dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
1116 
1117  ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
1118  dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
1119  ILVRL_H2_SH(dst1, dst0, tmp0, tmp1);
1120  ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
1121 
1122  ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
1123  SLDI_B2_UB(zeros, dst0, zeros, dst4, 8, dst1, dst5);
1124  dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
1125  dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
1126  SLDI_B2_UB(zeros, dst2_x, zeros, dst2_y, 8, dst3_x, dst3_y);
1127 
1128  out0 = __msa_copy_u_w((v4i32) dst0, 0);
1129  out1 = __msa_copy_u_h((v8i16) dst0, 2);
1130  out2 = __msa_copy_u_w((v4i32) dst1, 0);
1131  out3 = __msa_copy_u_h((v8i16) dst1, 2);
1132 
1133  SW(out0, (src - 3));
1134  SH(out1, (src + 1));
1135  src += stride;
1136  SW(out2, (src - 3));
1137  SH(out3, (src + 1));
1138  src += stride;
1139 
1140  out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
1141  out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
1142  out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
1143  out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
1144 
1145  SW(out0, (src - 3));
1146  SH(out1, (src + 1));
1147  src += stride;
1148  SW(out2, (src - 3));
1149  SH(out3, (src + 1));
1150  src += stride;
1151 
1152  out0 = __msa_copy_u_w((v4i32) dst4, 0);
1153  out1 = __msa_copy_u_h((v8i16) dst4, 2);
1154  out2 = __msa_copy_u_w((v4i32) dst5, 0);
1155  out3 = __msa_copy_u_h((v8i16) dst5, 2);
1156 
1157  SW(out0, (src - 3));
1158  SH(out1, (src + 1));
1159  src += stride;
1160  SW(out2, (src - 3));
1161  SH(out3, (src + 1));
1162  src += stride;
1163 
1164  out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
1165  out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
1166  out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
1167  out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
1168 
1169  SW(out0, (src - 3));
1170  SH(out1, (src + 1));
1171  src += stride;
1172  SW(out2, (src - 3));
1173  SH(out3, (src + 1));
1174 }
1175 
1176 static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr,
1177  uint8_t alpha_in,
1178  uint8_t beta_in,
1179  ptrdiff_t img_width)
1180 {
1181  v16u8 alpha, beta;
1182  v16u8 is_less_than;
1183  v8i16 p0_or_q0, q0_or_p0;
1184  v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1185  v16i8 zero = { 0 };
1186  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1187  v16u8 is_less_than_alpha, is_less_than_beta;
1188  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1189 
1190  alpha = (v16u8) __msa_fill_b(alpha_in);
1191  beta = (v16u8) __msa_fill_b(beta_in);
1192 
1193  LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
1194  p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
1195 
1196  p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1197  p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1198  q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1199 
1200  is_less_than_alpha = (p0_asub_q0 < alpha);
1201  is_less_than_beta = (p1_asub_p0 < beta);
1202  is_less_than = is_less_than_beta & is_less_than_alpha;
1203  is_less_than_beta = (q1_asub_q0 < beta);
1204  is_less_than = is_less_than_beta & is_less_than;
1205 
1206  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1207 
1208  if (!__msa_test_bz_v(is_less_than)) {
1209  ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1210  zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1211  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1212  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1213  PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1214 
1215  p0_or_q0_org =
1216  __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1217  q0_or_p0_org =
1218  __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1219 
1220  ST_UB(q0_or_p0_org, data_cb_or_cr);
1221  ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
1222  }
1223 }
1224 
1225 static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr,
1226  uint8_t alpha_in,
1227  uint8_t beta_in,
1228  ptrdiff_t img_width)
1229 {
1230  v8i16 tmp1;
1231  v16u8 alpha, beta, is_less_than;
1232  v8i16 p0_or_q0, q0_or_p0;
1233  v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1234  v16i8 zero = { 0 };
1235  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1236  v16u8 is_less_than_alpha, is_less_than_beta;
1237  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1238 
1239  {
1240  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1241 
1242  LD_UB8((data_cb_or_cr - 2), img_width,
1243  row0, row1, row2, row3, row4, row5, row6, row7);
1244 
1245  TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1246  p1_or_q1_org, p0_or_q0_org,
1247  q0_or_p0_org, q1_or_p1_org);
1248  }
1249 
1250  alpha = (v16u8) __msa_fill_b(alpha_in);
1251  beta = (v16u8) __msa_fill_b(beta_in);
1252 
1253  p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1254  p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1255  q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1256 
1257  is_less_than_alpha = (p0_asub_q0 < alpha);
1258  is_less_than_beta = (p1_asub_p0 < beta);
1259  is_less_than = is_less_than_beta & is_less_than_alpha;
1260  is_less_than_beta = (q1_asub_q0 < beta);
1261  is_less_than = is_less_than_beta & is_less_than;
1262  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1263 
1264  if (!__msa_test_bz_v(is_less_than)) {
1265  ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1266  zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1267 
1268  AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1269  AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1270 
1271  /* convert 16 bit output into 8 bit output */
1272  PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1273 
1274  p0_or_q0_org =
1275  __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1276  q0_or_p0_org =
1277  __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1278  tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
1279 
1280  data_cb_or_cr -= 1;
1281  ST_H4(tmp1, 0, 1, 2, 3, data_cb_or_cr, img_width);
1282  data_cb_or_cr += 4 * img_width;
1283  ST_H4(tmp1, 4, 5, 6, 7, data_cb_or_cr, img_width);
1284  }
1285 }
1286 
1287 static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t* pPix, uint32_t iStride,
1288  uint8_t iAlpha, uint8_t iBeta,
1289  uint8_t* pTc)
1290 {
1291  v16u8 p0, p1, p2, q0, q1, q2;
1292  v16i8 iTc, negiTc, negTc, flags, f;
1293  v8i16 p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r, q1_l, q1_r, q2_l, q2_r;
1294  v8i16 tc_l, tc_r, negTc_l, negTc_r;
1295  v8i16 iTc_l, iTc_r, negiTc_l, negiTc_r;
1296  // Use for temporary variable
1297  v8i16 t0, t1, t2, t3;
1298  v16u8 alpha, beta;
1299  v16u8 bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0;
1300  v16i8 const_1_b = __msa_ldi_b(1);
1301  v8i16 const_1_h = __msa_ldi_h(1);
1302  v8i16 const_4_h = __msa_ldi_h(4);
1303  v8i16 const_not_255_h = __msa_ldi_h(~255);
1304  v16i8 zero = { 0 };
1305  v16i8 tc = { pTc[0 >> 2], pTc[1 >> 2], pTc[2 >> 2], pTc[3 >> 2],
1306  pTc[4 >> 2], pTc[5 >> 2], pTc[6 >> 2], pTc[7 >> 2],
1307  pTc[8 >> 2], pTc[9 >> 2], pTc[10 >> 2], pTc[11 >> 2],
1308  pTc[12 >> 2], pTc[13 >> 2], pTc[14 >> 2], pTc[15 >> 2] };
1309  negTc = zero - tc;
1310  iTc = tc;
1311 
1312  // Load data from pPix
1313  LD_SH8(pPix - 3, iStride, t0, t1, t2, t3, q1_l, q1_r, q2_l, q2_r);
1314  LD_SH8(pPix + 8 * iStride - 3, iStride, p0_l, p0_r, p1_l, p1_r,
1315  p2_l, p2_r, q0_l, q0_r);
1316  TRANSPOSE16x8_UB_UB(t0, t1, t2, t3, q1_l, q1_r, q2_l, q2_r,
1317  p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r,
1318  p2, p1, p0, q0, q1, q2, alpha, beta);
1319 
1320  alpha = (v16u8)__msa_fill_b(iAlpha);
1321  beta = (v16u8)__msa_fill_b(iBeta);
1322 
1323  bDetaP0Q0 = __msa_asub_u_b(p0, q0);
1324  bDetaP1P0 = __msa_asub_u_b(p1, p0);
1325  bDetaQ1Q0 = __msa_asub_u_b(q1, q0);
1326  bDetaP2P0 = __msa_asub_u_b(p2, p0);
1327  bDetaQ2Q0 = __msa_asub_u_b(q2, q0);
1328  bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0, alpha);
1329  bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
1330  bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
1331  bDetaP2P0 = (v16u8)__msa_clt_u_b(bDetaP2P0, beta);
1332  bDetaQ2Q0 = (v16u8)__msa_clt_u_b(bDetaQ2Q0, beta);
1333 
1334  // Unsigned extend p0, p1, p2, q0, q1, q2 from 8 bits to 16 bits
1335  ILVRL_B2_SH(zero, p0, p0_r, p0_l);
1336  ILVRL_B2_SH(zero, p1, p1_r, p1_l);
1337  ILVRL_B2_SH(zero, p2, p2_r, p2_l);
1338  ILVRL_B2_SH(zero, q0, q0_r, q0_l);
1339  ILVRL_B2_SH(zero, q1, q1_r, q1_l);
1340  ILVRL_B2_SH(zero, q2, q2_r, q2_l);
1341  // Signed extend tc, negTc from 8 bits to 16 bits
1342  flags = __msa_clt_s_b(tc, zero);
1343  ILVRL_B2(v8i16, flags, tc, tc_r, tc_l);
1344  flags = __msa_clt_s_b(negTc, zero);
1345  ILVRL_B2(v8i16, flags, negTc, negTc_r, negTc_l);
1346 
1347  f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0;
1348  flags = f & (v16i8)bDetaP2P0;
1349  flags = __msa_ceq_b(flags, zero);
1350  iTc += ((~flags) & const_1_b);
1351  flags = f & (v16i8)bDetaQ2Q0;
1352  flags = __msa_ceq_b(flags, zero);
1353  iTc += ((~flags) & const_1_b);
1354  negiTc = zero - iTc;
1355  // Signed extend iTc, negiTc from 8 bits to 16 bits
1356  flags = __msa_clt_s_b(iTc, zero);
1357  ILVRL_B2(v8i16, flags, iTc, iTc_r, iTc_l);
1358  flags = __msa_clt_s_b(negiTc, zero);
1359  ILVRL_B2(v8i16, flags, negiTc, negiTc_r, negiTc_l);
1360 
1361  // Calculate the left part
1362  // p1
1363  t0 = (p2_l + ((p0_l + q0_l + const_1_h) >> 1) - (p1_l << 1)) >> 1;
1364  t0 = __msa_max_s_h(negTc_l, t0);
1365  t0 = __msa_min_s_h(tc_l, t0);
1366  t1 = p1_l + t0;
1367  // q1
1368  t0 = (q2_l + ((p0_l + q0_l + const_1_h) >> 1) - (q1_l << 1)) >> 1;
1369  t0 = __msa_max_s_h(negTc_l, t0);
1370  t0 = __msa_min_s_h(tc_l, t0);
1371  t2 = q1_l + t0;
1372  // iDeta
1373  t0 = (((q0_l - p0_l) << 2) + (p1_l - q1_l) + const_4_h) >> 3;
1374  t0 = __msa_max_s_h(negiTc_l, t0);
1375  t0 = __msa_min_s_h(iTc_l, t0);
1376  p1_l = t1;
1377  q1_l = t2;
1378  // p0
1379  t1 = p0_l + t0;
1380  t2 = t1 & const_not_255_h;
1381  t3 = __msa_cle_s_h((v8i16)zero, t1);
1382  flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
1383  p0_l = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
1384  // q0
1385  t1 = q0_l - t0;
1386  t2 = t1 & const_not_255_h;
1387  t3 = __msa_cle_s_h((v8i16)zero, t1);
1388  flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
1389  q0_l = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
1390 
1391  // Calculate the right part
1392  // p1
1393  t0 = (p2_r + ((p0_r + q0_r + const_1_h) >> 1) - (p1_r << 1)) >> 1;
1394  t0 = __msa_max_s_h(negTc_r, t0);
1395  t0 = __msa_min_s_h(tc_r, t0);
1396  t1 = p1_r + t0;
1397  // q1
1398  t0 = (q2_r + ((p0_r + q0_r + const_1_h) >> 1) - (q1_r << 1)) >> 1;
1399  t0 = __msa_max_s_h(negTc_r, t0);
1400  t0 = __msa_min_s_h(tc_r, t0);
1401  t2 = q1_r + t0;
1402  // iDeta
1403  t0 = (((q0_r - p0_r) << 2) + (p1_r - q1_r) + const_4_h) >> 3;
1404  t0 = __msa_max_s_h(negiTc_r, t0);
1405  t0 = __msa_min_s_h(iTc_r, t0);
1406  p1_r = t1;
1407  q1_r = t2;
1408  // p0
1409  t1 = p0_r + t0;
1410  t2 = t1 & const_not_255_h;
1411  t3 = __msa_cle_s_h((v8i16)zero, t1);
1412  flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
1413  p0_r = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
1414  // q0
1415  t1 = q0_r - t0;
1416  t2 = t1 & const_not_255_h;
1417  t3 = __msa_cle_s_h((v8i16)zero, t1);
1418  flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
1419  q0_r = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
1420 
1421  // Combined left and right
1422  PCKEV_B4(v8i16, p1_l, p1_r, p0_l, p0_r, q0_l, q0_r, q1_l, q1_r,
1423  t0, t1, t2, t3);
1424  flags = (v16i8)__msa_cle_s_b(zero, tc);
1425  flags &= f;
1426  p0 = (v16u8)(((v16i8)t1 & flags) + (p0 & (~flags)));
1427  q0 = (v16u8)(((v16i8)t2 & flags) + (q0 & (~flags)));
1428  // Using t1, t2 as temporary flags
1429  t1 = (v8i16)(flags & (~(__msa_ceq_b((v16i8)bDetaP2P0, zero))));
1430  p1 = (v16u8)(t0 & t1) + (p1 & (v16u8)(~t1));
1431  t2 = (v8i16)(flags & (~(__msa_ceq_b((v16i8)bDetaQ2Q0, zero))));
1432  q1 = (v16u8)(t3 & t2) + (q1 & (v16u8)(~t2));
1433 
1434  ILVRL_B2_SH(p0, p1, t0, t1);
1435  ILVRL_B2_SH(q1, q0, t2, t3);
1436  ILVRL_H2_UB(t2, t0, p1, p0);
1437  ILVRL_H2_UB(t3, t1, q0, q1);
1438  // Store data to pPix
1439  ST_W8(p1, p0, 0, 1, 2, 3, 0, 1, 2, 3, pPix - 2, iStride);
1440  ST_W8(q0, q1, 0, 1, 2, 3, 0, 1, 2, 3, pPix + 8 * iStride - 2, iStride);
1441 }
1442 
1444  uint8_t bs0, uint8_t bs1,
1445  uint8_t bs2, uint8_t bs3,
1446  uint8_t tc0, uint8_t tc1,
1447  uint8_t tc2, uint8_t tc3,
1448  uint8_t alpha_in,
1449  uint8_t beta_in,
1450  ptrdiff_t image_width)
1451 {
1452  v16u8 tmp_vec;
1453  v16u8 bs = { 0 };
1454 
1455  tmp_vec = (v16u8) __msa_fill_b(bs0);
1456  bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1457  tmp_vec = (v16u8) __msa_fill_b(bs1);
1458  bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1459  tmp_vec = (v16u8) __msa_fill_b(bs2);
1460  bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1461  tmp_vec = (v16u8) __msa_fill_b(bs3);
1462  bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1463 
1464  if (!__msa_test_bz_v(bs)) {
1465  v16u8 alpha, beta, is_less_than, is_less_than_beta;
1466  v16u8 p0, q0, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
1467  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1468  v16u8 is_less_than_alpha, is_bs_greater_than0;
1469  v8i16 p0_r, q0_r, p0_l, q0_l;
1470  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1471  v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1472  v16i8 zero = { 0 };
1473  v16i8 tc = { 0 };
1474 
1475  tmp_vec = (v16u8) __msa_fill_b(tc0);
1476  tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1477  tmp_vec = (v16u8) __msa_fill_b(tc1);
1478  tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1479  tmp_vec = (v16u8) __msa_fill_b(tc2);
1480  tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1481  tmp_vec = (v16u8) __msa_fill_b(tc3);
1482  tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1483 
1484  alpha = (v16u8) __msa_fill_b(alpha_in);
1485  beta = (v16u8) __msa_fill_b(beta_in);
1486 
1487  LD_UB5(data - (3 * image_width), image_width,
1488  p2_org, p1_org, p0_org, q0_org, q1_org);
1489 
1490  is_bs_greater_than0 = ((v16u8) zero < bs);
1491  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1492  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1493  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1494 
1495  is_less_than_alpha = (p0_asub_q0 < alpha);
1496  is_less_than_beta = (p1_asub_p0 < beta);
1497  is_less_than = is_less_than_beta & is_less_than_alpha;
1498  is_less_than_beta = (q1_asub_q0 < beta);
1499  is_less_than = is_less_than_beta & is_less_than;
1500  is_less_than = is_less_than & is_bs_greater_than0;
1501 
1502  if (!__msa_test_bz_v(is_less_than)) {
1503  v16i8 sign_negate_tc, negate_tc;
1504  v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
1505  v16u8 p2_asub_p0, q2_asub_q0;
1506 
1507  q2_org = LD_UB(data + (2 * image_width));
1508  negate_tc = zero - tc;
1509  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1510 
1511  ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1512 
1513  UNPCK_UB_SH(tc, tc_r, tc_l);
1514  UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1515  UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1516  UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1517 
1518  p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1519  is_less_than_beta = (p2_asub_p0 < beta);
1520  is_less_than_beta = is_less_than_beta & is_less_than;
1521 
1522  if (!__msa_test_bz_v(is_less_than_beta)) {
1523  v16u8 p1;
1524  v8i16 p1_r = { 0 };
1525  v8i16 p1_l = { 0 };
1526  v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1527  v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1528 
1529  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1530  negate_tc_r, tc_r, p1_r);
1531  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1532  i16_negatetc_l, tc_l, p1_l);
1533 
1534  p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1535  p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1536  ST_UB(p1_org, data - (2 * image_width));
1537 
1538  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1539  tc = tc + (v16i8) is_less_than_beta;
1540  }
1541 
1542  q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1543  is_less_than_beta = (q2_asub_q0 < beta);
1544  is_less_than_beta = is_less_than_beta & is_less_than;
1545 
1546  q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1547  q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1548 
1549  if (!__msa_test_bz_v(is_less_than_beta)) {
1550  v16u8 q1;
1551  v8i16 q1_r = { 0 };
1552  v8i16 q1_l = { 0 };
1553  v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1554  v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1555 
1556  AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1557  negate_tc_r, tc_r, q1_r);
1558  AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1559  i16_negatetc_l, tc_l, q1_l);
1560 
1561  q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1562  q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1563  ST_UB(q1_org, data + image_width);
1564 
1565  is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1566  tc = tc + (v16i8) is_less_than_beta;
1567  }
1568  {
1569  v16i8 negate_thresh, sign_negate_thresh;
1570  v8i16 threshold_r, threshold_l;
1571  v8i16 negate_thresh_l, negate_thresh_r;
1572 
1573  negate_thresh = zero - tc;
1574  sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1575 
1576  ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1577  threshold_r, negate_thresh_r);
1578  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1579  negate_thresh_r, threshold_r, p0_r, q0_r);
1580 
1581  threshold_l = (v8i16) __msa_ilvl_b(zero, tc);
1582  negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1583  negate_thresh);
1584  AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1585  negate_thresh_l, threshold_l, p0_l, q0_l);
1586  }
1587 
1588  PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1589 
1590  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1591  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1592 
1593  ST_UB(p0_org, (data - image_width));
1594  ST_UB(q0_org, data);
1595  }
1596  }
1597 }
1598 
1599 static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, ptrdiff_t stride,
1600  int32_t alpha_in, int32_t beta_in,
1601  int8_t *tc0)
1602 {
1603  uint8_t *data = in;
1604  uint32_t out0, out1, out2, out3;
1605  uint64_t load;
1606  uint32_t tc_val;
1607  v16u8 alpha, beta;
1608  v16i8 inp0 = { 0 };
1609  v16i8 inp1 = { 0 };
1610  v16i8 inp2 = { 0 };
1611  v16i8 inp3 = { 0 };
1612  v16i8 inp4 = { 0 };
1613  v16i8 inp5 = { 0 };
1614  v16i8 inp6 = { 0 };
1615  v16i8 inp7 = { 0 };
1616  v16i8 src0, src1, src2, src3;
1617  v8i16 src4, src5, src6, src7;
1618  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
1619  v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
1620  v16u8 is_less_than_beta1, is_less_than_beta2;
1621  v8i16 tc, tc_orig_r, tc_plus1;
1622  v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
1623  v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
1624  v8i16 src2_r, src3_r;
1625  v8i16 p2_r, p1_r, q2_r, q1_r;
1626  v16u8 p2, q2, p0, q0;
1627  v4i32 dst0, dst1;
1628  v16i8 zeros = { 0 };
1629 
1630  alpha = (v16u8) __msa_fill_b(alpha_in);
1631  beta = (v16u8) __msa_fill_b(beta_in);
1632 
1633  if (tc0[0] < 0) {
1634  data += (2 * stride);
1635  } else {
1636  load = LD(data - 3);
1637  inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
1638  load = LD(data - 3 + stride);
1639  inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
1640  data += (2 * stride);
1641  }
1642 
1643  if (tc0[1] < 0) {
1644  data += (2 * stride);
1645  } else {
1646  load = LD(data - 3);
1647  inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
1648  load = LD(data - 3 + stride);
1649  inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
1650  data += (2 * stride);
1651  }
1652 
1653  if (tc0[2] < 0) {
1654  data += (2 * stride);
1655  } else {
1656  load = LD(data - 3);
1657  inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
1658  load = LD(data - 3 + stride);
1659  inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
1660  data += (2 * stride);
1661  }
1662 
1663  if (tc0[3] < 0) {
1664  data += (2 * stride);
1665  } else {
1666  load = LD(data - 3);
1667  inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
1668  load = LD(data - 3 + stride);
1669  inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
1670  data += (2 * stride);
1671  }
1672 
1673  ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
1674  src0, src1, src2, src3);
1675 
1676  ILVR_H2_SH(src1, src0, src3, src2, src4, src6);
1677  ILVL_H2_SH(src1, src0, src3, src2, src5, src7);
1678 
1679  src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
1680  src1 = __msa_sldi_b(zeros, (v16i8) src0, 8);
1681  src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
1682  src3 = __msa_sldi_b(zeros, (v16i8) src2, 8);
1683  src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
1684  src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
1685 
1686  p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1687  p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1688  q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1689  p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1690  q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1691 
1692  is_less_than_alpha = (p0_asub_q0 < alpha);
1693  is_less_than_beta = (p1_asub_p0 < beta);
1694  is_less_than = is_less_than_alpha & is_less_than_beta;
1695  is_less_than_beta = (q1_asub_q0 < beta);
1696  is_less_than = is_less_than_beta & is_less_than;
1697 
1698  is_less_than_beta1 = (p2_asub_p0 < beta);
1699  is_less_than_beta2 = (q2_asub_q0 < beta);
1700 
1701  p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2);
1702  p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1703  p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
1704 
1705  ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r);
1706  p2_r += p0_add_q0;
1707  p2_r >>= 1;
1708  p2_r -= p1_r;
1709  ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
1710  q2_r += p0_add_q0;
1711  q2_r >>= 1;
1712  q2_r -= q1_r;
1713 
1714  tc_val = LW(tc0);
1715  tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
1716  tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
1717  is_tc_orig1 = tc_orig;
1718  is_tc_orig2 = tc_orig;
1719  tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
1720  tc = tc_orig_r;
1721 
1722  CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
1723  CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
1724 
1725  p2_r += p1_r;
1726  q2_r += q1_r;
1727 
1728  PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2);
1729 
1730  is_tc_orig1 = (zeros < is_tc_orig1);
1731  is_tc_orig2 = is_tc_orig1;
1732  is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
1733  is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
1734  is_tc_orig1 = is_less_than & is_tc_orig1;
1735  is_tc_orig2 = is_less_than & is_tc_orig2;
1736 
1737  p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1);
1738  q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
1739 
1740  q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1741  q0_sub_p0 <<= 2;
1742  p1_sub_q1 = p1_r - q1_r;
1743  q0_sub_p0 += p1_sub_q1;
1744  q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
1745 
1746  tc_plus1 = tc + 1;
1747  is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
1748  (v16i8) is_less_than_beta1);
1749  tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1);
1750  tc_plus1 = tc + 1;
1751  is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
1752  (v16i8) is_less_than_beta2);
1753  tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
1754 
1755  CLIP_SH(q0_sub_p0, -tc, tc);
1756 
1757  ILVR_B2_SH(zeros, src2, zeros, src3, src2_r, src3_r);
1758  src2_r += q0_sub_p0;
1759  src3_r -= q0_sub_p0;
1760 
1761  CLIP_SH2_0_255(src2_r, src3_r);
1762 
1763  PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
1764 
1765  p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than);
1766  q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than);
1767 
1768  ILVR_B2_UB(p0, p2, q2, q0, p2, q2);
1769 
1770  ILVRL_H2_SW(q2, p2, dst0, dst1);
1771 
1772  data = in;
1773 
1774  out0 = __msa_copy_u_w(dst0, 0);
1775  out1 = __msa_copy_u_w(dst0, 1);
1776  out2 = __msa_copy_u_w(dst0, 2);
1777  out3 = __msa_copy_u_w(dst0, 3);
1778 
1779  if (tc0[0] < 0) {
1780  data += (2 * stride);
1781  } else {
1782  SW(out0, (data - 2));
1783  data += stride;
1784  SW(out1, (data - 2));
1785  data += stride;
1786  }
1787 
1788  if (tc0[1] < 0) {
1789  data += (2 * stride);
1790  } else {
1791  SW(out2, (data - 2));
1792  data += stride;
1793  SW(out3, (data - 2));
1794  data += stride;
1795  }
1796 
1797  out0 = __msa_copy_u_w(dst1, 0);
1798  out1 = __msa_copy_u_w(dst1, 1);
1799  out2 = __msa_copy_u_w(dst1, 2);
1800  out3 = __msa_copy_u_w(dst1, 3);
1801 
1802  if (tc0[2] < 0) {
1803  data += (2 * stride);
1804  } else {
1805  SW(out0, (data - 2));
1806  data += stride;
1807  SW(out1, (data - 2));
1808  data += stride;
1809  }
1810 
1811  if (tc0[3] >= 0) {
1812  SW(out2, (data - 2));
1813  data += stride;
1814  SW(out3, (data - 2));
1815  }
1816 }
1817 
1819  uint8_t bs0, uint8_t bs1,
1820  uint8_t bs2, uint8_t bs3,
1821  uint8_t tc0, uint8_t tc1,
1822  uint8_t tc2, uint8_t tc3,
1823  uint8_t alpha_in,
1824  uint8_t beta_in,
1825  ptrdiff_t img_width)
1826 {
1827  v16u8 alpha, beta;
1828  v8i16 tmp_vec;
1829  v8i16 bs = { 0 };
1830  v8i16 tc = { 0 };
1831  v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
1832  v16u8 is_less_than;
1833  v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
1834  v8i16 p0_r, q0_r;
1835  v16u8 p1_org, p0_org, q0_org, q1_org;
1836  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1837  v16i8 negate_tc, sign_negate_tc;
1838  v8i16 tc_r, negate_tc_r;
1839  v16i8 zero = { 0 };
1840 
1841  tmp_vec = (v8i16) __msa_fill_b(bs0);
1842  bs = __msa_insve_h(bs, 0, tmp_vec);
1843  tmp_vec = (v8i16) __msa_fill_b(bs1);
1844  bs = __msa_insve_h(bs, 1, tmp_vec);
1845  tmp_vec = (v8i16) __msa_fill_b(bs2);
1846  bs = __msa_insve_h(bs, 2, tmp_vec);
1847  tmp_vec = (v8i16) __msa_fill_b(bs3);
1848  bs = __msa_insve_h(bs, 3, tmp_vec);
1849 
1850  if (!__msa_test_bz_v((v16u8) bs)) {
1851  tmp_vec = (v8i16) __msa_fill_b(tc0);
1852  tc = __msa_insve_h(tc, 0, tmp_vec);
1853  tmp_vec = (v8i16) __msa_fill_b(tc1);
1854  tc = __msa_insve_h(tc, 1, tmp_vec);
1855  tmp_vec = (v8i16) __msa_fill_b(tc2);
1856  tc = __msa_insve_h(tc, 2, tmp_vec);
1857  tmp_vec = (v8i16) __msa_fill_b(tc3);
1858  tc = __msa_insve_h(tc, 3, tmp_vec);
1859 
1860  is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
1861 
1862  alpha = (v16u8) __msa_fill_b(alpha_in);
1863  beta = (v16u8) __msa_fill_b(beta_in);
1864 
1865  LD_UB4(data - (img_width << 1), img_width,
1866  p1_org, p0_org, q0_org, q1_org);
1867 
1868  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1869  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1870  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1871 
1872  is_less_than_alpha = (p0_asub_q0 < alpha);
1873  is_less_than_beta = (p1_asub_p0 < beta);
1874  is_less_than = is_less_than_beta & is_less_than_alpha;
1875  is_less_than_beta = (q1_asub_q0 < beta);
1876  is_less_than = is_less_than_beta & is_less_than;
1877  is_less_than = is_less_than & is_bs_greater_than0;
1878 
1879  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1880 
1881  if (!__msa_test_bz_v(is_less_than)) {
1882  negate_tc = zero - (v16i8) tc;
1883  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1884 
1885  ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r);
1886 
1887  ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
1888  p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1889 
1890  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
1891  tc_r, p0_r, q0_r);
1892 
1893  PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
1894 
1895  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1896  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1897 
1898  ST_UB(q0_org, data);
1899  ST_UB(p0_org, (data - img_width));
1900  }
1901  }
1902 }
1903 
1905  uint8_t bs0, uint8_t bs1,
1906  uint8_t bs2, uint8_t bs3,
1907  uint8_t tc0, uint8_t tc1,
1908  uint8_t tc2, uint8_t tc3,
1909  uint8_t alpha_in,
1910  uint8_t beta_in,
1911  ptrdiff_t img_width)
1912 {
1913  uint8_t *src;
1914  v16u8 alpha, beta;
1915  v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1916  v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
1917  v16u8 p0, q0;
1918  v8i16 p0_r = { 0 };
1919  v8i16 q0_r = { 0 };
1920  v16u8 p1_org, p0_org, q0_org, q1_org;
1921  v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1922  v16u8 is_bs_greater_than0;
1923  v8i16 tc_r, negate_tc_r;
1924  v16i8 negate_tc, sign_negate_tc;
1925  v16i8 zero = { 0 };
1926  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1927  v8i16 tmp1, tmp_vec, bs = { 0 };
1928  v8i16 tc = { 0 };
1929 
1930  tmp_vec = (v8i16) __msa_fill_b(bs0);
1931  bs = __msa_insve_h(bs, 0, tmp_vec);
1932  tmp_vec = (v8i16) __msa_fill_b(bs1);
1933  bs = __msa_insve_h(bs, 1, tmp_vec);
1934  tmp_vec = (v8i16) __msa_fill_b(bs2);
1935  bs = __msa_insve_h(bs, 2, tmp_vec);
1936  tmp_vec = (v8i16) __msa_fill_b(bs3);
1937  bs = __msa_insve_h(bs, 3, tmp_vec);
1938 
1939  if (!__msa_test_bz_v((v16u8) bs)) {
1940  tmp_vec = (v8i16) __msa_fill_b(tc0);
1941  tc = __msa_insve_h(tc, 0, tmp_vec);
1942  tmp_vec = (v8i16) __msa_fill_b(tc1);
1943  tc = __msa_insve_h(tc, 1, tmp_vec);
1944  tmp_vec = (v8i16) __msa_fill_b(tc2);
1945  tc = __msa_insve_h(tc, 2, tmp_vec);
1946  tmp_vec = (v8i16) __msa_fill_b(tc3);
1947  tc = __msa_insve_h(tc, 3, tmp_vec);
1948 
1949  is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
1950 
1951  LD_UB8((data - 2), img_width,
1952  row0, row1, row2, row3, row4, row5, row6, row7);
1953 
1954  TRANSPOSE8x4_UB_UB(row0, row1, row2, row3,
1955  row4, row5, row6, row7,
1956  p1_org, p0_org, q0_org, q1_org);
1957 
1958  p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1959  p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1960  q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1961 
1962  alpha = (v16u8) __msa_fill_b(alpha_in);
1963  beta = (v16u8) __msa_fill_b(beta_in);
1964 
1965  is_less_than_alpha = (p0_asub_q0 < alpha);
1966  is_less_than_beta = (p1_asub_p0 < beta);
1967  is_less_than = is_less_than_beta & is_less_than_alpha;
1968  is_less_than_beta = (q1_asub_q0 < beta);
1969  is_less_than = is_less_than_beta & is_less_than;
1970  is_less_than = is_bs_greater_than0 & is_less_than;
1971 
1972  is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1973 
1974  if (!__msa_test_bz_v(is_less_than)) {
1975  ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
1976  p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1977 
1978  negate_tc = zero - (v16i8) tc;
1979  sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1980 
1981  ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r);
1982 
1983  AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
1984  tc_r, p0_r, q0_r);
1985 
1986  PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
1987 
1988  p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1989  q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1990  tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
1991  src = data - 1;
1992  ST_H4(tmp1, 0, 1, 2, 3, src, img_width);
1993  src += 4 * img_width;
1994  ST_H4(tmp1, 4, 5, 6, 7, src, img_width);
1995  }
1996  }
1997 }
1998 
1999 static void avc_h_loop_filter_chroma422_msa(uint8_t *src, ptrdiff_t stride,
2000  int32_t alpha_in, int32_t beta_in,
2001  int8_t *tc0)
2002 {
2003  int32_t col, tc_val;
2004  v16u8 alpha, beta, res;
2005 
2006  alpha = (v16u8) __msa_fill_b(alpha_in);
2007  beta = (v16u8) __msa_fill_b(beta_in);
2008 
2009  for (col = 0; col < 4; col++) {
2010  tc_val = (tc0[col] - 1) + 1;
2011 
2012  if (tc_val <= 0) {
2013  src += (4 * stride);
2014  continue;
2015  }
2016 
2017  AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2018  ST_H4(res, 0, 1, 2, 3, (src - 1), stride);
2019  src += (4 * stride);
2020  }
2021 }
2022 
2024  ptrdiff_t stride,
2025  int32_t alpha_in,
2026  int32_t beta_in,
2027  int8_t *tc0)
2028 {
2029  int32_t col, tc_val;
2030  int16_t out0, out1;
2031  v16u8 alpha, beta, res;
2032 
2033  alpha = (v16u8) __msa_fill_b(alpha_in);
2034  beta = (v16u8) __msa_fill_b(beta_in);
2035 
2036  for (col = 0; col < 4; col++) {
2037  tc_val = (tc0[col] - 1) + 1;
2038 
2039  if (tc_val <= 0) {
2040  src += 4 * stride;
2041  continue;
2042  }
2043 
2044  AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2045 
2046  out0 = __msa_copy_s_h((v8i16) res, 0);
2047  out1 = __msa_copy_s_h((v8i16) res, 1);
2048 
2049  SH(out0, (src - 1));
2050  src += stride;
2051  SH(out1, (src - 1));
2052  src += stride;
2053  }
2054 }
2055 
2056 void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2057  int alpha, int beta, int8_t *tc)
2058 {
2059 // uint8_t bs0 = 1;
2060 // uint8_t bs1 = 1;
2061 // uint8_t bs2 = 1;
2062 // uint8_t bs3 = 1;
2063 //
2064 // if (tc[0] < 0)
2065 // bs0 = 0;
2066 // if (tc[1] < 0)
2067 // bs1 = 0;
2068 // if (tc[2] < 0)
2069 // bs2 = 0;
2070 // if (tc[3] < 0)
2071 // bs3 = 0;
2072 //
2073 // avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2074 // tc[0], tc[1], tc[2], tc[3],
2075 // alpha, beta, img_width);
2077 }
2078 
2079 void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2080  int alpha, int beta, int8_t *tc)
2081 {
2082 
2083  uint8_t bs0 = 1;
2084  uint8_t bs1 = 1;
2085  uint8_t bs2 = 1;
2086  uint8_t bs3 = 1;
2087 
2088  if (tc[0] < 0)
2089  bs0 = 0;
2090  if (tc[1] < 0)
2091  bs1 = 0;
2092  if (tc[2] < 0)
2093  bs2 = 0;
2094  if (tc[3] < 0)
2095  bs3 = 0;
2096 
2097  avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2098  tc[0], tc[1], tc[2], tc[3],
2099  alpha, beta, img_width);
2100 }
2101 
2102 void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2103  int alpha, int beta, int8_t *tc)
2104 {
2105  uint8_t bs0 = 1;
2106  uint8_t bs1 = 1;
2107  uint8_t bs2 = 1;
2108  uint8_t bs3 = 1;
2109 
2110  if (tc[0] < 0)
2111  bs0 = 0;
2112  if (tc[1] < 0)
2113  bs1 = 0;
2114  if (tc[2] < 0)
2115  bs2 = 0;
2116  if (tc[3] < 0)
2117  bs3 = 0;
2118 
2120  tc[0], tc[1], tc[2], tc[3],
2121  alpha, beta, img_width);
2122 }
2123 
2124 void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2125  int alpha, int beta, int8_t *tc)
2126 {
2127  uint8_t bs0 = 1;
2128  uint8_t bs1 = 1;
2129  uint8_t bs2 = 1;
2130  uint8_t bs3 = 1;
2131 
2132  if (tc[0] < 0)
2133  bs0 = 0;
2134  if (tc[1] < 0)
2135  bs1 = 0;
2136  if (tc[2] < 0)
2137  bs2 = 0;
2138  if (tc[3] < 0)
2139  bs3 = 0;
2140 
2142  tc[0], tc[1], tc[2], tc[3],
2143  alpha, beta, img_width);
2144 }
2145 
2146 void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2147  int alpha, int beta)
2148 {
2150  (uint8_t) beta,
2151  img_width);
2152 }
2153 
2154 void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2155  int alpha, int beta)
2156 {
2158  (uint8_t) beta,
2159  img_width);
2160 }
2161 
2162 void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2163  int alpha, int beta)
2164 {
2166  (uint8_t) beta,
2167  img_width);
2168 }
2169 
2170 void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2171  int alpha, int beta)
2172 {
2174  (uint8_t) beta,
2175  img_width);
2176 }
2177 
2179  ptrdiff_t ystride,
2180  int32_t alpha, int32_t beta,
2181  int8_t *tc0)
2182 {
2183  avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0);
2184 }
2185 
2187  ptrdiff_t ystride,
2188  int32_t alpha,
2189  int32_t beta,
2190  int8_t *tc0)
2191 {
2192  avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0);
2193 }
2194 
2196  ptrdiff_t ystride,
2197  int32_t alpha,
2198  int32_t beta,
2199  int8_t *tc0)
2200 {
2201  avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0);
2202 }
2203 
2205  ptrdiff_t ystride,
2206  int32_t alpha,
2207  int32_t beta)
2208 {
2210 }
2211 
2212 void ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride,
2213  int height, int log2_denom,
2214  int weight_src, int offset_in)
2215 {
2216  uint32_t offset_val;
2217  v16i8 zero = { 0 };
2218  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2219  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2220  v8i16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
2221  v8i16 src4_l, src5_l, src6_l, src7_l, src4_r, src5_r, src6_r, src7_r;
2222  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2223  v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2224  v8i16 wgt, denom, offset;
2225 
2226  offset_val = (unsigned) offset_in << log2_denom;
2227 
2228  wgt = __msa_fill_h(weight_src);
2229  offset = __msa_fill_h(offset_val);
2230  denom = __msa_fill_h(log2_denom);
2231 
2232  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2233  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, src1_r,
2234  src2_r, src3_r);
2235  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, src1_l,
2236  src2_l, src3_l);
2237  ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, src5_r,
2238  src6_r, src7_r);
2239  ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, src5_l,
2240  src6_l, src7_l);
2241  MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, tmp2,
2242  tmp3);
2243  MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, tmp6,
2244  tmp7);
2245  MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, tmp10,
2246  tmp11);
2247  MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2248  tmp14, tmp15);
2249  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
2250  tmp1, tmp2, tmp3);
2251  ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
2252  tmp5, tmp6, tmp7);
2253  ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, tmp8,
2254  tmp9, tmp10, tmp11);
2255  ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2256  tmp12, tmp13, tmp14, tmp15);
2257  MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2258  MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2259  SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2260  SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2261  SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2262  SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2263  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2264  dst2, dst3);
2265  PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2266  dst5, dst6, dst7);
2267  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2268  src += 8 * stride;
2269 
2270  if (16 == height) {
2271  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2272  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r,
2273  src1_r, src2_r, src3_r);
2274  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l,
2275  src1_l, src2_l, src3_l);
2276  ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r,
2277  src5_r, src6_r, src7_r);
2278  ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l,
2279  src5_l, src6_l, src7_l);
2280  MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1,
2281  tmp2, tmp3);
2282  MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5,
2283  tmp6, tmp7);
2284  MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9,
2285  tmp10, tmp11);
2286  MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2287  tmp14, tmp15);
2288  ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
2289  tmp0, tmp1, tmp2, tmp3);
2290  ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
2291  tmp4, tmp5, tmp6, tmp7);
2292  ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset,
2293  tmp8, tmp9, tmp10, tmp11);
2294  ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2295  tmp12, tmp13, tmp14, tmp15);
2296  MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2297  MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2298  SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2299  SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2300  SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2301  SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2302  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2303  dst2, dst3);
2304  PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2305  dst5, dst6, dst7);
2306  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2307  }
2308 }
2309 
2310 void ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride,
2311  int height, int log2_denom,
2312  int weight_src, int offset)
2313 {
2314  if (4 == height) {
2315  avc_wgt_8x4_msa(src, stride, log2_denom, weight_src, offset);
2316  } else if (8 == height) {
2317  avc_wgt_8x8_msa(src, stride, log2_denom, weight_src, offset);
2318  } else {
2319  avc_wgt_8x16_msa(src, stride, log2_denom, weight_src, offset);
2320  }
2321 }
2322 
2323 void ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride,
2324  int height, int log2_denom,
2325  int weight_src, int offset)
2326 {
2327  if (2 == height) {
2328  avc_wgt_4x2_msa(src, stride, log2_denom, weight_src, offset);
2329  } else if (4 == height) {
2330  avc_wgt_4x4_msa(src, stride, log2_denom, weight_src, offset);
2331  } else {
2332  avc_wgt_4x8_msa(src, stride, log2_denom, weight_src, offset);
2333  }
2334 }
2335 
2336 void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
2337  ptrdiff_t stride, int height,
2338  int log2_denom, int weight_dst,
2339  int weight_src, int offset_in)
2340 {
2341  v16i8 src_wgt, dst_wgt, wgt;
2342  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2343  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2344  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2345  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2346  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2347  v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2348  v8i16 denom, offset;
2349 
2350  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
2351  offset_in += (128 * (weight_src + weight_dst));
2352 
2353  src_wgt = __msa_fill_b(weight_src);
2354  dst_wgt = __msa_fill_b(weight_dst);
2355  offset = __msa_fill_h(offset_in);
2356  denom = __msa_fill_h(log2_denom + 1);
2357 
2358  wgt = __msa_ilvev_b(dst_wgt, src_wgt);
2359 
2360  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2361  src += 8 * stride;
2362  LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2363  XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2364  XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2365  ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2, vec4,
2366  vec6);
2367  ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3, vec5,
2368  vec7);
2369  ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2370  vec12, vec14);
2371  ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2372  vec13, vec15);
2373  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2374  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2375  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2376  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2377  tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2378  tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2379  tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2380  tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2381  tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2382  tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2383  tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2384  tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2385  tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2386  tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2387  tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2388  tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2389  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2390  SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2391  SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2392  SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2393  CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2394  CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2395  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2396  dst2, dst3);
2397  PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2398  dst5, dst6, dst7);
2399  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2400  dst += 8 * stride;
2401 
2402  if (16 == height) {
2403  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2404  LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2405  XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2406  XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2407  ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2,
2408  vec4, vec6);
2409  ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3,
2410  vec5, vec7);
2411  ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2412  vec12, vec14);
2413  ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2414  vec13, vec15);
2415  tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2416  tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2417  tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2418  tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2419  tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2420  tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2421  tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2422  tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2423  tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2424  tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2425  tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2426  tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2427  tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2428  tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2429  tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2430  tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2431  SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2432  SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2433  SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2434  SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2435  CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2436  CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2437  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2438  dst2, dst3);
2439  PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2440  dst5, dst6, dst7);
2441  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2442  }
2443 }
2444 
2445 void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src,
2446  ptrdiff_t stride, int height,
2447  int log2_denom, int weight_dst,
2448  int weight_src, int offset)
2449 {
2450  if (4 == height) {
2451  avc_biwgt_8x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2452  offset);
2453  } else if (8 == height) {
2454  avc_biwgt_8x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2455  offset);
2456  } else {
2457  avc_biwgt_8x16_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2458  offset);
2459  }
2460 }
2461 
2462 void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src,
2463  ptrdiff_t stride, int height,
2464  int log2_denom, int weight_dst,
2465  int weight_src, int offset)
2466 {
2467  if (2 == height) {
2468  avc_biwgt_4x2_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2469  offset);
2470  } else if (4 == height) {
2471  avc_biwgt_4x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2472  offset);
2473  } else {
2474  avc_biwgt_4x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2475  offset);
2476  }
2477 }
stride
int stride
Definition: mace.c:144
q1
static const uint8_t q1[256]
Definition: twofish.c:96
ff_weight_h264_pixels8_8_msa
void ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset)
Definition: h264dsp_msa.c:2310
LD_UB8
#define LD_UB8(...)
Definition: generic_macros_msa.h:335
ILVR_H2_SH
#define ILVR_H2_SH(...)
Definition: generic_macros_msa.h:1392
ff_h264_h_loop_filter_luma_mbaff_msa
void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src, ptrdiff_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
Definition: h264dsp_msa.c:2195
ff_h264_h_loop_filter_chroma422_msa
void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src, ptrdiff_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
Definition: h264dsp_msa.c:2178
ILVL_B4_SH
#define ILVL_B4_SH(...)
Definition: generic_macros_msa.h:1276
avc_wgt_4x2_msa
static void avc_wgt_4x2_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:24
ff_h264_h_loop_filter_chroma422_mbaff_msa
void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, ptrdiff_t ystride, int32_t alpha, int32_t beta, int8_t *tc0)
Definition: h264dsp_msa.c:2186
SAT_UH4_SH
#define SAT_UH4_SH(...)
Definition: generic_macros_msa.h:1576
ff_h264_v_lpf_luma_intra_msa
void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2154
AVC_LPF_H_CHROMA_422
#define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res)
Definition: h264dsp_msa.c:562
ILVR_B4_UH
#define ILVR_B4_UH(...)
Definition: generic_macros_msa.h:1361
t0
#define t0
Definition: regdef.h:28
PCKEV_B4
#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
Definition: generic_macros_msa.h:1732
data
const char data[16]
Definition: mxf.c:143
ADDS_SH4_SH
#define ADDS_SH4_SH(...)
Definition: generic_macros_msa.h:1906
PCKEV_B4_UB
#define PCKEV_B4_UB(...)
Definition: generic_macros_msa.h:1739
avc_biwgt_4x8_msa
static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:297
t1
#define t1
Definition: regdef.h:29
LD_UB5
#define LD_UB5(...)
Definition: generic_macros_msa.h:307
ILVRL_B2
#define ILVRL_B2(RTYPE, in0, in1, out0, out1)
Definition: generic_macros_msa.h:1490
ST_UB8
#define ST_UB8(...)
Definition: generic_macros_msa.h:391
avc_h_loop_filter_chroma422_mbaff_msa
static void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, ptrdiff_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
Definition: h264dsp_msa.c:2023
SH
#define SH(val, pdst)
Definition: generic_macros_msa.h:154
avc_biwgt_8x4_msa
static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:337
avc_h_loop_filter_luma_mbaff_msa
static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, ptrdiff_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
Definition: h264dsp_msa.c:1599
generic_macros_msa.h
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: vp8_lpf_lsx.c:234
avc_wgt_4x4_msa
static void avc_wgt_4x4_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:51
ff_h264_v_lpf_chroma_inter_msa
void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2124
ADDS_SH2_SH
#define ADDS_SH2_SH(...)
Definition: generic_macros_msa.h:1897
ff_h264_h_lpf_chroma_inter_msa
void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2102
ILVRL_H2_UB
#define ILVRL_H2_UB(...)
Definition: generic_macros_msa.h:1506
ILVR_W2_UB
#define ILVR_W2_UB(...)
Definition: generic_macros_msa.h:1416
LD_UB
#define LD_UB(...)
Definition: generic_macros_msa.h:32
ILVL_W2_SB
#define ILVL_W2_SB(...)
Definition: generic_macros_msa.h:1319
MAXI_SH4_SH
#define MAXI_SH4_SH(...)
Definition: generic_macros_msa.h:1542
SW
#define SW(val, pdst)
Definition: generic_macros_msa.h:167
ILVL_H2_SH
#define ILVL_H2_SH(...)
Definition: generic_macros_msa.h:1292
avc_h_loop_filter_luma_mbaff_intra_msa
static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, ptrdiff_t stride, int32_t alpha_in, int32_t beta_in)
Definition: h264dsp_msa.c:976
MUL2
#define MUL2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2101
avc_loopfilter_luma_inter_edge_ver_msa
static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *pPix, uint32_t iStride, uint8_t iAlpha, uint8_t iBeta, uint8_t *pTc)
Definition: h264dsp_msa.c:1287
ff_h264_h_lpf_luma_intra_msa
void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2146
SRA_4V
#define SRA_4V(in0, in1, in2, in3, shift)
Definition: generic_macros_msa.h:1939
ff_h264_v_lpf_luma_inter_msa
void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2079
ff_weight_h264_pixels4_8_msa
void ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset)
Definition: h264dsp_msa.c:2323
avc_biwgt_4x2_msa
static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:230
f
#define f(width, name)
Definition: cbs_vp9.c:255
q0
static const uint8_t q0[256]
Definition: twofish.c:77
ST_D8
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:511
XORI_B2_128_UB
#define XORI_B2_128_UB(...)
Definition: generic_macros_msa.h:1834
ILVR_B4_SB
#define ILVR_B4_SB(...)
Definition: generic_macros_msa.h:1360
h264dsp_mips.h
ILVL_B2_SH
#define ILVL_B2_SH(...)
Definition: generic_macros_msa.h:1265
PCKEV_B2_UB
#define PCKEV_B2_UB(...)
Definition: generic_macros_msa.h:1720
ILVRL_H2_SH
#define ILVRL_H2_SH(...)
Definition: generic_macros_msa.h:1508
SAT_UH2_SH
#define SAT_UH2_SH(...)
Definition: generic_macros_msa.h:1568
avc_loopfilter_luma_intra_edge_ver_msa
static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data, uint8_t alpha_in, uint8_t beta_in, ptrdiff_t img_width)
Definition: h264dsp_msa.c:816
src
#define src
Definition: vp8dsp.c:255
CLIP_SH
#define CLIP_SH(in, min, max)
Definition: generic_macros_msa.h:923
ff_h264_h_lpf_chroma_intra_msa
void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2162
ST_W2
#define ST_W2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:450
avc_loopfilter_luma_intra_edge_hor_msa
static void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data, uint8_t alpha_in, uint8_t beta_in, ptrdiff_t img_width)
Definition: h264dsp_msa.c:683
CLIP_SH8_0_255
#define CLIP_SH8_0_255(in0, in1, in2, in3, in4, in5, in6, in7)
Definition: generic_macros_msa.h:953
AVC_LPF_H_2BYTE_CHROMA_422
#define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res)
Definition: h264dsp_msa.c:628
XORI_B8_128_UB
#define XORI_B8_128_UB(...)
Definition: generic_macros_msa.h:1881
ff_weight_h264_pixels16_8_msa
void ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset_in)
Definition: h264dsp_msa.c:2212
CLIP_SH2_0_255
#define CLIP_SH2_0_255(in0, in1)
Definition: generic_macros_msa.h:941
SRLR_H4_SH
#define SRLR_H4_SH(...)
Definition: generic_macros_msa.h:1974
LW
#define LW(psrc)
Definition: generic_macros_msa.h:104
ff_biweight_h264_pixels16_8_msa
void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset_in)
Definition: h264dsp_msa.c:2336
AVC_LPF_P0P1P2_OR_Q0Q1Q2
#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, q3_or_p3_org_in, p1_or_q1_org_in, p2_or_q2_org_in, q1_or_p1_org_in, p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)
Definition: h264dsp_msa.c:489
ff_h264_h_loop_filter_luma_mbaff_intra_msa
void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, ptrdiff_t ystride, int32_t alpha, int32_t beta)
Definition: h264dsp_msa.c:2204
LD_UB4
#define LD_UB4(...)
Definition: generic_macros_msa.h:296
ILVRL_B2_SB
#define ILVRL_B2_SB(...)
Definition: generic_macros_msa.h:1496
height
#define height
TRANSPOSE8x4_UB_UB
#define TRANSPOSE8x4_UB_UB(...)
Definition: generic_macros_msa.h:2348
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
TRANSPOSE16x8_UB_UB
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7)
Definition: generic_macros_msa.h:2420
LW4
#define LW4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:202
avc_biwgt_8x8_msa
static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:375
avc_loopfilter_cb_or_cr_intra_edge_ver_msa
static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr, uint8_t alpha_in, uint8_t beta_in, ptrdiff_t img_width)
Definition: h264dsp_msa.c:1225
src0
#define src0
Definition: h264pred.c:139
avc_biwgt_4x4_msa
static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:263
ILVRL_H2_SW
#define ILVRL_H2_SW(...)
Definition: generic_macros_msa.h:1509
src1
#define src1
Definition: h264pred.c:140
CLIP_SH4_0_255
#define CLIP_SH4_0_255(in0, in1, in2, in3)
Definition: generic_macros_msa.h:947
PCKEV_B2_SH
#define PCKEV_B2_SH(...)
Definition: generic_macros_msa.h:1721
LD_SH8
#define LD_SH8(...)
Definition: generic_macros_msa.h:338
ST_H4
#define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:417
avc_loopfilter_cb_or_cr_inter_edge_hor_msa
static void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, ptrdiff_t img_width)
Definition: h264dsp_msa.c:1818
avc_biwgt_8x16_msa
static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t dst_weight, int32_t offset_in)
Definition: h264dsp_msa.c:426
t3
#define t3
Definition: regdef.h:31
XORI_B4_128_UB
#define XORI_B4_128_UB(...)
Definition: generic_macros_msa.h:1850
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:470
ff_h264_h_lpf_luma_inter_msa
void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width, int alpha, int beta, int8_t *tc)
Definition: h264dsp_msa.c:2056
PCKEV_B3_UB
#define PCKEV_B3_UB(...)
Definition: generic_macros_msa.h:1729
LD4
#define LD4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:228
AVC_LPF_P0_OR_Q0
#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, p1_or_q1_org_in, p0_or_q0_out)
Definition: h264dsp_msa.c:516
UNPCK_UB_SH
#define UNPCK_UB_SH(in, out0, out1)
Definition: generic_macros_msa.h:2206
avc_loopfilter_cb_or_cr_inter_edge_ver_msa
static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, ptrdiff_t img_width)
Definition: h264dsp_msa.c:1904
SRLR_H8_SH
#define SRLR_H8_SH(...)
Definition: generic_macros_msa.h:1982
INSERT_W4_UB
#define INSERT_W4_UB(...)
Definition: generic_macros_msa.h:1153
SAT_UH8_SH
#define SAT_UH8_SH(...)
Definition: generic_macros_msa.h:1584
avc_loopfilter_cb_or_cr_intra_edge_hor_msa
static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr, uint8_t alpha_in, uint8_t beta_in, ptrdiff_t img_width)
Definition: h264dsp_msa.c:1176
ILVR_B2_UB
#define ILVR_B2_UB(...)
Definition: generic_macros_msa.h:1337
ST_UB
#define ST_UB(...)
Definition: generic_macros_msa.h:40
ff_biweight_h264_pixels4_8_msa
void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)
Definition: h264dsp_msa.c:2462
SLDI_B4_SB
#define SLDI_B4_SB(...)
Definition: generic_macros_msa.h:644
MAXI_SH2_SH
#define MAXI_SH2_SH(...)
Definition: generic_macros_msa.h:1534
t2
#define t2
Definition: regdef.h:30
avc_wgt_4x8_msa
static void avc_wgt_4x8_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:78
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:499
ILVL_B4_SB
#define ILVL_B4_SB(...)
Definition: generic_macros_msa.h:1274
MAXI_SH8_SH
#define MAXI_SH8_SH(...)
Definition: generic_macros_msa.h:1550
ILVR_B4_SH
#define ILVR_B4_SH(...)
Definition: generic_macros_msa.h:1362
AVC_LPF_P0Q0
#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, p1_or_q1_org_in, q1_or_p1_org_in, negate_threshold_in, threshold_in, p0_or_q0_out, q0_or_p0_out)
Definition: h264dsp_msa.c:540
tc
#define tc
Definition: regdef.h:69
zero
#define zero
Definition: regdef.h:64
ILVR_W2_SB
#define ILVR_W2_SB(...)
Definition: generic_macros_msa.h:1417
avc_loopfilter_luma_inter_edge_hor_msa
static void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data, uint8_t bs0, uint8_t bs1, uint8_t bs2, uint8_t bs3, uint8_t tc0, uint8_t tc1, uint8_t tc2, uint8_t tc3, uint8_t alpha_in, uint8_t beta_in, ptrdiff_t image_width)
Definition: h264dsp_msa.c:1443
INSERT_W2_UB
#define INSERT_W2_UB(...)
Definition: generic_macros_msa.h:1143
alpha
static const int16_t alpha[]
Definition: ilbcdata.h:55
ILVRL_B2_SH
#define ILVRL_B2_SH(...)
Definition: generic_macros_msa.h:1498
avc_wgt_8x8_msa
static void avc_wgt_8x8_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:142
MUL4
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2106
int32_t
int32_t
Definition: audioconvert.c:56
SLDI_B2_UB
#define SLDI_B2_UB(...)
Definition: generic_macros_msa.h:622
flags
#define flags(name, subs,...)
Definition: cbs_av1.c:561
LD
#define LD(psrc)
Definition: generic_macros_msa.h:137
ff_biweight_h264_pixels8_8_msa
void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)
Definition: h264dsp_msa.c:2445
ILVR_B2_SH
#define ILVR_B2_SH(...)
Definition: generic_macros_msa.h:1340
INSERT_D2_UB
#define INSERT_D2_UB(...)
Definition: generic_macros_msa.h:1169
avc_wgt_8x4_msa
static void avc_wgt_8x4_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:110
avc_h_loop_filter_chroma422_msa
static void avc_h_loop_filter_chroma422_msa(uint8_t *src, ptrdiff_t stride, int32_t alpha_in, int32_t beta_in, int8_t *tc0)
Definition: h264dsp_msa.c:1999
AVC_LPF_P1_OR_Q1
#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, p1_or_q1_org_in, p2_or_q2_org_in, negate_tc_in, tc_in, p1_or_q1_out)
Definition: h264dsp_msa.c:525
ff_h264_v_lpf_chroma_intra_msa
void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width, int alpha, int beta)
Definition: h264dsp_msa.c:2170
avc_wgt_8x16_msa
static void avc_wgt_8x16_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_msa.c:184
LW2
#define LW2(psrc, stride, out0, out1)
Definition: generic_macros_msa.h:210