FFmpeg
hevcpred_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/hevcdec.h"
23 #include "hevcpred_mips.h"
24 
25 static const int8_t intra_pred_angle_up[17] = {
26  -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
27 };
28 
29 static const int8_t intra_pred_angle_low[16] = {
30  32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
31 };
32 
33 #define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, \
34  mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, \
35  res0, res1, mul_val_b0, mul_val_b1, round) \
36 { \
37  v8i16 res0_m, res1_m, res2_m, res3_m; \
38  \
39  MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1, \
40  mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m); \
41  \
42  res0_m += mul_val_h1 * tmp0; \
43  res1_m += mul_val_h3 * tmp0; \
44  res2_m += mul_val_h1 * tmp0; \
45  res3_m += mul_val_h3 * tmp0; \
46  \
47  res0_m += mul_val_b0 * src0_r; \
48  res1_m += mul_val_b0 * src0_l; \
49  res2_m += (mul_val_b0 - 1) * src0_r; \
50  res3_m += (mul_val_b0 - 1) * src0_l; \
51  \
52  res0_m += mul_val_b1 * tmp1; \
53  res1_m += mul_val_b1 * tmp1; \
54  res2_m += (mul_val_b1 + 1) * tmp1; \
55  res3_m += (mul_val_b1 + 1) * tmp1; \
56  \
57  SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round); \
58  PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1); \
59 }
60 
61 static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top,
62  const uint8_t *src_left,
63  uint8_t *dst, int32_t stride,
64  int32_t flag)
65 {
66  uint32_t col;
67  uint32_t src_data;
68  v8i16 vec0, vec1, vec2;
69  v16i8 zero = { 0 };
70 
71  src_data = LW(src_top);
72  SW4(src_data, src_data, src_data, src_data, dst, stride);
73 
74  if (0 == flag) {
75  src_data = LW(src_left);
76 
77  vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
78 
79  vec0 = __msa_fill_h(src_left[-1]);
80  vec1 = __msa_fill_h(src_top[0]);
81 
82  vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
83  vec2 -= vec0;
84  vec2 >>= 1;
85  vec2 += vec1;
86  CLIP_SH_0_255(vec2);
87 
88  for (col = 0; col < 4; col++) {
89  dst[stride * col] = (uint8_t) vec2[col];
90  }
91  }
92 }
93 
94 static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top,
95  const uint8_t *src_left,
96  uint8_t *dst, int32_t stride,
97  int32_t flag)
98 {
99  uint8_t *tmp_dst = dst;
100  uint32_t row;
101  uint16_t val0, val1, val2, val3;
102  uint64_t src_data1;
103  v8i16 vec0, vec1, vec2;
104  v16i8 zero = { 0 };
105 
106  src_data1 = LD(src_top);
107 
108  for (row = 8; row--;) {
109  SD(src_data1, tmp_dst);
110  tmp_dst += stride;
111  }
112 
113  if (0 == flag) {
114  src_data1 = LD(src_left);
115 
116  vec2 = (v8i16) __msa_insert_d((v2i64) zero, 0, src_data1);
117 
118  vec0 = __msa_fill_h(src_left[-1]);
119  vec1 = __msa_fill_h(src_top[0]);
120 
121  vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
122  vec2 -= vec0;
123  vec2 >>= 1;
124  vec2 += vec1;
125  CLIP_SH_0_255(vec2);
126 
127  val0 = vec2[0];
128  val1 = vec2[1];
129  val2 = vec2[2];
130  val3 = vec2[3];
131 
132  dst[0] = val0;
133  dst[stride] = val1;
134  dst[2 * stride] = val2;
135  dst[3 * stride] = val3;
136 
137  val0 = vec2[4];
138  val1 = vec2[5];
139  val2 = vec2[6];
140  val3 = vec2[7];
141 
142  dst[4 * stride] = val0;
143  dst[5 * stride] = val1;
144  dst[6 * stride] = val2;
145  dst[7 * stride] = val3;
146  }
147 }
148 
149 static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top,
150  const uint8_t *src_left,
151  uint8_t *dst, int32_t stride,
152  int32_t flag)
153 {
154  int32_t col;
155  uint8_t *tmp_dst = dst;
156  uint32_t row;
157  v16u8 src;
158  v8i16 vec0, vec1, vec2, vec3;
159 
160  src = LD_UB(src_top);
161 
162  for (row = 16; row--;) {
163  ST_UB(src, tmp_dst);
164  tmp_dst += stride;
165  }
166 
167  if (0 == flag) {
168  src = LD_UB(src_left);
169 
170  vec0 = __msa_fill_h(src_left[-1]);
171  vec1 = __msa_fill_h(src_top[0]);
172 
173  UNPCK_UB_SH(src, vec2, vec3);
174  SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
175 
176  vec2 >>= 1;
177  vec3 >>= 1;
178 
179  ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
180  CLIP_SH2_0_255(vec2, vec3);
181 
182  src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
183 
184  for (col = 0; col < 16; col++) {
185  dst[stride * col] = src[col];
186  }
187  }
188 }
189 
190 static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top,
191  const uint8_t *src_left,
192  uint8_t *dst, int32_t stride,
193  int32_t flag)
194 {
195  uint32_t val0, val1, val2, val3;
196  v16i8 src0;
197  v8i16 src0_r, src_top_val, src_left_val;
198  v16i8 zero = { 0 };
199 
200  val0 = src_left[0] * 0x01010101;
201  val1 = src_left[1] * 0x01010101;
202  val2 = src_left[2] * 0x01010101;
203  val3 = src_left[3] * 0x01010101;
204  SW4(val0, val1, val2, val3, dst, stride);
205 
206  if (0 == flag) {
207  val0 = LW(src_top);
208  src0 = (v16i8) __msa_insert_w((v4i32) src0, 0, val0);
209  src_top_val = __msa_fill_h(src_top[-1]);
210  src_left_val = __msa_fill_h(src_left[0]);
211 
212  src0_r = (v8i16) __msa_ilvr_b(zero, src0);
213 
214  src0_r -= src_top_val;
215  src0_r >>= 1;
216  src0_r += src_left_val;
217  CLIP_SH_0_255(src0_r);
218  src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
219  val0 = __msa_copy_s_w((v4i32) src0, 0);
220  SW(val0, dst);
221  }
222 }
223 
224 static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top,
225  const uint8_t *src_left,
226  uint8_t *dst, int32_t stride,
227  int32_t flag)
228 {
229  uint64_t val0, val1, val2, val3;
230  v16i8 src0;
231  v8i16 src0_r, src_top_val, src_left_val;
232  v16i8 zero = { 0 };
233 
234  val0 = src_left[0] * 0x0101010101010101;
235  val1 = src_left[1] * 0x0101010101010101;
236  val2 = src_left[2] * 0x0101010101010101;
237  val3 = src_left[3] * 0x0101010101010101;
238  SD4(val0, val1, val2, val3, dst, stride);
239 
240  val0 = src_left[4] * 0x0101010101010101;
241  val1 = src_left[5] * 0x0101010101010101;
242  val2 = src_left[6] * 0x0101010101010101;
243  val3 = src_left[7] * 0x0101010101010101;
244  SD4(val0, val1, val2, val3, dst + 4 * stride, stride);
245 
246  if (0 == flag) {
247  val0 = LD(src_top);
248  src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, val0);
249  src_top_val = __msa_fill_h(src_top[-1]);
250  src_left_val = __msa_fill_h(src_left[0]);
251 
252  src0_r = (v8i16) __msa_ilvr_b(zero, src0);
253 
254  src0_r -= src_top_val;
255  src0_r >>= 1;
256  src0_r += src_left_val;
257  CLIP_SH_0_255(src0_r);
258  src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
259  val0 = __msa_copy_s_d((v2i64) src0, 0);
260  SD(val0, dst);
261  }
262 }
263 
264 static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top,
265  const uint8_t *src_left,
266  uint8_t *dst, int32_t stride,
267  int32_t flag)
268 {
269  uint8_t *tmp_dst = dst;
270  uint32_t row;
271  uint8_t inp0, inp1, inp2, inp3;
272  v16i8 src0, src1, src2, src3;
273  v8i16 src0_r, src0_l, src_left_val, src_top_val;
274 
275  src_left_val = __msa_fill_h(src_left[0]);
276 
277  for (row = 4; row--;) {
278  inp0 = src_left[0];
279  inp1 = src_left[1];
280  inp2 = src_left[2];
281  inp3 = src_left[3];
282  src_left += 4;
283 
284  src0 = __msa_fill_b(inp0);
285  src1 = __msa_fill_b(inp1);
286  src2 = __msa_fill_b(inp2);
287  src3 = __msa_fill_b(inp3);
288 
289  ST_SB4(src0, src1, src2, src3, tmp_dst, stride);
290  tmp_dst += (4 * stride);
291  }
292 
293  if (0 == flag) {
294  src0 = LD_SB(src_top);
295  src_top_val = __msa_fill_h(src_top[-1]);
296 
297  UNPCK_UB_SH(src0, src0_r, src0_l);
298  SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
299 
300  src0_r >>= 1;
301  src0_l >>= 1;
302 
303  ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
304  CLIP_SH2_0_255(src0_r, src0_l);
305  src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
306  ST_SB(src0, dst);
307  }
308 }
309 
310 static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top,
311  const uint8_t *src_left,
312  uint8_t *dst, int32_t stride)
313 {
314  uint32_t row;
315  uint8_t inp0, inp1, inp2, inp3;
316  v16i8 src0, src1, src2, src3;
317 
318  for (row = 0; row < 8; row++) {
319  inp0 = src_left[row * 4];
320  inp1 = src_left[row * 4 + 1];
321  inp2 = src_left[row * 4 + 2];
322  inp3 = src_left[row * 4 + 3];
323 
324  src0 = __msa_fill_b(inp0);
325  src1 = __msa_fill_b(inp1);
326  src2 = __msa_fill_b(inp2);
327  src3 = __msa_fill_b(inp3);
328 
329  ST_SB2(src0, src0, dst, 16);
330  dst += stride;
331  ST_SB2(src1, src1, dst, 16);
332  dst += stride;
333  ST_SB2(src2, src2, dst, 16);
334  dst += stride;
335  ST_SB2(src3, src3, dst, 16);
336  dst += stride;
337  }
338 }
339 
340 static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top,
341  const uint8_t *src_left,
342  uint8_t *dst, int32_t stride,
343  int32_t flag)
344 {
345  uint8_t *tmp_dst = dst;
346  uint32_t addition = 0;
347  uint32_t val0, val1, val2;
348  v16i8 src = { 0 };
349  v16u8 store;
350  v16i8 zero = { 0 };
351  v8u16 sum, vec0, vec1;
352 
353  val0 = LW(src_top);
354  val1 = LW(src_left);
355  INSERT_W2_SB(val0, val1, src);
356  sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
357  sum = (v8u16) __msa_hadd_u_w(sum, sum);
358  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
359  sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
360  addition = __msa_copy_u_w((v4i32) sum, 0);
361  store = (v16u8) __msa_fill_b(addition);
362  val0 = __msa_copy_u_w((v4i32) store, 0);
363  SW4(val0, val0, val0, val0, dst, stride)
364 
365  if (0 == flag) {
366  ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
367 
368  vec1 += vec0;
369  vec0 += vec0;
370  vec1 += vec0;
371 
372  vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
373  store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
374  val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
375  store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
376  val0 = __msa_copy_u_w((v4i32) store, 0);
377  SW(val0, tmp_dst);
378 
379  val0 = src_left[1];
380  val1 = src_left[2];
381  val2 = src_left[3];
382 
383  addition *= 3;
384 
385  ADD2(val0, addition, val1, addition, val0, val1);
386  val2 += addition;
387 
388  val0 += 2;
389  val1 += 2;
390  val2 += 2;
391  val0 >>= 2;
392  val1 >>= 2;
393  val2 >>= 2;
394 
395  tmp_dst[stride * 1] = val0;
396  tmp_dst[stride * 2] = val1;
397  tmp_dst[stride * 3] = val2;
398  }
399 }
400 
401 static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top,
402  const uint8_t *src_left,
403  uint8_t *dst, int32_t stride,
404  int32_t flag)
405 {
406  uint8_t *tmp_dst = dst;
407  uint32_t row, col, val;
408  uint32_t addition = 0;
409  uint64_t val0, val1;
410  v16u8 src = { 0 };
411  v16u8 store;
412  v8u16 sum, vec0, vec1;
413  v16i8 zero = { 0 };
414 
415  val0 = LD(src_top);
416  val1 = LD(src_left);
417  INSERT_D2_UB(val0, val1, src);
418  sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
419  sum = (v8u16) __msa_hadd_u_w(sum, sum);
420  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
421  sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
422  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
423  sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
424  addition = __msa_copy_u_w((v4i32) sum, 0);
425  store = (v16u8) __msa_fill_b(addition);
426  val0 = __msa_copy_u_d((v2i64) store, 0);
427 
428  for (row = 8; row--;) {
429  SD(val0, dst);
430  dst += stride;
431  }
432 
433  if (0 == flag) {
434  ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
435 
436  vec1 += vec0;
437  vec0 += vec0;
438  vec1 += vec0;
439  vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
440  store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
441  val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
442  store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
443  val0 = __msa_copy_u_d((v2i64) store, 0);
444  SD(val0, tmp_dst);
445 
446  val0 = LD(src_left);
447  src = (v16u8) __msa_insert_d((v2i64) src, 0, val0);
448  vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8) src);
449  vec0 = (v8u16) __msa_fill_h(addition);
450  vec0 *= 3;
451  vec1 += vec0;
452  vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
453 
454  for (col = 1; col < 8; col++) {
455  tmp_dst[stride * col] = vec1[col];
456  }
457  }
458 }
459 
460 static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top,
461  const uint8_t *src_left,
462  uint8_t *dst, int32_t stride,
463  int32_t flag)
464 {
465  uint8_t *tmp_dst = dst;
466  uint32_t row, col, val;
467  uint32_t addition = 0;
468  v16u8 src_above1, store, src_left1;
469  v8u16 sum, sum_above, sum_left;
470  v8u16 vec0, vec1, vec2;
471  v16i8 zero = { 0 };
472 
473  src_above1 = LD_UB(src_top);
474  src_left1 = LD_UB(src_left);
475 
476  HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
477  sum = sum_above + sum_left;
478  sum = (v8u16) __msa_hadd_u_w(sum, sum);
479  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
480  sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
481  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
482  sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
483  addition = __msa_copy_u_w((v4i32) sum, 0);
484  store = (v16u8) __msa_fill_b(addition);
485 
486  for (row = 16; row--;) {
487  ST_UB(store, dst);
488  dst += stride;
489  }
490 
491  if (0 == flag) {
492  vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store);
493  ILVRL_B2_UH(zero, src_above1, vec1, vec2);
494  ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
495  vec0 += vec0;
496  ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
497  SRARI_H2_UH(vec1, vec2, 2);
498  store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
499  val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
500  store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
501  ST_UB(store, tmp_dst);
502 
503  ILVRL_B2_UH(zero, src_left1, vec1, vec2);
504  vec0 = (v8u16) __msa_fill_h(addition);
505  vec0 *= 3;
506  ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
507  SRARI_H2_UH(vec1, vec2, 2);
508  store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
509 
510  for (col = 1; col < 16; col++) {
511  tmp_dst[stride * col] = store[col];
512  }
513  }
514 }
515 
516 static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top,
517  const uint8_t *src_left,
518  uint8_t *dst, int32_t stride)
519 {
520  uint32_t row;
521  v16u8 src_above1, src_above2, store, src_left1, src_left2;
522  v8u16 sum_above1, sum_above2;
523  v8u16 sum_left1, sum_left2;
524  v8u16 sum, sum_above, sum_left;
525 
526  LD_UB2(src_top, 16, src_above1, src_above2);
527  LD_UB2(src_left, 16, src_left1, src_left2);
528  HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
529  HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
530  sum_above = sum_above1 + sum_above2;
531  sum_left = sum_left1 + sum_left2;
532  sum = sum_above + sum_left;
533  sum = (v8u16) __msa_hadd_u_w(sum, sum);
534  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
535  sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
536  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
537  sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
538  store = (v16u8) __msa_splati_b((v16i8) sum, 0);
539 
540  for (row = 16; row--;) {
541  ST_UB2(store, store, dst, 16);
542  dst += stride;
543  ST_UB2(store, store, dst, 16);
544  dst += stride;
545  }
546 }
547 
548 static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top,
549  const uint8_t *src_left,
550  uint8_t *dst, int32_t stride)
551 {
552  uint32_t src0, src1;
553  v16i8 src_vec0, src_vec1;
554  v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
555  v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
556  v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
557  v16i8 zero = { 0 };
558 
559  src0 = LW(src_top);
560  src1 = LW(src_left);
561 
562  mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
563 
564  src_vec0 = (v16i8) __msa_insert_w((v4i32) zero, 0, src0);
565  src_vec1 = (v16i8) __msa_insert_w((v4i32) zero, 0, src1);
566 
567  ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r);
568  SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
569 
570  tmp0 = __msa_fill_h(src_top[4]);
571  tmp1 = __msa_fill_h(src_left[4]);
572 
573  MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
574  res0, res1, res2, res3);
575 
576  res0 += mul_val1 * tmp0;
577  res1 += mul_val1 * tmp0;
578  res2 += mul_val1 * tmp0;
579  res3 += mul_val1 * tmp0;
580 
581  res0 += 3 * src_vec0_r;
582  res1 += 2 * src_vec0_r;
583  res2 += src_vec0_r;
584  res0 += tmp1;
585  res1 += 2 * tmp1;
586  res2 += 3 * tmp1;
587  res3 += 4 * tmp1;
588 
589  PCKEV_D2_SH(res1, res0, res3, res2, res0, res1);
590  SRARI_H2_SH(res0, res1, 3);
591  src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
592  ST_W4(src_vec0, 0, 1, 2, 3, dst, stride);
593 }
594 
595 static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
596  const uint8_t *src_left,
597  uint8_t *dst, int32_t stride)
598 {
599  uint64_t src0, src1;
600  v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
601  v8i16 src_vec0_r, src_vec1_r;
602  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
603  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
604  v8i16 tmp0, tmp1, tmp2;
605  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
606  v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
607  v16i8 zero = { 0 };
608 
609  src0 = LD(src_top);
610  src1 = LD(src_left);
611 
612  src_vec0 = (v16i8) __msa_insert_d((v2i64) zero, 0, src0);
613  src_vec1 = (v16i8) __msa_insert_d((v2i64) zero, 0, src1);
614 
615  ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r);
616  SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
617  SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
618 
619  tmp0 = __msa_fill_h(src_top[8]);
620  tmp1 = __msa_fill_h(src_left[8]);
621 
622  MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
623  res0, res1, res2, res3);
624  MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
625  res4, res5, res6, res7);
626 
627  tmp2 = mul_val1 * tmp0;
628  res0 += tmp2;
629  res1 += tmp2;
630  res2 += tmp2;
631  res3 += tmp2;
632  res4 += tmp2;
633  res5 += tmp2;
634  res6 += tmp2;
635  res7 += tmp2;
636 
637  res0 += 7 * src_vec0_r;
638  res1 += 6 * src_vec0_r;
639  res2 += 5 * src_vec0_r;
640  res3 += 4 * src_vec0_r;
641  res4 += 3 * src_vec0_r;
642  res5 += 2 * src_vec0_r;
643  res6 += src_vec0_r;
644 
645  res0 += tmp1;
646  res1 += 2 * tmp1;
647  res2 += 3 * tmp1;
648  res3 += 4 * tmp1;
649  res4 += 5 * tmp1;
650  res5 += 6 * tmp1;
651  res6 += 7 * tmp1;
652  res7 += 8 * tmp1;
653 
654  SRARI_H4_SH(res0, res1, res2, res3, 4);
655  SRARI_H4_SH(res4, res5, res6, res7, 4);
656  PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
657  src_vec0, src_vec1, src_vec2, src_vec3);
658 
659  ST_D8(src_vec0, src_vec1, src_vec2, src_vec3, 0, 1, 0, 1,
660  0, 1, 0, 1, dst, stride);
661 }
662 
663 static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top,
664  const uint8_t *src_left,
665  uint8_t *dst, int32_t stride)
666 {
667  v16u8 src0, src1;
668  v8i16 src0_r, src1_r, src0_l, src1_l;
669  v8i16 vec0, vec1;
670  v8i16 res0, res1, tmp0, tmp1;
671  v8i16 mul_val2, mul_val3;
672  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
673  v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
674 
675  src0 = LD_UB(src_top);
676  src1 = LD_UB(src_left);
677 
678  UNPCK_UB_SH(src0, src0_r, src0_l);
679  UNPCK_UB_SH(src1, src1_r, src1_l);
680 
681  mul_val2 = mul_val0 - 8;
682  mul_val3 = mul_val1 + 8;
683 
684  tmp0 = __msa_fill_h(src_top[16]);
685  tmp1 = __msa_fill_h(src_left[16]);
686 
687  SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
688  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
689  mul_val0, mul_val1, mul_val2, mul_val3,
690  res0, res1, 15, 1, 5);
691  ST_SH2(res0, res1, dst, stride);
692  dst += (2 * stride);
693 
694  SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
695  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
696  mul_val0, mul_val1, mul_val2, mul_val3,
697  res0, res1, 13, 3, 5);
698  ST_SH2(res0, res1, dst, stride);
699  dst += (2 * stride);
700 
701  SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
702  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
703  mul_val0, mul_val1, mul_val2, mul_val3,
704  res0, res1, 11, 5, 5);
705  ST_SH2(res0, res1, dst, stride);
706  dst += (2 * stride);
707 
708  SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
709  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
710  mul_val0, mul_val1, mul_val2, mul_val3,
711  res0, res1, 9, 7, 5);
712  ST_SH2(res0, res1, dst, stride);
713  dst += (2 * stride);
714 
715  SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
716  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
717  mul_val0, mul_val1, mul_val2, mul_val3,
718  res0, res1, 7, 9, 5);
719  ST_SH2(res0, res1, dst, stride);
720  dst += (2 * stride);
721 
722  SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
723  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
724  mul_val0, mul_val1, mul_val2, mul_val3,
725  res0, res1, 5, 11, 5);
726  ST_SH2(res0, res1, dst, stride);
727  dst += (2 * stride);
728 
729  SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
730  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
731  mul_val0, mul_val1, mul_val2, mul_val3,
732  res0, res1, 3, 13, 5);
733  ST_SH2(res0, res1, dst, stride);
734  dst += (2 * stride);
735 
736  SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
737  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
738  mul_val0, mul_val1, mul_val2, mul_val3,
739  res0, res1, 1, 15, 5);
740  ST_SH2(res0, res1, dst, stride);
741 }
742 
743 static void process_intra_upper_16x16_msa(const uint8_t *src_top,
744  const uint8_t *src_left,
745  uint8_t *dst, int32_t stride,
746  uint8_t offset)
747 {
748  v16i8 src0, src1;
749  v8i16 src0_r, src1_r, src0_l, src1_l;
750  v8i16 vec0, vec1, res0, res1;
751  v8i16 tmp0, tmp1;
752  v8i16 mul_val2, mul_val3;
753  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
754  v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
755 
756  tmp0 = __msa_fill_h(src_top[32 - offset]);
757  tmp1 = __msa_fill_h(src_left[32]);
758 
759  src0 = LD_SB(src_top);
760  src1 = LD_SB(src_left);
761 
762  UNPCK_UB_SH(src0, src0_r, src0_l);
763  UNPCK_UB_SH(src1, src1_r, src1_l);
764 
765  mul_val1 += offset;
766  mul_val0 -= offset;
767  mul_val2 = mul_val0 - 8;
768  mul_val3 = mul_val1 + 8;
769 
770  SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
771  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
772  mul_val0, mul_val1, mul_val2, mul_val3,
773  res0, res1, 31, 1, 6);
774  ST_SH2(res0, res1, dst, stride);
775  dst += (2 * stride);
776 
777  SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
778  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
779  mul_val0, mul_val1, mul_val2, mul_val3,
780  res0, res1, 29, 3, 6);
781  ST_SH2(res0, res1, dst, stride);
782  dst += (2 * stride);
783 
784  SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
785  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
786  mul_val0, mul_val1, mul_val2, mul_val3,
787  res0, res1, 27, 5, 6);
788  ST_SH2(res0, res1, dst, stride);
789  dst += (2 * stride);
790 
791  SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
792  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
793  mul_val0, mul_val1, mul_val2, mul_val3,
794  res0, res1, 25, 7, 6);
795  ST_SH2(res0, res1, dst, stride);
796  dst += (2 * stride);
797 
798  SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
799  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
800  mul_val0, mul_val1, mul_val2, mul_val3,
801  res0, res1, 23, 9, 6);
802  ST_SH2(res0, res1, dst, stride);
803  dst += (2 * stride);
804 
805  SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
806  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
807  mul_val0, mul_val1, mul_val2, mul_val3,
808  res0, res1, 21, 11, 6);
809  ST_SH2(res0, res1, dst, stride);
810  dst += (2 * stride);
811 
812  SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
813  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
814  mul_val0, mul_val1, mul_val2, mul_val3,
815  res0, res1, 19, 13, 6);
816  ST_SH2(res0, res1, dst, stride);
817  dst += (2 * stride);
818 
819  SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
820  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
821  mul_val0, mul_val1, mul_val2, mul_val3,
822  res0, res1, 17, 15, 6);
823  ST_SH2(res0, res1, dst, stride);
824 }
825 
826 static void process_intra_lower_16x16_msa(const uint8_t *src_top,
827  const uint8_t *src_left,
828  uint8_t *dst, int32_t stride,
829  uint8_t offset)
830 {
831  v16i8 src0, src1;
832  v8i16 src0_r, src1_r, src0_l, src1_l;
833  v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
834  v8i16 mul_val2, mul_val3;
835  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
836  v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
837 
838  tmp0 = __msa_fill_h(src_top[32 - offset]);
839  tmp1 = __msa_fill_h(src_left[16]);
840 
841  src0 = LD_SB(src_top);
842  src1 = LD_SB(src_left);
843 
844  UNPCK_UB_SH(src0, src0_r, src0_l);
845  UNPCK_UB_SH(src1, src1_r, src1_l);
846 
847  mul_val1 += offset;
848  mul_val0 -= offset;
849  mul_val2 = mul_val0 - 8;
850  mul_val3 = mul_val1 + 8;
851 
852  SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
853  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
854  mul_val0, mul_val1, mul_val2, mul_val3,
855  res0, res1, 15, 17, 6);
856  ST_SH2(res0, res1, dst, stride);
857  dst += (2 * stride);
858 
859  SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
860  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
861  mul_val0, mul_val1, mul_val2, mul_val3,
862  res0, res1, 13, 19, 6);
863  ST_SH2(res0, res1, dst, stride);
864  dst += (2 * stride);
865 
866  SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
867  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
868  mul_val0, mul_val1, mul_val2, mul_val3,
869  res0, res1, 11, 21, 6);
870  ST_SH2(res0, res1, dst, stride);
871  dst += (2 * stride);
872 
873  SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
874  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
875  mul_val0, mul_val1, mul_val2, mul_val3,
876  res0, res1, 9, 23, 6);
877  ST_SH2(res0, res1, dst, stride);
878  dst += (2 * stride);
879 
880  SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
881  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
882  mul_val0, mul_val1, mul_val2, mul_val3,
883  res0, res1, 7, 25, 6);
884  ST_SH2(res0, res1, dst, stride);
885  dst += (2 * stride);
886 
887  SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
888  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
889  mul_val0, mul_val1, mul_val2, mul_val3,
890  res0, res1, 5, 27, 6);
891  ST_SH2(res0, res1, dst, stride);
892  dst += (2 * stride);
893 
894  SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
895  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
896  mul_val0, mul_val1, mul_val2, mul_val3,
897  res0, res1, 3, 29, 6);
898  ST_SH2(res0, res1, dst, stride);
899  dst += (2 * stride);
900 
901  SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
902  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
903  mul_val0, mul_val1, mul_val2, mul_val3,
904  res0, res1, 1, 31, 6);
905  ST_SH2(res0, res1, dst, stride);
906 }
907 
908 static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top,
909  const uint8_t *src_left,
910  uint8_t *dst, int32_t stride)
911 {
912  process_intra_upper_16x16_msa(src_top, src_left, dst, stride, 0);
913  process_intra_upper_16x16_msa((src_top + 16), src_left,
914  (dst + 16), stride, 16);
915  dst += (16 * stride);
916  src_left += 16;
917 
918  process_intra_lower_16x16_msa(src_top, src_left, dst, stride, 0);
919  process_intra_lower_16x16_msa((src_top + 16), src_left,
920  (dst + 16), stride, 16);
921 }
922 
924  const uint8_t *src_left,
925  uint8_t *dst,
926  int32_t stride,
927  int32_t mode)
928 {
929  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
930  uint8_t ref_array[3 * 32 + 4];
931  uint8_t *ref_tmp = ref_array + 4;
932  const uint8_t *ref;
933  int32_t last;
934  int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
935  int32_t idx2, fact_val2, idx3, fact_val3;
936  int32_t angle, angle_loop;
937  int32_t inv_angle_val, offset;
938  uint64_t tmp0;
939  v16i8 top0, top1, top2, top3;
940  v16i8 dst_val0;
941  v16i8 zero = { 0 };
942  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
943  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
944 
945  angle = intra_pred_angle_up[mode - 18];
946  inv_angle_val = inv_angle[mode - 18];
947  last = (angle) >> 3;
948  angle_loop = angle;
949 
950  ref = src_top - 1;
951  if (angle < 0 && last < -1) {
952  inv_angle_val = inv_angle[mode - 18];
953 
954  tmp0 = LD(ref);
955  SD(tmp0, ref_tmp);
956 
957  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
958  offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
959  ref_tmp[h_cnt] = src_left[offset];
960  }
961 
962  ref = ref_tmp;
963  }
964 
965  idx0 = angle_loop >> 5;
966  fact_val0 = angle_loop & 31;
967  angle_loop += angle;
968 
969  idx1 = angle_loop >> 5;
970  fact_val1 = angle_loop & 31;
971  angle_loop += angle;
972 
973  idx2 = angle_loop >> 5;
974  fact_val2 = angle_loop & 31;
975  angle_loop += angle;
976 
977  idx3 = angle_loop >> 5;
978  fact_val3 = angle_loop & 31;
979 
980  top0 = LD_SB(ref + idx0 + 1);
981  top1 = LD_SB(ref + idx1 + 1);
982  top2 = LD_SB(ref + idx2 + 1);
983  top3 = LD_SB(ref + idx3 + 1);
984 
985  fact0 = __msa_fill_h(fact_val0);
986  fact1 = __msa_fill_h(32 - fact_val0);
987 
988  fact2 = __msa_fill_h(fact_val1);
989  fact3 = __msa_fill_h(32 - fact_val1);
990 
991  fact4 = __msa_fill_h(fact_val2);
992  fact5 = __msa_fill_h(32 - fact_val2);
993 
994  fact6 = __msa_fill_h(fact_val3);
995  fact7 = __msa_fill_h(32 - fact_val3);
996 
997  ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
998  ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
999  ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1000  diff0, diff2, diff4, diff6);
1001  SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2,
1002  diff1, diff3, diff5, diff7);
1003  ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1004  ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1005  MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1006 
1007  diff1 += diff0 * fact1;
1008  diff3 += diff2 * fact3;
1009 
1010  SRARI_H2_SH(diff1, diff3, 5);
1011  dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
1012  ST_W4(dst_val0, 0, 1, 2, 3, dst, stride);
1013 }
1014 
1016  const uint8_t *src_left,
1017  uint8_t *dst,
1018  int32_t stride,
1019  int32_t mode)
1020 {
1021  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1022  uint8_t ref_array[3 * 32 + 4];
1023  uint8_t *ref_tmp = ref_array + 8;
1024  const uint8_t *ref;
1025  const uint8_t *src_left_tmp = src_left - 1;
1026  int32_t last, offset;
1027  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1028  int32_t idx2, fact_val2, idx3, fact_val3;
1029  int32_t angle, angle_loop;
1030  int32_t inv_angle_val, inv_angle_val_loop;
1031  int32_t tmp0, tmp1, tmp2;
1032  v16i8 top0, top1, top2, top3;
1033  v16u8 dst_val0, dst_val1;
1034  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1035  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1036 
1037  angle = intra_pred_angle_up[mode - 18];
1038  inv_angle_val = inv_angle[mode - 18];
1039  last = (angle) >> 2;
1040  angle_loop = angle;
1041 
1042  ref = src_top - 1;
1043  if (last < -1) {
1044  inv_angle_val_loop = inv_angle_val * last;
1045 
1046  tmp0 = LW(ref);
1047  tmp1 = LW(ref + 4);
1048  tmp2 = LW(ref + 8);
1049  SW(tmp0, ref_tmp);
1050  SW(tmp1, ref_tmp + 4);
1051  SW(tmp2, ref_tmp + 8);
1052 
1053  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1054  offset = (inv_angle_val_loop + 128) >> 8;
1055  ref_tmp[h_cnt] = src_left_tmp[offset];
1056  inv_angle_val_loop += inv_angle_val;
1057  }
1058  ref = ref_tmp;
1059  }
1060 
1061  for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1062  idx0 = (angle_loop) >> 5;
1063  fact_val0 = (angle_loop) & 31;
1064  angle_loop += angle;
1065 
1066  idx1 = (angle_loop) >> 5;
1067  fact_val1 = (angle_loop) & 31;
1068  angle_loop += angle;
1069 
1070  idx2 = (angle_loop) >> 5;
1071  fact_val2 = (angle_loop) & 31;
1072  angle_loop += angle;
1073 
1074  idx3 = (angle_loop) >> 5;
1075  fact_val3 = (angle_loop) & 31;
1076  angle_loop += angle;
1077 
1078  top0 = LD_SB(ref + idx0 + 1);
1079  top1 = LD_SB(ref + idx1 + 1);
1080  top2 = LD_SB(ref + idx2 + 1);
1081  top3 = LD_SB(ref + idx3 + 1);
1082 
1083  fact0 = __msa_fill_h(fact_val0);
1084  fact1 = __msa_fill_h(32 - fact_val0);
1085  fact2 = __msa_fill_h(fact_val1);
1086  fact3 = __msa_fill_h(32 - fact_val1);
1087  fact4 = __msa_fill_h(fact_val2);
1088  fact5 = __msa_fill_h(32 - fact_val2);
1089  fact6 = __msa_fill_h(fact_val3);
1090  fact7 = __msa_fill_h(32 - fact_val3);
1091 
1092  UNPCK_UB_SH(top0, diff0, diff1);
1093  UNPCK_UB_SH(top1, diff2, diff3);
1094  UNPCK_UB_SH(top2, diff4, diff5);
1095  UNPCK_UB_SH(top3, diff6, diff7);
1096 
1097  SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
1098  diff1, diff3, diff5, diff7);
1099  MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1100  diff1, diff3, diff5, diff7);
1101 
1102  diff1 += diff0 * fact1;
1103  diff3 += diff2 * fact3;
1104  diff5 += diff4 * fact5;
1105  diff7 += diff6 * fact7;
1106 
1107  SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1108  PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
1109  ST_D4(dst_val0, dst_val1, 0, 1, 0, 1, dst, stride);
1110  dst += (4 * stride);
1111  }
1112 }
1113 
1115  const uint8_t *src_left,
1116  uint8_t *dst,
1117  int32_t stride,
1118  int32_t mode)
1119 {
1120  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1121  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1122  int32_t idx2, fact_val2, idx3, fact_val3;
1123  int32_t tmp0;
1124  int32_t angle, angle_loop, offset;
1125  int32_t inv_angle_val, inv_angle_val_loop;
1126  uint8_t ref_array[3 * 32 + 4];
1127  uint8_t *ref_tmp = ref_array + 16;
1128  const uint8_t *ref;
1129  const uint8_t *src_left_tmp = src_left - 1;
1130  int32_t last;
1131  v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1132  v16i8 dst0, dst1, dst2, dst3;
1133  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1134  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1135  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1136 
1137  angle = intra_pred_angle_up[mode - 18];
1138  inv_angle_val = inv_angle[mode - 18];
1139  last = angle >> 1;
1140  angle_loop = angle;
1141 
1142  ref = src_top - 1;
1143  if (last < -1) {
1144  inv_angle_val_loop = inv_angle_val * last;
1145 
1146  top0 = LD_UB(ref);
1147  tmp0 = LW(ref + 16);
1148  ST_UB(top0, ref_tmp);
1149  SW(tmp0, ref_tmp + 16);
1150 
1151  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1152  offset = (inv_angle_val_loop + 128) >> 8;
1153  ref_tmp[h_cnt] = src_left_tmp[offset];
1154  inv_angle_val_loop += inv_angle_val;
1155  }
1156  ref = ref_tmp;
1157  }
1158 
1159  for (v_cnt = 4; v_cnt--;) {
1160  idx0 = (angle_loop) >> 5;
1161  fact_val0 = (angle_loop) & 31;
1162  angle_loop += angle;
1163 
1164  idx1 = (angle_loop) >> 5;
1165  fact_val1 = (angle_loop) & 31;
1166  angle_loop += angle;
1167 
1168  idx2 = (angle_loop) >> 5;
1169  fact_val2 = (angle_loop) & 31;
1170  angle_loop += angle;
1171 
1172  idx3 = (angle_loop) >> 5;
1173  fact_val3 = (angle_loop) & 31;
1174  angle_loop += angle;
1175 
1176  LD_UB2(ref + idx0 + 1, 16, top0, top1);
1177  LD_UB2(ref + idx1 + 1, 16, top2, top3);
1178  LD_UB2(ref + idx2 + 1, 16, top4, top5);
1179  LD_UB2(ref + idx3 + 1, 16, top6, top7);
1180 
1181  fact0 = __msa_fill_h(fact_val0);
1182  fact1 = __msa_fill_h(32 - fact_val0);
1183  fact2 = __msa_fill_h(fact_val1);
1184  fact3 = __msa_fill_h(32 - fact_val1);
1185  fact4 = __msa_fill_h(fact_val2);
1186  fact5 = __msa_fill_h(32 - fact_val2);
1187  fact6 = __msa_fill_h(fact_val3);
1188  fact7 = __msa_fill_h(32 - fact_val3);
1189 
1190  SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1191  top1, top3, top5, top7);
1192  UNPCK_UB_SH(top0, diff0, diff1);
1193  UNPCK_UB_SH(top1, diff2, diff3);
1194  UNPCK_UB_SH(top2, diff4, diff5);
1195  UNPCK_UB_SH(top3, diff6, diff7);
1196  UNPCK_UB_SH(top4, diff8, diff9);
1197  UNPCK_UB_SH(top5, diff10, diff11);
1198  UNPCK_UB_SH(top6, diff12, diff13);
1199  UNPCK_UB_SH(top7, diff14, diff15);
1200 
1201  MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1202  diff2, diff3, diff6, diff7);
1203  MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1204  diff10, diff11, diff14, diff15);
1205 
1206  diff2 += diff0 * fact1;
1207  diff3 += diff1 * fact1;
1208  diff6 += diff4 * fact3;
1209  diff7 += diff5 * fact3;
1210  diff10 += diff8 * fact5;
1211  diff11 += diff9 * fact5;
1212  diff14 += diff12 * fact7;
1213  diff15 += diff13 * fact7;
1214 
1215  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1216  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1217  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1218  dst0, dst1, dst2, dst3);
1219  ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
1220  dst += (4 * stride);
1221  }
1222 }
1223 
1225  const uint8_t *src_left,
1226  uint8_t *dst,
1227  int32_t stride,
1228  int32_t mode)
1229 {
1230  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1231  uint8_t ref_array[3 * 32 + 4];
1232  uint8_t *ref_tmp;
1233  const uint8_t *ref;
1234  const uint8_t *src_left_tmp = src_left - 1;
1235  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1236  int32_t tmp0, tmp1, tmp2, tmp3;
1237  int32_t angle, angle_loop;
1238  int32_t inv_angle_val, inv_angle_val_loop;
1239  int32_t last, offset;
1240  v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1241  v16i8 dst0, dst1, dst2, dst3;
1242  v8i16 fact0, fact1, fact2, fact3;
1243  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1244  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1245 
1246  ref_tmp = ref_array + 32;
1247 
1248  angle = intra_pred_angle_up[mode - 18];
1249  inv_angle_val = inv_angle[mode - 18];
1250  last = angle;
1251  angle_loop = angle;
1252 
1253  ref = src_top - 1;
1254  if (last < -1) {
1255  inv_angle_val_loop = inv_angle_val * last;
1256  LD_UB2(ref, 16, top0, top1);
1257  tmp0 = ref[32];
1258  tmp1 = ref[33];
1259  tmp2 = ref[34];
1260  tmp3 = ref[35];
1261 
1262  ST_UB2(top0, top1, ref_tmp, 16);
1263  ref_tmp[32] = tmp0;
1264  ref_tmp[33] = tmp1;
1265  ref_tmp[34] = tmp2;
1266  ref_tmp[35] = tmp3;
1267 
1268  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1269  offset = (inv_angle_val_loop + 128) >> 8;
1270  ref_tmp[h_cnt] = src_left_tmp[offset];
1271  inv_angle_val_loop += inv_angle_val;
1272  }
1273 
1274  ref = ref_tmp;
1275  }
1276 
1277  for (v_cnt = 16; v_cnt--;) {
1278  idx0 = (angle_loop) >> 5;
1279  fact_val0 = (angle_loop) & 31;
1280  angle_loop += angle;
1281 
1282  idx1 = (angle_loop) >> 5;
1283  fact_val1 = (angle_loop) & 31;
1284  angle_loop += angle;
1285 
1286  top0 = LD_UB(ref + idx0 + 1);
1287  top4 = LD_UB(ref + idx1 + 1);
1288  top1 = LD_UB(ref + idx0 + 17);
1289  top5 = LD_UB(ref + idx1 + 17);
1290  top3 = LD_UB(ref + idx0 + 33);
1291  top7 = LD_UB(ref + idx1 + 33);
1292 
1293  fact0 = __msa_fill_h(fact_val0);
1294  fact1 = __msa_fill_h(32 - fact_val0);
1295  fact2 = __msa_fill_h(fact_val1);
1296  fact3 = __msa_fill_h(32 - fact_val1);
1297 
1298  top2 = top1;
1299  top6 = top5;
1300 
1301  SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1302  top1, top3, top5, top7);
1303  UNPCK_UB_SH(top0, diff0, diff1);
1304  UNPCK_UB_SH(top1, diff2, diff3);
1305  UNPCK_UB_SH(top2, diff4, diff5);
1306  UNPCK_UB_SH(top3, diff6, diff7);
1307  UNPCK_UB_SH(top4, diff8, diff9);
1308  UNPCK_UB_SH(top5, diff10, diff11);
1309  UNPCK_UB_SH(top6, diff12, diff13);
1310  UNPCK_UB_SH(top7, diff14, diff15);
1311 
1312  MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1313  diff2, diff3, diff6, diff7);
1314  MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1315  diff10, diff11, diff14, diff15);
1316 
1317  diff2 += diff0 * fact1;
1318  diff3 += diff1 * fact1;
1319  diff6 += diff4 * fact1;
1320  diff7 += diff5 * fact1;
1321  diff10 += diff8 * fact3;
1322  diff11 += diff9 * fact3;
1323  diff14 += diff12 * fact3;
1324  diff15 += diff13 * fact3;
1325 
1326  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1327  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1328  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1329  dst0, dst1, dst2, dst3);
1330 
1331  ST_SB2(dst0, dst1, dst, 16);
1332  dst += stride;
1333  ST_SB2(dst2, dst3, dst, 16);
1334  dst += stride;
1335  }
1336 }
1337 
1339  const uint8_t *src_left,
1340  uint8_t *dst,
1341  int32_t stride,
1342  int32_t mode)
1343 {
1344  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1345  uint8_t ref_array[3 * 32 + 4];
1346  uint8_t *ref_tmp = ref_array + 4;
1347  const uint8_t *ref;
1348  int32_t last, offset;
1349  int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
1350  int32_t idx2, fact_val2, idx3, fact_val3;
1351  int32_t angle, angle_loop, inv_angle_val;
1352  uint64_t tmp0;
1353  v16i8 dst_val0, dst_val1;
1354  v16u8 top0, top1, top2, top3;
1355  v16u8 zero = { 0 };
1356  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1357  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1358 
1359  angle = intra_pred_angle_low[mode - 2];
1360  last = angle >> 3;
1361  angle_loop = angle;
1362 
1363  ref = src_left - 1;
1364  if (last < -1) {
1365  inv_angle_val = inv_angle[mode - 11];
1366 
1367  tmp0 = LD(ref);
1368  SD(tmp0, ref_tmp);
1369 
1370  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1371  offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
1372  ref_tmp[h_cnt] = src_top[offset];
1373  }
1374 
1375  ref = ref_tmp;
1376  }
1377 
1378  idx0 = angle_loop >> 5;
1379  fact_val0 = angle_loop & 31;
1380  angle_loop += angle;
1381 
1382  idx1 = angle_loop >> 5;
1383  fact_val1 = angle_loop & 31;
1384  angle_loop += angle;
1385 
1386  idx2 = angle_loop >> 5;
1387  fact_val2 = angle_loop & 31;
1388  angle_loop += angle;
1389 
1390  idx3 = angle_loop >> 5;
1391  fact_val3 = angle_loop & 31;
1392 
1393  top0 = LD_UB(ref + idx0 + 1);
1394  top1 = LD_UB(ref + idx1 + 1);
1395  top2 = LD_UB(ref + idx2 + 1);
1396  top3 = LD_UB(ref + idx3 + 1);
1397 
1398  fact0 = __msa_fill_h(fact_val0);
1399  fact1 = __msa_fill_h(32 - fact_val0);
1400  fact2 = __msa_fill_h(fact_val1);
1401  fact3 = __msa_fill_h(32 - fact_val1);
1402  fact4 = __msa_fill_h(fact_val2);
1403  fact5 = __msa_fill_h(32 - fact_val2);
1404  fact6 = __msa_fill_h(fact_val3);
1405  fact7 = __msa_fill_h(32 - fact_val3);
1406 
1407  ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
1408  ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
1409  ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1410  diff0, diff2, diff4, diff6);
1411  SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2,
1412  diff1, diff3, diff5, diff7);
1413  ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1414  ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1415  MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1416 
1417  diff1 += diff0 * fact1;
1418  diff3 += diff2 * fact3;
1419 
1420  SRARI_H2_SH(diff1, diff3, 5);
1421  PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
1422 
1423  diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
1424  diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
1425 
1426  diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
1427 
1428  dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
1429  dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
1430 
1431  ST_W2(dst_val0, 0, 1, dst, stride);
1432  ST_W2(dst_val1, 0, 1, dst + 2 * stride, stride);
1433 }
1434 
1436  const uint8_t *src_left,
1437  uint8_t *dst,
1438  int32_t stride,
1439  int32_t mode)
1440 {
1441  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1442  uint8_t ref_array[3 * 32 + 4];
1443  uint8_t *ref_tmp = ref_array + 8;
1444  const uint8_t *ref;
1445  const uint8_t *src_top_tmp = src_top - 1;
1446  uint8_t *dst_org;
1447  int32_t last, offset, tmp0, tmp1, tmp2;
1448  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1449  int32_t idx2, fact_val2, idx3, fact_val3;
1450  int32_t angle, angle_loop, inv_angle_val;
1451  v16i8 top0, top1, top2, top3;
1452  v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
1453  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1454  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1455 
1456  angle = intra_pred_angle_low[mode - 2];
1457  last = (angle) >> 2;
1458  angle_loop = angle;
1459 
1460  ref = src_left - 1;
1461  if (last < -1) {
1462  inv_angle_val = inv_angle[mode - 11];
1463 
1464  tmp0 = LW(ref);
1465  tmp1 = LW(ref + 4);
1466  tmp2 = LW(ref + 8);
1467  SW(tmp0, ref_tmp);
1468  SW(tmp1, ref_tmp + 4);
1469  SW(tmp2, ref_tmp + 8);
1470 
1471  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1472  offset = (h_cnt * inv_angle_val + 128) >> 8;
1473  ref_tmp[h_cnt] = src_top_tmp[offset];
1474  }
1475 
1476  ref = ref_tmp;
1477  }
1478 
1479  for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1480  dst_org = dst;
1481 
1482  idx0 = angle_loop >> 5;
1483  fact_val0 = angle_loop & 31;
1484  angle_loop += angle;
1485 
1486  idx1 = angle_loop >> 5;
1487  fact_val1 = angle_loop & 31;
1488  angle_loop += angle;
1489 
1490  idx2 = angle_loop >> 5;
1491  fact_val2 = angle_loop & 31;
1492  angle_loop += angle;
1493 
1494  idx3 = angle_loop >> 5;
1495  fact_val3 = angle_loop & 31;
1496  angle_loop += angle;
1497 
1498  top0 = LD_SB(ref + idx0 + 1);
1499  top1 = LD_SB(ref + idx1 + 1);
1500  top2 = LD_SB(ref + idx2 + 1);
1501  top3 = LD_SB(ref + idx3 + 1);
1502 
1503  fact0 = __msa_fill_h(fact_val0);
1504  fact1 = __msa_fill_h(32 - fact_val0);
1505  fact2 = __msa_fill_h(fact_val1);
1506  fact3 = __msa_fill_h(32 - fact_val1);
1507  fact4 = __msa_fill_h(fact_val2);
1508  fact5 = __msa_fill_h(32 - fact_val2);
1509  fact6 = __msa_fill_h(fact_val3);
1510  fact7 = __msa_fill_h(32 - fact_val3);
1511 
1512  UNPCK_UB_SH(top0, diff0, diff1);
1513  UNPCK_UB_SH(top1, diff2, diff3);
1514  UNPCK_UB_SH(top2, diff4, diff5);
1515  UNPCK_UB_SH(top3, diff6, diff7);
1516  SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
1517  diff1, diff3, diff5, diff7);
1518  MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1519  diff1, diff3, diff5, diff7);
1520 
1521  diff1 += diff0 * fact1;
1522  diff3 += diff2 * fact3;
1523  diff5 += diff4 * fact5;
1524  diff7 += diff6 * fact7;
1525 
1526  SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1527  PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
1528  dst_val0, dst_val1, dst_val2, dst_val3);
1529  ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1530  ILVRL_H2_SH(diff1, diff0, diff3, diff4);
1531  ST_W8(diff3, diff4, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1532  dst += 4;
1533  }
1534 }
1535 
1537  const uint8_t *src_left,
1538  uint8_t *dst,
1539  int32_t stride,
1540  int32_t mode)
1541 {
1542  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1543  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1544  int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
1545  v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1546  v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1547  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1548  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1549  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1550  int32_t angle, angle_loop, inv_angle_val, offset;
1551  uint8_t ref_array[3 * 32 + 4];
1552  uint8_t *ref_tmp = ref_array + 16;
1553  const uint8_t *ref, *src_top_tmp = src_top - 1;
1554  uint8_t *dst_org;
1555  int32_t last;
1556 
1557  angle = intra_pred_angle_low[mode - 2];
1558  last = (angle) >> 1;
1559  angle_loop = angle;
1560 
1561  ref = src_left - 1;
1562  if (last < -1) {
1563  inv_angle_val = inv_angle[mode - 11];
1564 
1565  top0 = LD_SB(ref);
1566  tmp0 = LW(ref + 16);
1567  ST_SB(top0, ref_tmp);
1568  SW(tmp0, ref_tmp + 16);
1569 
1570  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1571  offset = (h_cnt * inv_angle_val + 128) >> 8;
1572  ref_tmp[h_cnt] = src_top_tmp[offset];
1573  }
1574 
1575  ref = ref_tmp;
1576  }
1577 
1578  for (v_cnt = 0; v_cnt < 4; v_cnt++) {
1579  dst_org = dst;
1580 
1581  idx0 = angle_loop >> 5;
1582  fact_val0 = angle_loop & 31;
1583  angle_loop += angle;
1584 
1585  idx1 = angle_loop >> 5;
1586  fact_val1 = angle_loop & 31;
1587  angle_loop += angle;
1588 
1589  idx2 = angle_loop >> 5;
1590  fact_val2 = angle_loop & 31;
1591  angle_loop += angle;
1592 
1593  idx3 = angle_loop >> 5;
1594  fact_val3 = angle_loop & 31;
1595  angle_loop += angle;
1596 
1597  LD_SB2(ref + idx0 + 1, 16, top0, top1);
1598  LD_SB2(ref + idx1 + 1, 16, top2, top3);
1599  LD_SB2(ref + idx2 + 1, 16, top4, top5);
1600  LD_SB2(ref + idx3 + 1, 16, top6, top7);
1601 
1602  fact0 = __msa_fill_h(fact_val0);
1603  fact1 = __msa_fill_h(32 - fact_val0);
1604  fact2 = __msa_fill_h(fact_val1);
1605  fact3 = __msa_fill_h(32 - fact_val1);
1606  fact4 = __msa_fill_h(fact_val2);
1607  fact5 = __msa_fill_h(32 - fact_val2);
1608  fact6 = __msa_fill_h(fact_val3);
1609  fact7 = __msa_fill_h(32 - fact_val3);
1610 
1611  SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1612  top1, top3, top5, top7);
1613 
1614  UNPCK_UB_SH(top0, diff0, diff1);
1615  UNPCK_UB_SH(top1, diff2, diff3);
1616  UNPCK_UB_SH(top2, diff4, diff5);
1617  UNPCK_UB_SH(top3, diff6, diff7);
1618  UNPCK_UB_SH(top4, diff8, diff9);
1619  UNPCK_UB_SH(top5, diff10, diff11);
1620  UNPCK_UB_SH(top6, diff12, diff13);
1621  UNPCK_UB_SH(top7, diff14, diff15);
1622 
1623  MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1624  diff2, diff3, diff6, diff7);
1625  MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1626  diff10, diff11, diff14, diff15);
1627 
1628  diff2 += diff0 * fact1;
1629  diff3 += diff1 * fact1;
1630  diff6 += diff4 * fact3;
1631  diff7 += diff5 * fact3;
1632  diff10 += diff8 * fact5;
1633  diff11 += diff9 * fact5;
1634  diff14 += diff12 * fact7;
1635  diff15 += diff13 * fact7;
1636 
1637  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1638  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1639  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1640  dst_val0, dst_val1, dst_val2, dst_val3);
1641  ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1642  ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
1643  ILVRL_H2_SH(diff1, diff0, diff4, diff5);
1644  ILVRL_H2_SH(diff3, diff2, diff6, diff7);
1645  ST_W8(diff4, diff5, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1646  dst_org += (8 * stride);
1647  ST_W8(diff6, diff7, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1648  dst += 4;
1649  }
1650 }
1651 
1653  const uint8_t *src_left,
1654  uint8_t *dst,
1655  int32_t stride,
1656  int32_t mode)
1657 {
1658  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1659  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
1660  v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1661  v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1662  v8i16 fact0, fact1, fact2, fact3;
1663  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1664  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1665  int32_t angle, angle_loop, inv_angle_val, offset;
1666  uint8_t ref_array[3 * 32 + 4];
1667  uint8_t *ref_tmp = ref_array + 32;
1668  const uint8_t *ref, *src_top_tmp = src_top - 1;
1669  uint8_t *dst_org;
1670  int32_t last;
1671 
1672  angle = intra_pred_angle_low[mode - 2];
1673  last = angle;
1674  angle_loop = angle;
1675 
1676  ref = src_left - 1;
1677  if (last < -1) {
1678  inv_angle_val = inv_angle[mode - 11];
1679 
1680  LD_SB2(ref, 16, top0, top1);
1681  tmp0 = LW(ref + 32);
1682  ST_SB2(top0, top1, ref_tmp, 16);
1683  SW(tmp0, ref_tmp + 32);
1684 
1685  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1686  offset = (h_cnt * inv_angle_val + 128) >> 8;
1687  ref_tmp[h_cnt] = src_top_tmp[offset];
1688  }
1689 
1690  ref = ref_tmp;
1691  }
1692 
1693  for (v_cnt = 0; v_cnt < 16; v_cnt++) {
1694  dst_org = dst;
1695  idx0 = angle_loop >> 5;
1696  fact_val0 = angle_loop & 31;
1697  angle_loop += angle;
1698 
1699  idx1 = angle_loop >> 5;
1700  fact_val1 = angle_loop & 31;
1701  angle_loop += angle;
1702 
1703  top0 = LD_SB(ref + idx0 + 1);
1704  top4 = LD_SB(ref + idx1 + 1);
1705  top1 = LD_SB(ref + idx0 + 17);
1706  top5 = LD_SB(ref + idx1 + 17);
1707  top3 = LD_SB(ref + idx0 + 33);
1708  top7 = LD_SB(ref + idx1 + 33);
1709 
1710  fact0 = __msa_fill_h(fact_val0);
1711  fact1 = __msa_fill_h(32 - fact_val0);
1712  fact2 = __msa_fill_h(fact_val1);
1713  fact3 = __msa_fill_h(32 - fact_val1);
1714 
1715  top2 = top1;
1716  top6 = top5;
1717 
1718  SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1719  top1, top3, top5, top7);
1720 
1721  UNPCK_UB_SH(top0, diff0, diff1);
1722  UNPCK_UB_SH(top1, diff2, diff3);
1723  UNPCK_UB_SH(top2, diff4, diff5);
1724  UNPCK_UB_SH(top3, diff6, diff7);
1725  UNPCK_UB_SH(top4, diff8, diff9);
1726  UNPCK_UB_SH(top5, diff10, diff11);
1727  UNPCK_UB_SH(top6, diff12, diff13);
1728  UNPCK_UB_SH(top7, diff14, diff15);
1729 
1730  MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1731  diff2, diff3, diff6, diff7);
1732  MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1733  diff10, diff11, diff14, diff15);
1734 
1735  diff2 += diff0 * fact1;
1736  diff3 += diff1 * fact1;
1737  diff6 += diff4 * fact1;
1738  diff7 += diff5 * fact1;
1739  diff10 += diff8 * fact3;
1740  diff11 += diff9 * fact3;
1741  diff14 += diff12 * fact3;
1742  diff15 += diff13 * fact3;
1743 
1744  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1745  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1746  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1747  dst_val0, dst_val1, dst_val2, dst_val3);
1748  ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1);
1749  ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3);
1750 
1751  ST_H8(diff0, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1752  dst_org += (8 * stride);
1753  ST_H8(diff1, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1754  dst_org += (8 * stride);
1755  ST_H8(diff2, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1756  dst_org += (8 * stride);
1757  ST_H8(diff3, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1758  dst_org += (8 * stride);
1759 
1760  dst += 2;
1761  }
1762 }
1763 
1765  int32_t dst_stride)
1766 {
1767  uint32_t row;
1768  v16u8 src1, src2;
1769 
1770  src1 = LD_UB(src);
1771  src2 = LD_UB(src + 16);
1772 
1773  for (row = 32; row--;) {
1774  ST_UB2(src1, src2, dst, 16);
1775  dst += dst_stride;
1776  }
1777 }
1778 
1780  const uint8_t *src_top,
1781  const uint8_t *src_left,
1782  ptrdiff_t stride)
1783 {
1784  hevc_intra_pred_plane_4x4_msa(src_top, src_left, dst, stride);
1785 }
1786 
1788  const uint8_t *src_top,
1789  const uint8_t *src_left,
1790  ptrdiff_t stride)
1791 {
1792  hevc_intra_pred_plane_8x8_msa(src_top, src_left, dst, stride);
1793 }
1794 
1796  const uint8_t *src_top,
1797  const uint8_t *src_left,
1798  ptrdiff_t stride)
1799 {
1800  hevc_intra_pred_plane_16x16_msa(src_top, src_left, dst, stride);
1801 }
1802 
1804  const uint8_t *src_top,
1805  const uint8_t *src_left,
1806  ptrdiff_t stride)
1807 {
1808  hevc_intra_pred_plane_32x32_msa(src_top, src_left, dst, stride);
1809 }
1810 
1811 void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top,
1812  const uint8_t *src_left,
1813  ptrdiff_t stride, int log2, int c_idx)
1814 {
1815  switch (log2) {
1816  case 2:
1817  hevc_intra_pred_dc_4x4_msa(src_top, src_left, dst, stride, c_idx);
1818  break;
1819 
1820  case 3:
1821  hevc_intra_pred_dc_8x8_msa(src_top, src_left, dst, stride, c_idx);
1822  break;
1823 
1824  case 4:
1825  hevc_intra_pred_dc_16x16_msa(src_top, src_left, dst, stride, c_idx);
1826  break;
1827 
1828  case 5:
1829  hevc_intra_pred_dc_32x32_msa(src_top, src_left, dst, stride);
1830  break;
1831  }
1832 }
1833 
1835  const uint8_t *src_top,
1836  const uint8_t *src_left,
1837  ptrdiff_t stride, int c_idx, int mode)
1838 {
1839  if (mode == 10) {
1840  hevc_intra_pred_horiz_4x4_msa(src_top, src_left, dst, stride, c_idx);
1841  } else if (mode == 26) {
1842  hevc_intra_pred_vert_4x4_msa(src_top, src_left, dst, stride, c_idx);
1843  } else if (mode >= 18) {
1844  hevc_intra_pred_angular_upper_4width_msa(src_top, src_left,
1845  dst, stride, mode);
1846  } else {
1847  hevc_intra_pred_angular_lower_4width_msa(src_top, src_left,
1848  dst, stride, mode);
1849  }
1850 }
1851 
1853  const uint8_t *src_top,
1854  const uint8_t *src_left,
1855  ptrdiff_t stride, int c_idx, int mode)
1856 {
1857  if (mode == 10) {
1858  hevc_intra_pred_horiz_8x8_msa(src_top, src_left, dst, stride, c_idx);
1859  } else if (mode == 26) {
1860  hevc_intra_pred_vert_8x8_msa(src_top, src_left, dst, stride, c_idx);
1861  } else if (mode >= 18) {
1862  hevc_intra_pred_angular_upper_8width_msa(src_top, src_left,
1863  dst, stride, mode);
1864  } else {
1865  hevc_intra_pred_angular_lower_8width_msa(src_top, src_left,
1866  dst, stride, mode);
1867  }
1868 }
1869 
1871  const uint8_t *src_top,
1872  const uint8_t *src_left,
1873  ptrdiff_t stride, int c_idx, int mode)
1874 {
1875  if (mode == 10) {
1876  hevc_intra_pred_horiz_16x16_msa(src_top, src_left, dst, stride, c_idx);
1877  } else if (mode == 26) {
1878  hevc_intra_pred_vert_16x16_msa(src_top, src_left, dst, stride, c_idx);
1879  } else if (mode >= 18) {
1881  dst, stride, mode);
1882  } else {
1884  dst, stride, mode);
1885  }
1886 }
1887 
1889  const uint8_t *src_top,
1890  const uint8_t *src_left,
1891  ptrdiff_t stride, int c_idx, int mode)
1892 {
1893  if (mode == 10) {
1894  hevc_intra_pred_horiz_32x32_msa(src_top, src_left, dst, stride);
1895  } else if (mode == 26) {
1896  intra_predict_vert_32x32_msa(src_top, dst, stride);
1897  } else if (mode >= 18) {
1899  dst, stride, mode);
1900  } else {
1902  dst, stride, mode);
1903  }
1904 }
1905 
1906 void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
1907 {
1908  v16u8 vec0;
1909  HEVCLocalContext *lc = s->HEVClc;
1910  int i;
1911  int hshift = s->ps.sps->hshift[c_idx];
1912  int vshift = s->ps.sps->vshift[c_idx];
1913  int size_in_luma_h = 16 << hshift;
1914  int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
1915  int size_in_luma_v = 16 << vshift;
1916  int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
1917  int x = x0 >> hshift;
1918  int y = y0 >> vshift;
1919  int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
1920  int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
1921 
1922  int cur_tb_addr =
1923  s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
1924 
1925  ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
1926  uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
1927 
1928  int min_pu_width = s->ps.sps->min_pu_width;
1929 
1930  enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
1931  lc->tu.intra_pred_mode;
1932  uint32_t a;
1933  uint8_t left_array[2 * 32 + 1];
1934  uint8_t filtered_left_array[2 * 32 + 1];
1935  uint8_t top_array[2 * 32 + 1];
1936  uint8_t filtered_top_array[2 * 32 + 1];
1937 
1938  uint8_t *left = left_array + 1;
1939  uint8_t *top = top_array + 1;
1940  uint8_t *filtered_left = filtered_left_array + 1;
1941  uint8_t *filtered_top = filtered_top_array + 1;
1942  int cand_bottom_left = lc->na.cand_bottom_left
1943  && cur_tb_addr >
1944  s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
1945  (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
1946  int cand_left = lc->na.cand_left;
1947  int cand_up_left = lc->na.cand_up_left;
1948  int cand_up = lc->na.cand_up;
1949  int cand_up_right = lc->na.cand_up_right
1950  && cur_tb_addr >
1951  s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
1952  ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
1953 
1954  int bottom_left_size =
1955  (((y0 + 2 * size_in_luma_v) >
1956  (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
1957  2 * size_in_luma_v)) -
1958  (y0 + size_in_luma_v)) >> vshift;
1959  int top_right_size =
1960  (((x0 + 2 * size_in_luma_h) >
1961  (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
1962  (x0 + size_in_luma_h)) >> hshift;
1963 
1964  if (s->ps.pps->constrained_intra_pred_flag == 1) {
1965  int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
1966  int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
1967  int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
1968  int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
1969  if (!size_in_luma_pu_h)
1970  size_in_luma_pu_h++;
1971  if (cand_bottom_left == 1 && on_pu_edge_x) {
1972  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
1973  int y_bottom_pu =
1974  ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
1975  int max =
1976  ((size_in_luma_pu_v) >
1977  (s->ps.sps->min_pu_height -
1978  y_bottom_pu) ? (s->ps.sps->min_pu_height -
1979  y_bottom_pu) : (size_in_luma_pu_v));
1980  cand_bottom_left = 0;
1981  for (i = 0; i < max; i += 2)
1982  cand_bottom_left |=
1983  ((s->ref->tab_mvf[(x_left_pu) +
1984  (y_bottom_pu +
1985  i) * min_pu_width]).pred_flag ==
1986  PF_INTRA);
1987  }
1988  if (cand_left == 1 && on_pu_edge_x) {
1989  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
1990  int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
1991  int max =
1992  ((size_in_luma_pu_v) >
1993  (s->ps.sps->min_pu_height -
1994  y_left_pu) ? (s->ps.sps->min_pu_height -
1995  y_left_pu) : (size_in_luma_pu_v));
1996  cand_left = 0;
1997  for (i = 0; i < max; i += 2)
1998  cand_left |=
1999  ((s->ref->tab_mvf[(x_left_pu) +
2000  (y_left_pu +
2001  i) * min_pu_width]).pred_flag ==
2002  PF_INTRA);
2003  }
2004  if (cand_up_left == 1) {
2005  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2006  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2007  cand_up_left =
2008  (s->ref->tab_mvf[(x_left_pu) +
2009  (y_top_pu) * min_pu_width]).pred_flag ==
2010  PF_INTRA;
2011  }
2012  if (cand_up == 1 && on_pu_edge_y) {
2013  int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
2014  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2015  int max =
2016  ((size_in_luma_pu_h) >
2017  (s->ps.sps->min_pu_width -
2018  x_top_pu) ? (s->ps.sps->min_pu_width -
2019  x_top_pu) : (size_in_luma_pu_h));
2020  cand_up = 0;
2021  for (i = 0; i < max; i += 2)
2022  cand_up |=
2023  ((s->ref->tab_mvf[(x_top_pu + i) +
2024  (y_top_pu) *
2025  min_pu_width]).pred_flag == PF_INTRA);
2026  }
2027  if (cand_up_right == 1 && on_pu_edge_y) {
2028  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2029  int x_right_pu =
2030  ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2031  int max =
2032  ((size_in_luma_pu_h) >
2033  (s->ps.sps->min_pu_width -
2034  x_right_pu) ? (s->ps.sps->min_pu_width -
2035  x_right_pu) : (size_in_luma_pu_h));
2036  cand_up_right = 0;
2037  for (i = 0; i < max; i += 2)
2038  cand_up_right |=
2039  ((s->ref->tab_mvf[(x_right_pu + i) +
2040  (y_top_pu) *
2041  min_pu_width]).pred_flag == PF_INTRA);
2042  }
2043 
2044  vec0 = (v16u8) __msa_ldi_b(128);
2045 
2046  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2047 
2048  ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2049 
2050  top[-1] = 128;
2051  }
2052  if (cand_up_left) {
2053  left[-1] = src[(-1) + stride * (-1)];
2054  top[-1] = left[-1];
2055  }
2056  if (cand_up) {
2057  vec0 = LD_UB(src - stride);
2058  ST_UB(vec0, top);
2059  }
2060  if (cand_up_right) {
2061  vec0 = LD_UB(src - stride + 16);
2062  ST_UB(vec0, (top + 16));
2063 
2064  do {
2065  uint32_t pix =
2066  ((src[(16 + top_right_size - 1) + stride * (-1)]) *
2067  0x01010101U);
2068  for (i = 0; i < (16 - top_right_size); i += 4)
2069  ((((union unaligned_32 *) (top + 16 + top_right_size +
2070  i))->l) = (pix));
2071  } while (0);
2072  }
2073  if (cand_left)
2074  for (i = 0; i < 16; i++)
2075  left[i] = src[(-1) + stride * (i)];
2076  if (cand_bottom_left) {
2077  for (i = 16; i < 16 + bottom_left_size; i++)
2078  left[i] = src[(-1) + stride * (i)];
2079  do {
2080  uint32_t pix =
2081  ((src[(-1) + stride * (16 + bottom_left_size - 1)]) *
2082  0x01010101U);
2083  for (i = 0; i < (16 - bottom_left_size); i += 4)
2084  ((((union unaligned_32 *) (left + 16 + bottom_left_size +
2085  i))->l) = (pix));
2086  } while (0);
2087  }
2088 
2089  if (s->ps.pps->constrained_intra_pred_flag == 1) {
2090  if (cand_bottom_left || cand_left || cand_up_left || cand_up
2091  || cand_up_right) {
2092  int size_max_x =
2093  x0 + ((2 * 16) << hshift) <
2094  s->ps.sps->width ? 2 * 16 : (s->ps.sps->width - x0) >> hshift;
2095  int size_max_y =
2096  y0 + ((2 * 16) << vshift) <
2097  s->ps.sps->height ? 2 * 16 : (s->ps.sps->height - y0) >> vshift;
2098  int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2099  if (!cand_up_right) {
2100  size_max_x = x0 + ((16) << hshift) < s->ps.sps->width ?
2101  16 : (s->ps.sps->width - x0) >> hshift;
2102  }
2103  if (!cand_bottom_left) {
2104  size_max_y = y0 + ((16) << vshift) < s->ps.sps->height ?
2105  16 : (s->ps.sps->height - y0) >> vshift;
2106  }
2107  if (cand_bottom_left || cand_left || cand_up_left) {
2108  while (j > -1
2109  &&
2110  !((s->ref->tab_mvf[(((x0 +
2111  ((-1) << hshift)) >> s->ps.sps->
2112  log2_min_pu_size)) + (((y0 +
2113  ((j) <<
2114  vshift))
2115  >> s->ps.sps->
2116  log2_min_pu_size))
2117  * min_pu_width]).pred_flag ==
2118  PF_INTRA))
2119  j--;
2120  if (!
2121  ((s->ref->tab_mvf[(((x0 +
2122  ((-1) << hshift)) >> s->ps.sps->
2123  log2_min_pu_size)) + (((y0 + ((j)
2124  <<
2125  vshift))
2126  >> s->ps.sps->
2127  log2_min_pu_size))
2128  * min_pu_width]).pred_flag == PF_INTRA)) {
2129  j = 0;
2130  while (j < size_max_x
2131  &&
2132  !((s->ref->tab_mvf[(((x0 +
2133  ((j) << hshift)) >> s->ps.sps->
2134  log2_min_pu_size)) + (((y0 +
2135  ((-1) <<
2136  vshift))
2137  >> s->
2138  ps.sps->
2139  log2_min_pu_size))
2140  * min_pu_width]).pred_flag ==
2141  PF_INTRA))
2142  j++;
2143  for (i = j; i > (j) - (j + 1); i--)
2144  if (!
2145  ((s->ref->tab_mvf[(((x0 +
2146  ((i -
2147  1) << hshift)) >> s->ps.sps->
2148  log2_min_pu_size)) + (((y0 +
2149  ((-1) <<
2150  vshift))
2151  >> s->
2152  ps.sps->
2153  log2_min_pu_size))
2154  * min_pu_width]).pred_flag ==
2155  PF_INTRA))
2156  top[i - 1] = top[i];
2157  left[-1] = top[-1];
2158  }
2159  } else {
2160  j = 0;
2161  while (j < size_max_x
2162  &&
2163  !((s->ref->tab_mvf[(((x0 +
2164  ((j) << hshift)) >> s->ps.sps->
2165  log2_min_pu_size)) + (((y0 + ((-1)
2166  <<
2167  vshift))
2168  >> s->ps.sps->
2169  log2_min_pu_size))
2170  * min_pu_width]).pred_flag ==
2171  PF_INTRA))
2172  j++;
2173  if (j > 0)
2174  if (x0 > 0) {
2175  for (i = j; i > (j) - (j + 1); i--)
2176  if (!
2177  ((s->ref->tab_mvf[(((x0 +
2178  ((i -
2179  1) << hshift)) >>
2180  s->ps.sps->log2_min_pu_size))
2181  + (((y0 + ((-1)
2182  << vshift))
2183  >>
2184  s->ps.sps->log2_min_pu_size))
2185  *
2186  min_pu_width]).pred_flag ==
2187  PF_INTRA))
2188  top[i - 1] = top[i];
2189  } else {
2190  for (i = j; i > (j) - (j); i--)
2191  if (!
2192  ((s->ref->tab_mvf[(((x0 +
2193  ((i -
2194  1) << hshift)) >>
2195  s->ps.sps->log2_min_pu_size))
2196  + (((y0 + ((-1)
2197  << vshift))
2198  >>
2199  s->ps.sps->log2_min_pu_size))
2200  *
2201  min_pu_width]).pred_flag ==
2202  PF_INTRA))
2203  top[i - 1] = top[i];
2204  top[-1] = top[0];
2205  }
2206  left[-1] = top[-1];
2207  }
2208  left[-1] = top[-1];
2209  if (cand_bottom_left || cand_left) {
2210  a = ((left[-1]) * 0x01010101U);
2211  for (i = 0; i < (0) + (size_max_y); i += 4)
2212  if (!
2213  ((s->ref->tab_mvf[(((x0 +
2214  ((-1) << hshift)) >> s->ps.sps->
2215  log2_min_pu_size)) + (((y0 +
2216  ((i) <<
2217  vshift))
2218  >> s->ps.sps->
2219  log2_min_pu_size))
2220  * min_pu_width]).pred_flag ==
2221  PF_INTRA))
2222  ((((union unaligned_32 *) (&left[i]))->l) = (a));
2223  else
2224  a = ((left[i + 3]) * 0x01010101U);
2225  }
2226  if (!cand_left) {
2227  vec0 = (v16u8) __msa_fill_b(left[-1]);
2228 
2229  ST_UB(vec0, left);
2230  }
2231  if (!cand_bottom_left) {
2232 
2233  vec0 = (v16u8) __msa_fill_b(left[15]);
2234 
2235  ST_UB(vec0, (left + 16));
2236  }
2237  if (x0 != 0 && y0 != 0) {
2238  a = ((left[size_max_y - 1]) * 0x01010101U);
2239  for (i = (size_max_y - 1);
2240  i > (size_max_y - 1) - (size_max_y); i -= 4)
2241  if (!
2242  ((s->ref->tab_mvf[(((x0 +
2243  ((-1) << hshift)) >> s->ps.sps->
2244  log2_min_pu_size)) + (((y0 +
2245  ((i -
2246  3) <<
2247  vshift))
2248  >> s->ps.sps->
2249  log2_min_pu_size))
2250  * min_pu_width]).pred_flag ==
2251  PF_INTRA))
2252  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2253  else
2254  a = ((left[i - 3]) * 0x01010101U);
2255  if (!
2256  ((s->ref->tab_mvf[(((x0 +
2257  ((-1) << hshift)) >> s->ps.sps->
2258  log2_min_pu_size)) + (((y0 + ((-1)
2259  <<
2260  vshift))
2261  >> s->ps.sps->
2262  log2_min_pu_size))
2263  * min_pu_width]).pred_flag == PF_INTRA))
2264  left[-1] = left[0];
2265  } else if (x0 == 0) {
2266  do {
2267  uint32_t pix = ((0) * 0x01010101U);
2268  for (i = 0; i < (size_max_y); i += 4)
2269  ((((union unaligned_32 *) (left + i))->l) = (pix));
2270  } while (0);
2271  } else {
2272  a = ((left[size_max_y - 1]) * 0x01010101U);
2273  for (i = (size_max_y - 1);
2274  i > (size_max_y - 1) - (size_max_y); i -= 4)
2275  if (!
2276  ((s->ref->tab_mvf[(((x0 +
2277  ((-1) << hshift)) >> s->ps.sps->
2278  log2_min_pu_size)) + (((y0 +
2279  ((i -
2280  3) <<
2281  vshift))
2282  >> s->ps.sps->
2283  log2_min_pu_size))
2284  * min_pu_width]).pred_flag ==
2285  PF_INTRA))
2286  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2287  else
2288  a = ((left[i - 3]) * 0x01010101U);
2289  }
2290  top[-1] = left[-1];
2291  if (y0 != 0) {
2292  a = ((left[-1]) * 0x01010101U);
2293  for (i = 0; i < (0) + (size_max_x); i += 4)
2294  if (!
2295  ((s->ref->tab_mvf[(((x0 +
2296  ((i) << hshift)) >> s->ps.sps->
2297  log2_min_pu_size)) + (((y0 + ((-1)
2298  <<
2299  vshift))
2300  >> s->ps.sps->
2301  log2_min_pu_size))
2302  * min_pu_width]).pred_flag ==
2303  PF_INTRA))
2304  ((((union unaligned_32 *) (&top[i]))->l) = (a));
2305  else
2306  a = ((top[i + 3]) * 0x01010101U);
2307  }
2308  }
2309  }
2310 
2311  if (!cand_bottom_left) {
2312  if (cand_left) {
2313  vec0 = (v16u8) __msa_fill_b(left[15]);
2314 
2315  ST_UB(vec0, (left + 16));
2316 
2317  } else if (cand_up_left) {
2318  vec0 = (v16u8) __msa_fill_b(left[-1]);
2319 
2320  ST_UB2(vec0, vec0, left, 16);
2321 
2322  cand_left = 1;
2323  } else if (cand_up) {
2324  left[-1] = top[0];
2325 
2326  vec0 = (v16u8) __msa_fill_b(left[-1]);
2327 
2328  ST_UB2(vec0, vec0, left, 16);
2329 
2330  cand_up_left = 1;
2331  cand_left = 1;
2332  } else if (cand_up_right) {
2333  vec0 = (v16u8) __msa_fill_b(top[16]);
2334 
2335  ST_UB(vec0, top);
2336 
2337  left[-1] = top[16];
2338 
2339  ST_UB2(vec0, vec0, left, 16);
2340 
2341  cand_up = 1;
2342  cand_up_left = 1;
2343  cand_left = 1;
2344  } else {
2345  left[-1] = 128;
2346  vec0 = (v16u8) __msa_ldi_b(128);
2347 
2348  ST_UB2(vec0, vec0, top, 16);
2349  ST_UB2(vec0, vec0, left, 16);
2350  }
2351  }
2352 
2353  if (!cand_left) {
2354  vec0 = (v16u8) __msa_fill_b(left[16]);
2355  ST_UB(vec0, left);
2356  }
2357  if (!cand_up_left) {
2358  left[-1] = left[0];
2359  }
2360  if (!cand_up) {
2361  vec0 = (v16u8) __msa_fill_b(left[-1]);
2362  ST_UB(vec0, top);
2363  }
2364  if (!cand_up_right) {
2365  vec0 = (v16u8) __msa_fill_b(top[15]);
2366  ST_UB(vec0, (top + 16));
2367  }
2368 
2369  top[-1] = left[-1];
2370 
2371 
2372  if (!s->ps.sps->intra_smoothing_disabled_flag
2373  && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
2374  if (mode != INTRA_DC && 16 != 4) {
2375  int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2376  int min_dist_vert_hor =
2377  (((((int) (mode - 26U)) >=
2378  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2379  ((((int) (mode - 10U)) >=
2380  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2381  ? ((((int) (mode - 10U)) >=
2382  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2383  : ((((int) (mode - 26U)) >=
2384  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2385  if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
2386  filtered_left[2 * 16 - 1] = left[2 * 16 - 1];
2387  filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
2388  for (i = 2 * 16 - 2; i >= 0; i--)
2389  filtered_left[i] = (left[i + 1] + 2 * left[i] +
2390  left[i - 1] + 2) >> 2;
2391  filtered_top[-1] =
2392  filtered_left[-1] =
2393  (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
2394  for (i = 2 * 16 - 2; i >= 0; i--)
2395  filtered_top[i] = (top[i + 1] + 2 * top[i] +
2396  top[i - 1] + 2) >> 2;
2397  left = filtered_left;
2398  top = filtered_top;
2399  }
2400  }
2401  }
2402 
2403  switch (mode) {
2404  case INTRA_PLANAR:
2405  s->hpc.pred_planar[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2406  (uint8_t *) left, stride);
2407  break;
2408  case INTRA_DC:
2409  s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
2410  (uint8_t *) left, stride, 4, c_idx);
2411  break;
2412  default:
2413  s->hpc.pred_angular[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2414  (uint8_t *) left, stride, c_idx, mode);
2415  break;
2416  }
2417 }
2418 
2419 void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
2420 {
2421  v16u8 vec0, vec1;
2422  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2423  v8i16 res0, res1, res2, res3;
2424  v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
2425  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
2426  HEVCLocalContext *lc = s->HEVClc;
2427  int i;
2428  int hshift = s->ps.sps->hshift[c_idx];
2429  int vshift = s->ps.sps->vshift[c_idx];
2430  int size_in_luma_h = 32 << hshift;
2431  int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
2432  int size_in_luma_v = 32 << vshift;
2433  int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
2434  int x = x0 >> hshift;
2435  int y = y0 >> vshift;
2436  int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
2437  int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
2438 
2439  int cur_tb_addr =
2440  s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
2441 
2442  ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
2443  uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
2444 
2445  int min_pu_width = s->ps.sps->min_pu_width;
2446 
2447  enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
2448  lc->tu.intra_pred_mode;
2449  uint32_t a;
2450  uint8_t left_array[2 * 32 + 1];
2451  uint8_t filtered_left_array[2 * 32 + 1];
2452  uint8_t top_array[2 * 32 + 1];
2453  uint8_t filtered_top_array[2 * 32 + 1];
2454 
2455  uint8_t *left = left_array + 1;
2456  uint8_t *top = top_array + 1;
2457  uint8_t *filtered_left = filtered_left_array + 1;
2458  uint8_t *filtered_top = filtered_top_array + 1;
2459  int cand_bottom_left = lc->na.cand_bottom_left
2460  && cur_tb_addr >
2461  s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
2462  (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
2463  int cand_left = lc->na.cand_left;
2464  int cand_up_left = lc->na.cand_up_left;
2465  int cand_up = lc->na.cand_up;
2466  int cand_up_right = lc->na.cand_up_right
2467  && cur_tb_addr >
2468  s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
2469  ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
2470 
2471  int bottom_left_size =
2472  (((y0 + 2 * size_in_luma_v) >
2473  (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
2474  2 * size_in_luma_v)) -
2475  (y0 + size_in_luma_v)) >> vshift;
2476  int top_right_size =
2477  (((x0 + 2 * size_in_luma_h) >
2478  (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
2479  (x0 + size_in_luma_h)) >> hshift;
2480 
2481  if (s->ps.pps->constrained_intra_pred_flag == 1) {
2482  int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
2483  int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2484  int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
2485  int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
2486  if (!size_in_luma_pu_h)
2487  size_in_luma_pu_h++;
2488  if (cand_bottom_left == 1 && on_pu_edge_x) {
2489  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2490  int y_bottom_pu =
2491  ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
2492  int max =
2493  ((size_in_luma_pu_v) >
2494  (s->ps.sps->min_pu_height -
2495  y_bottom_pu) ? (s->ps.sps->min_pu_height -
2496  y_bottom_pu) : (size_in_luma_pu_v));
2497  cand_bottom_left = 0;
2498  for (i = 0; i < max; i += 2)
2499  cand_bottom_left |=
2500  ((s->ref->tab_mvf[(x_left_pu) +
2501  (y_bottom_pu +
2502  i) * min_pu_width]).pred_flag ==
2503  PF_INTRA);
2504  }
2505  if (cand_left == 1 && on_pu_edge_x) {
2506  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2507  int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
2508  int max =
2509  ((size_in_luma_pu_v) >
2510  (s->ps.sps->min_pu_height -
2511  y_left_pu) ? (s->ps.sps->min_pu_height -
2512  y_left_pu) : (size_in_luma_pu_v));
2513  cand_left = 0;
2514  for (i = 0; i < max; i += 2)
2515  cand_left |=
2516  ((s->ref->tab_mvf[(x_left_pu) +
2517  (y_left_pu +
2518  i) * min_pu_width]).pred_flag ==
2519  PF_INTRA);
2520  }
2521  if (cand_up_left == 1) {
2522  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2523  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2524  cand_up_left =
2525  (s->ref->tab_mvf[(x_left_pu) +
2526  (y_top_pu) * min_pu_width]).pred_flag ==
2527  PF_INTRA;
2528  }
2529  if (cand_up == 1 && on_pu_edge_y) {
2530  int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
2531  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2532  int max =
2533  ((size_in_luma_pu_h) >
2534  (s->ps.sps->min_pu_width -
2535  x_top_pu) ? (s->ps.sps->min_pu_width -
2536  x_top_pu) : (size_in_luma_pu_h));
2537  cand_up = 0;
2538  for (i = 0; i < max; i += 2)
2539  cand_up |=
2540  ((s->ref->tab_mvf[(x_top_pu + i) +
2541  (y_top_pu) *
2542  min_pu_width]).pred_flag == PF_INTRA);
2543  }
2544  if (cand_up_right == 1 && on_pu_edge_y) {
2545  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2546  int x_right_pu =
2547  ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2548  int max =
2549  ((size_in_luma_pu_h) >
2550  (s->ps.sps->min_pu_width -
2551  x_right_pu) ? (s->ps.sps->min_pu_width -
2552  x_right_pu) : (size_in_luma_pu_h));
2553  cand_up_right = 0;
2554  for (i = 0; i < max; i += 2)
2555  cand_up_right |=
2556  ((s->ref->tab_mvf[(x_right_pu + i) +
2557  (y_top_pu) *
2558  min_pu_width]).pred_flag == PF_INTRA);
2559  }
2560  vec0 = (v16u8) __msa_ldi_b(128);
2561 
2562  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2563  ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2564 
2565  top[-1] = 128;
2566  }
2567  if (cand_up_left) {
2568  left[-1] = src[(-1) + stride * (-1)];
2569  top[-1] = left[-1];
2570  }
2571  if (cand_up) {
2572  LD_UB2(src - stride, 16, vec0, vec1);
2573  ST_UB2(vec0, vec1, top, 16);
2574  }
2575 
2576  if (cand_up_right) {
2577  LD_UB2(src - stride + 32, 16, vec0, vec1);
2578  ST_UB2(vec0, vec1, (top + 32), 16);
2579  do {
2580  uint32_t pix =
2581  ((src[(32 + top_right_size - 1) + stride * (-1)]) *
2582  0x01010101U);
2583  for (i = 0; i < (32 - top_right_size); i += 4)
2584  ((((union unaligned_32 *) (top + 32 + top_right_size +
2585  i))->l) = (pix));
2586  } while (0);
2587  }
2588  if (cand_left)
2589  for (i = 0; i < 32; i++)
2590  left[i] = src[(-1) + stride * (i)];
2591  if (cand_bottom_left) {
2592  for (i = 32; i < 32 + bottom_left_size; i++)
2593  left[i] = src[(-1) + stride * (i)];
2594  do {
2595  uint32_t pix =
2596  ((src[(-1) + stride * (32 + bottom_left_size - 1)]) *
2597  0x01010101U);
2598  for (i = 0; i < (32 - bottom_left_size); i += 4)
2599  ((((union unaligned_32 *) (left + 32 + bottom_left_size +
2600  i))->l) = (pix));
2601  } while (0);
2602  }
2603 
2604  if (s->ps.pps->constrained_intra_pred_flag == 1) {
2605  if (cand_bottom_left || cand_left || cand_up_left || cand_up
2606  || cand_up_right) {
2607  int size_max_x =
2608  x0 + ((2 * 32) << hshift) <
2609  s->ps.sps->width ? 2 * 32 : (s->ps.sps->width - x0) >> hshift;
2610  int size_max_y =
2611  y0 + ((2 * 32) << vshift) <
2612  s->ps.sps->height ? 2 * 32 : (s->ps.sps->height - y0) >> vshift;
2613  int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2614  if (!cand_up_right) {
2615  size_max_x = x0 + ((32) << hshift) < s->ps.sps->width ?
2616  32 : (s->ps.sps->width - x0) >> hshift;
2617  }
2618  if (!cand_bottom_left) {
2619  size_max_y = y0 + ((32) << vshift) < s->ps.sps->height ?
2620  32 : (s->ps.sps->height - y0) >> vshift;
2621  }
2622  if (cand_bottom_left || cand_left || cand_up_left) {
2623  while (j > -1
2624  &&
2625  !((s->ref->tab_mvf[(((x0 +
2626  ((-1) << hshift)) >> s->ps.sps->
2627  log2_min_pu_size)) + (((y0 +
2628  ((j) <<
2629  vshift))
2630  >> s->ps.sps->
2631  log2_min_pu_size))
2632  * min_pu_width]).pred_flag ==
2633  PF_INTRA))
2634  j--;
2635  if (!
2636  ((s->ref->tab_mvf[(((x0 +
2637  ((-1) << hshift)) >> s->ps.sps->
2638  log2_min_pu_size)) + (((y0 + ((j)
2639  <<
2640  vshift))
2641  >> s->ps.sps->
2642  log2_min_pu_size))
2643  * min_pu_width]).pred_flag == PF_INTRA)) {
2644  j = 0;
2645  while (j < size_max_x
2646  &&
2647  !((s->ref->tab_mvf[(((x0 +
2648  ((j) << hshift)) >> s->ps.sps->
2649  log2_min_pu_size)) + (((y0 +
2650  ((-1) <<
2651  vshift))
2652  >> s->
2653  ps.sps->
2654  log2_min_pu_size))
2655  * min_pu_width]).pred_flag ==
2656  PF_INTRA))
2657  j++;
2658  for (i = j; i > (j) - (j + 1); i--)
2659  if (!
2660  ((s->ref->tab_mvf[(((x0 +
2661  ((i -
2662  1) << hshift)) >> s->ps.sps->
2663  log2_min_pu_size)) + (((y0 +
2664  ((-1) <<
2665  vshift))
2666  >> s->
2667  ps.sps->
2668  log2_min_pu_size))
2669  * min_pu_width]).pred_flag ==
2670  PF_INTRA))
2671  top[i - 1] = top[i];
2672  left[-1] = top[-1];
2673  }
2674  } else {
2675  j = 0;
2676  while (j < size_max_x
2677  &&
2678  !((s->ref->tab_mvf[(((x0 +
2679  ((j) << hshift)) >> s->ps.sps->
2680  log2_min_pu_size)) + (((y0 + ((-1)
2681  <<
2682  vshift))
2683  >> s->ps.sps->
2684  log2_min_pu_size))
2685  * min_pu_width]).pred_flag ==
2686  PF_INTRA))
2687  j++;
2688  if (j > 0)
2689  if (x0 > 0) {
2690  for (i = j; i > (j) - (j + 1); i--)
2691  if (!
2692  ((s->ref->tab_mvf[(((x0 +
2693  ((i -
2694  1) << hshift)) >>
2695  s->ps.sps->log2_min_pu_size))
2696  + (((y0 + ((-1)
2697  << vshift))
2698  >>
2699  s->ps.sps->log2_min_pu_size))
2700  *
2701  min_pu_width]).pred_flag ==
2702  PF_INTRA))
2703  top[i - 1] = top[i];
2704  } else {
2705  for (i = j; i > (j) - (j); i--)
2706  if (!
2707  ((s->ref->tab_mvf[(((x0 +
2708  ((i -
2709  1) << hshift)) >>
2710  s->ps.sps->log2_min_pu_size))
2711  + (((y0 + ((-1)
2712  << vshift))
2713  >>
2714  s->ps.sps->log2_min_pu_size))
2715  *
2716  min_pu_width]).pred_flag ==
2717  PF_INTRA))
2718  top[i - 1] = top[i];
2719  top[-1] = top[0];
2720  }
2721  left[-1] = top[-1];
2722  }
2723  left[-1] = top[-1];
2724  if (cand_bottom_left || cand_left) {
2725  a = ((left[-1]) * 0x01010101U);
2726  for (i = 0; i < (0) + (size_max_y); i += 4)
2727  if (!
2728  ((s->ref->tab_mvf[(((x0 +
2729  ((-1) << hshift)) >> s->ps.sps->
2730  log2_min_pu_size)) + (((y0 +
2731  ((i) <<
2732  vshift))
2733  >> s->ps.sps->
2734  log2_min_pu_size))
2735  * min_pu_width]).pred_flag ==
2736  PF_INTRA))
2737  ((((union unaligned_32 *) (&left[i]))->l) = (a));
2738  else
2739  a = ((left[i + 3]) * 0x01010101U);
2740  }
2741  if (!cand_left) {
2742  vec0 = (v16u8) __msa_fill_b(left[-1]);
2743 
2744  ST_UB2(vec0, vec0, left, 16);
2745  }
2746  if (!cand_bottom_left) {
2747  vec0 = (v16u8) __msa_fill_b(left[31]);
2748 
2749  ST_UB2(vec0, vec0, (left + 32), 16);
2750  }
2751  if (x0 != 0 && y0 != 0) {
2752  a = ((left[size_max_y - 1]) * 0x01010101U);
2753  for (i = (size_max_y - 1);
2754  i > (size_max_y - 1) - (size_max_y); i -= 4)
2755  if (!
2756  ((s->ref->tab_mvf[(((x0 +
2757  ((-1) << hshift)) >> s->ps.sps->
2758  log2_min_pu_size)) + (((y0 +
2759  ((i -
2760  3) <<
2761  vshift))
2762  >> s->ps.sps->
2763  log2_min_pu_size))
2764  * min_pu_width]).pred_flag ==
2765  PF_INTRA))
2766  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2767  else
2768  a = ((left[i - 3]) * 0x01010101U);
2769  if (!
2770  ((s->ref->tab_mvf[(((x0 +
2771  ((-1) << hshift)) >> s->ps.sps->
2772  log2_min_pu_size)) + (((y0 + ((-1)
2773  <<
2774  vshift))
2775  >> s->ps.sps->
2776  log2_min_pu_size))
2777  * min_pu_width]).pred_flag == PF_INTRA))
2778  left[-1] = left[0];
2779  } else if (x0 == 0) {
2780  do {
2781  uint32_t pix = ((0) * 0x01010101U);
2782  for (i = 0; i < (size_max_y); i += 4)
2783  ((((union unaligned_32 *) (left + i))->l) = (pix));
2784  } while (0);
2785  } else {
2786  a = ((left[size_max_y - 1]) * 0x01010101U);
2787  for (i = (size_max_y - 1);
2788  i > (size_max_y - 1) - (size_max_y); i -= 4)
2789  if (!
2790  ((s->ref->tab_mvf[(((x0 +
2791  ((-1) << hshift)) >> s->ps.sps->
2792  log2_min_pu_size)) + (((y0 +
2793  ((i -
2794  3) <<
2795  vshift))
2796  >> s->ps.sps->
2797  log2_min_pu_size))
2798  * min_pu_width]).pred_flag ==
2799  PF_INTRA))
2800  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2801  else
2802  a = ((left[i - 3]) * 0x01010101U);
2803  }
2804  top[-1] = left[-1];
2805  if (y0 != 0) {
2806  a = ((left[-1]) * 0x01010101U);
2807  for (i = 0; i < (0) + (size_max_x); i += 4)
2808  if (!
2809  ((s->ref->tab_mvf[(((x0 +
2810  ((i) << hshift)) >> s->ps.sps->
2811  log2_min_pu_size)) + (((y0 + ((-1)
2812  <<
2813  vshift))
2814  >> s->ps.sps->
2815  log2_min_pu_size))
2816  * min_pu_width]).pred_flag ==
2817  PF_INTRA))
2818  ((((union unaligned_32 *) (&top[i]))->l) = (a));
2819  else
2820  a = ((top[i + 3]) * 0x01010101U);
2821  }
2822  }
2823  }
2824 
2825  if (!cand_bottom_left) {
2826  if (cand_left) {
2827  vec0 = (v16u8) __msa_fill_b(left[31]);
2828 
2829  ST_UB2(vec0, vec0, (left + 32), 16);
2830  } else if (cand_up_left) {
2831  vec0 = (v16u8) __msa_fill_b(left[-1]);
2832 
2833  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2834 
2835  cand_left = 1;
2836  } else if (cand_up) {
2837  left[-1] = top[0];
2838 
2839  vec0 = (v16u8) __msa_fill_b(left[-1]);
2840 
2841  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2842 
2843  cand_up_left = 1;
2844  cand_left = 1;
2845  } else if (cand_up_right) {
2846  vec0 = (v16u8) __msa_fill_b(top[32]);
2847 
2848  ST_UB2(vec0, vec0, top, 16);
2849 
2850  left[-1] = top[32];
2851 
2852  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2853 
2854  cand_up = 1;
2855  cand_up_left = 1;
2856  cand_left = 1;
2857  } else {
2858  left[-1] = 128;
2859 
2860  vec0 = (v16u8) __msa_ldi_b(128);
2861 
2862  ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2863  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2864  }
2865  }
2866 
2867  if (!cand_left) {
2868  vec0 = (v16u8) __msa_fill_b(left[32]);
2869 
2870  ST_UB2(vec0, vec0, left, 16);
2871  }
2872  if (!cand_up_left) {
2873  left[-1] = left[0];
2874  }
2875  if (!cand_up) {
2876  vec0 = (v16u8) __msa_fill_b(left[-1]);
2877 
2878  ST_UB2(vec0, vec0, top, 16);
2879  }
2880  if (!cand_up_right) {
2881  vec0 = (v16u8) __msa_fill_b(top[31]);
2882 
2883  ST_UB2(vec0, vec0, (top + 32), 16);
2884  }
2885 
2886  top[-1] = left[-1];
2887 
2888 
2889  if (!s->ps.sps->intra_smoothing_disabled_flag
2890  && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
2891  if (mode != INTRA_DC && 32 != 4) {
2892  int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2893  int min_dist_vert_hor =
2894  (((((int) (mode - 26U)) >=
2895  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2896  ((((int) (mode - 10U)) >=
2897  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2898  ? ((((int) (mode - 10U)) >=
2899  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2900  : ((((int) (mode - 26U)) >=
2901  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2902  if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
2903  int threshold = 1 << (8 - 5);
2904  if (s->ps.sps->sps_strong_intra_smoothing_enable_flag
2905  && c_idx == 0
2906  && ((top[-1] + top[63] - 2 * top[31]) >=
2907  0 ? (top[-1] + top[63] -
2908  2 * top[31]) : (-(top[-1] + top[63] -
2909  2 * top[31]))) < threshold
2910  && ((left[-1] + left[63] - 2 * left[31]) >=
2911  0 ? (left[-1] + left[63] -
2912  2 * left[31]) : (-(left[-1] + left[63] -
2913  2 * left[31]))) < threshold) {
2914 
2915 
2916  filtered_top[-1] = top[-1];
2917  filtered_top[63] = top[63];
2918 
2919 
2920  for (i = 0; i < 63; i++) {
2921  filtered_top[i] =
2922  ((63 - i) * top[-1] + (i + 1) * top[63] + 32) >> 6;
2923  }
2924 
2925  tmp0 = __msa_fill_h(top[-1]);
2926  tmp1 = __msa_fill_h(top[63]);
2927 
2928  tmp2 = mul_val0 - 8;
2929  tmp3 = mul_val0 - 16;
2930  tmp4 = mul_val0 - 24;
2931  tmp5 = mul_val1 + 8;
2932  tmp6 = mul_val1 + 16;
2933  tmp7 = mul_val1 + 24;
2934 
2935  res0 = mul_val0 * tmp0;
2936  res1 = tmp2 * tmp0;
2937  res2 = tmp3 * tmp0;
2938  res3 = tmp4 * tmp0;
2939  res0 += mul_val1 * tmp1;
2940  res1 += tmp5 * tmp1;
2941  res2 += tmp6 * tmp1;
2942  res3 += tmp7 * tmp1;
2943 
2944  res0 = __msa_srari_h(res0, 6);
2945  res1 = __msa_srari_h(res1, 6);
2946  res2 = __msa_srari_h(res2, 6);
2947  res3 = __msa_srari_h(res3, 6);
2948 
2949  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2950  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2951 
2952  ST_UB2(vec0, vec1, filtered_top, 16);
2953 
2954  res0 = mul_val0 - 32;
2955  tmp2 = mul_val0 - 40;
2956  tmp3 = mul_val0 - 48;
2957  tmp4 = mul_val0 - 56;
2958  res3 = mul_val1 + 32;
2959  tmp5 = mul_val1 + 40;
2960  tmp6 = mul_val1 + 48;
2961  tmp7 = mul_val1 + 56;
2962 
2963  res0 = res0 * tmp0;
2964  res1 = tmp2 * tmp0;
2965  res2 = tmp3 * tmp0;
2966  res0 += res3 * tmp1;
2967  res3 = tmp4 * tmp0;
2968  res1 += tmp5 * tmp1;
2969  res2 += tmp6 * tmp1;
2970  res3 += tmp7 * tmp1;
2971 
2972  res0 = __msa_srari_h(res0, 6);
2973  res1 = __msa_srari_h(res1, 6);
2974  res2 = __msa_srari_h(res2, 6);
2975  res3 = __msa_srari_h(res3, 6);
2976 
2977  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2978  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2979 
2980  ST_UB2(vec0, vec1, (filtered_top + 32), 16);
2981 
2982  filtered_top[63] = top[63];
2983 
2984  tmp0 = __msa_fill_h(left[-1]);
2985  tmp1 = __msa_fill_h(left[63]);
2986 
2987  tmp2 = mul_val0 - 8;
2988  tmp3 = mul_val0 - 16;
2989  tmp4 = mul_val0 - 24;
2990  tmp5 = mul_val1 + 8;
2991  tmp6 = mul_val1 + 16;
2992  tmp7 = mul_val1 + 24;
2993 
2994  res0 = mul_val0 * tmp0;
2995  res1 = tmp2 * tmp0;
2996  res2 = tmp3 * tmp0;
2997  res3 = tmp4 * tmp0;
2998  res0 += mul_val1 * tmp1;
2999  res1 += tmp5 * tmp1;
3000  res2 += tmp6 * tmp1;
3001  res3 += tmp7 * tmp1;
3002 
3003  res0 = __msa_srari_h(res0, 6);
3004  res1 = __msa_srari_h(res1, 6);
3005  res2 = __msa_srari_h(res2, 6);
3006  res3 = __msa_srari_h(res3, 6);
3007 
3008  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3009  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3010 
3011  ST_UB2(vec0, vec1, left, 16);
3012 
3013  res0 = mul_val0 - 32;
3014  tmp2 = mul_val0 - 40;
3015  tmp3 = mul_val0 - 48;
3016  tmp4 = mul_val0 - 56;
3017  res3 = mul_val1 + 32;
3018  tmp5 = mul_val1 + 40;
3019  tmp6 = mul_val1 + 48;
3020  tmp7 = mul_val1 + 56;
3021 
3022  res0 = res0 * tmp0;
3023  res1 = tmp2 * tmp0;
3024  res2 = tmp3 * tmp0;
3025  res0 += res3 * tmp1;
3026  res3 = tmp4 * tmp0;
3027  res1 += tmp5 * tmp1;
3028  res2 += tmp6 * tmp1;
3029  res3 += tmp7 * tmp1;
3030 
3031  res0 = __msa_srari_h(res0, 6);
3032  res1 = __msa_srari_h(res1, 6);
3033  res2 = __msa_srari_h(res2, 6);
3034  res3 = __msa_srari_h(res3, 6);
3035 
3036  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3037  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3038 
3039  ST_UB2(vec0, vec1, (left + 32), 16);
3040 
3041  left[63] = tmp1[0];
3042 
3043  top = filtered_top;
3044  } else {
3045  filtered_left[2 * 32 - 1] = left[2 * 32 - 1];
3046  filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
3047  for (i = 2 * 32 - 2; i >= 0; i--)
3048  filtered_left[i] = (left[i + 1] + 2 * left[i] +
3049  left[i - 1] + 2) >> 2;
3050  filtered_top[-1] =
3051  filtered_left[-1] =
3052  (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
3053  for (i = 2 * 32 - 2; i >= 0; i--)
3054  filtered_top[i] = (top[i + 1] + 2 * top[i] +
3055  top[i - 1] + 2) >> 2;
3056  left = filtered_left;
3057  top = filtered_top;
3058  }
3059  }
3060  }
3061  }
3062 
3063  switch (mode) {
3064  case INTRA_PLANAR:
3065  s->hpc.pred_planar[3] ((uint8_t *) src, (uint8_t *) top,
3066  (uint8_t *) left, stride);
3067  break;
3068  case INTRA_DC:
3069  s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
3070  (uint8_t *) left, stride, 5, c_idx);
3071  break;
3072  default:
3073  s->hpc.pred_angular[3] ((uint8_t *) src, (uint8_t *) top,
3074  (uint8_t *) left, stride, c_idx, mode);
3075  break;
3076  }
3077 }
HEVCLocalContext::na
NeighbourAvailable na
Definition: hevcdec.h:456
stride
int stride
Definition: mace.c:144
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:460
HEVCLocalContext
Definition: hevcdec.h:424
SRARI_H2_SH
#define SRARI_H2_SH(...)
Definition: generic_macros_msa.h:2071
PCKEV_B4_SB
#define PCKEV_B4_SB(...)
Definition: generic_macros_msa.h:1750
SPLATI_H4_SH
#define SPLATI_H4_SH(...)
Definition: generic_macros_msa.h:1686
ST_UB2
#define ST_UB2(...)
Definition: generic_macros_msa.h:365
SRARI_H4_SH
#define SRARI_H4_SH(...)
Definition: generic_macros_msa.h:2079
NeighbourAvailable::cand_left
int cand_left
Definition: hevcdec.h:352
NeighbourAvailable::cand_up
int cand_up
Definition: hevcdec.h:353
INTRA_DC
@ INTRA_DC
Definition: hevcdec.h:175
ILVR_B2_UH
#define ILVR_B2_UH(...)
Definition: generic_macros_msa.h:1351
NeighbourAvailable::cand_up_right
int cand_up_right
Definition: hevcdec.h:355
hevc_intra_pred_angular_upper_4width_msa
static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:923
max
#define max(a, b)
Definition: cuda_runtime.h:33
hevc_intra_pred_plane_32x32_msa
static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:908
ST_UB4
#define ST_UB4(...)
Definition: generic_macros_msa.h:376
PF_INTRA
@ PF_INTRA
Definition: hevcdec.h:167
INSERT_W2_SB
#define INSERT_W2_SB(...)
Definition: generic_macros_msa.h:1156
hevc_intra_pred_horiz_8x8_msa
static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:224
intra_pred_angle_low
static const int8_t intra_pred_angle_low[16]
Definition: hevcpred_msa.c:29
U
#define U(x)
Definition: vp56_arith.h:37
ADD2
#define ADD2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2130
HADD_UB2_UH
#define HADD_UB2_UH(...)
Definition: generic_macros_msa.h:1069
generic_macros_msa.h
intra_predict_vert_32x32_msa
static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, int32_t dst_stride)
Definition: hevcpred_msa.c:1764
val
static double val(void *priv, double ch)
Definition: aeval.c:76
LD_SB
#define LD_SB(...)
Definition: generic_macros_msa.h:37
intra_pred_angle_up
static const int8_t intra_pred_angle_up[17]
Definition: hevcpred_msa.c:25
LD_UB
#define LD_UB(...)
Definition: generic_macros_msa.h:36
hevc_intra_pred_dc_8x8_msa
static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:401
TransformUnit::intra_pred_mode
int intra_pred_mode
Definition: hevcdec.h:375
SW
#define SW(val, pdst)
Definition: generic_macros_msa.h:169
hevc_intra_pred_vert_8x8_msa
static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:94
MUL2
#define MUL2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2113
hevc_intra_pred_angular_lower_16width_msa
static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1536
s
#define s(width, name)
Definition: cbs_vp9.c:257
NeighbourAvailable::cand_bottom_left
int cand_bottom_left
Definition: hevcdec.h:351
ST_H8
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:431
hevc_intra_pred_plane_16x16_msa
static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:663
SLDI_B4_UB
#define SLDI_B4_UB(...)
Definition: generic_macros_msa.h:645
hevc_intra_pred_plane_4x4_msa
static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:548
int32_t
int32_t
Definition: audio_convert.c:194
hevcpred_mips.h
hevc_intra_pred_angular_upper_8width_msa
static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1015
ST_D8
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:513
ILVL_B2_SH
#define ILVL_B2_SH(...)
Definition: generic_macros_msa.h:1277
PCKEV_B2_UB
#define PCKEV_B2_UB(...)
Definition: generic_macros_msa.h:1732
ST_SH2
#define ST_SH2(...)
Definition: generic_macros_msa.h:368
ff_hevc_intra_pred_dc_msa
void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int log2, int c_idx)
Definition: hevcpred_msa.c:1811
ILVRL_H2_SH
#define ILVRL_H2_SH(...)
Definition: generic_macros_msa.h:1520
ff_hevc_intra_pred_planar_1_msa
void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
Definition: hevcpred_msa.c:1787
src
#define src
Definition: vp8dsp.c:254
unaligned_32
Definition: intreadwrite.h:221
hevc_intra_pred_horiz_16x16_msa
static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:264
ST_W2
#define ST_W2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:452
ILVRL_B2_UH
#define ILVRL_B2_UH(...)
Definition: generic_macros_msa.h:1509
CLIP_SH2_0_255
#define CLIP_SH2_0_255(in0, in1)
Definition: generic_macros_msa.h:943
LW
#define LW(psrc)
Definition: generic_macros_msa.h:108
hevc_intra_pred_plane_8x8_msa
static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:595
hevcdec.h
INTRA_PLANAR
@ INTRA_PLANAR
Definition: hevcdec.h:174
SD4
#define SD4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:258
hevc_intra_pred_dc_32x32_msa
static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:516
ST_SB4
#define ST_SB4(...)
Definition: generic_macros_msa.h:377
ff_pred_intra_pred_angular_1_msa
void ff_pred_intra_pred_angular_1_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
Definition: hevcpred_msa.c:1852
hevc_intra_pred_angular_lower_32width_msa
static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1652
hevc_intra_pred_angular_lower_4width_msa
static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1338
hevc_intra_pred_angular_upper_32width_msa
static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1224
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
ST_SB2
#define ST_SB2(...)
Definition: generic_macros_msa.h:366
HEVC_PRED_PLANAR_16x2
#define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, res0, res1, mul_val_b0, mul_val_b1, round)
Definition: hevcpred_msa.c:33
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
SW4
#define SW4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:243
SPLATI_H2_SH
#define SPLATI_H2_SH(...)
Definition: generic_macros_msa.h:1668
src0
#define src0
Definition: h264pred.c:138
ff_intra_pred_8_32x32_msa
void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
Definition: hevcpred_msa.c:2419
SUB2
#define SUB2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2147
hevc_intra_pred_dc_4x4_msa
static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:340
hevc_intra_pred_horiz_32x32_msa
static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:310
flag
#define flag(name)
Definition: cbs_av1.c:557
src1
#define src1
Definition: h264pred.c:139
ff_hevc_intra_pred_planar_0_msa
void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
Definition: hevcpred_msa.c:1779
SRARI_H2_UH
#define SRARI_H2_UH(...)
Definition: generic_macros_msa.h:2070
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
NeighbourAvailable::cand_up_left
int cand_up_left
Definition: hevcdec.h:354
ST_SB
#define ST_SB(...)
Definition: generic_macros_msa.h:45
CLIP_SH_0_255
#define CLIP_SH_0_255(in)
Definition: generic_macros_msa.h:937
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:472
uint8_t
uint8_t
Definition: audio_convert.c:194
UNPCK_UB_SH
#define UNPCK_UB_SH(in, out0, out1)
Definition: generic_macros_msa.h:2233
hevc_intra_pred_dc_16x16_msa
static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:460
log2
#define log2(x)
Definition: libm.h:404
IntraPredMode
IntraPredMode
Definition: hevcdec.h:173
SLDI_B4_SH
#define SLDI_B4_SH(...)
Definition: generic_macros_msa.h:647
ST_UB
#define ST_UB(...)
Definition: generic_macros_msa.h:44
LD_UB2
#define LD_UB2(...)
Definition: generic_macros_msa.h:279
hevc_intra_pred_angular_lower_8width_msa
static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1435
left
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
Definition: snow.txt:386
SLDI_B4_SB
#define SLDI_B4_SB(...)
Definition: generic_macros_msa.h:646
ff_hevc_intra_pred_planar_3_msa
void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
Definition: hevcpred_msa.c:1803
ff_intra_pred_8_16x16_msa
void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
Definition: hevcpred_msa.c:1906
process_intra_lower_16x16_msa
static void process_intra_lower_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, uint8_t offset)
Definition: hevcpred_msa.c:826
ILVR_D2_SH
#define ILVR_D2_SH(...)
Definition: generic_macros_msa.h:1457
mode
mode
Definition: ebur128.h:83
HEVCContext
Definition: hevcdec.h:467
ff_pred_intra_pred_angular_0_msa
void ff_pred_intra_pred_angular_0_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
Definition: hevcpred_msa.c:1834
hevc_intra_pred_vert_16x16_msa
static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:149
ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:107
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:501
HEVCLocalContext::tu
TransformUnit tu
Definition: hevcdec.h:439
ILVR_B4_SH
#define ILVR_B4_SH(...)
Definition: generic_macros_msa.h:1374
zero
#define zero
Definition: regdef.h:64
process_intra_upper_16x16_msa
static void process_intra_upper_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, uint8_t offset)
Definition: hevcpred_msa.c:743
hevc_intra_pred_vert_4x4_msa
static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:61
ILVRL_B2_SH
#define ILVRL_B2_SH(...)
Definition: generic_macros_msa.h:1510
hevc_intra_pred_horiz_4x4_msa
static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:190
PCKEV_B2_SB
#define PCKEV_B2_SB(...)
Definition: generic_macros_msa.h:1731
MUL4
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2118
hevc_intra_pred_angular_upper_16width_msa
static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1114
ff_pred_intra_pred_angular_3_msa
void ff_pred_intra_pred_angular_3_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
Definition: hevcpred_msa.c:1888
LD
#define LD(psrc)
Definition: generic_macros_msa.h:139
ILVR_B2_SH
#define ILVR_B2_SH(...)
Definition: generic_macros_msa.h:1352
TransformUnit::intra_pred_mode_c
int intra_pred_mode_c
Definition: hevcdec.h:376
PCKEV_D2_SH
#define PCKEV_D2_SH(...)
Definition: generic_macros_msa.h:1801
INSERT_D2_UB
#define INSERT_D2_UB(...)
Definition: generic_macros_msa.h:1181
ff_pred_intra_pred_angular_2_msa
void ff_pred_intra_pred_angular_2_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
Definition: hevcpred_msa.c:1870
int
int
Definition: ffmpeg_filter.c:192
SD
#define SD
Definition: ccaption_dec.c:819
ff_hevc_intra_pred_planar_2_msa
void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
Definition: hevcpred_msa.c:1795
LD_SB2
#define LD_SB2(...)
Definition: generic_macros_msa.h:280