FFmpeg
hevcpred_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/hevcdec.h"
23 #include "hevcpred_mips.h"
24 
25 static const int8_t intra_pred_angle_up[17] = {
26  -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
27 };
28 
29 static const int8_t intra_pred_angle_low[16] = {
30  32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
31 };
32 
33 #define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, \
34  mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, \
35  res0, res1, mul_val_b0, mul_val_b1, round) \
36 { \
37  v8i16 res0_m, res1_m, res2_m, res3_m; \
38  \
39  MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1, \
40  mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m); \
41  \
42  res0_m += mul_val_h1 * tmp0; \
43  res1_m += mul_val_h3 * tmp0; \
44  res2_m += mul_val_h1 * tmp0; \
45  res3_m += mul_val_h3 * tmp0; \
46  \
47  res0_m += mul_val_b0 * src0_r; \
48  res1_m += mul_val_b0 * src0_l; \
49  res2_m += (mul_val_b0 - 1) * src0_r; \
50  res3_m += (mul_val_b0 - 1) * src0_l; \
51  \
52  res0_m += mul_val_b1 * tmp1; \
53  res1_m += mul_val_b1 * tmp1; \
54  res2_m += (mul_val_b1 + 1) * tmp1; \
55  res3_m += (mul_val_b1 + 1) * tmp1; \
56  \
57  SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round); \
58  PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1); \
59 }
60 
61 static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top,
62  const uint8_t *src_left,
63  uint8_t *dst, int32_t stride,
64  int32_t flag)
65 {
66  uint32_t col;
67  uint32_t src_data;
68  v8i16 vec0, vec1, vec2;
69  v16i8 zero = { 0 };
70 
71  src_data = LW(src_top);
72  SW4(src_data, src_data, src_data, src_data, dst, stride);
73 
74  if (0 == flag) {
75  src_data = LW(src_left);
76 
77  vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
78 
79  vec0 = __msa_fill_h(src_left[-1]);
80  vec1 = __msa_fill_h(src_top[0]);
81 
82  vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
83  vec2 -= vec0;
84  vec2 >>= 1;
85  vec2 += vec1;
86  vec2 = CLIP_SH_0_255(vec2);
87 
88  for (col = 0; col < 4; col++) {
89  dst[stride * col] = (uint8_t) vec2[col];
90  }
91  }
92 }
93 
94 static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top,
95  const uint8_t *src_left,
96  uint8_t *dst, int32_t stride,
97  int32_t flag)
98 {
99  uint8_t *tmp_dst = dst;
100  uint32_t row;
101  uint16_t val0, val1, val2, val3;
102  uint64_t src_data1;
103  v8i16 vec0, vec1, vec2;
104  v16i8 zero = { 0 };
105 
106  src_data1 = LD(src_top);
107 
108  for (row = 8; row--;) {
109  SD(src_data1, tmp_dst);
110  tmp_dst += stride;
111  }
112 
113  if (0 == flag) {
114  src_data1 = LD(src_left);
115 
116  vec2 = (v8i16) __msa_insert_d((v2i64) zero, 0, src_data1);
117 
118  vec0 = __msa_fill_h(src_left[-1]);
119  vec1 = __msa_fill_h(src_top[0]);
120 
121  vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
122  vec2 -= vec0;
123  vec2 >>= 1;
124  vec2 += vec1;
125  vec2 = CLIP_SH_0_255(vec2);
126 
127  val0 = vec2[0];
128  val1 = vec2[1];
129  val2 = vec2[2];
130  val3 = vec2[3];
131 
132  dst[0] = val0;
133  dst[stride] = val1;
134  dst[2 * stride] = val2;
135  dst[3 * stride] = val3;
136 
137  val0 = vec2[4];
138  val1 = vec2[5];
139  val2 = vec2[6];
140  val3 = vec2[7];
141 
142  dst[4 * stride] = val0;
143  dst[5 * stride] = val1;
144  dst[6 * stride] = val2;
145  dst[7 * stride] = val3;
146  }
147 }
148 
149 static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top,
150  const uint8_t *src_left,
151  uint8_t *dst, int32_t stride,
152  int32_t flag)
153 {
154  int32_t col;
155  uint8_t *tmp_dst = dst;
156  uint32_t row;
157  v16u8 src;
158  v8i16 vec0, vec1, vec2, vec3;
159 
160  src = LD_UB(src_top);
161 
162  for (row = 16; row--;) {
163  ST_UB(src, tmp_dst);
164  tmp_dst += stride;
165  }
166 
167  if (0 == flag) {
168  src = LD_UB(src_left);
169 
170  vec0 = __msa_fill_h(src_left[-1]);
171  vec1 = __msa_fill_h(src_top[0]);
172 
173  UNPCK_UB_SH(src, vec2, vec3);
174  SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
175 
176  vec2 >>= 1;
177  vec3 >>= 1;
178 
179  ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
180  CLIP_SH2_0_255(vec2, vec3);
181 
182  src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
183 
184  for (col = 0; col < 16; col++) {
185  dst[stride * col] = src[col];
186  }
187  }
188 }
189 
190 static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top,
191  const uint8_t *src_left,
192  uint8_t *dst, int32_t stride,
193  int32_t flag)
194 {
195  uint32_t val0, val1, val2, val3;
196  v16i8 src0;
197  v8i16 src0_r, src_top_val, src_left_val;
198  v16i8 zero = { 0 };
199 
200  val0 = src_left[0] * 0x01010101;
201  val1 = src_left[1] * 0x01010101;
202  val2 = src_left[2] * 0x01010101;
203  val3 = src_left[3] * 0x01010101;
204  SW4(val0, val1, val2, val3, dst, stride);
205 
206  if (0 == flag) {
207  val0 = LW(src_top);
208  src0 = (v16i8) __msa_insert_w((v4i32) src0, 0, val0);
209  src_top_val = __msa_fill_h(src_top[-1]);
210  src_left_val = __msa_fill_h(src_left[0]);
211 
212  src0_r = (v8i16) __msa_ilvr_b(zero, src0);
213 
214  src0_r -= src_top_val;
215  src0_r >>= 1;
216  src0_r += src_left_val;
217  src0_r = CLIP_SH_0_255(src0_r);
218  src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
219  val0 = __msa_copy_s_w((v4i32) src0, 0);
220  SW(val0, dst);
221  }
222 }
223 
224 static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top,
225  const uint8_t *src_left,
226  uint8_t *dst, int32_t stride,
227  int32_t flag)
228 {
229  uint64_t val0, val1, val2, val3;
230  v16i8 src0;
231  v8i16 src0_r, src_top_val, src_left_val;
232  v16i8 zero = { 0 };
233 
234  val0 = src_left[0] * 0x0101010101010101;
235  val1 = src_left[1] * 0x0101010101010101;
236  val2 = src_left[2] * 0x0101010101010101;
237  val3 = src_left[3] * 0x0101010101010101;
238  SD4(val0, val1, val2, val3, dst, stride);
239 
240  val0 = src_left[4] * 0x0101010101010101;
241  val1 = src_left[5] * 0x0101010101010101;
242  val2 = src_left[6] * 0x0101010101010101;
243  val3 = src_left[7] * 0x0101010101010101;
244  SD4(val0, val1, val2, val3, dst + 4 * stride, stride);
245 
246  if (0 == flag) {
247  val0 = LD(src_top);
248  src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, val0);
249  src_top_val = __msa_fill_h(src_top[-1]);
250  src_left_val = __msa_fill_h(src_left[0]);
251 
252  src0_r = (v8i16) __msa_ilvr_b(zero, src0);
253 
254  src0_r -= src_top_val;
255  src0_r >>= 1;
256  src0_r += src_left_val;
257  src0_r = CLIP_SH_0_255(src0_r);
258  src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
259  val0 = __msa_copy_s_d((v2i64) src0, 0);
260  SD(val0, dst);
261  }
262 }
263 
264 static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top,
265  const uint8_t *src_left,
266  uint8_t *dst, int32_t stride,
267  int32_t flag)
268 {
269  uint8_t *tmp_dst = dst;
270  uint32_t row;
271  uint8_t inp0, inp1, inp2, inp3;
272  v16i8 src0, src1, src2, src3;
273  v8i16 src0_r, src0_l, src_left_val, src_top_val;
274 
275  src_left_val = __msa_fill_h(src_left[0]);
276 
277  for (row = 4; row--;) {
278  inp0 = src_left[0];
279  inp1 = src_left[1];
280  inp2 = src_left[2];
281  inp3 = src_left[3];
282  src_left += 4;
283 
284  src0 = __msa_fill_b(inp0);
285  src1 = __msa_fill_b(inp1);
286  src2 = __msa_fill_b(inp2);
287  src3 = __msa_fill_b(inp3);
288 
289  ST_SB4(src0, src1, src2, src3, tmp_dst, stride);
290  tmp_dst += (4 * stride);
291  }
292 
293  if (0 == flag) {
294  src0 = LD_SB(src_top);
295  src_top_val = __msa_fill_h(src_top[-1]);
296 
297  UNPCK_UB_SH(src0, src0_r, src0_l);
298  SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
299 
300  src0_r >>= 1;
301  src0_l >>= 1;
302 
303  ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
304  CLIP_SH2_0_255(src0_r, src0_l);
305  src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
306  ST_SB(src0, dst);
307  }
308 }
309 
310 static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top,
311  const uint8_t *src_left,
312  uint8_t *dst, int32_t stride)
313 {
314  uint32_t row;
315  uint8_t inp0, inp1, inp2, inp3;
316  v16i8 src0, src1, src2, src3;
317 
318  for (row = 0; row < 8; row++) {
319  inp0 = src_left[row * 4];
320  inp1 = src_left[row * 4 + 1];
321  inp2 = src_left[row * 4 + 2];
322  inp3 = src_left[row * 4 + 3];
323 
324  src0 = __msa_fill_b(inp0);
325  src1 = __msa_fill_b(inp1);
326  src2 = __msa_fill_b(inp2);
327  src3 = __msa_fill_b(inp3);
328 
329  ST_SB2(src0, src0, dst, 16);
330  dst += stride;
331  ST_SB2(src1, src1, dst, 16);
332  dst += stride;
333  ST_SB2(src2, src2, dst, 16);
334  dst += stride;
335  ST_SB2(src3, src3, dst, 16);
336  dst += stride;
337  }
338 }
339 
340 static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top,
341  const uint8_t *src_left,
342  uint8_t *dst, int32_t stride,
343  int32_t flag)
344 {
345  uint8_t *tmp_dst = dst;
346  uint32_t addition = 0;
347  uint32_t val0, val1, val2;
348  v16i8 src = { 0 };
349  v16u8 store;
350  v16i8 zero = { 0 };
351  v8u16 sum, vec0, vec1;
352 
353  val0 = LW(src_top);
354  val1 = LW(src_left);
355  INSERT_W2_SB(val0, val1, src);
356  sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
357  sum = (v8u16) __msa_hadd_u_w(sum, sum);
358  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
359  sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
360  addition = __msa_copy_u_w((v4i32) sum, 0);
361  store = (v16u8) __msa_fill_b(addition);
362  val0 = __msa_copy_u_w((v4i32) store, 0);
363  SW4(val0, val0, val0, val0, dst, stride)
364 
365  if (0 == flag) {
366  ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
367 
368  vec1 += vec0;
369  vec0 += vec0;
370  vec1 += vec0;
371 
372  vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
373  store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
374  val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
375  store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
376  val0 = __msa_copy_u_w((v4i32) store, 0);
377  SW(val0, tmp_dst);
378 
379  val0 = src_left[1];
380  val1 = src_left[2];
381  val2 = src_left[3];
382 
383  addition *= 3;
384 
385  ADD2(val0, addition, val1, addition, val0, val1);
386  val2 += addition;
387 
388  val0 += 2;
389  val1 += 2;
390  val2 += 2;
391  val0 >>= 2;
392  val1 >>= 2;
393  val2 >>= 2;
394 
395  tmp_dst[stride * 1] = val0;
396  tmp_dst[stride * 2] = val1;
397  tmp_dst[stride * 3] = val2;
398  }
399 }
400 
401 static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top,
402  const uint8_t *src_left,
403  uint8_t *dst, int32_t stride,
404  int32_t flag)
405 {
406  uint8_t *tmp_dst = dst;
407  uint32_t row, col, val;
408  uint32_t addition = 0;
409  uint64_t val0, val1;
410  v16u8 src = { 0 };
411  v16u8 store;
412  v8u16 sum, vec0, vec1;
413  v16i8 zero = { 0 };
414 
415  val0 = LD(src_top);
416  val1 = LD(src_left);
417  INSERT_D2_UB(val0, val1, src);
418  sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
419  sum = (v8u16) __msa_hadd_u_w(sum, sum);
420  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
421  sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
422  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
423  sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
424  addition = __msa_copy_u_w((v4i32) sum, 0);
425  store = (v16u8) __msa_fill_b(addition);
426  val0 = __msa_copy_u_d((v2i64) store, 0);
427 
428  for (row = 8; row--;) {
429  SD(val0, dst);
430  dst += stride;
431  }
432 
433  if (0 == flag) {
434  ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
435 
436  vec1 += vec0;
437  vec0 += vec0;
438  vec1 += vec0;
439  vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
440  store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
441  val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
442  store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
443  val0 = __msa_copy_u_d((v2i64) store, 0);
444  SD(val0, tmp_dst);
445 
446  val0 = LD(src_left);
447  src = (v16u8) __msa_insert_d((v2i64) src, 0, val0);
448  vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8) src);
449  vec0 = (v8u16) __msa_fill_h(addition);
450  vec0 *= 3;
451  vec1 += vec0;
452  vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
453 
454  for (col = 1; col < 8; col++) {
455  tmp_dst[stride * col] = vec1[col];
456  }
457  }
458 }
459 
460 static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top,
461  const uint8_t *src_left,
462  uint8_t *dst, int32_t stride,
463  int32_t flag)
464 {
465  uint8_t *tmp_dst = dst;
466  uint32_t row, col, val;
467  uint32_t addition = 0;
468  v16u8 src_above1, store, src_left1;
469  v8u16 sum, sum_above, sum_left;
470  v8u16 vec0, vec1, vec2;
471  v16i8 zero = { 0 };
472 
473  src_above1 = LD_UB(src_top);
474  src_left1 = LD_UB(src_left);
475 
476  HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
477  sum = sum_above + sum_left;
478  sum = (v8u16) __msa_hadd_u_w(sum, sum);
479  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
480  sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
481  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
482  sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
483  addition = __msa_copy_u_w((v4i32) sum, 0);
484  store = (v16u8) __msa_fill_b(addition);
485 
486  for (row = 16; row--;) {
487  ST_UB(store, dst);
488  dst += stride;
489  }
490 
491  if (0 == flag) {
492  vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store);
493  ILVRL_B2_UH(zero, src_above1, vec1, vec2);
494  ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
495  vec0 += vec0;
496  ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
497  SRARI_H2_UH(vec1, vec2, 2);
498  store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
499  val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
500  store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
501  ST_UB(store, tmp_dst);
502 
503  ILVRL_B2_UH(zero, src_left1, vec1, vec2);
504  vec0 = (v8u16) __msa_fill_h(addition);
505  vec0 *= 3;
506  ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
507  SRARI_H2_UH(vec1, vec2, 2);
508  store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
509 
510  for (col = 1; col < 16; col++) {
511  tmp_dst[stride * col] = store[col];
512  }
513  }
514 }
515 
516 static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top,
517  const uint8_t *src_left,
518  uint8_t *dst, int32_t stride)
519 {
520  uint32_t row;
521  v16u8 src_above1, src_above2, store, src_left1, src_left2;
522  v8u16 sum_above1, sum_above2;
523  v8u16 sum_left1, sum_left2;
524  v8u16 sum, sum_above, sum_left;
525 
526  LD_UB2(src_top, 16, src_above1, src_above2);
527  LD_UB2(src_left, 16, src_left1, src_left2);
528  HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
529  HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
530  sum_above = sum_above1 + sum_above2;
531  sum_left = sum_left1 + sum_left2;
532  sum = sum_above + sum_left;
533  sum = (v8u16) __msa_hadd_u_w(sum, sum);
534  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
535  sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
536  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
537  sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
538  store = (v16u8) __msa_splati_b((v16i8) sum, 0);
539 
540  for (row = 16; row--;) {
541  ST_UB2(store, store, dst, 16);
542  dst += stride;
543  ST_UB2(store, store, dst, 16);
544  dst += stride;
545  }
546 }
547 
548 static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top,
549  const uint8_t *src_left,
550  uint8_t *dst, int32_t stride)
551 {
552  uint32_t src0, src1;
553  v16i8 src_vec0, src_vec1;
554  v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
555  v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
556  v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
557  v16i8 zero = { 0 };
558 
559  src0 = LW(src_top);
560  src1 = LW(src_left);
561 
562  mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
563 
564  src_vec0 = (v16i8) __msa_insert_w((v4i32) zero, 0, src0);
565  src_vec1 = (v16i8) __msa_insert_w((v4i32) zero, 0, src1);
566 
567  ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r);
568  SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
569 
570  tmp0 = __msa_fill_h(src_top[4]);
571  tmp1 = __msa_fill_h(src_left[4]);
572 
573  MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
574  res0, res1, res2, res3);
575 
576  res0 += mul_val1 * tmp0;
577  res1 += mul_val1 * tmp0;
578  res2 += mul_val1 * tmp0;
579  res3 += mul_val1 * tmp0;
580 
581  res0 += 3 * src_vec0_r;
582  res1 += 2 * src_vec0_r;
583  res2 += src_vec0_r;
584  res0 += tmp1;
585  res1 += 2 * tmp1;
586  res2 += 3 * tmp1;
587  res3 += 4 * tmp1;
588 
589  PCKEV_D2_SH(res1, res0, res3, res2, res0, res1);
590  SRARI_H2_SH(res0, res1, 3);
591  src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
592  ST_W4(src_vec0, 0, 1, 2, 3, dst, stride);
593 }
594 
595 static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
596  const uint8_t *src_left,
597  uint8_t *dst, int32_t stride)
598 {
599  uint64_t src0, src1;
600  v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
601  v8i16 src_vec0_r, src_vec1_r;
602  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
603  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
604  v8i16 tmp0, tmp1, tmp2;
605  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
606  v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
607  v16i8 zero = { 0 };
608 
609  src0 = LD(src_top);
610  src1 = LD(src_left);
611 
612  src_vec0 = (v16i8) __msa_insert_d((v2i64) zero, 0, src0);
613  src_vec1 = (v16i8) __msa_insert_d((v2i64) zero, 0, src1);
614 
615  ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r);
616  SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
617  SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
618 
619  tmp0 = __msa_fill_h(src_top[8]);
620  tmp1 = __msa_fill_h(src_left[8]);
621 
622  MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
623  res0, res1, res2, res3);
624  MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
625  res4, res5, res6, res7);
626 
627  tmp2 = mul_val1 * tmp0;
628  res0 += tmp2;
629  res1 += tmp2;
630  res2 += tmp2;
631  res3 += tmp2;
632  res4 += tmp2;
633  res5 += tmp2;
634  res6 += tmp2;
635  res7 += tmp2;
636 
637  res0 += 7 * src_vec0_r;
638  res1 += 6 * src_vec0_r;
639  res2 += 5 * src_vec0_r;
640  res3 += 4 * src_vec0_r;
641  res4 += 3 * src_vec0_r;
642  res5 += 2 * src_vec0_r;
643  res6 += src_vec0_r;
644 
645  res0 += tmp1;
646  res1 += 2 * tmp1;
647  res2 += 3 * tmp1;
648  res3 += 4 * tmp1;
649  res4 += 5 * tmp1;
650  res5 += 6 * tmp1;
651  res6 += 7 * tmp1;
652  res7 += 8 * tmp1;
653 
654  SRARI_H4_SH(res0, res1, res2, res3, 4);
655  SRARI_H4_SH(res4, res5, res6, res7, 4);
656  PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
657  src_vec0, src_vec1, src_vec2, src_vec3);
658 
659  ST_D8(src_vec0, src_vec1, src_vec2, src_vec3, 0, 1, 0, 1,
660  0, 1, 0, 1, dst, stride);
661 }
662 
663 static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top,
664  const uint8_t *src_left,
665  uint8_t *dst, int32_t stride)
666 {
667  v16u8 src0, src1;
668  v8i16 src0_r, src1_r, src0_l, src1_l;
669  v8i16 vec0, vec1;
670  v8i16 res0, res1, tmp0, tmp1;
671  v8i16 mul_val2, mul_val3;
672  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
673  v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
674 
675  src0 = LD_UB(src_top);
676  src1 = LD_UB(src_left);
677 
678  UNPCK_UB_SH(src0, src0_r, src0_l);
679  UNPCK_UB_SH(src1, src1_r, src1_l);
680 
681  mul_val2 = mul_val0 - 8;
682  mul_val3 = mul_val1 + 8;
683 
684  tmp0 = __msa_fill_h(src_top[16]);
685  tmp1 = __msa_fill_h(src_left[16]);
686 
687  SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
688  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
689  mul_val0, mul_val1, mul_val2, mul_val3,
690  res0, res1, 15, 1, 5);
691  ST_SH2(res0, res1, dst, stride);
692  dst += (2 * stride);
693 
694  SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
695  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
696  mul_val0, mul_val1, mul_val2, mul_val3,
697  res0, res1, 13, 3, 5);
698  ST_SH2(res0, res1, dst, stride);
699  dst += (2 * stride);
700 
701  SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
702  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
703  mul_val0, mul_val1, mul_val2, mul_val3,
704  res0, res1, 11, 5, 5);
705  ST_SH2(res0, res1, dst, stride);
706  dst += (2 * stride);
707 
708  SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
709  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
710  mul_val0, mul_val1, mul_val2, mul_val3,
711  res0, res1, 9, 7, 5);
712  ST_SH2(res0, res1, dst, stride);
713  dst += (2 * stride);
714 
715  SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
716  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
717  mul_val0, mul_val1, mul_val2, mul_val3,
718  res0, res1, 7, 9, 5);
719  ST_SH2(res0, res1, dst, stride);
720  dst += (2 * stride);
721 
722  SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
723  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
724  mul_val0, mul_val1, mul_val2, mul_val3,
725  res0, res1, 5, 11, 5);
726  ST_SH2(res0, res1, dst, stride);
727  dst += (2 * stride);
728 
729  SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
730  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
731  mul_val0, mul_val1, mul_val2, mul_val3,
732  res0, res1, 3, 13, 5);
733  ST_SH2(res0, res1, dst, stride);
734  dst += (2 * stride);
735 
736  SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
737  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
738  mul_val0, mul_val1, mul_val2, mul_val3,
739  res0, res1, 1, 15, 5);
740  ST_SH2(res0, res1, dst, stride);
741 }
742 
743 static void process_intra_upper_16x16_msa(const uint8_t *src_top,
744  const uint8_t *src_left,
745  uint8_t *dst, int32_t stride,
746  uint8_t offset)
747 {
748  v16i8 src0, src1;
749  v8i16 src0_r, src1_r, src0_l, src1_l;
750  v8i16 vec0, vec1, res0, res1;
751  v8i16 tmp0, tmp1;
752  v8i16 mul_val2, mul_val3;
753  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
754  v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
755 
756  tmp0 = __msa_fill_h(src_top[32 - offset]);
757  tmp1 = __msa_fill_h(src_left[32]);
758 
759  src0 = LD_SB(src_top);
760  src1 = LD_SB(src_left);
761 
762  UNPCK_UB_SH(src0, src0_r, src0_l);
763  UNPCK_UB_SH(src1, src1_r, src1_l);
764 
765  mul_val1 += offset;
766  mul_val0 -= offset;
767  mul_val2 = mul_val0 - 8;
768  mul_val3 = mul_val1 + 8;
769 
770  SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
771  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
772  mul_val0, mul_val1, mul_val2, mul_val3,
773  res0, res1, 31, 1, 6);
774  ST_SH2(res0, res1, dst, stride);
775  dst += (2 * stride);
776 
777  SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
778  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
779  mul_val0, mul_val1, mul_val2, mul_val3,
780  res0, res1, 29, 3, 6);
781  ST_SH2(res0, res1, dst, stride);
782  dst += (2 * stride);
783 
784  SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
785  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
786  mul_val0, mul_val1, mul_val2, mul_val3,
787  res0, res1, 27, 5, 6);
788  ST_SH2(res0, res1, dst, stride);
789  dst += (2 * stride);
790 
791  SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
792  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
793  mul_val0, mul_val1, mul_val2, mul_val3,
794  res0, res1, 25, 7, 6);
795  ST_SH2(res0, res1, dst, stride);
796  dst += (2 * stride);
797 
798  SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
799  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
800  mul_val0, mul_val1, mul_val2, mul_val3,
801  res0, res1, 23, 9, 6);
802  ST_SH2(res0, res1, dst, stride);
803  dst += (2 * stride);
804 
805  SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
806  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
807  mul_val0, mul_val1, mul_val2, mul_val3,
808  res0, res1, 21, 11, 6);
809  ST_SH2(res0, res1, dst, stride);
810  dst += (2 * stride);
811 
812  SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
813  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
814  mul_val0, mul_val1, mul_val2, mul_val3,
815  res0, res1, 19, 13, 6);
816  ST_SH2(res0, res1, dst, stride);
817  dst += (2 * stride);
818 
819  SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
820  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
821  mul_val0, mul_val1, mul_val2, mul_val3,
822  res0, res1, 17, 15, 6);
823  ST_SH2(res0, res1, dst, stride);
824 }
825 
826 static void process_intra_lower_16x16_msa(const uint8_t *src_top,
827  const uint8_t *src_left,
828  uint8_t *dst, int32_t stride,
829  uint8_t offset)
830 {
831  v16i8 src0, src1;
832  v8i16 src0_r, src1_r, src0_l, src1_l;
833  v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
834  v8i16 mul_val2, mul_val3;
835  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
836  v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
837 
838  tmp0 = __msa_fill_h(src_top[32 - offset]);
839  tmp1 = __msa_fill_h(src_left[16]);
840 
841  src0 = LD_SB(src_top);
842  src1 = LD_SB(src_left);
843 
844  UNPCK_UB_SH(src0, src0_r, src0_l);
845  UNPCK_UB_SH(src1, src1_r, src1_l);
846 
847  mul_val1 += offset;
848  mul_val0 -= offset;
849  mul_val2 = mul_val0 - 8;
850  mul_val3 = mul_val1 + 8;
851 
852  SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
853  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
854  mul_val0, mul_val1, mul_val2, mul_val3,
855  res0, res1, 15, 17, 6);
856  ST_SH2(res0, res1, dst, stride);
857  dst += (2 * stride);
858 
859  SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
860  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
861  mul_val0, mul_val1, mul_val2, mul_val3,
862  res0, res1, 13, 19, 6);
863  ST_SH2(res0, res1, dst, stride);
864  dst += (2 * stride);
865 
866  SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
867  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
868  mul_val0, mul_val1, mul_val2, mul_val3,
869  res0, res1, 11, 21, 6);
870  ST_SH2(res0, res1, dst, stride);
871  dst += (2 * stride);
872 
873  SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
874  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
875  mul_val0, mul_val1, mul_val2, mul_val3,
876  res0, res1, 9, 23, 6);
877  ST_SH2(res0, res1, dst, stride);
878  dst += (2 * stride);
879 
880  SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
881  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
882  mul_val0, mul_val1, mul_val2, mul_val3,
883  res0, res1, 7, 25, 6);
884  ST_SH2(res0, res1, dst, stride);
885  dst += (2 * stride);
886 
887  SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
888  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
889  mul_val0, mul_val1, mul_val2, mul_val3,
890  res0, res1, 5, 27, 6);
891  ST_SH2(res0, res1, dst, stride);
892  dst += (2 * stride);
893 
894  SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
895  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
896  mul_val0, mul_val1, mul_val2, mul_val3,
897  res0, res1, 3, 29, 6);
898  ST_SH2(res0, res1, dst, stride);
899  dst += (2 * stride);
900 
901  SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
902  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
903  mul_val0, mul_val1, mul_val2, mul_val3,
904  res0, res1, 1, 31, 6);
905  ST_SH2(res0, res1, dst, stride);
906 }
907 
908 static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top,
909  const uint8_t *src_left,
910  uint8_t *dst, int32_t stride)
911 {
912  process_intra_upper_16x16_msa(src_top, src_left, dst, stride, 0);
913  process_intra_upper_16x16_msa((src_top + 16), src_left,
914  (dst + 16), stride, 16);
915  dst += (16 * stride);
916  src_left += 16;
917 
918  process_intra_lower_16x16_msa(src_top, src_left, dst, stride, 0);
919  process_intra_lower_16x16_msa((src_top + 16), src_left,
920  (dst + 16), stride, 16);
921 }
922 
924  const uint8_t *src_left,
925  uint8_t *dst,
926  int32_t stride,
927  int32_t mode)
928 {
929  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
930  uint8_t ref_array[3 * 32 + 4];
931  uint8_t *ref_tmp = ref_array + 4;
932  const uint8_t *ref;
933  int32_t last;
934  int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
935  int32_t idx2, fact_val2, idx3, fact_val3;
936  int32_t angle, angle_loop;
937  int32_t inv_angle_val, offset;
938  uint64_t tmp0;
939  v16i8 top0, top1, top2, top3;
940  v16i8 dst_val0;
941  v16i8 zero = { 0 };
942  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
943  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
944 
945  angle = intra_pred_angle_up[mode - 18];
946  inv_angle_val = inv_angle[mode - 18];
947  last = (angle) >> 3;
948  angle_loop = angle;
949 
950  ref = src_top - 1;
951  if (angle < 0 && last < -1) {
952  inv_angle_val = inv_angle[mode - 18];
953 
954  tmp0 = LD(ref);
955  SD(tmp0, ref_tmp);
956 
957  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
958  offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
959  ref_tmp[h_cnt] = src_left[offset];
960  }
961 
962  ref = ref_tmp;
963  }
964 
965  idx0 = angle_loop >> 5;
966  fact_val0 = angle_loop & 31;
967  angle_loop += angle;
968 
969  idx1 = angle_loop >> 5;
970  fact_val1 = angle_loop & 31;
971  angle_loop += angle;
972 
973  idx2 = angle_loop >> 5;
974  fact_val2 = angle_loop & 31;
975  angle_loop += angle;
976 
977  idx3 = angle_loop >> 5;
978  fact_val3 = angle_loop & 31;
979 
980  top0 = LD_SB(ref + idx0 + 1);
981  top1 = LD_SB(ref + idx1 + 1);
982  top2 = LD_SB(ref + idx2 + 1);
983  top3 = LD_SB(ref + idx3 + 1);
984 
985  fact0 = __msa_fill_h(fact_val0);
986  fact1 = __msa_fill_h(32 - fact_val0);
987 
988  fact2 = __msa_fill_h(fact_val1);
989  fact3 = __msa_fill_h(32 - fact_val1);
990 
991  fact4 = __msa_fill_h(fact_val2);
992  fact5 = __msa_fill_h(32 - fact_val2);
993 
994  fact6 = __msa_fill_h(fact_val3);
995  fact7 = __msa_fill_h(32 - fact_val3);
996 
997  ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
998  ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
999  ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1000  diff0, diff2, diff4, diff6);
1001  SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
1002  ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1003  ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1004  MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1005 
1006  diff1 += diff0 * fact1;
1007  diff3 += diff2 * fact3;
1008 
1009  SRARI_H2_SH(diff1, diff3, 5);
1010  dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
1011  ST_W4(dst_val0, 0, 1, 2, 3, dst, stride);
1012 }
1013 
1015  const uint8_t *src_left,
1016  uint8_t *dst,
1017  int32_t stride,
1018  int32_t mode)
1019 {
1020  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1021  uint8_t ref_array[3 * 32 + 4];
1022  uint8_t *ref_tmp = ref_array + 8;
1023  const uint8_t *ref;
1024  const uint8_t *src_left_tmp = src_left - 1;
1025  int32_t last, offset;
1026  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1027  int32_t idx2, fact_val2, idx3, fact_val3;
1028  int32_t angle, angle_loop;
1029  int32_t inv_angle_val, inv_angle_val_loop;
1030  int32_t tmp0, tmp1, tmp2;
1031  v16i8 top0, top1, top2, top3;
1032  v16u8 dst_val0, dst_val1;
1033  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1034  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1035 
1036  angle = intra_pred_angle_up[mode - 18];
1037  inv_angle_val = inv_angle[mode - 18];
1038  last = (angle) >> 2;
1039  angle_loop = angle;
1040 
1041  ref = src_top - 1;
1042  if (last < -1) {
1043  inv_angle_val_loop = inv_angle_val * last;
1044 
1045  tmp0 = LW(ref);
1046  tmp1 = LW(ref + 4);
1047  tmp2 = LW(ref + 8);
1048  SW(tmp0, ref_tmp);
1049  SW(tmp1, ref_tmp + 4);
1050  SW(tmp2, ref_tmp + 8);
1051 
1052  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1053  offset = (inv_angle_val_loop + 128) >> 8;
1054  ref_tmp[h_cnt] = src_left_tmp[offset];
1055  inv_angle_val_loop += inv_angle_val;
1056  }
1057  ref = ref_tmp;
1058  }
1059 
1060  for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1061  idx0 = (angle_loop) >> 5;
1062  fact_val0 = (angle_loop) & 31;
1063  angle_loop += angle;
1064 
1065  idx1 = (angle_loop) >> 5;
1066  fact_val1 = (angle_loop) & 31;
1067  angle_loop += angle;
1068 
1069  idx2 = (angle_loop) >> 5;
1070  fact_val2 = (angle_loop) & 31;
1071  angle_loop += angle;
1072 
1073  idx3 = (angle_loop) >> 5;
1074  fact_val3 = (angle_loop) & 31;
1075  angle_loop += angle;
1076 
1077  top0 = LD_SB(ref + idx0 + 1);
1078  top1 = LD_SB(ref + idx1 + 1);
1079  top2 = LD_SB(ref + idx2 + 1);
1080  top3 = LD_SB(ref + idx3 + 1);
1081 
1082  fact0 = __msa_fill_h(fact_val0);
1083  fact1 = __msa_fill_h(32 - fact_val0);
1084  fact2 = __msa_fill_h(fact_val1);
1085  fact3 = __msa_fill_h(32 - fact_val1);
1086  fact4 = __msa_fill_h(fact_val2);
1087  fact5 = __msa_fill_h(32 - fact_val2);
1088  fact6 = __msa_fill_h(fact_val3);
1089  fact7 = __msa_fill_h(32 - fact_val3);
1090 
1091  UNPCK_UB_SH(top0, diff0, diff1);
1092  UNPCK_UB_SH(top1, diff2, diff3);
1093  UNPCK_UB_SH(top2, diff4, diff5);
1094  UNPCK_UB_SH(top3, diff6, diff7);
1095 
1096  SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
1097  SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
1098  MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1099  diff1, diff3, diff5, diff7);
1100 
1101  diff1 += diff0 * fact1;
1102  diff3 += diff2 * fact3;
1103  diff5 += diff4 * fact5;
1104  diff7 += diff6 * fact7;
1105 
1106  SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1107  PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
1108  ST_D4(dst_val0, dst_val1, 0, 1, 0, 1, dst, stride);
1109  dst += (4 * stride);
1110  }
1111 }
1112 
1114  const uint8_t *src_left,
1115  uint8_t *dst,
1116  int32_t stride,
1117  int32_t mode)
1118 {
1119  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1120  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1121  int32_t idx2, fact_val2, idx3, fact_val3;
1122  int32_t tmp0;
1123  int32_t angle, angle_loop, offset;
1124  int32_t inv_angle_val, inv_angle_val_loop;
1125  uint8_t ref_array[3 * 32 + 4];
1126  uint8_t *ref_tmp = ref_array + 16;
1127  const uint8_t *ref;
1128  const uint8_t *src_left_tmp = src_left - 1;
1129  int32_t last;
1130  v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1131  v16i8 dst0, dst1, dst2, dst3;
1132  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1133  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1134  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1135 
1136  angle = intra_pred_angle_up[mode - 18];
1137  inv_angle_val = inv_angle[mode - 18];
1138  last = angle >> 1;
1139  angle_loop = angle;
1140 
1141  ref = src_top - 1;
1142  if (last < -1) {
1143  inv_angle_val_loop = inv_angle_val * last;
1144 
1145  top0 = LD_UB(ref);
1146  tmp0 = LW(ref + 16);
1147  ST_UB(top0, ref_tmp);
1148  SW(tmp0, ref_tmp + 16);
1149 
1150  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1151  offset = (inv_angle_val_loop + 128) >> 8;
1152  ref_tmp[h_cnt] = src_left_tmp[offset];
1153  inv_angle_val_loop += inv_angle_val;
1154  }
1155  ref = ref_tmp;
1156  }
1157 
1158  for (v_cnt = 4; v_cnt--;) {
1159  idx0 = (angle_loop) >> 5;
1160  fact_val0 = (angle_loop) & 31;
1161  angle_loop += angle;
1162 
1163  idx1 = (angle_loop) >> 5;
1164  fact_val1 = (angle_loop) & 31;
1165  angle_loop += angle;
1166 
1167  idx2 = (angle_loop) >> 5;
1168  fact_val2 = (angle_loop) & 31;
1169  angle_loop += angle;
1170 
1171  idx3 = (angle_loop) >> 5;
1172  fact_val3 = (angle_loop) & 31;
1173  angle_loop += angle;
1174 
1175  LD_UB2(ref + idx0 + 1, 16, top0, top1);
1176  LD_UB2(ref + idx1 + 1, 16, top2, top3);
1177  LD_UB2(ref + idx2 + 1, 16, top4, top5);
1178  LD_UB2(ref + idx3 + 1, 16, top6, top7);
1179 
1180  fact0 = __msa_fill_h(fact_val0);
1181  fact1 = __msa_fill_h(32 - fact_val0);
1182  fact2 = __msa_fill_h(fact_val1);
1183  fact3 = __msa_fill_h(32 - fact_val1);
1184  fact4 = __msa_fill_h(fact_val2);
1185  fact5 = __msa_fill_h(32 - fact_val2);
1186  fact6 = __msa_fill_h(fact_val3);
1187  fact7 = __msa_fill_h(32 - fact_val3);
1188 
1189  SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
1190  SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
1191  UNPCK_UB_SH(top0, diff0, diff1);
1192  UNPCK_UB_SH(top1, diff2, diff3);
1193  UNPCK_UB_SH(top2, diff4, diff5);
1194  UNPCK_UB_SH(top3, diff6, diff7);
1195  UNPCK_UB_SH(top4, diff8, diff9);
1196  UNPCK_UB_SH(top5, diff10, diff11);
1197  UNPCK_UB_SH(top6, diff12, diff13);
1198  UNPCK_UB_SH(top7, diff14, diff15);
1199 
1200  MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1201  diff2, diff3, diff6, diff7);
1202  MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1203  diff10, diff11, diff14, diff15);
1204 
1205  diff2 += diff0 * fact1;
1206  diff3 += diff1 * fact1;
1207  diff6 += diff4 * fact3;
1208  diff7 += diff5 * fact3;
1209  diff10 += diff8 * fact5;
1210  diff11 += diff9 * fact5;
1211  diff14 += diff12 * fact7;
1212  diff15 += diff13 * fact7;
1213 
1214  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1215  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1216  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1217  dst0, dst1, dst2, dst3);
1218  ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
1219  dst += (4 * stride);
1220  }
1221 }
1222 
1224  const uint8_t *src_left,
1225  uint8_t *dst,
1226  int32_t stride,
1227  int32_t mode)
1228 {
1229  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1230  uint8_t ref_array[3 * 32 + 4];
1231  uint8_t *ref_tmp;
1232  const uint8_t *ref;
1233  const uint8_t *src_left_tmp = src_left - 1;
1234  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1235  int32_t tmp0, tmp1, tmp2, tmp3;
1236  int32_t angle, angle_loop;
1237  int32_t inv_angle_val, inv_angle_val_loop;
1238  int32_t last, offset;
1239  v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1240  v16i8 dst0, dst1, dst2, dst3;
1241  v8i16 fact0, fact1, fact2, fact3;
1242  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1243  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1244 
1245  ref_tmp = ref_array + 32;
1246 
1247  angle = intra_pred_angle_up[mode - 18];
1248  inv_angle_val = inv_angle[mode - 18];
1249  last = angle;
1250  angle_loop = angle;
1251 
1252  ref = src_top - 1;
1253  if (last < -1) {
1254  inv_angle_val_loop = inv_angle_val * last;
1255  LD_UB2(ref, 16, top0, top1);
1256  tmp0 = ref[32];
1257  tmp1 = ref[33];
1258  tmp2 = ref[34];
1259  tmp3 = ref[35];
1260 
1261  ST_UB2(top0, top1, ref_tmp, 16);
1262  ref_tmp[32] = tmp0;
1263  ref_tmp[33] = tmp1;
1264  ref_tmp[34] = tmp2;
1265  ref_tmp[35] = tmp3;
1266 
1267  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1268  offset = (inv_angle_val_loop + 128) >> 8;
1269  ref_tmp[h_cnt] = src_left_tmp[offset];
1270  inv_angle_val_loop += inv_angle_val;
1271  }
1272 
1273  ref = ref_tmp;
1274  }
1275 
1276  for (v_cnt = 16; v_cnt--;) {
1277  idx0 = (angle_loop) >> 5;
1278  fact_val0 = (angle_loop) & 31;
1279  angle_loop += angle;
1280 
1281  idx1 = (angle_loop) >> 5;
1282  fact_val1 = (angle_loop) & 31;
1283  angle_loop += angle;
1284 
1285  top0 = LD_UB(ref + idx0 + 1);
1286  top4 = LD_UB(ref + idx1 + 1);
1287  top1 = LD_UB(ref + idx0 + 17);
1288  top5 = LD_UB(ref + idx1 + 17);
1289  top3 = LD_UB(ref + idx0 + 33);
1290  top7 = LD_UB(ref + idx1 + 33);
1291 
1292  fact0 = __msa_fill_h(fact_val0);
1293  fact1 = __msa_fill_h(32 - fact_val0);
1294  fact2 = __msa_fill_h(fact_val1);
1295  fact3 = __msa_fill_h(32 - fact_val1);
1296 
1297  top2 = top1;
1298  top6 = top5;
1299 
1300  SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
1301  SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
1302  UNPCK_UB_SH(top0, diff0, diff1);
1303  UNPCK_UB_SH(top1, diff2, diff3);
1304  UNPCK_UB_SH(top2, diff4, diff5);
1305  UNPCK_UB_SH(top3, diff6, diff7);
1306  UNPCK_UB_SH(top4, diff8, diff9);
1307  UNPCK_UB_SH(top5, diff10, diff11);
1308  UNPCK_UB_SH(top6, diff12, diff13);
1309  UNPCK_UB_SH(top7, diff14, diff15);
1310 
1311  MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1312  diff2, diff3, diff6, diff7);
1313  MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1314  diff10, diff11, diff14, diff15);
1315 
1316  diff2 += diff0 * fact1;
1317  diff3 += diff1 * fact1;
1318  diff6 += diff4 * fact1;
1319  diff7 += diff5 * fact1;
1320  diff10 += diff8 * fact3;
1321  diff11 += diff9 * fact3;
1322  diff14 += diff12 * fact3;
1323  diff15 += diff13 * fact3;
1324 
1325  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1326  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1327  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1328  dst0, dst1, dst2, dst3);
1329 
1330  ST_SB2(dst0, dst1, dst, 16);
1331  dst += stride;
1332  ST_SB2(dst2, dst3, dst, 16);
1333  dst += stride;
1334  }
1335 }
1336 
1338  const uint8_t *src_left,
1339  uint8_t *dst,
1340  int32_t stride,
1341  int32_t mode)
1342 {
1343  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1344  uint8_t ref_array[3 * 32 + 4];
1345  uint8_t *ref_tmp = ref_array + 4;
1346  const uint8_t *ref;
1347  int32_t last, offset;
1348  int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
1349  int32_t idx2, fact_val2, idx3, fact_val3;
1350  int32_t angle, angle_loop, inv_angle_val;
1351  uint64_t tmp0;
1352  v16i8 dst_val0, dst_val1;
1353  v16u8 top0, top1, top2, top3;
1354  v16u8 zero = { 0 };
1355  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1356  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1357 
1358  angle = intra_pred_angle_low[mode - 2];
1359  last = angle >> 3;
1360  angle_loop = angle;
1361 
1362  ref = src_left - 1;
1363  if (last < -1) {
1364  inv_angle_val = inv_angle[mode - 11];
1365 
1366  tmp0 = LD(ref);
1367  SD(tmp0, ref_tmp);
1368 
1369  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1370  offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
1371  ref_tmp[h_cnt] = src_top[offset];
1372  }
1373 
1374  ref = ref_tmp;
1375  }
1376 
1377  idx0 = angle_loop >> 5;
1378  fact_val0 = angle_loop & 31;
1379  angle_loop += angle;
1380 
1381  idx1 = angle_loop >> 5;
1382  fact_val1 = angle_loop & 31;
1383  angle_loop += angle;
1384 
1385  idx2 = angle_loop >> 5;
1386  fact_val2 = angle_loop & 31;
1387  angle_loop += angle;
1388 
1389  idx3 = angle_loop >> 5;
1390  fact_val3 = angle_loop & 31;
1391 
1392  top0 = LD_UB(ref + idx0 + 1);
1393  top1 = LD_UB(ref + idx1 + 1);
1394  top2 = LD_UB(ref + idx2 + 1);
1395  top3 = LD_UB(ref + idx3 + 1);
1396 
1397  fact0 = __msa_fill_h(fact_val0);
1398  fact1 = __msa_fill_h(32 - fact_val0);
1399  fact2 = __msa_fill_h(fact_val1);
1400  fact3 = __msa_fill_h(32 - fact_val1);
1401  fact4 = __msa_fill_h(fact_val2);
1402  fact5 = __msa_fill_h(32 - fact_val2);
1403  fact6 = __msa_fill_h(fact_val3);
1404  fact7 = __msa_fill_h(32 - fact_val3);
1405 
1406  ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
1407  ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
1408  ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1409  diff0, diff2, diff4, diff6);
1410  SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
1411  ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1412  ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1413  MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1414 
1415  diff1 += diff0 * fact1;
1416  diff3 += diff2 * fact3;
1417 
1418  SRARI_H2_SH(diff1, diff3, 5);
1419  PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
1420 
1421  diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
1422  diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
1423 
1424  diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
1425 
1426  dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
1427  dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
1428 
1429  ST_W2(dst_val0, 0, 1, dst, stride);
1430  ST_W2(dst_val1, 0, 1, dst + 2 * stride, stride);
1431 }
1432 
1434  const uint8_t *src_left,
1435  uint8_t *dst,
1436  int32_t stride,
1437  int32_t mode)
1438 {
1439  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1440  uint8_t ref_array[3 * 32 + 4];
1441  uint8_t *ref_tmp = ref_array + 8;
1442  const uint8_t *ref;
1443  const uint8_t *src_top_tmp = src_top - 1;
1444  uint8_t *dst_org;
1445  int32_t last, offset, tmp0, tmp1, tmp2;
1446  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1447  int32_t idx2, fact_val2, idx3, fact_val3;
1448  int32_t angle, angle_loop, inv_angle_val;
1449  v16i8 top0, top1, top2, top3;
1450  v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
1451  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1452  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1453 
1454  angle = intra_pred_angle_low[mode - 2];
1455  last = (angle) >> 2;
1456  angle_loop = angle;
1457 
1458  ref = src_left - 1;
1459  if (last < -1) {
1460  inv_angle_val = inv_angle[mode - 11];
1461 
1462  tmp0 = LW(ref);
1463  tmp1 = LW(ref + 4);
1464  tmp2 = LW(ref + 8);
1465  SW(tmp0, ref_tmp);
1466  SW(tmp1, ref_tmp + 4);
1467  SW(tmp2, ref_tmp + 8);
1468 
1469  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1470  offset = (h_cnt * inv_angle_val + 128) >> 8;
1471  ref_tmp[h_cnt] = src_top_tmp[offset];
1472  }
1473 
1474  ref = ref_tmp;
1475  }
1476 
1477  for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1478  dst_org = dst;
1479 
1480  idx0 = angle_loop >> 5;
1481  fact_val0 = angle_loop & 31;
1482  angle_loop += angle;
1483 
1484  idx1 = angle_loop >> 5;
1485  fact_val1 = angle_loop & 31;
1486  angle_loop += angle;
1487 
1488  idx2 = angle_loop >> 5;
1489  fact_val2 = angle_loop & 31;
1490  angle_loop += angle;
1491 
1492  idx3 = angle_loop >> 5;
1493  fact_val3 = angle_loop & 31;
1494  angle_loop += angle;
1495 
1496  top0 = LD_SB(ref + idx0 + 1);
1497  top1 = LD_SB(ref + idx1 + 1);
1498  top2 = LD_SB(ref + idx2 + 1);
1499  top3 = LD_SB(ref + idx3 + 1);
1500 
1501  fact0 = __msa_fill_h(fact_val0);
1502  fact1 = __msa_fill_h(32 - fact_val0);
1503  fact2 = __msa_fill_h(fact_val1);
1504  fact3 = __msa_fill_h(32 - fact_val1);
1505  fact4 = __msa_fill_h(fact_val2);
1506  fact5 = __msa_fill_h(32 - fact_val2);
1507  fact6 = __msa_fill_h(fact_val3);
1508  fact7 = __msa_fill_h(32 - fact_val3);
1509 
1510  UNPCK_UB_SH(top0, diff0, diff1);
1511  UNPCK_UB_SH(top1, diff2, diff3);
1512  UNPCK_UB_SH(top2, diff4, diff5);
1513  UNPCK_UB_SH(top3, diff6, diff7);
1514  SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
1515  SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
1516  MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1517  diff1, diff3, diff5, diff7);
1518 
1519  diff1 += diff0 * fact1;
1520  diff3 += diff2 * fact3;
1521  diff5 += diff4 * fact5;
1522  diff7 += diff6 * fact7;
1523 
1524  SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1525  PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
1526  dst_val0, dst_val1, dst_val2, dst_val3);
1527  ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1528  ILVRL_H2_SH(diff1, diff0, diff3, diff4);
1529  ST_W8(diff3, diff4, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1530  dst += 4;
1531  }
1532 }
1533 
1535  const uint8_t *src_left,
1536  uint8_t *dst,
1537  int32_t stride,
1538  int32_t mode)
1539 {
1540  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1541  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1542  int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
1543  v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1544  v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1545  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1546  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1547  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1548  int32_t angle, angle_loop, inv_angle_val, offset;
1549  uint8_t ref_array[3 * 32 + 4];
1550  uint8_t *ref_tmp = ref_array + 16;
1551  const uint8_t *ref, *src_top_tmp = src_top - 1;
1552  uint8_t *dst_org;
1553  int32_t last;
1554 
1555  angle = intra_pred_angle_low[mode - 2];
1556  last = (angle) >> 1;
1557  angle_loop = angle;
1558 
1559  ref = src_left - 1;
1560  if (last < -1) {
1561  inv_angle_val = inv_angle[mode - 11];
1562 
1563  top0 = LD_SB(ref);
1564  tmp0 = LW(ref + 16);
1565  ST_SB(top0, ref_tmp);
1566  SW(tmp0, ref_tmp + 16);
1567 
1568  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1569  offset = (h_cnt * inv_angle_val + 128) >> 8;
1570  ref_tmp[h_cnt] = src_top_tmp[offset];
1571  }
1572 
1573  ref = ref_tmp;
1574  }
1575 
1576  for (v_cnt = 0; v_cnt < 4; v_cnt++) {
1577  dst_org = dst;
1578 
1579  idx0 = angle_loop >> 5;
1580  fact_val0 = angle_loop & 31;
1581  angle_loop += angle;
1582 
1583  idx1 = angle_loop >> 5;
1584  fact_val1 = angle_loop & 31;
1585  angle_loop += angle;
1586 
1587  idx2 = angle_loop >> 5;
1588  fact_val2 = angle_loop & 31;
1589  angle_loop += angle;
1590 
1591  idx3 = angle_loop >> 5;
1592  fact_val3 = angle_loop & 31;
1593  angle_loop += angle;
1594 
1595  LD_SB2(ref + idx0 + 1, 16, top0, top1);
1596  LD_SB2(ref + idx1 + 1, 16, top2, top3);
1597  LD_SB2(ref + idx2 + 1, 16, top4, top5);
1598  LD_SB2(ref + idx3 + 1, 16, top6, top7);
1599 
1600  fact0 = __msa_fill_h(fact_val0);
1601  fact1 = __msa_fill_h(32 - fact_val0);
1602  fact2 = __msa_fill_h(fact_val1);
1603  fact3 = __msa_fill_h(32 - fact_val1);
1604  fact4 = __msa_fill_h(fact_val2);
1605  fact5 = __msa_fill_h(32 - fact_val2);
1606  fact6 = __msa_fill_h(fact_val3);
1607  fact7 = __msa_fill_h(32 - fact_val3);
1608 
1609  SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
1610  SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
1611 
1612  UNPCK_UB_SH(top0, diff0, diff1);
1613  UNPCK_UB_SH(top1, diff2, diff3);
1614  UNPCK_UB_SH(top2, diff4, diff5);
1615  UNPCK_UB_SH(top3, diff6, diff7);
1616  UNPCK_UB_SH(top4, diff8, diff9);
1617  UNPCK_UB_SH(top5, diff10, diff11);
1618  UNPCK_UB_SH(top6, diff12, diff13);
1619  UNPCK_UB_SH(top7, diff14, diff15);
1620 
1621  MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1622  diff2, diff3, diff6, diff7);
1623  MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1624  diff10, diff11, diff14, diff15);
1625 
1626  diff2 += diff0 * fact1;
1627  diff3 += diff1 * fact1;
1628  diff6 += diff4 * fact3;
1629  diff7 += diff5 * fact3;
1630  diff10 += diff8 * fact5;
1631  diff11 += diff9 * fact5;
1632  diff14 += diff12 * fact7;
1633  diff15 += diff13 * fact7;
1634 
1635  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1636  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1637  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1638  dst_val0, dst_val1, dst_val2, dst_val3);
1639  ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1640  ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
1641  ILVRL_H2_SH(diff1, diff0, diff4, diff5);
1642  ILVRL_H2_SH(diff3, diff2, diff6, diff7);
1643  ST_W8(diff4, diff5, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1644  dst_org += (8 * stride);
1645  ST_W8(diff6, diff7, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1646  dst += 4;
1647  }
1648 }
1649 
1651  const uint8_t *src_left,
1652  uint8_t *dst,
1653  int32_t stride,
1654  int32_t mode)
1655 {
1656  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1657  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
1658  v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1659  v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1660  v8i16 fact0, fact1, fact2, fact3;
1661  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1662  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1663  int32_t angle, angle_loop, inv_angle_val, offset;
1664  uint8_t ref_array[3 * 32 + 4];
1665  uint8_t *ref_tmp = ref_array + 32;
1666  const uint8_t *ref, *src_top_tmp = src_top - 1;
1667  uint8_t *dst_org;
1668  int32_t last;
1669 
1670  angle = intra_pred_angle_low[mode - 2];
1671  last = angle;
1672  angle_loop = angle;
1673 
1674  ref = src_left - 1;
1675  if (last < -1) {
1676  inv_angle_val = inv_angle[mode - 11];
1677 
1678  LD_SB2(ref, 16, top0, top1);
1679  tmp0 = LW(ref + 32);
1680  ST_SB2(top0, top1, ref_tmp, 16);
1681  SW(tmp0, ref_tmp + 32);
1682 
1683  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1684  offset = (h_cnt * inv_angle_val + 128) >> 8;
1685  ref_tmp[h_cnt] = src_top_tmp[offset];
1686  }
1687 
1688  ref = ref_tmp;
1689  }
1690 
1691  for (v_cnt = 0; v_cnt < 16; v_cnt++) {
1692  dst_org = dst;
1693  idx0 = angle_loop >> 5;
1694  fact_val0 = angle_loop & 31;
1695  angle_loop += angle;
1696 
1697  idx1 = angle_loop >> 5;
1698  fact_val1 = angle_loop & 31;
1699  angle_loop += angle;
1700 
1701  top0 = LD_SB(ref + idx0 + 1);
1702  top4 = LD_SB(ref + idx1 + 1);
1703  top1 = LD_SB(ref + idx0 + 17);
1704  top5 = LD_SB(ref + idx1 + 17);
1705  top3 = LD_SB(ref + idx0 + 33);
1706  top7 = LD_SB(ref + idx1 + 33);
1707 
1708  fact0 = __msa_fill_h(fact_val0);
1709  fact1 = __msa_fill_h(32 - fact_val0);
1710  fact2 = __msa_fill_h(fact_val1);
1711  fact3 = __msa_fill_h(32 - fact_val1);
1712 
1713  top2 = top1;
1714  top6 = top5;
1715 
1716  SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
1717  SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
1718 
1719  UNPCK_UB_SH(top0, diff0, diff1);
1720  UNPCK_UB_SH(top1, diff2, diff3);
1721  UNPCK_UB_SH(top2, diff4, diff5);
1722  UNPCK_UB_SH(top3, diff6, diff7);
1723  UNPCK_UB_SH(top4, diff8, diff9);
1724  UNPCK_UB_SH(top5, diff10, diff11);
1725  UNPCK_UB_SH(top6, diff12, diff13);
1726  UNPCK_UB_SH(top7, diff14, diff15);
1727 
1728  MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1729  diff2, diff3, diff6, diff7);
1730  MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1731  diff10, diff11, diff14, diff15);
1732 
1733  diff2 += diff0 * fact1;
1734  diff3 += diff1 * fact1;
1735  diff6 += diff4 * fact1;
1736  diff7 += diff5 * fact1;
1737  diff10 += diff8 * fact3;
1738  diff11 += diff9 * fact3;
1739  diff14 += diff12 * fact3;
1740  diff15 += diff13 * fact3;
1741 
1742  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1743  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1744  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1745  dst_val0, dst_val1, dst_val2, dst_val3);
1746  ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1);
1747  ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3);
1748 
1749  ST_H8(diff0, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1750  dst_org += (8 * stride);
1751  ST_H8(diff1, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1752  dst_org += (8 * stride);
1753  ST_H8(diff2, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1754  dst_org += (8 * stride);
1755  ST_H8(diff3, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1756  dst_org += (8 * stride);
1757 
1758  dst += 2;
1759  }
1760 }
1761 
1763  int32_t dst_stride)
1764 {
1765  uint32_t row;
1766  v16u8 src1, src2;
1767 
1768  src1 = LD_UB(src);
1769  src2 = LD_UB(src + 16);
1770 
1771  for (row = 32; row--;) {
1772  ST_UB2(src1, src2, dst, 16);
1773  dst += dst_stride;
1774  }
1775 }
1776 
1778  const uint8_t *src_top,
1779  const uint8_t *src_left,
1780  ptrdiff_t stride)
1781 {
1782  hevc_intra_pred_plane_4x4_msa(src_top, src_left, dst, stride);
1783 }
1784 
1786  const uint8_t *src_top,
1787  const uint8_t *src_left,
1788  ptrdiff_t stride)
1789 {
1790  hevc_intra_pred_plane_8x8_msa(src_top, src_left, dst, stride);
1791 }
1792 
1794  const uint8_t *src_top,
1795  const uint8_t *src_left,
1796  ptrdiff_t stride)
1797 {
1798  hevc_intra_pred_plane_16x16_msa(src_top, src_left, dst, stride);
1799 }
1800 
1802  const uint8_t *src_top,
1803  const uint8_t *src_left,
1804  ptrdiff_t stride)
1805 {
1806  hevc_intra_pred_plane_32x32_msa(src_top, src_left, dst, stride);
1807 }
1808 
1809 void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top,
1810  const uint8_t *src_left,
1811  ptrdiff_t stride, int log2, int c_idx)
1812 {
1813  switch (log2) {
1814  case 2:
1815  hevc_intra_pred_dc_4x4_msa(src_top, src_left, dst, stride, c_idx);
1816  break;
1817 
1818  case 3:
1819  hevc_intra_pred_dc_8x8_msa(src_top, src_left, dst, stride, c_idx);
1820  break;
1821 
1822  case 4:
1823  hevc_intra_pred_dc_16x16_msa(src_top, src_left, dst, stride, c_idx);
1824  break;
1825 
1826  case 5:
1827  hevc_intra_pred_dc_32x32_msa(src_top, src_left, dst, stride);
1828  break;
1829  }
1830 }
1831 
1833  const uint8_t *src_top,
1834  const uint8_t *src_left,
1835  ptrdiff_t stride, int c_idx, int mode)
1836 {
1837  if (mode == 10) {
1838  hevc_intra_pred_horiz_4x4_msa(src_top, src_left, dst, stride, c_idx);
1839  } else if (mode == 26) {
1840  hevc_intra_pred_vert_4x4_msa(src_top, src_left, dst, stride, c_idx);
1841  } else if (mode >= 18) {
1842  hevc_intra_pred_angular_upper_4width_msa(src_top, src_left,
1843  dst, stride, mode);
1844  } else {
1845  hevc_intra_pred_angular_lower_4width_msa(src_top, src_left,
1846  dst, stride, mode);
1847  }
1848 }
1849 
1851  const uint8_t *src_top,
1852  const uint8_t *src_left,
1853  ptrdiff_t stride, int c_idx, int mode)
1854 {
1855  if (mode == 10) {
1856  hevc_intra_pred_horiz_8x8_msa(src_top, src_left, dst, stride, c_idx);
1857  } else if (mode == 26) {
1858  hevc_intra_pred_vert_8x8_msa(src_top, src_left, dst, stride, c_idx);
1859  } else if (mode >= 18) {
1860  hevc_intra_pred_angular_upper_8width_msa(src_top, src_left,
1861  dst, stride, mode);
1862  } else {
1863  hevc_intra_pred_angular_lower_8width_msa(src_top, src_left,
1864  dst, stride, mode);
1865  }
1866 }
1867 
1869  const uint8_t *src_top,
1870  const uint8_t *src_left,
1871  ptrdiff_t stride, int c_idx, int mode)
1872 {
1873  if (mode == 10) {
1874  hevc_intra_pred_horiz_16x16_msa(src_top, src_left, dst, stride, c_idx);
1875  } else if (mode == 26) {
1876  hevc_intra_pred_vert_16x16_msa(src_top, src_left, dst, stride, c_idx);
1877  } else if (mode >= 18) {
1879  dst, stride, mode);
1880  } else {
1882  dst, stride, mode);
1883  }
1884 }
1885 
1887  const uint8_t *src_top,
1888  const uint8_t *src_left,
1889  ptrdiff_t stride, int c_idx, int mode)
1890 {
1891  if (mode == 10) {
1892  hevc_intra_pred_horiz_32x32_msa(src_top, src_left, dst, stride);
1893  } else if (mode == 26) {
1894  intra_predict_vert_32x32_msa(src_top, dst, stride);
1895  } else if (mode >= 18) {
1897  dst, stride, mode);
1898  } else {
1900  dst, stride, mode);
1901  }
1902 }
1903 
1904 void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
1905 {
1906  v16u8 vec0;
1907  HEVCLocalContext *lc = s->HEVClc;
1908  int i;
1909  int hshift = s->ps.sps->hshift[c_idx];
1910  int vshift = s->ps.sps->vshift[c_idx];
1911  int size_in_luma_h = 16 << hshift;
1912  int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
1913  int size_in_luma_v = 16 << vshift;
1914  int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
1915  int x = x0 >> hshift;
1916  int y = y0 >> vshift;
1917  int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
1918  int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
1919 
1920  int cur_tb_addr =
1921  s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
1922 
1923  ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
1924  uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
1925 
1926  int min_pu_width = s->ps.sps->min_pu_width;
1927 
1928  enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
1929  lc->tu.intra_pred_mode;
1930  uint32_t a;
1931  uint8_t left_array[2 * 32 + 1];
1932  uint8_t filtered_left_array[2 * 32 + 1];
1933  uint8_t top_array[2 * 32 + 1];
1934  uint8_t filtered_top_array[2 * 32 + 1];
1935 
1936  uint8_t *left = left_array + 1;
1937  uint8_t *top = top_array + 1;
1938  uint8_t *filtered_left = filtered_left_array + 1;
1939  uint8_t *filtered_top = filtered_top_array + 1;
1940  int cand_bottom_left = lc->na.cand_bottom_left
1941  && cur_tb_addr >
1942  s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
1943  (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
1944  int cand_left = lc->na.cand_left;
1945  int cand_up_left = lc->na.cand_up_left;
1946  int cand_up = lc->na.cand_up;
1947  int cand_up_right = lc->na.cand_up_right
1948  && cur_tb_addr >
1949  s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
1950  ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
1951 
1952  int bottom_left_size =
1953  (((y0 + 2 * size_in_luma_v) >
1954  (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
1955  2 * size_in_luma_v)) -
1956  (y0 + size_in_luma_v)) >> vshift;
1957  int top_right_size =
1958  (((x0 + 2 * size_in_luma_h) >
1959  (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
1960  (x0 + size_in_luma_h)) >> hshift;
1961 
1962  if (s->ps.pps->constrained_intra_pred_flag == 1) {
1963  int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
1964  int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
1965  int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
1966  int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
1967  if (!size_in_luma_pu_h)
1968  size_in_luma_pu_h++;
1969  if (cand_bottom_left == 1 && on_pu_edge_x) {
1970  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
1971  int y_bottom_pu =
1972  ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
1973  int max =
1974  ((size_in_luma_pu_v) >
1975  (s->ps.sps->min_pu_height -
1976  y_bottom_pu) ? (s->ps.sps->min_pu_height -
1977  y_bottom_pu) : (size_in_luma_pu_v));
1978  cand_bottom_left = 0;
1979  for (i = 0; i < max; i += 2)
1980  cand_bottom_left |=
1981  ((s->ref->tab_mvf[(x_left_pu) +
1982  (y_bottom_pu +
1983  i) * min_pu_width]).pred_flag ==
1984  PF_INTRA);
1985  }
1986  if (cand_left == 1 && on_pu_edge_x) {
1987  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
1988  int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
1989  int max =
1990  ((size_in_luma_pu_v) >
1991  (s->ps.sps->min_pu_height -
1992  y_left_pu) ? (s->ps.sps->min_pu_height -
1993  y_left_pu) : (size_in_luma_pu_v));
1994  cand_left = 0;
1995  for (i = 0; i < max; i += 2)
1996  cand_left |=
1997  ((s->ref->tab_mvf[(x_left_pu) +
1998  (y_left_pu +
1999  i) * min_pu_width]).pred_flag ==
2000  PF_INTRA);
2001  }
2002  if (cand_up_left == 1) {
2003  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2004  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2005  cand_up_left =
2006  (s->ref->tab_mvf[(x_left_pu) +
2007  (y_top_pu) * min_pu_width]).pred_flag ==
2008  PF_INTRA;
2009  }
2010  if (cand_up == 1 && on_pu_edge_y) {
2011  int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
2012  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2013  int max =
2014  ((size_in_luma_pu_h) >
2015  (s->ps.sps->min_pu_width -
2016  x_top_pu) ? (s->ps.sps->min_pu_width -
2017  x_top_pu) : (size_in_luma_pu_h));
2018  cand_up = 0;
2019  for (i = 0; i < max; i += 2)
2020  cand_up |=
2021  ((s->ref->tab_mvf[(x_top_pu + i) +
2022  (y_top_pu) *
2023  min_pu_width]).pred_flag == PF_INTRA);
2024  }
2025  if (cand_up_right == 1 && on_pu_edge_y) {
2026  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2027  int x_right_pu =
2028  ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2029  int max =
2030  ((size_in_luma_pu_h) >
2031  (s->ps.sps->min_pu_width -
2032  x_right_pu) ? (s->ps.sps->min_pu_width -
2033  x_right_pu) : (size_in_luma_pu_h));
2034  cand_up_right = 0;
2035  for (i = 0; i < max; i += 2)
2036  cand_up_right |=
2037  ((s->ref->tab_mvf[(x_right_pu + i) +
2038  (y_top_pu) *
2039  min_pu_width]).pred_flag == PF_INTRA);
2040  }
2041 
2042  vec0 = (v16u8) __msa_ldi_b(128);
2043 
2044  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2045 
2046  ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2047 
2048  top[-1] = 128;
2049  }
2050  if (cand_up_left) {
2051  left[-1] = src[(-1) + stride * (-1)];
2052  top[-1] = left[-1];
2053  }
2054  if (cand_up) {
2055  vec0 = LD_UB(src - stride);
2056  ST_UB(vec0, top);
2057  }
2058  if (cand_up_right) {
2059  vec0 = LD_UB(src - stride + 16);
2060  ST_UB(vec0, (top + 16));
2061 
2062  do {
2063  uint32_t pix =
2064  ((src[(16 + top_right_size - 1) + stride * (-1)]) *
2065  0x01010101U);
2066  for (i = 0; i < (16 - top_right_size); i += 4)
2067  ((((union unaligned_32 *) (top + 16 + top_right_size +
2068  i))->l) = (pix));
2069  } while (0);
2070  }
2071  if (cand_left)
2072  for (i = 0; i < 16; i++)
2073  left[i] = src[(-1) + stride * (i)];
2074  if (cand_bottom_left) {
2075  for (i = 16; i < 16 + bottom_left_size; i++)
2076  left[i] = src[(-1) + stride * (i)];
2077  do {
2078  uint32_t pix =
2079  ((src[(-1) + stride * (16 + bottom_left_size - 1)]) *
2080  0x01010101U);
2081  for (i = 0; i < (16 - bottom_left_size); i += 4)
2082  ((((union unaligned_32 *) (left + 16 + bottom_left_size +
2083  i))->l) = (pix));
2084  } while (0);
2085  }
2086 
2087  if (s->ps.pps->constrained_intra_pred_flag == 1) {
2088  if (cand_bottom_left || cand_left || cand_up_left || cand_up
2089  || cand_up_right) {
2090  int size_max_x =
2091  x0 + ((2 * 16) << hshift) <
2092  s->ps.sps->width ? 2 * 16 : (s->ps.sps->width - x0) >> hshift;
2093  int size_max_y =
2094  y0 + ((2 * 16) << vshift) <
2095  s->ps.sps->height ? 2 * 16 : (s->ps.sps->height - y0) >> vshift;
2096  int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2097  if (!cand_up_right) {
2098  size_max_x = x0 + ((16) << hshift) < s->ps.sps->width ?
2099  16 : (s->ps.sps->width - x0) >> hshift;
2100  }
2101  if (!cand_bottom_left) {
2102  size_max_y = y0 + ((16) << vshift) < s->ps.sps->height ?
2103  16 : (s->ps.sps->height - y0) >> vshift;
2104  }
2105  if (cand_bottom_left || cand_left || cand_up_left) {
2106  while (j > -1
2107  &&
2108  !((s->ref->tab_mvf[(((x0 +
2109  ((-1) << hshift)) >> s->ps.sps->
2110  log2_min_pu_size)) + (((y0 +
2111  ((j) <<
2112  vshift))
2113  >> s->ps.sps->
2114  log2_min_pu_size))
2115  * min_pu_width]).pred_flag ==
2116  PF_INTRA))
2117  j--;
2118  if (!
2119  ((s->ref->tab_mvf[(((x0 +
2120  ((-1) << hshift)) >> s->ps.sps->
2121  log2_min_pu_size)) + (((y0 + ((j)
2122  <<
2123  vshift))
2124  >> s->ps.sps->
2125  log2_min_pu_size))
2126  * min_pu_width]).pred_flag == PF_INTRA)) {
2127  j = 0;
2128  while (j < size_max_x
2129  &&
2130  !((s->ref->tab_mvf[(((x0 +
2131  ((j) << hshift)) >> s->ps.sps->
2132  log2_min_pu_size)) + (((y0 +
2133  ((-1) <<
2134  vshift))
2135  >> s->
2136  ps.sps->
2137  log2_min_pu_size))
2138  * min_pu_width]).pred_flag ==
2139  PF_INTRA))
2140  j++;
2141  for (i = j; i > (j) - (j + 1); i--)
2142  if (!
2143  ((s->ref->tab_mvf[(((x0 +
2144  ((i -
2145  1) << hshift)) >> s->ps.sps->
2146  log2_min_pu_size)) + (((y0 +
2147  ((-1) <<
2148  vshift))
2149  >> s->
2150  ps.sps->
2151  log2_min_pu_size))
2152  * min_pu_width]).pred_flag ==
2153  PF_INTRA))
2154  top[i - 1] = top[i];
2155  left[-1] = top[-1];
2156  }
2157  } else {
2158  j = 0;
2159  while (j < size_max_x
2160  &&
2161  !((s->ref->tab_mvf[(((x0 +
2162  ((j) << hshift)) >> s->ps.sps->
2163  log2_min_pu_size)) + (((y0 + ((-1)
2164  <<
2165  vshift))
2166  >> s->ps.sps->
2167  log2_min_pu_size))
2168  * min_pu_width]).pred_flag ==
2169  PF_INTRA))
2170  j++;
2171  if (j > 0)
2172  if (x0 > 0) {
2173  for (i = j; i > (j) - (j + 1); i--)
2174  if (!
2175  ((s->ref->tab_mvf[(((x0 +
2176  ((i -
2177  1) << hshift)) >>
2178  s->ps.sps->log2_min_pu_size))
2179  + (((y0 + ((-1)
2180  << vshift))
2181  >>
2182  s->ps.sps->log2_min_pu_size))
2183  *
2184  min_pu_width]).pred_flag ==
2185  PF_INTRA))
2186  top[i - 1] = top[i];
2187  } else {
2188  for (i = j; i > (j) - (j); i--)
2189  if (!
2190  ((s->ref->tab_mvf[(((x0 +
2191  ((i -
2192  1) << hshift)) >>
2193  s->ps.sps->log2_min_pu_size))
2194  + (((y0 + ((-1)
2195  << vshift))
2196  >>
2197  s->ps.sps->log2_min_pu_size))
2198  *
2199  min_pu_width]).pred_flag ==
2200  PF_INTRA))
2201  top[i - 1] = top[i];
2202  top[-1] = top[0];
2203  }
2204  left[-1] = top[-1];
2205  }
2206  left[-1] = top[-1];
2207  if (cand_bottom_left || cand_left) {
2208  a = ((left[-1]) * 0x01010101U);
2209  for (i = 0; i < (0) + (size_max_y); i += 4)
2210  if (!
2211  ((s->ref->tab_mvf[(((x0 +
2212  ((-1) << hshift)) >> s->ps.sps->
2213  log2_min_pu_size)) + (((y0 +
2214  ((i) <<
2215  vshift))
2216  >> s->ps.sps->
2217  log2_min_pu_size))
2218  * min_pu_width]).pred_flag ==
2219  PF_INTRA))
2220  ((((union unaligned_32 *) (&left[i]))->l) = (a));
2221  else
2222  a = ((left[i + 3]) * 0x01010101U);
2223  }
2224  if (!cand_left) {
2225  vec0 = (v16u8) __msa_fill_b(left[-1]);
2226 
2227  ST_UB(vec0, left);
2228  }
2229  if (!cand_bottom_left) {
2230 
2231  vec0 = (v16u8) __msa_fill_b(left[15]);
2232 
2233  ST_UB(vec0, (left + 16));
2234  }
2235  if (x0 != 0 && y0 != 0) {
2236  a = ((left[size_max_y - 1]) * 0x01010101U);
2237  for (i = (size_max_y - 1);
2238  i > (size_max_y - 1) - (size_max_y); i -= 4)
2239  if (!
2240  ((s->ref->tab_mvf[(((x0 +
2241  ((-1) << hshift)) >> s->ps.sps->
2242  log2_min_pu_size)) + (((y0 +
2243  ((i -
2244  3) <<
2245  vshift))
2246  >> s->ps.sps->
2247  log2_min_pu_size))
2248  * min_pu_width]).pred_flag ==
2249  PF_INTRA))
2250  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2251  else
2252  a = ((left[i - 3]) * 0x01010101U);
2253  if (!
2254  ((s->ref->tab_mvf[(((x0 +
2255  ((-1) << hshift)) >> s->ps.sps->
2256  log2_min_pu_size)) + (((y0 + ((-1)
2257  <<
2258  vshift))
2259  >> s->ps.sps->
2260  log2_min_pu_size))
2261  * min_pu_width]).pred_flag == PF_INTRA))
2262  left[-1] = left[0];
2263  } else if (x0 == 0) {
2264  do {
2265  uint32_t pix = ((0) * 0x01010101U);
2266  for (i = 0; i < (size_max_y); i += 4)
2267  ((((union unaligned_32 *) (left + i))->l) = (pix));
2268  } while (0);
2269  } else {
2270  a = ((left[size_max_y - 1]) * 0x01010101U);
2271  for (i = (size_max_y - 1);
2272  i > (size_max_y - 1) - (size_max_y); i -= 4)
2273  if (!
2274  ((s->ref->tab_mvf[(((x0 +
2275  ((-1) << hshift)) >> s->ps.sps->
2276  log2_min_pu_size)) + (((y0 +
2277  ((i -
2278  3) <<
2279  vshift))
2280  >> s->ps.sps->
2281  log2_min_pu_size))
2282  * min_pu_width]).pred_flag ==
2283  PF_INTRA))
2284  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2285  else
2286  a = ((left[i - 3]) * 0x01010101U);
2287  }
2288  top[-1] = left[-1];
2289  if (y0 != 0) {
2290  a = ((left[-1]) * 0x01010101U);
2291  for (i = 0; i < (0) + (size_max_x); i += 4)
2292  if (!
2293  ((s->ref->tab_mvf[(((x0 +
2294  ((i) << hshift)) >> s->ps.sps->
2295  log2_min_pu_size)) + (((y0 + ((-1)
2296  <<
2297  vshift))
2298  >> s->ps.sps->
2299  log2_min_pu_size))
2300  * min_pu_width]).pred_flag ==
2301  PF_INTRA))
2302  ((((union unaligned_32 *) (&top[i]))->l) = (a));
2303  else
2304  a = ((top[i + 3]) * 0x01010101U);
2305  }
2306  }
2307  }
2308 
2309  if (!cand_bottom_left) {
2310  if (cand_left) {
2311  vec0 = (v16u8) __msa_fill_b(left[15]);
2312 
2313  ST_UB(vec0, (left + 16));
2314 
2315  } else if (cand_up_left) {
2316  vec0 = (v16u8) __msa_fill_b(left[-1]);
2317 
2318  ST_UB2(vec0, vec0, left, 16);
2319 
2320  cand_left = 1;
2321  } else if (cand_up) {
2322  left[-1] = top[0];
2323 
2324  vec0 = (v16u8) __msa_fill_b(left[-1]);
2325 
2326  ST_UB2(vec0, vec0, left, 16);
2327 
2328  cand_up_left = 1;
2329  cand_left = 1;
2330  } else if (cand_up_right) {
2331  vec0 = (v16u8) __msa_fill_b(top[16]);
2332 
2333  ST_UB(vec0, top);
2334 
2335  left[-1] = top[16];
2336 
2337  ST_UB2(vec0, vec0, left, 16);
2338 
2339  cand_up = 1;
2340  cand_up_left = 1;
2341  cand_left = 1;
2342  } else {
2343  left[-1] = 128;
2344  vec0 = (v16u8) __msa_ldi_b(128);
2345 
2346  ST_UB2(vec0, vec0, top, 16);
2347  ST_UB2(vec0, vec0, left, 16);
2348  }
2349  }
2350 
2351  if (!cand_left) {
2352  vec0 = (v16u8) __msa_fill_b(left[16]);
2353  ST_UB(vec0, left);
2354  }
2355  if (!cand_up_left) {
2356  left[-1] = left[0];
2357  }
2358  if (!cand_up) {
2359  vec0 = (v16u8) __msa_fill_b(left[-1]);
2360  ST_UB(vec0, top);
2361  }
2362  if (!cand_up_right) {
2363  vec0 = (v16u8) __msa_fill_b(top[15]);
2364  ST_UB(vec0, (top + 16));
2365  }
2366 
2367  top[-1] = left[-1];
2368 
2369 
2370  if (!s->ps.sps->intra_smoothing_disabled_flag
2371  && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
2372  if (mode != INTRA_DC && 16 != 4) {
2373  int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2374  int min_dist_vert_hor =
2375  (((((int) (mode - 26U)) >=
2376  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2377  ((((int) (mode - 10U)) >=
2378  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2379  ? ((((int) (mode - 10U)) >=
2380  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2381  : ((((int) (mode - 26U)) >=
2382  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2383  if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
2384  filtered_left[2 * 16 - 1] = left[2 * 16 - 1];
2385  filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
2386  for (i = 2 * 16 - 2; i >= 0; i--)
2387  filtered_left[i] = (left[i + 1] + 2 * left[i] +
2388  left[i - 1] + 2) >> 2;
2389  filtered_top[-1] =
2390  filtered_left[-1] =
2391  (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
2392  for (i = 2 * 16 - 2; i >= 0; i--)
2393  filtered_top[i] = (top[i + 1] + 2 * top[i] +
2394  top[i - 1] + 2) >> 2;
2395  left = filtered_left;
2396  top = filtered_top;
2397  }
2398  }
2399  }
2400 
2401  switch (mode) {
2402  case INTRA_PLANAR:
2403  s->hpc.pred_planar[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2404  (uint8_t *) left, stride);
2405  break;
2406  case INTRA_DC:
2407  s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
2408  (uint8_t *) left, stride, 4, c_idx);
2409  break;
2410  default:
2411  s->hpc.pred_angular[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2412  (uint8_t *) left, stride, c_idx, mode);
2413  break;
2414  }
2415 }
2416 
2417 void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
2418 {
2419  v16u8 vec0, vec1;
2420  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2421  v8i16 res0, res1, res2, res3;
2422  v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
2423  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
2424  HEVCLocalContext *lc = s->HEVClc;
2425  int i;
2426  int hshift = s->ps.sps->hshift[c_idx];
2427  int vshift = s->ps.sps->vshift[c_idx];
2428  int size_in_luma_h = 32 << hshift;
2429  int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
2430  int size_in_luma_v = 32 << vshift;
2431  int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
2432  int x = x0 >> hshift;
2433  int y = y0 >> vshift;
2434  int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
2435  int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
2436 
2437  int cur_tb_addr =
2438  s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
2439 
2440  ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
2441  uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
2442 
2443  int min_pu_width = s->ps.sps->min_pu_width;
2444 
2445  enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
2446  lc->tu.intra_pred_mode;
2447  uint32_t a;
2448  uint8_t left_array[2 * 32 + 1];
2449  uint8_t filtered_left_array[2 * 32 + 1];
2450  uint8_t top_array[2 * 32 + 1];
2451  uint8_t filtered_top_array[2 * 32 + 1];
2452 
2453  uint8_t *left = left_array + 1;
2454  uint8_t *top = top_array + 1;
2455  uint8_t *filtered_left = filtered_left_array + 1;
2456  uint8_t *filtered_top = filtered_top_array + 1;
2457  int cand_bottom_left = lc->na.cand_bottom_left
2458  && cur_tb_addr >
2459  s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
2460  (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
2461  int cand_left = lc->na.cand_left;
2462  int cand_up_left = lc->na.cand_up_left;
2463  int cand_up = lc->na.cand_up;
2464  int cand_up_right = lc->na.cand_up_right
2465  && cur_tb_addr >
2466  s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
2467  ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
2468 
2469  int bottom_left_size =
2470  (((y0 + 2 * size_in_luma_v) >
2471  (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
2472  2 * size_in_luma_v)) -
2473  (y0 + size_in_luma_v)) >> vshift;
2474  int top_right_size =
2475  (((x0 + 2 * size_in_luma_h) >
2476  (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
2477  (x0 + size_in_luma_h)) >> hshift;
2478 
2479  if (s->ps.pps->constrained_intra_pred_flag == 1) {
2480  int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
2481  int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2482  int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
2483  int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
2484  if (!size_in_luma_pu_h)
2485  size_in_luma_pu_h++;
2486  if (cand_bottom_left == 1 && on_pu_edge_x) {
2487  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2488  int y_bottom_pu =
2489  ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
2490  int max =
2491  ((size_in_luma_pu_v) >
2492  (s->ps.sps->min_pu_height -
2493  y_bottom_pu) ? (s->ps.sps->min_pu_height -
2494  y_bottom_pu) : (size_in_luma_pu_v));
2495  cand_bottom_left = 0;
2496  for (i = 0; i < max; i += 2)
2497  cand_bottom_left |=
2498  ((s->ref->tab_mvf[(x_left_pu) +
2499  (y_bottom_pu +
2500  i) * min_pu_width]).pred_flag ==
2501  PF_INTRA);
2502  }
2503  if (cand_left == 1 && on_pu_edge_x) {
2504  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2505  int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
2506  int max =
2507  ((size_in_luma_pu_v) >
2508  (s->ps.sps->min_pu_height -
2509  y_left_pu) ? (s->ps.sps->min_pu_height -
2510  y_left_pu) : (size_in_luma_pu_v));
2511  cand_left = 0;
2512  for (i = 0; i < max; i += 2)
2513  cand_left |=
2514  ((s->ref->tab_mvf[(x_left_pu) +
2515  (y_left_pu +
2516  i) * min_pu_width]).pred_flag ==
2517  PF_INTRA);
2518  }
2519  if (cand_up_left == 1) {
2520  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2521  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2522  cand_up_left =
2523  (s->ref->tab_mvf[(x_left_pu) +
2524  (y_top_pu) * min_pu_width]).pred_flag ==
2525  PF_INTRA;
2526  }
2527  if (cand_up == 1 && on_pu_edge_y) {
2528  int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
2529  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2530  int max =
2531  ((size_in_luma_pu_h) >
2532  (s->ps.sps->min_pu_width -
2533  x_top_pu) ? (s->ps.sps->min_pu_width -
2534  x_top_pu) : (size_in_luma_pu_h));
2535  cand_up = 0;
2536  for (i = 0; i < max; i += 2)
2537  cand_up |=
2538  ((s->ref->tab_mvf[(x_top_pu + i) +
2539  (y_top_pu) *
2540  min_pu_width]).pred_flag == PF_INTRA);
2541  }
2542  if (cand_up_right == 1 && on_pu_edge_y) {
2543  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2544  int x_right_pu =
2545  ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2546  int max =
2547  ((size_in_luma_pu_h) >
2548  (s->ps.sps->min_pu_width -
2549  x_right_pu) ? (s->ps.sps->min_pu_width -
2550  x_right_pu) : (size_in_luma_pu_h));
2551  cand_up_right = 0;
2552  for (i = 0; i < max; i += 2)
2553  cand_up_right |=
2554  ((s->ref->tab_mvf[(x_right_pu + i) +
2555  (y_top_pu) *
2556  min_pu_width]).pred_flag == PF_INTRA);
2557  }
2558  vec0 = (v16u8) __msa_ldi_b(128);
2559 
2560  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2561  ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2562 
2563  top[-1] = 128;
2564  }
2565  if (cand_up_left) {
2566  left[-1] = src[(-1) + stride * (-1)];
2567  top[-1] = left[-1];
2568  }
2569  if (cand_up) {
2570  LD_UB2(src - stride, 16, vec0, vec1);
2571  ST_UB2(vec0, vec1, top, 16);
2572  }
2573 
2574  if (cand_up_right) {
2575  LD_UB2(src - stride + 32, 16, vec0, vec1);
2576  ST_UB2(vec0, vec1, (top + 32), 16);
2577  do {
2578  uint32_t pix =
2579  ((src[(32 + top_right_size - 1) + stride * (-1)]) *
2580  0x01010101U);
2581  for (i = 0; i < (32 - top_right_size); i += 4)
2582  ((((union unaligned_32 *) (top + 32 + top_right_size +
2583  i))->l) = (pix));
2584  } while (0);
2585  }
2586  if (cand_left)
2587  for (i = 0; i < 32; i++)
2588  left[i] = src[(-1) + stride * (i)];
2589  if (cand_bottom_left) {
2590  for (i = 32; i < 32 + bottom_left_size; i++)
2591  left[i] = src[(-1) + stride * (i)];
2592  do {
2593  uint32_t pix =
2594  ((src[(-1) + stride * (32 + bottom_left_size - 1)]) *
2595  0x01010101U);
2596  for (i = 0; i < (32 - bottom_left_size); i += 4)
2597  ((((union unaligned_32 *) (left + 32 + bottom_left_size +
2598  i))->l) = (pix));
2599  } while (0);
2600  }
2601 
2602  if (s->ps.pps->constrained_intra_pred_flag == 1) {
2603  if (cand_bottom_left || cand_left || cand_up_left || cand_up
2604  || cand_up_right) {
2605  int size_max_x =
2606  x0 + ((2 * 32) << hshift) <
2607  s->ps.sps->width ? 2 * 32 : (s->ps.sps->width - x0) >> hshift;
2608  int size_max_y =
2609  y0 + ((2 * 32) << vshift) <
2610  s->ps.sps->height ? 2 * 32 : (s->ps.sps->height - y0) >> vshift;
2611  int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2612  if (!cand_up_right) {
2613  size_max_x = x0 + ((32) << hshift) < s->ps.sps->width ?
2614  32 : (s->ps.sps->width - x0) >> hshift;
2615  }
2616  if (!cand_bottom_left) {
2617  size_max_y = y0 + ((32) << vshift) < s->ps.sps->height ?
2618  32 : (s->ps.sps->height - y0) >> vshift;
2619  }
2620  if (cand_bottom_left || cand_left || cand_up_left) {
2621  while (j > -1
2622  &&
2623  !((s->ref->tab_mvf[(((x0 +
2624  ((-1) << hshift)) >> s->ps.sps->
2625  log2_min_pu_size)) + (((y0 +
2626  ((j) <<
2627  vshift))
2628  >> s->ps.sps->
2629  log2_min_pu_size))
2630  * min_pu_width]).pred_flag ==
2631  PF_INTRA))
2632  j--;
2633  if (!
2634  ((s->ref->tab_mvf[(((x0 +
2635  ((-1) << hshift)) >> s->ps.sps->
2636  log2_min_pu_size)) + (((y0 + ((j)
2637  <<
2638  vshift))
2639  >> s->ps.sps->
2640  log2_min_pu_size))
2641  * min_pu_width]).pred_flag == PF_INTRA)) {
2642  j = 0;
2643  while (j < size_max_x
2644  &&
2645  !((s->ref->tab_mvf[(((x0 +
2646  ((j) << hshift)) >> s->ps.sps->
2647  log2_min_pu_size)) + (((y0 +
2648  ((-1) <<
2649  vshift))
2650  >> s->
2651  ps.sps->
2652  log2_min_pu_size))
2653  * min_pu_width]).pred_flag ==
2654  PF_INTRA))
2655  j++;
2656  for (i = j; i > (j) - (j + 1); i--)
2657  if (!
2658  ((s->ref->tab_mvf[(((x0 +
2659  ((i -
2660  1) << hshift)) >> s->ps.sps->
2661  log2_min_pu_size)) + (((y0 +
2662  ((-1) <<
2663  vshift))
2664  >> s->
2665  ps.sps->
2666  log2_min_pu_size))
2667  * min_pu_width]).pred_flag ==
2668  PF_INTRA))
2669  top[i - 1] = top[i];
2670  left[-1] = top[-1];
2671  }
2672  } else {
2673  j = 0;
2674  while (j < size_max_x
2675  &&
2676  !((s->ref->tab_mvf[(((x0 +
2677  ((j) << hshift)) >> s->ps.sps->
2678  log2_min_pu_size)) + (((y0 + ((-1)
2679  <<
2680  vshift))
2681  >> s->ps.sps->
2682  log2_min_pu_size))
2683  * min_pu_width]).pred_flag ==
2684  PF_INTRA))
2685  j++;
2686  if (j > 0)
2687  if (x0 > 0) {
2688  for (i = j; i > (j) - (j + 1); i--)
2689  if (!
2690  ((s->ref->tab_mvf[(((x0 +
2691  ((i -
2692  1) << hshift)) >>
2693  s->ps.sps->log2_min_pu_size))
2694  + (((y0 + ((-1)
2695  << vshift))
2696  >>
2697  s->ps.sps->log2_min_pu_size))
2698  *
2699  min_pu_width]).pred_flag ==
2700  PF_INTRA))
2701  top[i - 1] = top[i];
2702  } else {
2703  for (i = j; i > (j) - (j); i--)
2704  if (!
2705  ((s->ref->tab_mvf[(((x0 +
2706  ((i -
2707  1) << hshift)) >>
2708  s->ps.sps->log2_min_pu_size))
2709  + (((y0 + ((-1)
2710  << vshift))
2711  >>
2712  s->ps.sps->log2_min_pu_size))
2713  *
2714  min_pu_width]).pred_flag ==
2715  PF_INTRA))
2716  top[i - 1] = top[i];
2717  top[-1] = top[0];
2718  }
2719  left[-1] = top[-1];
2720  }
2721  left[-1] = top[-1];
2722  if (cand_bottom_left || cand_left) {
2723  a = ((left[-1]) * 0x01010101U);
2724  for (i = 0; i < (0) + (size_max_y); i += 4)
2725  if (!
2726  ((s->ref->tab_mvf[(((x0 +
2727  ((-1) << hshift)) >> s->ps.sps->
2728  log2_min_pu_size)) + (((y0 +
2729  ((i) <<
2730  vshift))
2731  >> s->ps.sps->
2732  log2_min_pu_size))
2733  * min_pu_width]).pred_flag ==
2734  PF_INTRA))
2735  ((((union unaligned_32 *) (&left[i]))->l) = (a));
2736  else
2737  a = ((left[i + 3]) * 0x01010101U);
2738  }
2739  if (!cand_left) {
2740  vec0 = (v16u8) __msa_fill_b(left[-1]);
2741 
2742  ST_UB2(vec0, vec0, left, 16);
2743  }
2744  if (!cand_bottom_left) {
2745  vec0 = (v16u8) __msa_fill_b(left[31]);
2746 
2747  ST_UB2(vec0, vec0, (left + 32), 16);
2748  }
2749  if (x0 != 0 && y0 != 0) {
2750  a = ((left[size_max_y - 1]) * 0x01010101U);
2751  for (i = (size_max_y - 1);
2752  i > (size_max_y - 1) - (size_max_y); i -= 4)
2753  if (!
2754  ((s->ref->tab_mvf[(((x0 +
2755  ((-1) << hshift)) >> s->ps.sps->
2756  log2_min_pu_size)) + (((y0 +
2757  ((i -
2758  3) <<
2759  vshift))
2760  >> s->ps.sps->
2761  log2_min_pu_size))
2762  * min_pu_width]).pred_flag ==
2763  PF_INTRA))
2764  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2765  else
2766  a = ((left[i - 3]) * 0x01010101U);
2767  if (!
2768  ((s->ref->tab_mvf[(((x0 +
2769  ((-1) << hshift)) >> s->ps.sps->
2770  log2_min_pu_size)) + (((y0 + ((-1)
2771  <<
2772  vshift))
2773  >> s->ps.sps->
2774  log2_min_pu_size))
2775  * min_pu_width]).pred_flag == PF_INTRA))
2776  left[-1] = left[0];
2777  } else if (x0 == 0) {
2778  do {
2779  uint32_t pix = ((0) * 0x01010101U);
2780  for (i = 0; i < (size_max_y); i += 4)
2781  ((((union unaligned_32 *) (left + i))->l) = (pix));
2782  } while (0);
2783  } else {
2784  a = ((left[size_max_y - 1]) * 0x01010101U);
2785  for (i = (size_max_y - 1);
2786  i > (size_max_y - 1) - (size_max_y); i -= 4)
2787  if (!
2788  ((s->ref->tab_mvf[(((x0 +
2789  ((-1) << hshift)) >> s->ps.sps->
2790  log2_min_pu_size)) + (((y0 +
2791  ((i -
2792  3) <<
2793  vshift))
2794  >> s->ps.sps->
2795  log2_min_pu_size))
2796  * min_pu_width]).pred_flag ==
2797  PF_INTRA))
2798  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2799  else
2800  a = ((left[i - 3]) * 0x01010101U);
2801  }
2802  top[-1] = left[-1];
2803  if (y0 != 0) {
2804  a = ((left[-1]) * 0x01010101U);
2805  for (i = 0; i < (0) + (size_max_x); i += 4)
2806  if (!
2807  ((s->ref->tab_mvf[(((x0 +
2808  ((i) << hshift)) >> s->ps.sps->
2809  log2_min_pu_size)) + (((y0 + ((-1)
2810  <<
2811  vshift))
2812  >> s->ps.sps->
2813  log2_min_pu_size))
2814  * min_pu_width]).pred_flag ==
2815  PF_INTRA))
2816  ((((union unaligned_32 *) (&top[i]))->l) = (a));
2817  else
2818  a = ((top[i + 3]) * 0x01010101U);
2819  }
2820  }
2821  }
2822 
2823  if (!cand_bottom_left) {
2824  if (cand_left) {
2825  vec0 = (v16u8) __msa_fill_b(left[31]);
2826 
2827  ST_UB2(vec0, vec0, (left + 32), 16);
2828  } else if (cand_up_left) {
2829  vec0 = (v16u8) __msa_fill_b(left[-1]);
2830 
2831  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2832 
2833  cand_left = 1;
2834  } else if (cand_up) {
2835  left[-1] = top[0];
2836 
2837  vec0 = (v16u8) __msa_fill_b(left[-1]);
2838 
2839  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2840 
2841  cand_up_left = 1;
2842  cand_left = 1;
2843  } else if (cand_up_right) {
2844  vec0 = (v16u8) __msa_fill_b(top[32]);
2845 
2846  ST_UB2(vec0, vec0, top, 16);
2847 
2848  left[-1] = top[32];
2849 
2850  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2851 
2852  cand_up = 1;
2853  cand_up_left = 1;
2854  cand_left = 1;
2855  } else {
2856  left[-1] = 128;
2857 
2858  vec0 = (v16u8) __msa_ldi_b(128);
2859 
2860  ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2861  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2862  }
2863  }
2864 
2865  if (!cand_left) {
2866  vec0 = (v16u8) __msa_fill_b(left[32]);
2867 
2868  ST_UB2(vec0, vec0, left, 16);
2869  }
2870  if (!cand_up_left) {
2871  left[-1] = left[0];
2872  }
2873  if (!cand_up) {
2874  vec0 = (v16u8) __msa_fill_b(left[-1]);
2875 
2876  ST_UB2(vec0, vec0, top, 16);
2877  }
2878  if (!cand_up_right) {
2879  vec0 = (v16u8) __msa_fill_b(top[31]);
2880 
2881  ST_UB2(vec0, vec0, (top + 32), 16);
2882  }
2883 
2884  top[-1] = left[-1];
2885 
2886 
2887  if (!s->ps.sps->intra_smoothing_disabled_flag
2888  && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
2889  if (mode != INTRA_DC && 32 != 4) {
2890  int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2891  int min_dist_vert_hor =
2892  (((((int) (mode - 26U)) >=
2893  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2894  ((((int) (mode - 10U)) >=
2895  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2896  ? ((((int) (mode - 10U)) >=
2897  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2898  : ((((int) (mode - 26U)) >=
2899  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2900  if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
2901  int threshold = 1 << (8 - 5);
2902  if (s->ps.sps->sps_strong_intra_smoothing_enable_flag
2903  && c_idx == 0
2904  && ((top[-1] + top[63] - 2 * top[31]) >=
2905  0 ? (top[-1] + top[63] -
2906  2 * top[31]) : (-(top[-1] + top[63] -
2907  2 * top[31]))) < threshold
2908  && ((left[-1] + left[63] - 2 * left[31]) >=
2909  0 ? (left[-1] + left[63] -
2910  2 * left[31]) : (-(left[-1] + left[63] -
2911  2 * left[31]))) < threshold) {
2912 
2913 
2914  filtered_top[-1] = top[-1];
2915  filtered_top[63] = top[63];
2916 
2917 
2918  for (i = 0; i < 63; i++) {
2919  filtered_top[i] =
2920  ((63 - i) * top[-1] + (i + 1) * top[63] + 32) >> 6;
2921  }
2922 
2923  tmp0 = __msa_fill_h(top[-1]);
2924  tmp1 = __msa_fill_h(top[63]);
2925 
2926  tmp2 = mul_val0 - 8;
2927  tmp3 = mul_val0 - 16;
2928  tmp4 = mul_val0 - 24;
2929  tmp5 = mul_val1 + 8;
2930  tmp6 = mul_val1 + 16;
2931  tmp7 = mul_val1 + 24;
2932 
2933  res0 = mul_val0 * tmp0;
2934  res1 = tmp2 * tmp0;
2935  res2 = tmp3 * tmp0;
2936  res3 = tmp4 * tmp0;
2937  res0 += mul_val1 * tmp1;
2938  res1 += tmp5 * tmp1;
2939  res2 += tmp6 * tmp1;
2940  res3 += tmp7 * tmp1;
2941 
2942  res0 = __msa_srari_h(res0, 6);
2943  res1 = __msa_srari_h(res1, 6);
2944  res2 = __msa_srari_h(res2, 6);
2945  res3 = __msa_srari_h(res3, 6);
2946 
2947  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2948  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2949 
2950  ST_UB2(vec0, vec1, filtered_top, 16);
2951 
2952  res0 = mul_val0 - 32;
2953  tmp2 = mul_val0 - 40;
2954  tmp3 = mul_val0 - 48;
2955  tmp4 = mul_val0 - 56;
2956  res3 = mul_val1 + 32;
2957  tmp5 = mul_val1 + 40;
2958  tmp6 = mul_val1 + 48;
2959  tmp7 = mul_val1 + 56;
2960 
2961  res0 = res0 * tmp0;
2962  res1 = tmp2 * tmp0;
2963  res2 = tmp3 * tmp0;
2964  res0 += res3 * tmp1;
2965  res3 = tmp4 * tmp0;
2966  res1 += tmp5 * tmp1;
2967  res2 += tmp6 * tmp1;
2968  res3 += tmp7 * tmp1;
2969 
2970  res0 = __msa_srari_h(res0, 6);
2971  res1 = __msa_srari_h(res1, 6);
2972  res2 = __msa_srari_h(res2, 6);
2973  res3 = __msa_srari_h(res3, 6);
2974 
2975  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2976  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2977 
2978  ST_UB2(vec0, vec1, (filtered_top + 32), 16);
2979 
2980  filtered_top[63] = top[63];
2981 
2982  tmp0 = __msa_fill_h(left[-1]);
2983  tmp1 = __msa_fill_h(left[63]);
2984 
2985  tmp2 = mul_val0 - 8;
2986  tmp3 = mul_val0 - 16;
2987  tmp4 = mul_val0 - 24;
2988  tmp5 = mul_val1 + 8;
2989  tmp6 = mul_val1 + 16;
2990  tmp7 = mul_val1 + 24;
2991 
2992  res0 = mul_val0 * tmp0;
2993  res1 = tmp2 * tmp0;
2994  res2 = tmp3 * tmp0;
2995  res3 = tmp4 * tmp0;
2996  res0 += mul_val1 * tmp1;
2997  res1 += tmp5 * tmp1;
2998  res2 += tmp6 * tmp1;
2999  res3 += tmp7 * tmp1;
3000 
3001  res0 = __msa_srari_h(res0, 6);
3002  res1 = __msa_srari_h(res1, 6);
3003  res2 = __msa_srari_h(res2, 6);
3004  res3 = __msa_srari_h(res3, 6);
3005 
3006  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3007  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3008 
3009  ST_UB2(vec0, vec1, left, 16);
3010 
3011  res0 = mul_val0 - 32;
3012  tmp2 = mul_val0 - 40;
3013  tmp3 = mul_val0 - 48;
3014  tmp4 = mul_val0 - 56;
3015  res3 = mul_val1 + 32;
3016  tmp5 = mul_val1 + 40;
3017  tmp6 = mul_val1 + 48;
3018  tmp7 = mul_val1 + 56;
3019 
3020  res0 = res0 * tmp0;
3021  res1 = tmp2 * tmp0;
3022  res2 = tmp3 * tmp0;
3023  res0 += res3 * tmp1;
3024  res3 = tmp4 * tmp0;
3025  res1 += tmp5 * tmp1;
3026  res2 += tmp6 * tmp1;
3027  res3 += tmp7 * tmp1;
3028 
3029  res0 = __msa_srari_h(res0, 6);
3030  res1 = __msa_srari_h(res1, 6);
3031  res2 = __msa_srari_h(res2, 6);
3032  res3 = __msa_srari_h(res3, 6);
3033 
3034  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3035  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3036 
3037  ST_UB2(vec0, vec1, (left + 32), 16);
3038 
3039  left[63] = tmp1[0];
3040 
3041  top = filtered_top;
3042  } else {
3043  filtered_left[2 * 32 - 1] = left[2 * 32 - 1];
3044  filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
3045  for (i = 2 * 32 - 2; i >= 0; i--)
3046  filtered_left[i] = (left[i + 1] + 2 * left[i] +
3047  left[i - 1] + 2) >> 2;
3048  filtered_top[-1] =
3049  filtered_left[-1] =
3050  (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
3051  for (i = 2 * 32 - 2; i >= 0; i--)
3052  filtered_top[i] = (top[i + 1] + 2 * top[i] +
3053  top[i - 1] + 2) >> 2;
3054  left = filtered_left;
3055  top = filtered_top;
3056  }
3057  }
3058  }
3059  }
3060 
3061  switch (mode) {
3062  case INTRA_PLANAR:
3063  s->hpc.pred_planar[3] ((uint8_t *) src, (uint8_t *) top,
3064  (uint8_t *) left, stride);
3065  break;
3066  case INTRA_DC:
3067  s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
3068  (uint8_t *) left, stride, 5, c_idx);
3069  break;
3070  default:
3071  s->hpc.pred_angular[3] ((uint8_t *) src, (uint8_t *) top,
3072  (uint8_t *) left, stride, c_idx, mode);
3073  break;
3074  }
3075 }
HEVCLocalContext::na
NeighbourAvailable na
Definition: hevcdec.h:372
stride
int stride
Definition: mace.c:144
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:458
HEVCLocalContext
Definition: hevcdec.h:340
SRARI_H2_SH
#define SRARI_H2_SH(...)
Definition: generic_macros_msa.h:2108
PCKEV_B4_SB
#define PCKEV_B4_SB(...)
Definition: generic_macros_msa.h:1787
SPLATI_H4_SH
#define SPLATI_H4_SH(...)
Definition: generic_macros_msa.h:1723
ST_UB2
#define ST_UB2(...)
Definition: generic_macros_msa.h:363
SRARI_H4_SH
#define SRARI_H4_SH(...)
Definition: generic_macros_msa.h:2116
NeighbourAvailable::cand_left
int cand_left
Definition: hevcdec.h:268
NeighbourAvailable::cand_up
int cand_up
Definition: hevcdec.h:269
INTRA_DC
@ INTRA_DC
Definition: hevcdec.h:175
ILVR_B2_UH
#define ILVR_B2_UH(...)
Definition: generic_macros_msa.h:1389
NeighbourAvailable::cand_up_right
int cand_up_right
Definition: hevcdec.h:271
hevc_intra_pred_angular_upper_4width_msa
static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:923
max
#define max(a, b)
Definition: cuda_runtime.h:33
hevc_intra_pred_plane_32x32_msa
static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:908
ST_UB4
#define ST_UB4(...)
Definition: generic_macros_msa.h:374
PF_INTRA
@ PF_INTRA
Definition: hevcdec.h:167
INSERT_W2_SB
#define INSERT_W2_SB(...)
Definition: generic_macros_msa.h:1194
hevc_intra_pred_horiz_8x8_msa
static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:224
intra_pred_angle_low
static const int8_t intra_pred_angle_low[16]
Definition: hevcpred_msa.c:29
U
#define U(x)
Definition: vp56_arith.h:37
ADD2
#define ADD2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2167
HADD_UB2_UH
#define HADD_UB2_UH(...)
Definition: generic_macros_msa.h:1107
generic_macros_msa.h
intra_predict_vert_32x32_msa
static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, int32_t dst_stride)
Definition: hevcpred_msa.c:1762
LD_SB
#define LD_SB(...)
Definition: generic_macros_msa.h:37
src
#define src
Definition: vp8dsp.c:254
intra_pred_angle_up
static const int8_t intra_pred_angle_up[17]
Definition: hevcpred_msa.c:25
LD_UB
#define LD_UB(...)
Definition: generic_macros_msa.h:36
hevc_intra_pred_dc_8x8_msa
static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:401
TransformUnit::intra_pred_mode
int intra_pred_mode
Definition: hevcdec.h:291
SW
#define SW(val, pdst)
Definition: generic_macros_msa.h:169
hevc_intra_pred_vert_8x8_msa
static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:94
MUL2
#define MUL2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2150
hevc_intra_pred_angular_lower_16width_msa
static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1534
s
#define s(width, name)
Definition: cbs_vp9.c:257
NeighbourAvailable::cand_bottom_left
int cand_bottom_left
Definition: hevcdec.h:267
ST_H8
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:429
hevc_intra_pred_plane_16x16_msa
static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:663
hevc_intra_pred_plane_4x4_msa
static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:548
int32_t
int32_t
Definition: audio_convert.c:194
hevcpred_mips.h
hevc_intra_pred_angular_upper_8width_msa
static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1014
ST_D8
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:511
ILVL_B2_SH
#define ILVL_B2_SH(...)
Definition: generic_macros_msa.h:1315
PCKEV_B2_UB
#define PCKEV_B2_UB(...)
Definition: generic_macros_msa.h:1769
ST_SH2
#define ST_SH2(...)
Definition: generic_macros_msa.h:366
ff_hevc_intra_pred_dc_msa
void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int log2, int c_idx)
Definition: hevcpred_msa.c:1809
ILVRL_H2_SH
#define ILVRL_H2_SH(...)
Definition: generic_macros_msa.h:1557
ff_hevc_intra_pred_planar_1_msa
void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
Definition: hevcpred_msa.c:1785
SLDI_B2_SB
#define SLDI_B2_SB(...)
Definition: generic_macros_msa.h:654
unaligned_32
Definition: intreadwrite.h:221
hevc_intra_pred_horiz_16x16_msa
static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:264
ST_W2
#define ST_W2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:450
ILVRL_B2_UH
#define ILVRL_B2_UH(...)
Definition: generic_macros_msa.h:1546
CLIP_SH2_0_255
#define CLIP_SH2_0_255(in0, in1)
Definition: generic_macros_msa.h:966
LW
#define LW(psrc)
Definition: generic_macros_msa.h:108
hevc_intra_pred_plane_8x8_msa
static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:595
hevcdec.h
INTRA_PLANAR
@ INTRA_PLANAR
Definition: hevcdec.h:174
SD4
#define SD4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:258
hevc_intra_pred_dc_32x32_msa
static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:516
ST_SB4
#define ST_SB4(...)
Definition: generic_macros_msa.h:375
SLDI_B2_SH
#define SLDI_B2_SH(...)
Definition: generic_macros_msa.h:655
ff_pred_intra_pred_angular_1_msa
void ff_pred_intra_pred_angular_1_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
Definition: hevcpred_msa.c:1850
hevc_intra_pred_angular_lower_32width_msa
static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1650
hevc_intra_pred_angular_lower_4width_msa
static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1337
val
const char const char void * val
Definition: avisynth_c.h:863
hevc_intra_pred_angular_upper_32width_msa
static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1223
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
ST_SB2
#define ST_SB2(...)
Definition: generic_macros_msa.h:364
HEVC_PRED_PLANAR_16x2
#define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, res0, res1, mul_val_b0, mul_val_b1, round)
Definition: hevcpred_msa.c:33
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
SW4
#define SW4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:243
SPLATI_H2_SH
#define SPLATI_H2_SH(...)
Definition: generic_macros_msa.h:1705
src0
#define src0
Definition: h264pred.c:138
ff_intra_pred_8_32x32_msa
void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
Definition: hevcpred_msa.c:2417
SUB2
#define SUB2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2184
hevc_intra_pred_dc_4x4_msa
static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:340
hevc_intra_pred_horiz_32x32_msa
static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:310
flag
#define flag(name)
Definition: cbs_av1.c:557
src1
#define src1
Definition: h264pred.c:139
ff_hevc_intra_pred_planar_0_msa
void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
Definition: hevcpred_msa.c:1777
SRARI_H2_UH
#define SRARI_H2_UH(...)
Definition: generic_macros_msa.h:2107
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
NeighbourAvailable::cand_up_left
int cand_up_left
Definition: hevcdec.h:270
ST_SB
#define ST_SB(...)
Definition: generic_macros_msa.h:45
CLIP_SH_0_255
#define CLIP_SH_0_255(in)
Definition: generic_macros_msa.h:957
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:470
SLDI_B4_0_SH
#define SLDI_B4_0_SH(...)
Definition: generic_macros_msa.h:639
uint8_t
uint8_t
Definition: audio_convert.c:194
UNPCK_UB_SH
#define UNPCK_UB_SH(in, out0, out1)
Definition: generic_macros_msa.h:2270
hevc_intra_pred_dc_16x16_msa
static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:460
log2
#define log2(x)
Definition: libm.h:404
IntraPredMode
IntraPredMode
Definition: hevcdec.h:173
ST_UB
#define ST_UB(...)
Definition: generic_macros_msa.h:44
LD_UB2
#define LD_UB2(...)
Definition: generic_macros_msa.h:279
hevc_intra_pred_angular_lower_8width_msa
static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1433
left
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
Definition: snow.txt:386
ff_hevc_intra_pred_planar_3_msa
void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
Definition: hevcpred_msa.c:1801
ff_intra_pred_8_16x16_msa
void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
Definition: hevcpred_msa.c:1904
process_intra_lower_16x16_msa
static void process_intra_lower_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, uint8_t offset)
Definition: hevcpred_msa.c:826
ILVR_D2_SH
#define ILVR_D2_SH(...)
Definition: generic_macros_msa.h:1494
mode
mode
Definition: ebur128.h:83
HEVCContext
Definition: hevcdec.h:383
ff_pred_intra_pred_angular_0_msa
void ff_pred_intra_pred_angular_0_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
Definition: hevcpred_msa.c:1832
hevc_intra_pred_vert_16x16_msa
static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:149
ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:107
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:499
HEVCLocalContext::tu
TransformUnit tu
Definition: hevcdec.h:355
ILVR_B4_SH
#define ILVR_B4_SH(...)
Definition: generic_macros_msa.h:1412
zero
#define zero
Definition: regdef.h:64
process_intra_upper_16x16_msa
static void process_intra_upper_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, uint8_t offset)
Definition: hevcpred_msa.c:743
hevc_intra_pred_vert_4x4_msa
static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:61
ILVRL_B2_SH
#define ILVRL_B2_SH(...)
Definition: generic_macros_msa.h:1547
hevc_intra_pred_horiz_4x4_msa
static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:190
PCKEV_B2_SB
#define PCKEV_B2_SB(...)
Definition: generic_macros_msa.h:1768
MUL4
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2155
SLDI_B2_UB
#define SLDI_B2_UB(...)
Definition: generic_macros_msa.h:653
hevc_intra_pred_angular_upper_16width_msa
static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:1113
ff_pred_intra_pred_angular_3_msa
void ff_pred_intra_pred_angular_3_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
Definition: hevcpred_msa.c:1886
LD
#define LD(psrc)
Definition: generic_macros_msa.h:139
ILVR_B2_SH
#define ILVR_B2_SH(...)
Definition: generic_macros_msa.h:1390
TransformUnit::intra_pred_mode_c
int intra_pred_mode_c
Definition: hevcdec.h:292
PCKEV_D2_SH
#define PCKEV_D2_SH(...)
Definition: generic_macros_msa.h:1838
INSERT_D2_UB
#define INSERT_D2_UB(...)
Definition: generic_macros_msa.h:1219
ff_pred_intra_pred_angular_2_msa
void ff_pred_intra_pred_angular_2_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
Definition: hevcpred_msa.c:1868
int
int
Definition: ffmpeg_filter.c:191
SD
#define SD
Definition: ccaption_dec.c:819
ff_hevc_intra_pred_planar_2_msa
void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
Definition: hevcpred_msa.c:1793
LD_SB2
#define LD_SB2(...)
Definition: generic_macros_msa.h:280