FFmpeg
h264idct_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "h264dsp_mips.h"
24 
25 #define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3) \
26 { \
27  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
28  \
29  tmp0_m = in0 + in2; \
30  tmp1_m = in0 - in2; \
31  tmp2_m = in1 >> 1; \
32  tmp2_m = tmp2_m - in3; \
33  tmp3_m = in3 >> 1; \
34  tmp3_m = in1 + tmp3_m; \
35  \
36  BUTTERFLY_4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3); \
37 }
38 
39 static void avc_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
40  int32_t de_q_val)
41 {
42 #define DC_DEST_STRIDE 16
43  int16_t out0, out1, out2, out3, out4, out5, out6, out7;
44  v8i16 src1, src3;
45  v8i16 vec0, vec1, vec2, vec3;
46  v8i16 tmp0, tmp1, tmp2, tmp3;
47  v8i16 hres0, hres1, hres2, hres3;
48  v8i16 vres0, vres1, vres2, vres3;
49  v4i32 vres0_r, vres1_r, vres2_r, vres3_r;
50  const v4i32 de_q_vec = __msa_fill_w(de_q_val);
51  const v8i16 src0 = LD_SH(src);
52  const v8i16 src2 = LD_SH(src + 8);
53 
54  ILVL_D2_SH(src0, src0, src2, src2, src1, src3);
55  TRANSPOSE4x4_SH_SH(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
56  BUTTERFLY_4(tmp0, tmp2, tmp3, tmp1, vec0, vec3, vec2, vec1);
57  BUTTERFLY_4(vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1);
58  TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
59  BUTTERFLY_4(hres0, hres1, hres3, hres2, vec0, vec3, vec2, vec1);
60  BUTTERFLY_4(vec0, vec1, vec2, vec3, vres0, vres1, vres2, vres3);
61  UNPCK_R_SH_SW(vres0, vres0_r);
62  UNPCK_R_SH_SW(vres1, vres1_r);
63  UNPCK_R_SH_SW(vres2, vres2_r);
64  UNPCK_R_SH_SW(vres3, vres3_r);
65 
66  vres0_r *= de_q_vec;
67  vres1_r *= de_q_vec;
68  vres2_r *= de_q_vec;
69  vres3_r *= de_q_vec;
70 
71  SRARI_W4_SW(vres0_r, vres1_r, vres2_r, vres3_r, 8);
72  PCKEV_H2_SH(vres1_r, vres0_r, vres3_r, vres2_r, vec0, vec1);
73 
74  out0 = __msa_copy_s_h(vec0, 0);
75  out1 = __msa_copy_s_h(vec0, 1);
76  out2 = __msa_copy_s_h(vec0, 2);
77  out3 = __msa_copy_s_h(vec0, 3);
78  out4 = __msa_copy_s_h(vec0, 4);
79  out5 = __msa_copy_s_h(vec0, 5);
80  out6 = __msa_copy_s_h(vec0, 6);
81  out7 = __msa_copy_s_h(vec0, 7);
82  SH(out0, (dst + 0 * DC_DEST_STRIDE));
83  SH(out1, (dst + 2 * DC_DEST_STRIDE));
84  SH(out2, (dst + 8 * DC_DEST_STRIDE));
85  SH(out3, (dst + 10 * DC_DEST_STRIDE));
86  SH(out4, (dst + 1 * DC_DEST_STRIDE));
87  SH(out5, (dst + 3 * DC_DEST_STRIDE));
88  SH(out6, (dst + 9 * DC_DEST_STRIDE));
89  SH(out7, (dst + 11 * DC_DEST_STRIDE));
90 
91  out0 = __msa_copy_s_h(vec1, 0);
92  out1 = __msa_copy_s_h(vec1, 1);
93  out2 = __msa_copy_s_h(vec1, 2);
94  out3 = __msa_copy_s_h(vec1, 3);
95  out4 = __msa_copy_s_h(vec1, 4);
96  out5 = __msa_copy_s_h(vec1, 5);
97  out6 = __msa_copy_s_h(vec1, 6);
98  out7 = __msa_copy_s_h(vec1, 7);
99  SH(out0, (dst + 4 * DC_DEST_STRIDE));
100  SH(out1, (dst + 6 * DC_DEST_STRIDE));
101  SH(out2, (dst + 12 * DC_DEST_STRIDE));
102  SH(out3, (dst + 14 * DC_DEST_STRIDE));
103  SH(out4, (dst + 5 * DC_DEST_STRIDE));
104  SH(out5, (dst + 7 * DC_DEST_STRIDE));
105  SH(out6, (dst + 13 * DC_DEST_STRIDE));
106  SH(out7, (dst + 15 * DC_DEST_STRIDE));
107 
108 #undef DC_DEST_STRIDE
109 }
110 
111 static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
112 {
113  v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
114  v8i16 vec0, vec1, vec2, vec3;
115  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
116  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
117  v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
118  v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
119  v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
120  v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
121  v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
122  v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
123  v8i16 zeros = { 0 };
124 
125  src[0] += 32;
126 
127  LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
128  ST_SH8(zeros, zeros, zeros, zeros, zeros, zeros, zeros, zeros, src, 8);
129 
130  vec0 = src0 + src4;
131  vec1 = src0 - src4;
132  vec2 = src2 >> 1;
133  vec2 = vec2 - src6;
134  vec3 = src6 >> 1;
135  vec3 = src2 + vec3;
136 
137  BUTTERFLY_4(vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3);
138 
139  vec0 = src7 >> 1;
140  vec0 = src5 - vec0 - src3 - src7;
141  vec1 = src3 >> 1;
142  vec1 = src1 - vec1 + src7 - src3;
143  vec2 = src5 >> 1;
144  vec2 = vec2 - src1 + src7 + src5;
145  vec3 = src1 >> 1;
146  vec3 = vec3 + src3 + src5 + src1;
147  tmp4 = vec3 >> 2;
148  tmp4 += vec0;
149  tmp5 = vec2 >> 2;
150  tmp5 += vec1;
151  tmp6 = vec1 >> 2;
152  tmp6 -= vec2;
153  tmp7 = vec0 >> 2;
154  tmp7 = vec3 - tmp7;
155 
156  BUTTERFLY_8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
157  res0, res1, res2, res3, res4, res5, res6, res7);
158  TRANSPOSE8x8_SH_SH(res0, res1, res2, res3, res4, res5, res6, res7,
159  res0, res1, res2, res3, res4, res5, res6, res7);
160  UNPCK_SH_SW(res0, tmp0_r, tmp0_l);
161  UNPCK_SH_SW(res1, tmp1_r, tmp1_l);
162  UNPCK_SH_SW(res2, tmp2_r, tmp2_l);
163  UNPCK_SH_SW(res3, tmp3_r, tmp3_l);
164  UNPCK_SH_SW(res4, tmp4_r, tmp4_l);
165  UNPCK_SH_SW(res5, tmp5_r, tmp5_l);
166  UNPCK_SH_SW(res6, tmp6_r, tmp6_l);
167  UNPCK_SH_SW(res7, tmp7_r, tmp7_l);
168  BUTTERFLY_4(tmp0_r, tmp0_l, tmp4_l, tmp4_r, vec0_r, vec0_l, vec1_l, vec1_r);
169 
170  vec2_r = tmp2_r >> 1;
171  vec2_l = tmp2_l >> 1;
172  vec2_r -= tmp6_r;
173  vec2_l -= tmp6_l;
174  vec3_r = tmp6_r >> 1;
175  vec3_l = tmp6_l >> 1;
176  vec3_r += tmp2_r;
177  vec3_l += tmp2_l;
178 
179  BUTTERFLY_4(vec0_r, vec1_r, vec2_r, vec3_r, tmp0_r, tmp2_r, tmp4_r, tmp6_r);
180  BUTTERFLY_4(vec0_l, vec1_l, vec2_l, vec3_l, tmp0_l, tmp2_l, tmp4_l, tmp6_l);
181 
182  vec0_r = tmp7_r >> 1;
183  vec0_l = tmp7_l >> 1;
184  vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
185  vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
186  vec1_r = tmp3_r >> 1;
187  vec1_l = tmp3_l >> 1;
188  vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
189  vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
190  vec2_r = tmp5_r >> 1;
191  vec2_l = tmp5_l >> 1;
192  vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
193  vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
194  vec3_r = tmp1_r >> 1;
195  vec3_l = tmp1_l >> 1;
196  vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
197  vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
198  tmp1_r = vec3_r >> 2;
199  tmp1_l = vec3_l >> 2;
200  tmp1_r += vec0_r;
201  tmp1_l += vec0_l;
202  tmp3_r = vec2_r >> 2;
203  tmp3_l = vec2_l >> 2;
204  tmp3_r += vec1_r;
205  tmp3_l += vec1_l;
206  tmp5_r = vec1_r >> 2;
207  tmp5_l = vec1_l >> 2;
208  tmp5_r -= vec2_r;
209  tmp5_l -= vec2_l;
210  tmp7_r = vec0_r >> 2;
211  tmp7_l = vec0_l >> 2;
212  tmp7_r = vec3_r - tmp7_r;
213  tmp7_l = vec3_l - tmp7_l;
214 
215  BUTTERFLY_4(tmp0_r, tmp0_l, tmp7_l, tmp7_r, res0_r, res0_l, res7_l, res7_r);
216  BUTTERFLY_4(tmp2_r, tmp2_l, tmp5_l, tmp5_r, res1_r, res1_l, res6_l, res6_r);
217  BUTTERFLY_4(tmp4_r, tmp4_l, tmp3_l, tmp3_r, res2_r, res2_l, res5_l, res5_r);
218  BUTTERFLY_4(tmp6_r, tmp6_l, tmp1_l, tmp1_r, res3_r, res3_l, res4_l, res4_r);
219  SRA_4V(res0_r, res0_l, res1_r, res1_l, 6);
220  SRA_4V(res2_r, res2_l, res3_r, res3_l, 6);
221  SRA_4V(res4_r, res4_l, res5_r, res5_l, 6);
222  SRA_4V(res6_r, res6_l, res7_r, res7_l, 6);
223  PCKEV_H4_SH(res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
224  res0, res1, res2, res3);
225  PCKEV_H4_SH(res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
226  res4, res5, res6, res7);
227  LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
228  ILVR_B4_SH(zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
229  tmp0, tmp1, tmp2, tmp3);
230  ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
231  tmp4, tmp5, tmp6, tmp7);
232  ADD4(res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
233  res0, res1, res2, res3);
234  ADD4(res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
235  res4, res5, res6, res7);
236  CLIP_SH4_0_255(res0, res1, res2, res3);
237  CLIP_SH4_0_255(res4, res5, res6, res7);
238  PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
239  dst0, dst1, dst2, dst3);
240  ST8x4_UB(dst0, dst1, dst, dst_stride);
241  dst += (4 * dst_stride);
242  ST8x4_UB(dst2, dst3, dst, dst_stride);
243 }
244 
245 static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
246  int32_t dst_stride)
247 {
248  int32_t dc_val;
249  v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
250  v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
251  v8i16 dc;
252  v16i8 zeros = { 0 };
253 
254  dc_val = (src[0] + 32) >> 6;
255  dc = __msa_fill_h(dc_val);
256 
257  src[0] = 0;
258 
259  LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
260  ILVR_B4_SH(zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
261  dst0_r, dst1_r, dst2_r, dst3_r);
262  ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
263  dst4_r, dst5_r, dst6_r, dst7_r);
264  ADD4(dst0_r, dc, dst1_r, dc, dst2_r, dc, dst3_r, dc,
265  dst0_r, dst1_r, dst2_r, dst3_r);
266  ADD4(dst4_r, dc, dst5_r, dc, dst6_r, dc, dst7_r, dc,
267  dst4_r, dst5_r, dst6_r, dst7_r);
268  CLIP_SH4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
269  CLIP_SH4_0_255(dst4_r, dst5_r, dst6_r, dst7_r);
270  PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r,
271  dst0, dst1, dst2, dst3);
272  ST8x4_UB(dst0, dst1, dst, dst_stride);
273  dst += (4 * dst_stride);
274  ST8x4_UB(dst2, dst3, dst, dst_stride);
275 }
276 
277 void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
278 {
279  uint32_t src0_m, src1_m, src2_m, src3_m, out0_m, out1_m, out2_m, out3_m;
280  v16i8 dst0_m = { 0 };
281  v16i8 dst1_m = { 0 };
282  v8i16 hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3;
283  v8i16 inp0_m, inp1_m, res0_m, res1_m, src1, src3;
284  const v8i16 src0 = LD_SH(src);
285  const v8i16 src2 = LD_SH(src + 8);
286  const v8i16 zero = { 0 };
287  const uint8_t *dst1 = dst + dst_stride;
288  const uint8_t *dst2 = dst + 2 * dst_stride;
289  const uint8_t *dst3 = dst + 3 * dst_stride;
290 
291  ILVL_D2_SH(src0, src0, src2, src2, src1, src3);
292  ST_SH2(zero, zero, src, 8);
293  AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
294  TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
295  AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
296  src0_m = LW(dst);
297  src1_m = LW(dst1);
298  SRARI_H4_SH(vres0, vres1, vres2, vres3, 6);
299  src2_m = LW(dst2);
300  src3_m = LW(dst3);
301  ILVR_D2_SH(vres1, vres0, vres3, vres2, inp0_m, inp1_m);
302  INSERT_W2_SB(src0_m, src1_m, dst0_m);
303  INSERT_W2_SB(src2_m, src3_m, dst1_m);
304  ILVR_B2_SH(zero, dst0_m, zero, dst1_m, res0_m, res1_m);
305  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);
306  CLIP_SH2_0_255(res0_m, res1_m);
307  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);
308  out0_m = __msa_copy_u_w((v4i32) dst0_m, 0);
309  out1_m = __msa_copy_u_w((v4i32) dst0_m, 1);
310  out2_m = __msa_copy_u_w((v4i32) dst1_m, 0);
311  out3_m = __msa_copy_u_w((v4i32) dst1_m, 1);
312  SW(out0_m, dst);
313  SW(out1_m, dst1);
314  SW(out2_m, dst2);
315  SW(out3_m, dst3);
316 }
317 
319  int32_t dst_stride)
320 {
321  avc_idct8_addblk_msa(dst, src, dst_stride);
322 }
323 
325  int32_t dst_stride)
326 {
327  v16u8 pred = { 0 };
328  v16i8 out;
329  v8i16 pred_r, pred_l;
330  const uint32_t src0 = LW(dst);
331  const uint32_t src1 = LW(dst + dst_stride);
332  const uint32_t src2 = LW(dst + 2 * dst_stride);
333  const uint32_t src3 = LW(dst + 3 * dst_stride);
334  const int16_t dc = (src[0] + 32) >> 6;
335  const v8i16 input_dc = __msa_fill_h(dc);
336 
337  src[0] = 0;
338  INSERT_W4_UB(src0, src1, src2, src3, pred);
339  UNPCK_UB_SH(pred, pred_r, pred_l);
340  ADD2(pred_r, input_dc, pred_l, input_dc, pred_r, pred_l);
341  CLIP_SH2_0_255(pred_r, pred_l);
342  out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r);
343  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
344 }
345 
347  int32_t dst_stride)
348 {
349  avc_idct8_dc_addblk_msa(dst, src, dst_stride);
350 }
351 
353  const int32_t *blk_offset,
354  int16_t *block, int32_t dst_stride,
355  const uint8_t nzc[15 * 8])
356 {
357  int32_t i;
358 
359  for (i = 0; i < 16; i++) {
360  int32_t nnz = nzc[scan8[i]];
361 
362  if (nnz) {
363  if (nnz == 1 && ((dctcoef *) block)[i * 16])
364  ff_h264_idct4x4_addblk_dc_msa(dst + blk_offset[i],
365  block + i * 16 * sizeof(pixel),
366  dst_stride);
367  else
368  ff_h264_idct_add_msa(dst + blk_offset[i],
369  block + i * 16 * sizeof(pixel),
370  dst_stride);
371  }
372  }
373 }
374 
375 void ff_h264_idct8_add4_msa(uint8_t *dst, const int32_t *blk_offset,
376  int16_t *block, int32_t dst_stride,
377  const uint8_t nzc[15 * 8])
378 {
379  int32_t cnt;
380 
381  for (cnt = 0; cnt < 16; cnt += 4) {
382  int32_t nnz = nzc[scan8[cnt]];
383 
384  if (nnz) {
385  if (nnz == 1 && ((dctcoef *) block)[cnt * 16])
386  ff_h264_idct8_dc_addblk_msa(dst + blk_offset[cnt],
387  block + cnt * 16 * sizeof(pixel),
388  dst_stride);
389  else
390  ff_h264_idct8_addblk_msa(dst + blk_offset[cnt],
391  block + cnt * 16 * sizeof(pixel),
392  dst_stride);
393  }
394  }
395 }
396 
398  const int32_t *blk_offset,
399  int16_t *block, int32_t dst_stride,
400  const uint8_t nzc[15 * 8])
401 {
402  int32_t i, j;
403 
404  for (j = 1; j < 3; j++) {
405  for (i = (j * 16); i < (j * 16 + 4); i++) {
406  if (nzc[scan8[i]])
407  ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i],
408  block + i * 16 * sizeof(pixel),
409  dst_stride);
410  else if (((dctcoef *) block)[i * 16])
411  ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i],
412  block + i * 16 * sizeof(pixel),
413  dst_stride);
414  }
415  }
416 }
417 
419  const int32_t *blk_offset,
420  int16_t *block, int32_t dst_stride,
421  const uint8_t nzc[15 * 8])
422 {
423  int32_t i, j;
424 
425  for (j = 1; j < 3; j++) {
426  for (i = (j * 16); i < (j * 16 + 4); i++) {
427  if (nzc[scan8[i]])
428  ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i],
429  block + i * 16 * sizeof(pixel),
430  dst_stride);
431  else if (((dctcoef *) block)[i * 16])
432  ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i],
433  block + i * 16 * sizeof(pixel),
434  dst_stride);
435  }
436  }
437 
438  for (j = 1; j < 3; j++) {
439  for (i = (j * 16 + 4); i < (j * 16 + 8); i++) {
440  if (nzc[scan8[i + 4]])
441  ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i + 4],
442  block + i * 16 * sizeof(pixel),
443  dst_stride);
444  else if (((dctcoef *) block)[i * 16])
445  ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i + 4],
446  block + i * 16 * sizeof(pixel),
447  dst_stride);
448  }
449  }
450 }
451 
453  const int32_t *blk_offset,
454  int16_t *block,
455  int32_t dst_stride,
456  const uint8_t nzc[15 * 8])
457 {
458  int32_t i;
459 
460  for (i = 0; i < 16; i++) {
461  if (nzc[scan8[i]])
462  ff_h264_idct_add_msa(dst + blk_offset[i],
463  block + i * 16 * sizeof(pixel), dst_stride);
464  else if (((dctcoef *) block)[i * 16])
465  ff_h264_idct4x4_addblk_dc_msa(dst + blk_offset[i],
466  block + i * 16 * sizeof(pixel),
467  dst_stride);
468  }
469 }
470 
471 void ff_h264_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
472  int32_t de_qval)
473 {
474  avc_deq_idct_luma_dc_msa(dst, src, de_qval);
475 }
void ff_h264_idct_add8_msa(uint8_t **dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
Definition: h264idct_msa.c:397
static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:245
#define SRARI_W4_SW(...)
static void avc_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src, int32_t de_q_val)
Definition: h264idct_msa.c:39
#define LW(psrc)
#define src
Definition: vp8dsp.c:254
#define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3)
Definition: h264idct_msa.c:25
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define SRA_4V(in0, in1, in2, in3, shift)
The exact code depends on how similar the blocks are and how related they are to the block
uint8_t
#define ST_SH2(...)
#define UNPCK_UB_SH(in, out0, out1)
#define SRARI_H4_SH(...)
#define LD_SH(...)
#define LD_SB8(...)
#define CLIP_SH2_0_255(in0, in1)
#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3)
#define PCKEV_B2_SB(...)
void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:346
#define UNPCK_R_SH_SW(in, out)
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:277
void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:318
#define zero
Definition: regdef.h:64
void ff_h264_idct_add8_422_msa(uint8_t **dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
Definition: h264idct_msa.c:418
#define ILVR_B2_SH(...)
void ff_h264_idct_add16_intra_msa(uint8_t *dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
Definition: h264idct_msa.c:452
#define TRANSPOSE8x8_SH_SH(...)
#define CLIP_SH4_0_255(in0, in1, in2, in3)
#define LD_SH8(...)
void ff_h264_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src, int32_t de_qval)
Definition: h264idct_msa.c:471
void ff_h264_idct8_add4_msa(uint8_t *dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
Definition: h264idct_msa.c:375
int32_t
#define DC_DEST_STRIDE
#define PCKEV_H2_SH(...)
#define PCKEV_B4_SB(...)
#define ILVL_D2_SH(...)
#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,out0, out1, out2, out3, out4, out5, out6, out7)
#define INSERT_W4_UB(...)
void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:324
static const float pred[4]
Definition: siprdata.h:259
#define src1
Definition: h264pred.c:139
#define UNPCK_SH_SW(in, out0, out1)
static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:111
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2]...the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so...,+,-,+,-,+,+,-,+,-,+,...hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32-hcoeff[1]-hcoeff[2]-...a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2}an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||.........intra?||||:Block01:yes no||||:Block02:.................||||:Block03::y DC::ref index:||||:Block04::cb DC::motion x:||||.........:cr DC::motion y:||||.................|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------------------------------|||Y subbands||Cb subbands||Cr subbands||||------||------||------|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||------||------||------||||------||------||------|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||------||------||------||||------||------||------|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||------||------||------||||------||------||------|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------------------------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction------------|\Dequantization-------------------\||Reference frames|\IDWT|--------------|Motion\|||Frame 0||Frame 1||Compensation.OBMC v-------|--------------|--------------.\------> Frame n output Frame Frame<----------------------------------/|...|-------------------Range Coder:============Binary Range Coder:-------------------The implemented range coder is an adapted version based upon"Range encoding: an algorithm for removing redundancy from a digitised message."by G.N.N.Martin.The symbols encoded by the Snow range coder are bits(0|1).The associated probabilities are not fix but change depending on the symbol mix seen so far.bit seen|new state---------+-----------------------------------------------0|256-state_transition_table[256-old_state];1|state_transition_table[old_state];state_transition_table={0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:-------------------------FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1.the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff)*mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
static const uint8_t scan8[16 *3+3]
Definition: h264dec.h:644
#define dctcoef
#define ILVR_B4_SH(...)
#define src0
Definition: h264pred.c:138
#define ADD2(in0, in1, in2, in3, out0, out1)
#define SH(val, pdst)
#define INSERT_W2_SB(...)
uint8_t pixel
Definition: tiny_ssim.c:42
#define SW(val, pdst)
#define ST_SH8(...)
#define ST8x4_UB(in0, in1, pdst, stride)
void ff_h264_idct_add16_msa(uint8_t *dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
Definition: h264idct_msa.c:352
#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)
FILE * out
Definition: movenc.c:54
#define PCKEV_H4_SH(...)
#define ILVR_D2_SH(...)