FFmpeg
h264idct_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "h264dsp_mips.h"
24 
25 #define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3) \
26 { \
27  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
28  \
29  tmp0_m = in0 + in2; \
30  tmp1_m = in0 - in2; \
31  tmp2_m = in1 >> 1; \
32  tmp2_m = tmp2_m - in3; \
33  tmp3_m = in3 >> 1; \
34  tmp3_m = in1 + tmp3_m; \
35  \
36  BUTTERFLY_4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3); \
37 }
38 
39 static void avc_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
40  int32_t de_q_val)
41 {
42 #define DC_DEST_STRIDE 16
43  int16_t out0, out1, out2, out3, out4, out5, out6, out7;
44  v8i16 src1, src3;
45  v8i16 vec0, vec1, vec2, vec3;
46  v8i16 tmp0, tmp1, tmp2, tmp3;
47  v8i16 hres0, hres1, hres2, hres3;
48  v8i16 vres0, vres1, vres2, vres3;
49  v4i32 vres0_r, vres1_r, vres2_r, vres3_r;
50  const v4i32 de_q_vec = __msa_fill_w(de_q_val);
51  const v8i16 src0 = LD_SH(src);
52  const v8i16 src2 = LD_SH(src + 8);
53 
54  ILVL_D2_SH(src0, src0, src2, src2, src1, src3);
55  TRANSPOSE4x4_SH_SH(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
56  BUTTERFLY_4(tmp0, tmp2, tmp3, tmp1, vec0, vec3, vec2, vec1);
57  BUTTERFLY_4(vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1);
58  TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
59  BUTTERFLY_4(hres0, hres1, hres3, hres2, vec0, vec3, vec2, vec1);
60  BUTTERFLY_4(vec0, vec1, vec2, vec3, vres0, vres1, vres2, vres3);
61  UNPCK_R_SH_SW(vres0, vres0_r);
62  UNPCK_R_SH_SW(vres1, vres1_r);
63  UNPCK_R_SH_SW(vres2, vres2_r);
64  UNPCK_R_SH_SW(vres3, vres3_r);
65 
66  vres0_r *= de_q_vec;
67  vres1_r *= de_q_vec;
68  vres2_r *= de_q_vec;
69  vres3_r *= de_q_vec;
70 
71  SRARI_W4_SW(vres0_r, vres1_r, vres2_r, vres3_r, 8);
72  PCKEV_H2_SH(vres1_r, vres0_r, vres3_r, vres2_r, vec0, vec1);
73 
74  out0 = __msa_copy_s_h(vec0, 0);
75  out1 = __msa_copy_s_h(vec0, 1);
76  out2 = __msa_copy_s_h(vec0, 2);
77  out3 = __msa_copy_s_h(vec0, 3);
78  out4 = __msa_copy_s_h(vec0, 4);
79  out5 = __msa_copy_s_h(vec0, 5);
80  out6 = __msa_copy_s_h(vec0, 6);
81  out7 = __msa_copy_s_h(vec0, 7);
82  SH(out0, (dst + 0 * DC_DEST_STRIDE));
83  SH(out1, (dst + 2 * DC_DEST_STRIDE));
84  SH(out2, (dst + 8 * DC_DEST_STRIDE));
85  SH(out3, (dst + 10 * DC_DEST_STRIDE));
86  SH(out4, (dst + 1 * DC_DEST_STRIDE));
87  SH(out5, (dst + 3 * DC_DEST_STRIDE));
88  SH(out6, (dst + 9 * DC_DEST_STRIDE));
89  SH(out7, (dst + 11 * DC_DEST_STRIDE));
90 
91  out0 = __msa_copy_s_h(vec1, 0);
92  out1 = __msa_copy_s_h(vec1, 1);
93  out2 = __msa_copy_s_h(vec1, 2);
94  out3 = __msa_copy_s_h(vec1, 3);
95  out4 = __msa_copy_s_h(vec1, 4);
96  out5 = __msa_copy_s_h(vec1, 5);
97  out6 = __msa_copy_s_h(vec1, 6);
98  out7 = __msa_copy_s_h(vec1, 7);
99  SH(out0, (dst + 4 * DC_DEST_STRIDE));
100  SH(out1, (dst + 6 * DC_DEST_STRIDE));
101  SH(out2, (dst + 12 * DC_DEST_STRIDE));
102  SH(out3, (dst + 14 * DC_DEST_STRIDE));
103  SH(out4, (dst + 5 * DC_DEST_STRIDE));
104  SH(out5, (dst + 7 * DC_DEST_STRIDE));
105  SH(out6, (dst + 13 * DC_DEST_STRIDE));
106  SH(out7, (dst + 15 * DC_DEST_STRIDE));
107 
108 #undef DC_DEST_STRIDE
109 }
110 
111 static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
112 {
113  v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
114  v8i16 vec0, vec1, vec2, vec3;
115  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
116  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
117  v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
118  v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
119  v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
120  v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
121  v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
122  v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
123  v8i16 zeros = { 0 };
124 
125  src[0] += 32;
126 
127  LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
128  ST_SH8(zeros, zeros, zeros, zeros, zeros, zeros, zeros, zeros, src, 8);
129 
130  vec0 = src0 + src4;
131  vec1 = src0 - src4;
132  vec2 = src2 >> 1;
133  vec2 = vec2 - src6;
134  vec3 = src6 >> 1;
135  vec3 = src2 + vec3;
136 
137  BUTTERFLY_4(vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3);
138 
139  vec0 = src7 >> 1;
140  vec0 = src5 - vec0 - src3 - src7;
141  vec1 = src3 >> 1;
142  vec1 = src1 - vec1 + src7 - src3;
143  vec2 = src5 >> 1;
144  vec2 = vec2 - src1 + src7 + src5;
145  vec3 = src1 >> 1;
146  vec3 = vec3 + src3 + src5 + src1;
147  tmp4 = vec3 >> 2;
148  tmp4 += vec0;
149  tmp5 = vec2 >> 2;
150  tmp5 += vec1;
151  tmp6 = vec1 >> 2;
152  tmp6 -= vec2;
153  tmp7 = vec0 >> 2;
154  tmp7 = vec3 - tmp7;
155 
156  BUTTERFLY_8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
157  res0, res1, res2, res3, res4, res5, res6, res7);
158  TRANSPOSE8x8_SH_SH(res0, res1, res2, res3, res4, res5, res6, res7,
159  res0, res1, res2, res3, res4, res5, res6, res7);
160  UNPCK_SH_SW(res0, tmp0_r, tmp0_l);
161  UNPCK_SH_SW(res1, tmp1_r, tmp1_l);
162  UNPCK_SH_SW(res2, tmp2_r, tmp2_l);
163  UNPCK_SH_SW(res3, tmp3_r, tmp3_l);
164  UNPCK_SH_SW(res4, tmp4_r, tmp4_l);
165  UNPCK_SH_SW(res5, tmp5_r, tmp5_l);
166  UNPCK_SH_SW(res6, tmp6_r, tmp6_l);
167  UNPCK_SH_SW(res7, tmp7_r, tmp7_l);
168  BUTTERFLY_4(tmp0_r, tmp0_l, tmp4_l, tmp4_r, vec0_r, vec0_l, vec1_l, vec1_r);
169 
170  vec2_r = tmp2_r >> 1;
171  vec2_l = tmp2_l >> 1;
172  vec2_r -= tmp6_r;
173  vec2_l -= tmp6_l;
174  vec3_r = tmp6_r >> 1;
175  vec3_l = tmp6_l >> 1;
176  vec3_r += tmp2_r;
177  vec3_l += tmp2_l;
178 
179  BUTTERFLY_4(vec0_r, vec1_r, vec2_r, vec3_r, tmp0_r, tmp2_r, tmp4_r, tmp6_r);
180  BUTTERFLY_4(vec0_l, vec1_l, vec2_l, vec3_l, tmp0_l, tmp2_l, tmp4_l, tmp6_l);
181 
182  vec0_r = tmp7_r >> 1;
183  vec0_l = tmp7_l >> 1;
184  vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
185  vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
186  vec1_r = tmp3_r >> 1;
187  vec1_l = tmp3_l >> 1;
188  vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
189  vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
190  vec2_r = tmp5_r >> 1;
191  vec2_l = tmp5_l >> 1;
192  vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
193  vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
194  vec3_r = tmp1_r >> 1;
195  vec3_l = tmp1_l >> 1;
196  vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
197  vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
198  tmp1_r = vec3_r >> 2;
199  tmp1_l = vec3_l >> 2;
200  tmp1_r += vec0_r;
201  tmp1_l += vec0_l;
202  tmp3_r = vec2_r >> 2;
203  tmp3_l = vec2_l >> 2;
204  tmp3_r += vec1_r;
205  tmp3_l += vec1_l;
206  tmp5_r = vec1_r >> 2;
207  tmp5_l = vec1_l >> 2;
208  tmp5_r -= vec2_r;
209  tmp5_l -= vec2_l;
210  tmp7_r = vec0_r >> 2;
211  tmp7_l = vec0_l >> 2;
212  tmp7_r = vec3_r - tmp7_r;
213  tmp7_l = vec3_l - tmp7_l;
214 
215  BUTTERFLY_4(tmp0_r, tmp0_l, tmp7_l, tmp7_r, res0_r, res0_l, res7_l, res7_r);
216  BUTTERFLY_4(tmp2_r, tmp2_l, tmp5_l, tmp5_r, res1_r, res1_l, res6_l, res6_r);
217  BUTTERFLY_4(tmp4_r, tmp4_l, tmp3_l, tmp3_r, res2_r, res2_l, res5_l, res5_r);
218  BUTTERFLY_4(tmp6_r, tmp6_l, tmp1_l, tmp1_r, res3_r, res3_l, res4_l, res4_r);
219  SRA_4V(res0_r, res0_l, res1_r, res1_l, 6);
220  SRA_4V(res2_r, res2_l, res3_r, res3_l, 6);
221  SRA_4V(res4_r, res4_l, res5_r, res5_l, 6);
222  SRA_4V(res6_r, res6_l, res7_r, res7_l, 6);
223  PCKEV_H4_SH(res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
224  res0, res1, res2, res3);
225  PCKEV_H4_SH(res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
226  res4, res5, res6, res7);
227  LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
228  ILVR_B4_SH(zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
229  tmp0, tmp1, tmp2, tmp3);
230  ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
231  tmp4, tmp5, tmp6, tmp7);
232  ADD4(res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
233  res0, res1, res2, res3);
234  ADD4(res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
235  res4, res5, res6, res7);
236  CLIP_SH4_0_255(res0, res1, res2, res3);
237  CLIP_SH4_0_255(res4, res5, res6, res7);
238  PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
239  dst0, dst1, dst2, dst3);
240  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
241 }
242 
243 static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
244  int32_t dst_stride)
245 {
246  int32_t dc_val;
247  v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
248  v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
249  v8i16 dc;
250  v16i8 zeros = { 0 };
251 
252  dc_val = (src[0] + 32) >> 6;
253  dc = __msa_fill_h(dc_val);
254 
255  src[0] = 0;
256 
257  LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
258  ILVR_B4_SH(zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
259  dst0_r, dst1_r, dst2_r, dst3_r);
260  ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
261  dst4_r, dst5_r, dst6_r, dst7_r);
262  ADD4(dst0_r, dc, dst1_r, dc, dst2_r, dc, dst3_r, dc,
263  dst0_r, dst1_r, dst2_r, dst3_r);
264  ADD4(dst4_r, dc, dst5_r, dc, dst6_r, dc, dst7_r, dc,
265  dst4_r, dst5_r, dst6_r, dst7_r);
266  CLIP_SH4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
267  CLIP_SH4_0_255(dst4_r, dst5_r, dst6_r, dst7_r);
268  PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r,
269  dst0, dst1, dst2, dst3);
270  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
271 }
272 
273 void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
274 {
275  uint32_t src0_m, src1_m, src2_m, src3_m, out0_m, out1_m, out2_m, out3_m;
276  v16i8 dst0_m = { 0 };
277  v16i8 dst1_m = { 0 };
278  v8i16 hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3;
279  v8i16 inp0_m, inp1_m, res0_m, res1_m, src1, src3;
280  const v8i16 src0 = LD_SH(src);
281  const v8i16 src2 = LD_SH(src + 8);
282  const v8i16 zero = { 0 };
283  const uint8_t *dst1 = dst + dst_stride;
284  const uint8_t *dst2 = dst + 2 * dst_stride;
285  const uint8_t *dst3 = dst + 3 * dst_stride;
286 
287  ILVL_D2_SH(src0, src0, src2, src2, src1, src3);
288  ST_SH2(zero, zero, src, 8);
289  AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
290  TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
291  AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
292  src0_m = LW(dst);
293  src1_m = LW(dst1);
294  SRARI_H4_SH(vres0, vres1, vres2, vres3, 6);
295  src2_m = LW(dst2);
296  src3_m = LW(dst3);
297  ILVR_D2_SH(vres1, vres0, vres3, vres2, inp0_m, inp1_m);
298  INSERT_W2_SB(src0_m, src1_m, dst0_m);
299  INSERT_W2_SB(src2_m, src3_m, dst1_m);
300  ILVR_B2_SH(zero, dst0_m, zero, dst1_m, res0_m, res1_m);
301  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);
302  CLIP_SH2_0_255(res0_m, res1_m);
303  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);
304  out0_m = __msa_copy_u_w((v4i32) dst0_m, 0);
305  out1_m = __msa_copy_u_w((v4i32) dst0_m, 1);
306  out2_m = __msa_copy_u_w((v4i32) dst1_m, 0);
307  out3_m = __msa_copy_u_w((v4i32) dst1_m, 1);
308  SW(out0_m, dst);
309  SW(out1_m, dst1);
310  SW(out2_m, dst2);
311  SW(out3_m, dst3);
312 }
313 
315  int32_t dst_stride)
316 {
317  avc_idct8_addblk_msa(dst, src, dst_stride);
318 }
319 
321  int32_t dst_stride)
322 {
323  v16u8 pred = { 0 };
324  v16i8 out;
325  v8i16 pred_r, pred_l;
326  const uint32_t src0 = LW(dst);
327  const uint32_t src1 = LW(dst + dst_stride);
328  const uint32_t src2 = LW(dst + 2 * dst_stride);
329  const uint32_t src3 = LW(dst + 3 * dst_stride);
330  const int16_t dc = (src[0] + 32) >> 6;
331  const v8i16 input_dc = __msa_fill_h(dc);
332 
333  src[0] = 0;
334  INSERT_W4_UB(src0, src1, src2, src3, pred);
335  UNPCK_UB_SH(pred, pred_r, pred_l);
336  ADD2(pred_r, input_dc, pred_l, input_dc, pred_r, pred_l);
337  CLIP_SH2_0_255(pred_r, pred_l);
338  out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r);
339  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
340 }
341 
343  int32_t dst_stride)
344 {
345  avc_idct8_dc_addblk_msa(dst, src, dst_stride);
346 }
347 
349  const int32_t *blk_offset,
350  int16_t *block, int32_t dst_stride,
351  const uint8_t nzc[15 * 8])
352 {
353  int32_t i;
354 
355  for (i = 0; i < 16; i++) {
356  int32_t nnz = nzc[scan8[i]];
357 
358  if (nnz) {
359  if (nnz == 1 && ((dctcoef *) block)[i * 16])
360  ff_h264_idct4x4_addblk_dc_msa(dst + blk_offset[i],
361  block + i * 16 * sizeof(pixel),
362  dst_stride);
363  else
364  ff_h264_idct_add_msa(dst + blk_offset[i],
365  block + i * 16 * sizeof(pixel),
366  dst_stride);
367  }
368  }
369 }
370 
371 void ff_h264_idct8_add4_msa(uint8_t *dst, const int32_t *blk_offset,
372  int16_t *block, int32_t dst_stride,
373  const uint8_t nzc[15 * 8])
374 {
375  int32_t cnt;
376 
377  for (cnt = 0; cnt < 16; cnt += 4) {
378  int32_t nnz = nzc[scan8[cnt]];
379 
380  if (nnz) {
381  if (nnz == 1 && ((dctcoef *) block)[cnt * 16])
382  ff_h264_idct8_dc_addblk_msa(dst + blk_offset[cnt],
383  block + cnt * 16 * sizeof(pixel),
384  dst_stride);
385  else
386  ff_h264_idct8_addblk_msa(dst + blk_offset[cnt],
387  block + cnt * 16 * sizeof(pixel),
388  dst_stride);
389  }
390  }
391 }
392 
394  const int32_t *blk_offset,
395  int16_t *block, int32_t dst_stride,
396  const uint8_t nzc[15 * 8])
397 {
398  int32_t i, j;
399 
400  for (j = 1; j < 3; j++) {
401  for (i = (j * 16); i < (j * 16 + 4); i++) {
402  if (nzc[scan8[i]])
403  ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i],
404  block + i * 16 * sizeof(pixel),
405  dst_stride);
406  else if (((dctcoef *) block)[i * 16])
407  ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i],
408  block + i * 16 * sizeof(pixel),
409  dst_stride);
410  }
411  }
412 }
413 
415  const int32_t *blk_offset,
416  int16_t *block, int32_t dst_stride,
417  const uint8_t nzc[15 * 8])
418 {
419  int32_t i, j;
420 
421  for (j = 1; j < 3; j++) {
422  for (i = (j * 16); i < (j * 16 + 4); i++) {
423  if (nzc[scan8[i]])
424  ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i],
425  block + i * 16 * sizeof(pixel),
426  dst_stride);
427  else if (((dctcoef *) block)[i * 16])
428  ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i],
429  block + i * 16 * sizeof(pixel),
430  dst_stride);
431  }
432  }
433 
434  for (j = 1; j < 3; j++) {
435  for (i = (j * 16 + 4); i < (j * 16 + 8); i++) {
436  if (nzc[scan8[i + 4]])
437  ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i + 4],
438  block + i * 16 * sizeof(pixel),
439  dst_stride);
440  else if (((dctcoef *) block)[i * 16])
441  ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i + 4],
442  block + i * 16 * sizeof(pixel),
443  dst_stride);
444  }
445  }
446 }
447 
449  const int32_t *blk_offset,
450  int16_t *block,
451  int32_t dst_stride,
452  const uint8_t nzc[15 * 8])
453 {
454  int32_t i;
455 
456  for (i = 0; i < 16; i++) {
457  if (nzc[scan8[i]])
458  ff_h264_idct_add_msa(dst + blk_offset[i],
459  block + i * 16 * sizeof(pixel), dst_stride);
460  else if (((dctcoef *) block)[i * 16])
461  ff_h264_idct4x4_addblk_dc_msa(dst + blk_offset[i],
462  block + i * 16 * sizeof(pixel),
463  dst_stride);
464  }
465 }
466 
467 void ff_h264_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
468  int32_t de_qval)
469 {
470  avc_deq_idct_luma_dc_msa(dst, src, de_qval);
471 }
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:458
ff_h264_idct_add16_msa
void ff_h264_idct_add16_msa(uint8_t *dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
Definition: h264idct_msa.c:348
out
FILE * out
Definition: movenc.c:54
PCKEV_B4_SB
#define PCKEV_B4_SB(...)
Definition: generic_macros_msa.h:1787
SRARI_H4_SH
#define SRARI_H4_SH(...)
Definition: generic_macros_msa.h:2116
avc_deq_idct_luma_dc_msa
static void avc_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src, int32_t de_q_val)
Definition: h264idct_msa.c:39
AVC_ITRANS_H
#define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3)
Definition: h264idct_msa.c:25
LD_SH
#define LD_SH(...)
Definition: generic_macros_msa.h:39
ff_h264_idct8_dc_addblk_msa
void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:342
ADD4
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2172
ff_h264_idct_add_msa
void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:273
SH
#define SH(val, pdst)
Definition: generic_macros_msa.h:156
INSERT_W2_SB
#define INSERT_W2_SB(...)
Definition: generic_macros_msa.h:1194
ADD2
#define ADD2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2167
generic_macros_msa.h
dctcoef
#define dctcoef
Definition: bit_depth_template.c:84
TRANSPOSE8x8_SH_SH
#define TRANSPOSE8x8_SH_SH(...)
Definition: generic_macros_msa.h:2578
src
#define src
Definition: vp8dsp.c:254
SW
#define SW(val, pdst)
Definition: generic_macros_msa.h:169
SRA_4V
#define SRA_4V(in0, in1, in2, in3, shift)
Definition: generic_macros_msa.h:1988
ff_h264_idct_add16_intra_msa
void ff_h264_idct_add16_intra_msa(uint8_t *dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
Definition: h264idct_msa.c:448
PCKEV_H4_SH
#define PCKEV_H4_SH(...)
Definition: generic_macros_msa.h:1817
ILVL_D2_SH
#define ILVL_D2_SH(...)
Definition: generic_macros_msa.h:1528
int32_t
int32_t
Definition: audio_convert.c:194
ST_D8
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:511
ff_h264_idct8_addblk_msa
void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:314
ff_h264_deq_idct_luma_dc_msa
void ff_h264_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src, int32_t de_qval)
Definition: h264idct_msa.c:467
h264dsp_mips.h
ST_SH2
#define ST_SH2(...)
Definition: generic_macros_msa.h:366
pixel
uint8_t pixel
Definition: tiny_ssim.c:42
UNPCK_SH_SW
#define UNPCK_SH_SW(in, out0, out1)
Definition: generic_macros_msa.h:2295
ff_h264_idct_add8_422_msa
void ff_h264_idct_add8_422_msa(uint8_t **dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
Definition: h264idct_msa.c:414
avc_idct8_addblk_msa
static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:111
TRANSPOSE4x4_SH_SH
#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2539
bit_depth_template.c
CLIP_SH2_0_255
#define CLIP_SH2_0_255(in0, in1)
Definition: generic_macros_msa.h:966
LW
#define LW(psrc)
Definition: generic_macros_msa.h:108
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
ST_SH8
#define ST_SH8(...)
Definition: generic_macros_msa.h:392
src0
#define src0
Definition: h264pred.c:138
src1
#define src1
Definition: h264pred.c:139
CLIP_SH4_0_255
#define CLIP_SH4_0_255(in0, in1, in2, in3)
Definition: generic_macros_msa.h:971
LD_SH8
#define LD_SH8(...)
Definition: generic_macros_msa.h:339
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
ff_h264_idct8_add4_msa
void ff_h264_idct8_add4_msa(uint8_t *dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
Definition: h264idct_msa.c:371
DC_DEST_STRIDE
#define DC_DEST_STRIDE
BUTTERFLY_8
#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4, out5, out6, out7)
Definition: generic_macros_msa.h:2335
uint8_t
uint8_t
Definition: audio_convert.c:194
UNPCK_UB_SH
#define UNPCK_UB_SH(in, out0, out1)
Definition: generic_macros_msa.h:2270
INSERT_W4_UB
#define INSERT_W4_UB(...)
Definition: generic_macros_msa.h:1203
ff_h264_idct4x4_addblk_dc_msa
void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:320
pred
static const float pred[4]
Definition: siprdata.h:259
ILVR_D2_SH
#define ILVR_D2_SH(...)
Definition: generic_macros_msa.h:1494
scan8
static const uint8_t scan8[16 *3+3]
Definition: h264dec.h:644
LD_SB8
#define LD_SB8(...)
Definition: generic_macros_msa.h:337
SRARI_W4_SW
#define SRARI_W4_SW(...)
Definition: generic_macros_msa.h:2141
BUTTERFLY_4
#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2321
ILVR_B4_SH
#define ILVR_B4_SH(...)
Definition: generic_macros_msa.h:1412
ff_h264_idct_add8_msa
void ff_h264_idct_add8_msa(uint8_t **dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
Definition: h264idct_msa.c:393
zero
#define zero
Definition: regdef.h:64
UNPCK_R_SH_SW
#define UNPCK_R_SH_SW(in, out)
Definition: generic_macros_msa.h:2227
PCKEV_B2_SB
#define PCKEV_B2_SB(...)
Definition: generic_macros_msa.h:1768
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
ILVR_B2_SH
#define ILVR_B2_SH(...)
Definition: generic_macros_msa.h:1390
PCKEV_H2_SH
#define PCKEV_H2_SH(...)
Definition: generic_macros_msa.h:1808
avc_idct8_dc_addblk_msa
static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:243