FFmpeg
h264idct_lasx.c
Go to the documentation of this file.
1 /*
2  * Loongson LASX optimized h264dsp
3  *
4  * Copyright (c) 2021 Loongson Technology Corporation Limited
5  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
6  * Xiwei Gu <guxiwei-hf@loongson.cn>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
26 #include "h264dsp_lasx.h"
28 
29 #define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3) \
30 { \
31  __m256i tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
32  \
33  tmp0_m = __lasx_xvadd_h(in0, in2); \
34  tmp1_m = __lasx_xvsub_h(in0, in2); \
35  tmp2_m = __lasx_xvsrai_h(in1, 1); \
36  tmp2_m = __lasx_xvsub_h(tmp2_m, in3); \
37  tmp3_m = __lasx_xvsrai_h(in3, 1); \
38  tmp3_m = __lasx_xvadd_h(in1, tmp3_m); \
39  \
40  LASX_BUTTERFLY_4_H(tmp0_m, tmp1_m, tmp2_m, tmp3_m, \
41  out0, out1, out2, out3); \
42 }
43 
44 void ff_h264_idct_add_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride)
45 {
46  __m256i src0_m, src1_m, src2_m, src3_m;
47  __m256i dst0_m, dst1_m;
48  __m256i hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3;
49  __m256i inp0_m, inp1_m, res0_m, src1, src3;
50  __m256i src0 = __lasx_xvld(src, 0);
51  __m256i src2 = __lasx_xvld(src, 16);
52  __m256i zero = __lasx_xvldi(0);
53  int32_t dst_stride_2x = dst_stride << 1;
54  int32_t dst_stride_3x = dst_stride_2x + dst_stride;
55 
56  __lasx_xvst(zero, src, 0);
57  DUP2_ARG2(__lasx_xvilvh_d, src0, src0, src2, src2, src1, src3);
58  AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
59  LASX_TRANSPOSE4x4_H(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
60  AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
61  DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
62  dst, dst_stride_3x, src0_m, src1_m, src2_m, src3_m);
63  DUP4_ARG2(__lasx_xvld, dst, 0, dst + dst_stride, 0, dst + dst_stride_2x,
64  0, dst + dst_stride_3x, 0, src0_m, src1_m, src2_m, src3_m);
65  DUP2_ARG2(__lasx_xvilvl_d, vres1, vres0, vres3, vres2, inp0_m, inp1_m);
66  inp0_m = __lasx_xvpermi_q(inp1_m, inp0_m, 0x20);
67  inp0_m = __lasx_xvsrari_h(inp0_m, 6);
68  DUP2_ARG2(__lasx_xvilvl_w, src1_m, src0_m, src3_m, src2_m, dst0_m, dst1_m);
69  dst0_m = __lasx_xvilvl_d(dst1_m, dst0_m);
70  res0_m = __lasx_vext2xv_hu_bu(dst0_m);
71  res0_m = __lasx_xvadd_h(res0_m, inp0_m);
72  res0_m = __lasx_xvclip255_h(res0_m);
73  dst0_m = __lasx_xvpickev_b(res0_m, res0_m);
74  __lasx_xvstelm_w(dst0_m, dst, 0, 0);
75  __lasx_xvstelm_w(dst0_m, dst + dst_stride, 0, 1);
76  __lasx_xvstelm_w(dst0_m, dst + dst_stride_2x, 0, 4);
77  __lasx_xvstelm_w(dst0_m, dst + dst_stride_3x, 0, 5);
78 }
79 
80 void ff_h264_idct8_addblk_lasx(uint8_t *dst, int16_t *src,
81  int32_t dst_stride)
82 {
83  __m256i src0, src1, src2, src3, src4, src5, src6, src7;
84  __m256i vec0, vec1, vec2, vec3;
85  __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
86  __m256i res0, res1, res2, res3, res4, res5, res6, res7;
87  __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
88  __m256i zero = __lasx_xvldi(0);
89  int32_t dst_stride_2x = dst_stride << 1;
90  int32_t dst_stride_4x = dst_stride << 2;
91  int32_t dst_stride_3x = dst_stride_2x + dst_stride;
92 
93  src[0] += 32;
94  DUP4_ARG2(__lasx_xvld, src, 0, src, 16, src, 32, src, 48,
95  src0, src1, src2, src3);
96  DUP4_ARG2(__lasx_xvld, src, 64, src, 80, src, 96, src, 112,
97  src4, src5, src6, src7);
98  __lasx_xvst(zero, src, 0);
99  __lasx_xvst(zero, src, 32);
100  __lasx_xvst(zero, src, 64);
101  __lasx_xvst(zero, src, 96);
102 
103  vec0 = __lasx_xvadd_h(src0, src4);
104  vec1 = __lasx_xvsub_h(src0, src4);
105  vec2 = __lasx_xvsrai_h(src2, 1);
106  vec2 = __lasx_xvsub_h(vec2, src6);
107  vec3 = __lasx_xvsrai_h(src6, 1);
108  vec3 = __lasx_xvadd_h(src2, vec3);
109 
110  LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3);
111 
112  vec0 = __lasx_xvsrai_h(src7, 1);
113  vec0 = __lasx_xvsub_h(src5, vec0);
114  vec0 = __lasx_xvsub_h(vec0, src3);
115  vec0 = __lasx_xvsub_h(vec0, src7);
116 
117  vec1 = __lasx_xvsrai_h(src3, 1);
118  vec1 = __lasx_xvsub_h(src1, vec1);
119  vec1 = __lasx_xvadd_h(vec1, src7);
120  vec1 = __lasx_xvsub_h(vec1, src3);
121 
122  vec2 = __lasx_xvsrai_h(src5, 1);
123  vec2 = __lasx_xvsub_h(vec2, src1);
124  vec2 = __lasx_xvadd_h(vec2, src7);
125  vec2 = __lasx_xvadd_h(vec2, src5);
126 
127  vec3 = __lasx_xvsrai_h(src1, 1);
128  vec3 = __lasx_xvadd_h(src3, vec3);
129  vec3 = __lasx_xvadd_h(vec3, src5);
130  vec3 = __lasx_xvadd_h(vec3, src1);
131 
132  tmp4 = __lasx_xvsrai_h(vec3, 2);
133  tmp4 = __lasx_xvadd_h(tmp4, vec0);
134  tmp5 = __lasx_xvsrai_h(vec2, 2);
135  tmp5 = __lasx_xvadd_h(tmp5, vec1);
136  tmp6 = __lasx_xvsrai_h(vec1, 2);
137  tmp6 = __lasx_xvsub_h(tmp6, vec2);
138  tmp7 = __lasx_xvsrai_h(vec0, 2);
139  tmp7 = __lasx_xvsub_h(vec3, tmp7);
140 
141  LASX_BUTTERFLY_8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
142  res0, res1, res2, res3, res4, res5, res6, res7);
143  LASX_TRANSPOSE8x8_H(res0, res1, res2, res3, res4, res5, res6, res7,
144  res0, res1, res2, res3, res4, res5, res6, res7);
145 
146  DUP4_ARG1(__lasx_vext2xv_w_h, res0, res1, res2, res3,
147  tmp0, tmp1, tmp2, tmp3);
148  DUP4_ARG1(__lasx_vext2xv_w_h, res4, res5, res6, res7,
149  tmp4, tmp5, tmp6, tmp7);
150  vec0 = __lasx_xvadd_w(tmp0, tmp4);
151  vec1 = __lasx_xvsub_w(tmp0, tmp4);
152 
153  vec2 = __lasx_xvsrai_w(tmp2, 1);
154  vec2 = __lasx_xvsub_w(vec2, tmp6);
155  vec3 = __lasx_xvsrai_w(tmp6, 1);
156  vec3 = __lasx_xvadd_w(vec3, tmp2);
157 
158  tmp0 = __lasx_xvadd_w(vec0, vec3);
159  tmp2 = __lasx_xvadd_w(vec1, vec2);
160  tmp4 = __lasx_xvsub_w(vec1, vec2);
161  tmp6 = __lasx_xvsub_w(vec0, vec3);
162 
163  vec0 = __lasx_xvsrai_w(tmp7, 1);
164  vec0 = __lasx_xvsub_w(tmp5, vec0);
165  vec0 = __lasx_xvsub_w(vec0, tmp3);
166  vec0 = __lasx_xvsub_w(vec0, tmp7);
167 
168  vec1 = __lasx_xvsrai_w(tmp3, 1);
169  vec1 = __lasx_xvsub_w(tmp1, vec1);
170  vec1 = __lasx_xvadd_w(vec1, tmp7);
171  vec1 = __lasx_xvsub_w(vec1, tmp3);
172 
173  vec2 = __lasx_xvsrai_w(tmp5, 1);
174  vec2 = __lasx_xvsub_w(vec2, tmp1);
175  vec2 = __lasx_xvadd_w(vec2, tmp7);
176  vec2 = __lasx_xvadd_w(vec2, tmp5);
177 
178  vec3 = __lasx_xvsrai_w(tmp1, 1);
179  vec3 = __lasx_xvadd_w(tmp3, vec3);
180  vec3 = __lasx_xvadd_w(vec3, tmp5);
181  vec3 = __lasx_xvadd_w(vec3, tmp1);
182 
183  tmp1 = __lasx_xvsrai_w(vec3, 2);
184  tmp1 = __lasx_xvadd_w(tmp1, vec0);
185  tmp3 = __lasx_xvsrai_w(vec2, 2);
186  tmp3 = __lasx_xvadd_w(tmp3, vec1);
187  tmp5 = __lasx_xvsrai_w(vec1, 2);
188  tmp5 = __lasx_xvsub_w(tmp5, vec2);
189  tmp7 = __lasx_xvsrai_w(vec0, 2);
190  tmp7 = __lasx_xvsub_w(vec3, tmp7);
191 
192  LASX_BUTTERFLY_4_W(tmp0, tmp2, tmp5, tmp7, res0, res1, res6, res7);
193  LASX_BUTTERFLY_4_W(tmp4, tmp6, tmp1, tmp3, res2, res3, res4, res5);
194 
195  DUP4_ARG2(__lasx_xvsrai_w, res0, 6, res1, 6, res2, 6, res3, 6,
196  res0, res1, res2, res3);
197  DUP4_ARG2(__lasx_xvsrai_w, res4, 6, res5, 6, res6, 6, res7, 6,
198  res4, res5, res6, res7);
199  DUP4_ARG2(__lasx_xvpickev_h, res1, res0, res3, res2, res5, res4, res7,
200  res6, res0, res1, res2, res3);
201  DUP4_ARG2(__lasx_xvpermi_d, res0, 0xd8, res1, 0xd8, res2, 0xd8, res3, 0xd8,
202  res0, res1, res2, res3);
203 
204  DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
205  dst, dst_stride_3x, dst0, dst1, dst2, dst3);
206  dst += dst_stride_4x;
207  DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
208  dst, dst_stride_3x, dst4, dst5, dst6, dst7);
209  dst -= dst_stride_4x;
210  DUP4_ARG2(__lasx_xvilvl_b, zero, dst0, zero, dst1, zero, dst2, zero, dst3,
211  dst0, dst1, dst2, dst3);
212  DUP4_ARG2(__lasx_xvilvl_b, zero, dst4, zero, dst5, zero, dst6, zero, dst7,
213  dst4, dst5, dst6, dst7);
214  DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst3, dst2, 0x20, dst5,
215  dst4, 0x20, dst7, dst6, 0x20, dst0, dst1, dst2, dst3);
216  res0 = __lasx_xvadd_h(res0, dst0);
217  res1 = __lasx_xvadd_h(res1, dst1);
218  res2 = __lasx_xvadd_h(res2, dst2);
219  res3 = __lasx_xvadd_h(res3, dst3);
220  DUP4_ARG1(__lasx_xvclip255_h, res0, res1, res2, res3, res0, res1,
221  res2, res3);
222  DUP2_ARG2(__lasx_xvpickev_b, res1, res0, res3, res2, res0, res1);
223  __lasx_xvstelm_d(res0, dst, 0, 0);
224  __lasx_xvstelm_d(res0, dst + dst_stride, 0, 2);
225  __lasx_xvstelm_d(res0, dst + dst_stride_2x, 0, 1);
226  __lasx_xvstelm_d(res0, dst + dst_stride_3x, 0, 3);
227  dst += dst_stride_4x;
228  __lasx_xvstelm_d(res1, dst, 0, 0);
229  __lasx_xvstelm_d(res1, dst + dst_stride, 0, 2);
230  __lasx_xvstelm_d(res1, dst + dst_stride_2x, 0, 1);
231  __lasx_xvstelm_d(res1, dst + dst_stride_3x, 0, 3);
232 }
233 
234 void ff_h264_idct4x4_addblk_dc_lasx(uint8_t *dst, int16_t *src,
235  int32_t dst_stride)
236 {
237  const int16_t dc = (src[0] + 32) >> 6;
238  int32_t dst_stride_2x = dst_stride << 1;
239  int32_t dst_stride_3x = dst_stride_2x + dst_stride;
240  __m256i pred, out;
241  __m256i src0, src1, src2, src3;
242  __m256i input_dc = __lasx_xvreplgr2vr_h(dc);
243 
244  src[0] = 0;
245  DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
246  dst, dst_stride_3x, src0, src1, src2, src3);
247  DUP2_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src0, src1);
248 
249  pred = __lasx_xvpermi_q(src0, src1, 0x02);
250  pred = __lasx_xvaddw_h_h_bu(input_dc, pred);
251  pred = __lasx_xvclip255_h(pred);
252  out = __lasx_xvpickev_b(pred, pred);
253  __lasx_xvstelm_w(out, dst, 0, 0);
254  __lasx_xvstelm_w(out, dst + dst_stride, 0, 1);
255  __lasx_xvstelm_w(out, dst + dst_stride_2x, 0, 4);
256  __lasx_xvstelm_w(out, dst + dst_stride_3x, 0, 5);
257 }
258 
259 void ff_h264_idct8_dc_addblk_lasx(uint8_t *dst, int16_t *src,
260  int32_t dst_stride)
261 {
262  int32_t dc_val;
263  int32_t dst_stride_2x = dst_stride << 1;
264  int32_t dst_stride_4x = dst_stride << 2;
265  int32_t dst_stride_3x = dst_stride_2x + dst_stride;
266  __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
267  __m256i dc;
268 
269  dc_val = (src[0] + 32) >> 6;
270  dc = __lasx_xvreplgr2vr_h(dc_val);
271 
272  src[0] = 0;
273 
274  DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
275  dst, dst_stride_3x, dst0, dst1, dst2, dst3);
276  dst += dst_stride_4x;
277  DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
278  dst, dst_stride_3x, dst4, dst5, dst6, dst7);
279  dst -= dst_stride_4x;
280  DUP4_ARG1(__lasx_vext2xv_hu_bu, dst0, dst1, dst2, dst3,
281  dst0, dst1, dst2, dst3);
282  DUP4_ARG1(__lasx_vext2xv_hu_bu, dst4, dst5, dst6, dst7,
283  dst4, dst5, dst6, dst7);
284  DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst3, dst2, 0x20, dst5,
285  dst4, 0x20, dst7, dst6, 0x20, dst0, dst1, dst2, dst3);
286  dst0 = __lasx_xvadd_h(dst0, dc);
287  dst1 = __lasx_xvadd_h(dst1, dc);
288  dst2 = __lasx_xvadd_h(dst2, dc);
289  dst3 = __lasx_xvadd_h(dst3, dc);
290  DUP4_ARG1(__lasx_xvclip255_h, dst0, dst1, dst2, dst3,
291  dst0, dst1, dst2, dst3);
292  DUP2_ARG2(__lasx_xvpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
293  __lasx_xvstelm_d(dst0, dst, 0, 0);
294  __lasx_xvstelm_d(dst0, dst + dst_stride, 0, 2);
295  __lasx_xvstelm_d(dst0, dst + dst_stride_2x, 0, 1);
296  __lasx_xvstelm_d(dst0, dst + dst_stride_3x, 0, 3);
297  dst += dst_stride_4x;
298  __lasx_xvstelm_d(dst1, dst, 0, 0);
299  __lasx_xvstelm_d(dst1, dst + dst_stride, 0, 2);
300  __lasx_xvstelm_d(dst1, dst + dst_stride_2x, 0, 1);
301  __lasx_xvstelm_d(dst1, dst + dst_stride_3x, 0, 3);
302 }
303 
304 void ff_h264_idct_add16_lasx(uint8_t *dst,
305  const int32_t *blk_offset,
306  int16_t *block, int32_t dst_stride,
307  const uint8_t nzc[15 * 8])
308 {
309  int32_t i;
310 
311  for (i = 0; i < 16; i++) {
312  int32_t nnz = nzc[scan8[i]];
313 
314  if (nnz) {
315  if (nnz == 1 && ((dctcoef *) block)[i * 16])
316  ff_h264_idct4x4_addblk_dc_lasx(dst + blk_offset[i],
317  block + i * 16 * sizeof(pixel),
318  dst_stride);
319  else
320  ff_h264_idct_add_lasx(dst + blk_offset[i],
321  block + i * 16 * sizeof(pixel),
322  dst_stride);
323  }
324  }
325 }
326 
327 void ff_h264_idct8_add4_lasx(uint8_t *dst, const int32_t *blk_offset,
328  int16_t *block, int32_t dst_stride,
329  const uint8_t nzc[15 * 8])
330 {
331  int32_t cnt;
332 
333  for (cnt = 0; cnt < 16; cnt += 4) {
334  int32_t nnz = nzc[scan8[cnt]];
335 
336  if (nnz) {
337  if (nnz == 1 && ((dctcoef *) block)[cnt * 16])
338  ff_h264_idct8_dc_addblk_lasx(dst + blk_offset[cnt],
339  block + cnt * 16 * sizeof(pixel),
340  dst_stride);
341  else
342  ff_h264_idct8_addblk_lasx(dst + blk_offset[cnt],
343  block + cnt * 16 * sizeof(pixel),
344  dst_stride);
345  }
346  }
347 }
348 
349 
350 void ff_h264_idct_add8_lasx(uint8_t **dst,
351  const int32_t *blk_offset,
352  int16_t *block, int32_t dst_stride,
353  const uint8_t nzc[15 * 8])
354 {
355  int32_t i;
356 
357  for (i = 16; i < 20; i++) {
358  if (nzc[scan8[i]])
359  ff_h264_idct_add_lasx(dst[0] + blk_offset[i],
360  block + i * 16 * sizeof(pixel),
361  dst_stride);
362  else if (((dctcoef *) block)[i * 16])
363  ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i],
364  block + i * 16 * sizeof(pixel),
365  dst_stride);
366  }
367  for (i = 32; i < 36; i++) {
368  if (nzc[scan8[i]])
369  ff_h264_idct_add_lasx(dst[1] + blk_offset[i],
370  block + i * 16 * sizeof(pixel),
371  dst_stride);
372  else if (((dctcoef *) block)[i * 16])
373  ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i],
374  block + i * 16 * sizeof(pixel),
375  dst_stride);
376  }
377 }
378 
379 void ff_h264_idct_add8_422_lasx(uint8_t **dst,
380  const int32_t *blk_offset,
381  int16_t *block, int32_t dst_stride,
382  const uint8_t nzc[15 * 8])
383 {
384  int32_t i;
385 
386  for (i = 16; i < 20; i++) {
387  if (nzc[scan8[i]])
388  ff_h264_idct_add_lasx(dst[0] + blk_offset[i],
389  block + i * 16 * sizeof(pixel),
390  dst_stride);
391  else if (((dctcoef *) block)[i * 16])
392  ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i],
393  block + i * 16 * sizeof(pixel),
394  dst_stride);
395  }
396  for (i = 32; i < 36; i++) {
397  if (nzc[scan8[i]])
398  ff_h264_idct_add_lasx(dst[1] + blk_offset[i],
399  block + i * 16 * sizeof(pixel),
400  dst_stride);
401  else if (((dctcoef *) block)[i * 16])
402  ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i],
403  block + i * 16 * sizeof(pixel),
404  dst_stride);
405  }
406  for (i = 20; i < 24; i++) {
407  if (nzc[scan8[i + 4]])
408  ff_h264_idct_add_lasx(dst[0] + blk_offset[i + 4],
409  block + i * 16 * sizeof(pixel),
410  dst_stride);
411  else if (((dctcoef *) block)[i * 16])
412  ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i + 4],
413  block + i * 16 * sizeof(pixel),
414  dst_stride);
415  }
416  for (i = 36; i < 40; i++) {
417  if (nzc[scan8[i + 4]])
418  ff_h264_idct_add_lasx(dst[1] + blk_offset[i + 4],
419  block + i * 16 * sizeof(pixel),
420  dst_stride);
421  else if (((dctcoef *) block)[i * 16])
422  ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i + 4],
423  block + i * 16 * sizeof(pixel),
424  dst_stride);
425  }
426 }
427 
429  const int32_t *blk_offset,
430  int16_t *block,
431  int32_t dst_stride,
432  const uint8_t nzc[15 * 8])
433 {
434  int32_t i;
435 
436  for (i = 0; i < 16; i++) {
437  if (nzc[scan8[i]])
438  ff_h264_idct_add_lasx(dst + blk_offset[i],
439  block + i * 16 * sizeof(pixel), dst_stride);
440  else if (((dctcoef *) block)[i * 16])
441  ff_h264_idct4x4_addblk_dc_lasx(dst + blk_offset[i],
442  block + i * 16 * sizeof(pixel),
443  dst_stride);
444  }
445 }
446 
447 void ff_h264_deq_idct_luma_dc_lasx(int16_t *dst, int16_t *src,
448  int32_t de_qval)
449 {
450 #define DC_DEST_STRIDE 16
451 
452  __m256i src0, src1, src2, src3;
453  __m256i vec0, vec1, vec2, vec3;
454  __m256i tmp0, tmp1, tmp2, tmp3;
455  __m256i hres0, hres1, hres2, hres3;
456  __m256i vres0, vres1, vres2, vres3;
457  __m256i de_q_vec = __lasx_xvreplgr2vr_w(de_qval);
458 
459  DUP4_ARG2(__lasx_xvld, src, 0, src, 8, src, 16, src, 24,
460  src0, src1, src2, src3);
461  LASX_TRANSPOSE4x4_H(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
462  LASX_BUTTERFLY_4_H(tmp0, tmp2, tmp3, tmp1, vec0, vec3, vec2, vec1);
463  LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1);
464  LASX_TRANSPOSE4x4_H(hres0, hres1, hres2, hres3,
465  hres0, hres1, hres2, hres3);
466  LASX_BUTTERFLY_4_H(hres0, hres1, hres3, hres2, vec0, vec3, vec2, vec1);
467  LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, vres0, vres1, vres2, vres3);
468  DUP4_ARG1(__lasx_vext2xv_w_h, vres0, vres1, vres2, vres3,
469  vres0, vres1, vres2, vres3);
470  DUP2_ARG3(__lasx_xvpermi_q, vres1, vres0, 0x20, vres3, vres2, 0x20,
471  vres0, vres1);
472 
473  vres0 = __lasx_xvmul_w(vres0, de_q_vec);
474  vres1 = __lasx_xvmul_w(vres1, de_q_vec);
475 
476  vres0 = __lasx_xvsrari_w(vres0, 8);
477  vres1 = __lasx_xvsrari_w(vres1, 8);
478  vec0 = __lasx_xvpickev_h(vres1, vres0);
479  vec0 = __lasx_xvpermi_d(vec0, 0xd8);
480  __lasx_xvstelm_h(vec0, dst + 0 * DC_DEST_STRIDE, 0, 0);
481  __lasx_xvstelm_h(vec0, dst + 2 * DC_DEST_STRIDE, 0, 1);
482  __lasx_xvstelm_h(vec0, dst + 8 * DC_DEST_STRIDE, 0, 2);
483  __lasx_xvstelm_h(vec0, dst + 10 * DC_DEST_STRIDE, 0, 3);
484  __lasx_xvstelm_h(vec0, dst + 1 * DC_DEST_STRIDE, 0, 4);
485  __lasx_xvstelm_h(vec0, dst + 3 * DC_DEST_STRIDE, 0, 5);
486  __lasx_xvstelm_h(vec0, dst + 9 * DC_DEST_STRIDE, 0, 6);
487  __lasx_xvstelm_h(vec0, dst + 11 * DC_DEST_STRIDE, 0, 7);
488  __lasx_xvstelm_h(vec0, dst + 4 * DC_DEST_STRIDE, 0, 8);
489  __lasx_xvstelm_h(vec0, dst + 6 * DC_DEST_STRIDE, 0, 9);
490  __lasx_xvstelm_h(vec0, dst + 12 * DC_DEST_STRIDE, 0, 10);
491  __lasx_xvstelm_h(vec0, dst + 14 * DC_DEST_STRIDE, 0, 11);
492  __lasx_xvstelm_h(vec0, dst + 5 * DC_DEST_STRIDE, 0, 12);
493  __lasx_xvstelm_h(vec0, dst + 7 * DC_DEST_STRIDE, 0, 13);
494  __lasx_xvstelm_h(vec0, dst + 13 * DC_DEST_STRIDE, 0, 14);
495  __lasx_xvstelm_h(vec0, dst + 15 * DC_DEST_STRIDE, 0, 15);
496 
497 #undef DC_DEST_STRIDE
498 }
ff_h264_idct_add16_intra_lasx
void ff_h264_idct_add16_intra_lasx(uint8_t *dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
Definition: h264idct_lasx.c:428
out
FILE * out
Definition: movenc.c:54
src1
const pixel * src1
Definition: h264pred_template.c:421
ff_h264_idct8_dc_addblk_lasx
void ff_h264_idct8_dc_addblk_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_lasx.c:259
DUP2_ARG2
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:58
ff_h264_deq_idct_luma_dc_lasx
void ff_h264_deq_idct_luma_dc_lasx(int16_t *dst, int16_t *src, int32_t de_qval)
Definition: h264idct_lasx.c:447
dctcoef
#define dctcoef
Definition: bit_depth_template.c:84
scan8
static const uint8_t scan8[16 *3+3]
Definition: h264_parse.h:40
ff_h264_idct8_add4_lasx
void ff_h264_idct8_add4_lasx(uint8_t *dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
Definition: h264idct_lasx.c:327
ff_h264_idct_add_lasx
void ff_h264_idct_add_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_lasx.c:44
ff_h264_idct4x4_addblk_dc_lasx
void ff_h264_idct4x4_addblk_dc_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_lasx.c:234
DUP4_ARG2
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:76
pixel
uint8_t pixel
Definition: tiny_ssim.c:41
AVC_ITRANS_H
#define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3)
Definition: h264idct_lasx.c:29
bit_depth_template.c
DC_DEST_STRIDE
#define DC_DEST_STRIDE
DUP4_ARG1
#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:70
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
ff_h264_idct_add16_lasx
void ff_h264_idct_add16_lasx(uint8_t *dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
Definition: h264idct_lasx.c:304
DUP2_ARG3
#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:64
ff_h264_idct_add8_422_lasx
void ff_h264_idct_add8_422_lasx(uint8_t **dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
Definition: h264idct_lasx.c:379
ff_h264_idct_add8_lasx
void ff_h264_idct_add8_lasx(uint8_t **dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
Definition: h264idct_lasx.c:350
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
src2
const pixel * src2
Definition: h264pred_template.c:422
ff_h264_idct8_addblk_lasx
void ff_h264_idct8_addblk_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_lasx.c:80
pred
static const float pred[4]
Definition: siprdata.h:259
src0
const pixel *const src0
Definition: h264pred_template.c:420
zero
#define zero
Definition: regdef.h:64
loongson_intrinsics.h
h264dsp_lasx.h
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
int32_t
int32_t
Definition: audioconvert.c:56
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
DUP4_ARG3
#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:83