FFmpeg
yuv2rgb_lasx.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2022 Loongson Technology Corporation Limited
3  * Contributed by Hao Chen(chenhao@loongson.cn)
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "swscale_loongarch.h"
24 
25 #define YUV2RGB_LOAD_COE \
26  /* Load x_offset */ \
27  __m256i y_offset = __lasx_xvreplgr2vr_d(c->yOffset); \
28  __m256i u_offset = __lasx_xvreplgr2vr_d(c->uOffset); \
29  __m256i v_offset = __lasx_xvreplgr2vr_d(c->vOffset); \
30  /* Load x_coeff */ \
31  __m256i ug_coeff = __lasx_xvreplgr2vr_d(c->ugCoeff); \
32  __m256i vg_coeff = __lasx_xvreplgr2vr_d(c->vgCoeff); \
33  __m256i y_coeff = __lasx_xvreplgr2vr_d(c->yCoeff); \
34  __m256i ub_coeff = __lasx_xvreplgr2vr_d(c->ubCoeff); \
35  __m256i vr_coeff = __lasx_xvreplgr2vr_d(c->vrCoeff); \
36 
37 #define LOAD_YUV_16 \
38  m_y1 = __lasx_xvld(py_1, 0); \
39  m_y2 = __lasx_xvld(py_2, 0); \
40  m_u = __lasx_xvldrepl_d(pu, 0); \
41  m_v = __lasx_xvldrepl_d(pv, 0); \
42  m_u = __lasx_xvilvl_b(m_u, m_u); \
43  m_v = __lasx_xvilvl_b(m_v, m_v); \
44  DUP4_ARG1(__lasx_vext2xv_hu_bu, m_y1, m_y2, m_u, m_v, \
45  m_y1, m_y2, m_u, m_v); \
46 
47 /* YUV2RGB method
48  * The conversion method is as follows:
49  * R = Y' * y_coeff + V' * vr_coeff
50  * G = Y' * y_coeff + V' * vg_coeff + U' * ug_coeff
51  * B = Y' * y_coeff + U' * ub_coeff
52  *
53  * where X' = X * 8 - x_offset
54  *
55  */
56 
57 #define YUV2RGB \
58  m_y1 = __lasx_xvslli_h(m_y1, 3); \
59  m_y2 = __lasx_xvslli_h(m_y2, 3); \
60  m_u = __lasx_xvslli_h(m_u, 3); \
61  m_v = __lasx_xvslli_h(m_v, 3); \
62  m_y1 = __lasx_xvsub_h(m_y1, y_offset); \
63  m_y2 = __lasx_xvsub_h(m_y2, y_offset); \
64  m_u = __lasx_xvsub_h(m_u, u_offset); \
65  m_v = __lasx_xvsub_h(m_v, v_offset); \
66  y_1 = __lasx_xvmuh_h(m_y1, y_coeff); \
67  y_2 = __lasx_xvmuh_h(m_y2, y_coeff); \
68  u2g = __lasx_xvmuh_h(m_u, ug_coeff); \
69  u2b = __lasx_xvmuh_h(m_u, ub_coeff); \
70  v2r = __lasx_xvmuh_h(m_v, vr_coeff); \
71  v2g = __lasx_xvmuh_h(m_v, vg_coeff); \
72  r1 = __lasx_xvsadd_h(y_1, v2r); \
73  v2g = __lasx_xvsadd_h(v2g, u2g); \
74  g1 = __lasx_xvsadd_h(y_1, v2g); \
75  b1 = __lasx_xvsadd_h(y_1, u2b); \
76  r2 = __lasx_xvsadd_h(y_2, v2r); \
77  g2 = __lasx_xvsadd_h(y_2, v2g); \
78  b2 = __lasx_xvsadd_h(y_2, u2b); \
79  DUP4_ARG1(__lasx_xvclip255_h, r1, g1, b1, r2, r1, g1, b1, r2); \
80  DUP2_ARG1(__lasx_xvclip255_h, g2, b2, g2, b2); \
81 
82 #define YUV2RGB_RES \
83  m_y1 = __lasx_xvldrepl_d(py_1, 0); \
84  m_y2 = __lasx_xvldrepl_d(py_2, 0); \
85  m_u = __lasx_xvldrepl_w(pu, 0); \
86  m_v = __lasx_xvldrepl_w(pv, 0); \
87  m_y1 = __lasx_xvilvl_d(m_y2, m_y1); \
88  m_u = __lasx_xvilvl_b(m_u, m_u); \
89  m_v = __lasx_xvilvl_b(m_v, m_v); \
90  m_y1 = __lasx_vext2xv_hu_bu(m_y1); \
91  m_u = __lasx_vext2xv_hu_bu(m_u); \
92  m_v = __lasx_vext2xv_hu_bu(m_v); \
93  m_y1 = __lasx_xvslli_h(m_y1, 3); \
94  m_u = __lasx_xvslli_h(m_u, 3); \
95  m_v = __lasx_xvslli_h(m_v, 3); \
96  m_y1 = __lasx_xvsub_h(m_y1, y_offset); \
97  m_u = __lasx_xvsub_h(m_u, u_offset); \
98  m_v = __lasx_xvsub_h(m_v, v_offset); \
99  y_1 = __lasx_xvmuh_h(m_y1, y_coeff); \
100  u2g = __lasx_xvmuh_h(m_u, ug_coeff); \
101  u2b = __lasx_xvmuh_h(m_u, ub_coeff); \
102  v2r = __lasx_xvmuh_h(m_v, vr_coeff); \
103  v2g = __lasx_xvmuh_h(m_v, vg_coeff); \
104  r1 = __lasx_xvsadd_h(y_1, v2r); \
105  v2g = __lasx_xvsadd_h(v2g, u2g); \
106  g1 = __lasx_xvsadd_h(y_1, v2g); \
107  b1 = __lasx_xvsadd_h(y_1, u2b); \
108  r1 = __lasx_xvclip255_h(r1); \
109  g1 = __lasx_xvclip255_h(g1); \
110  b1 = __lasx_xvclip255_h(b1); \
111 
112 #define RGB_PACK(r, g, b, rgb_l, rgb_h) \
113 { \
114  __m256i rg; \
115  rg = __lasx_xvpackev_b(g, r); \
116  DUP2_ARG3(__lasx_xvshuf_b, b, rg, shuf2, b, rg, shuf3, rgb_l, rgb_h); \
117 }
118 
119 #define RGB32_PACK(a, r, g, b, rgb_l, rgb_h) \
120 { \
121  __m256i ra, bg, tmp0, tmp1; \
122  ra = __lasx_xvpackev_b(r, a); \
123  bg = __lasx_xvpackev_b(b, g); \
124  tmp0 = __lasx_xvilvl_h(bg, ra); \
125  tmp1 = __lasx_xvilvh_h(bg, ra); \
126  rgb_l = __lasx_xvpermi_q(tmp1, tmp0, 0x20); \
127  rgb_h = __lasx_xvpermi_q(tmp1, tmp0, 0x31); \
128 }
129 
130 #define RGB_STORE_RES(rgb_l, rgb_h, image_1, image_2) \
131 { \
132  __lasx_xvstelm_d(rgb_l, image_1, 0, 0); \
133  __lasx_xvstelm_d(rgb_l, image_1, 8, 1); \
134  __lasx_xvstelm_d(rgb_h, image_1, 16, 0); \
135  __lasx_xvstelm_d(rgb_l, image_2, 0, 2); \
136  __lasx_xvstelm_d(rgb_l, image_2, 8, 3); \
137  __lasx_xvstelm_d(rgb_h, image_2, 16, 2); \
138 }
139 
140 #define RGB_STORE(rgb_l, rgb_h, image) \
141 { \
142  __lasx_xvstelm_d(rgb_l, image, 0, 0); \
143  __lasx_xvstelm_d(rgb_l, image, 8, 1); \
144  __lasx_xvstelm_d(rgb_h, image, 16, 0); \
145  __lasx_xvstelm_d(rgb_l, image, 24, 2); \
146  __lasx_xvstelm_d(rgb_l, image, 32, 3); \
147  __lasx_xvstelm_d(rgb_h, image, 40, 2); \
148 }
149 
150 #define RGB32_STORE(rgb_l, rgb_h, image) \
151 { \
152  __lasx_xvst(rgb_l, image, 0); \
153  __lasx_xvst(rgb_h, image, 32); \
154 }
155 
156 #define RGB32_STORE_RES(rgb_l, rgb_h, image_1, image_2) \
157 { \
158  __lasx_xvst(rgb_l, image_1, 0); \
159  __lasx_xvst(rgb_h, image_2, 0); \
160 }
161 
162 #define YUV2RGBFUNC(func_name, dst_type, alpha) \
163  int func_name(SwsInternal *c, const uint8_t *const src[], \
164  const int srcStride[], int srcSliceY, int srcSliceH, \
165  uint8_t *const dst[], const int dstStride[]) \
166 { \
167  int x, y, h_size, vshift, res; \
168  __m256i m_y1, m_y2, m_u, m_v; \
169  __m256i y_1, y_2, u2g, v2g, u2b, v2r, rgb1_l, rgb1_h; \
170  __m256i rgb2_l, rgb2_h, r1, g1, b1, r2, g2, b2; \
171  __m256i shuf2 = {0x0504120302100100, 0x0A18090816070614, \
172  0x0504120302100100, 0x0A18090816070614}; \
173  __m256i shuf3 = {0x1E0F0E1C0D0C1A0B, 0x0101010101010101, \
174  0x1E0F0E1C0D0C1A0B, 0x0101010101010101}; \
175  YUV2RGB_LOAD_COE \
176  y = c->opts.dst_w; \
177  h_size = y >> 4; \
178  res = y & 15; \
179  \
180  vshift = c->opts.src_format != AV_PIX_FMT_YUV422P; \
181  for (y = 0; y < srcSliceH; y += 2) { \
182  dst_type *image1 = (dst_type *)(dst[0] + (y + srcSliceY) * dstStride[0]);\
183  dst_type *image2 = (dst_type *)(image1 + dstStride[0]);\
184  const uint8_t *py_1 = src[0] + y * srcStride[0]; \
185  const uint8_t *py_2 = py_1 + srcStride[0]; \
186  const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \
187  const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
188  for(x = 0; x < h_size; x++) { \
189 
190 #define YUV2RGBFUNC32(func_name, dst_type, alpha) \
191  int func_name(SwsInternal *c, const uint8_t *const src[], \
192  const int srcStride[], int srcSliceY, int srcSliceH, \
193  uint8_t *const dst[], const int dstStride[]) \
194 { \
195  int x, y, h_size, vshift, res; \
196  __m256i m_y1, m_y2, m_u, m_v; \
197  __m256i y_1, y_2, u2g, v2g, u2b, v2r, rgb1_l, rgb1_h; \
198  __m256i rgb2_l, rgb2_h, r1, g1, b1, r2, g2, b2; \
199  __m256i a = __lasx_xvldi(0xFF); \
200  \
201  YUV2RGB_LOAD_COE \
202  y = c->opts.dst_w; \
203  h_size = y >> 4; \
204  res = y & 15; \
205  \
206  vshift = c->opts.src_format != AV_PIX_FMT_YUV422P; \
207  for (y = 0; y < srcSliceH; y += 2) { \
208  int yd = y + srcSliceY; \
209  av_unused dst_type *r, *g, *b; \
210  dst_type *image1 = (dst_type *)(dst[0] + (yd) * dstStride[0]); \
211  dst_type *image2 = (dst_type *)(dst[0] + (yd + 1) * dstStride[0]); \
212  const uint8_t *py_1 = src[0] + y * srcStride[0]; \
213  const uint8_t *py_2 = py_1 + srcStride[0]; \
214  const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \
215  const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
216  for(x = 0; x < h_size; x++) { \
217 
218 #define DEALYUV2RGBLINE \
219  py_1 += 16; \
220  py_2 += 16; \
221  pu += 8; \
222  pv += 8; \
223  image1 += 48; \
224  image2 += 48; \
225  } \
226  if (res & 8) { \
227 
228 #define DEALYUV2RGBLINERES \
229  py_1 += 8; \
230  py_2 += 8; \
231  pu += 4; \
232  pv += 4; \
233  image1 += 24; \
234  image2 += 24; \
235  res -= 8 ; \
236  } \
237  if (res) {
238 
239 #define ENDYUV2RGBLINE(rgb_l, rgb_h, image_1, image_2) \
240  if (res == 6) { \
241  __lasx_xvstelm_d(rgb_l, image_1, 0, 0); \
242  __lasx_xvstelm_d(rgb_l, image_1, 8, 1); \
243  __lasx_xvstelm_h(rgb_h, image_1, 16, 0); \
244  __lasx_xvstelm_d(rgb_l, image_2, 0, 2); \
245  __lasx_xvstelm_d(rgb_l, image_2, 8, 3); \
246  __lasx_xvstelm_h(rgb_h, image_2, 16, 8); \
247  } else if (res == 4) { \
248  __lasx_xvstelm_d(rgb_l, image_1, 0, 0); \
249  __lasx_xvstelm_w(rgb_l, image_1, 8, 2); \
250  __lasx_xvstelm_d(rgb_l, image_2, 0, 2); \
251  __lasx_xvstelm_w(rgb_l, image_2, 8, 6); \
252  } else if (res == 2) { \
253  __lasx_xvstelm_w(rgb_l, image_1, 0, 0); \
254  __lasx_xvstelm_h(rgb_l, image_1, 4, 2); \
255  __lasx_xvstelm_w(rgb_l, image_2, 0, 4); \
256  __lasx_xvstelm_h(rgb_l, image_2, 4, 10); \
257  }
258 
259 #define DEALYUV2RGBLINE32 \
260  py_1 += 16; \
261  py_2 += 16; \
262  pu += 8; \
263  pv += 8; \
264  image1 += 16; \
265  image2 += 16; \
266  } \
267  if (res & 8) { \
268 
269 #define DEALYUV2RGBLINERES32 \
270  py_1 += 8; \
271  py_2 += 8; \
272  pu += 4; \
273  pv += 4; \
274  image1 += 8; \
275  image2 += 8; \
276  res -= 8; \
277  } \
278  if (res) {
279 
280 #define ENDYUV2RGBLINE32(rgb_l, rgb_h, image_1, image_2) \
281  if (res == 6) { \
282  __lasx_xvstelm_d(rgb_l, image_1, 0, 0); \
283  __lasx_xvstelm_d(rgb_l, image_1, 8, 1); \
284  __lasx_xvstelm_d(rgb_l, image_1, 16, 2); \
285  __lasx_xvstelm_d(rgb_h, image_2, 0, 0); \
286  __lasx_xvstelm_d(rgb_h, image_2, 8, 1); \
287  __lasx_xvstelm_d(rgb_h, image_2, 16, 2); \
288  } else if (res == 4) { \
289  __lasx_xvstelm_d(rgb_l, image_1, 0, 0); \
290  __lasx_xvstelm_d(rgb_l, image_1, 8, 1); \
291  __lasx_xvstelm_d(rgb_h, image_2, 0, 0); \
292  __lasx_xvstelm_d(rgb_h, image_2, 8, 1); \
293  } else if (res == 2) { \
294  __lasx_xvstelm_d(rgb_l, image_1, 0, 0); \
295  __lasx_xvstelm_d(rgb_h, image_2, 0, 0); \
296  }
297 
298 
299 #define END_FUNC() \
300  } \
301  } \
302  return srcSliceH; \
303 }
304 
305 YUV2RGBFUNC(yuv420_rgb24_lasx, uint8_t, 0)
307  YUV2RGB
308  RGB_PACK(r1, g1, b1, rgb1_l, rgb1_h);
309  RGB_PACK(r2, g2, b2, rgb2_l, rgb2_h);
310  RGB_STORE(rgb1_l, rgb1_h, image1);
311  RGB_STORE(rgb2_l, rgb2_h, image2);
314  RGB_PACK(r1, g1, b1, rgb1_l, rgb1_h);
315  RGB_STORE_RES(rgb1_l, rgb1_h, image1, image2);
318  RGB_PACK(r1, g1, b1, rgb1_l, rgb1_h);
319  ENDYUV2RGBLINE(rgb1_l, rgb1_h, image1, image2);
320  END_FUNC()
321 
322 YUV2RGBFUNC(yuv420_bgr24_lasx, uint8_t, 0)
324  YUV2RGB
325  RGB_PACK(b1, g1, r1, rgb1_l, rgb1_h);
326  RGB_PACK(b2, g2, r2, rgb2_l, rgb2_h);
327  RGB_STORE(rgb1_l, rgb1_h, image1);
328  RGB_STORE(rgb2_l, rgb2_h, image2);
331  RGB_PACK(b1, g1, r1, rgb1_l, rgb1_h);
332  RGB_STORE_RES(rgb1_l, rgb1_h, image1, image2);
335  RGB_PACK(b1, g1, r1, rgb1_l, rgb1_h);
336  ENDYUV2RGBLINE(rgb1_l, rgb1_h, image1, image2);
337  END_FUNC()
338 
339 YUV2RGBFUNC32(yuv420_rgba32_lasx, uint32_t, 0)
341  YUV2RGB
342  RGB32_PACK(r1, g1, b1, a, rgb1_l, rgb1_h);
343  RGB32_PACK(r2, g2, b2, a, rgb2_l, rgb2_h);
344  RGB32_STORE(rgb1_l, rgb1_h, image1);
345  RGB32_STORE(rgb2_l, rgb2_h, image2);
348  RGB32_PACK(r1, g1, b1, a, rgb1_l, rgb1_h);
349  RGB32_STORE_RES(rgb1_l, rgb1_h, image1, image2);
352  RGB32_PACK(r1, g1, b1, a, rgb1_l, rgb1_h);
353  ENDYUV2RGBLINE32(rgb1_l, rgb1_h, image1, image2);
354  END_FUNC()
355 
356 YUV2RGBFUNC32(yuv420_bgra32_lasx, uint32_t, 0)
358  YUV2RGB
359  RGB32_PACK(b1, g1, r1, a, rgb1_l, rgb1_h);
360  RGB32_PACK(b2, g2, r2, a, rgb2_l, rgb2_h);
361  RGB32_STORE(rgb1_l, rgb1_h, image1);
362  RGB32_STORE(rgb2_l, rgb2_h, image2);
365  RGB32_PACK(b1, g1, r1, a, rgb1_l, rgb1_h);
366  RGB32_STORE_RES(rgb1_l, rgb1_h, image1, image2);
369  RGB32_PACK(b1, g1, r1, a, rgb1_l, rgb1_h);
370  ENDYUV2RGBLINE32(rgb1_l, rgb1_h, image1, image2);
371  END_FUNC()
372 
373 YUV2RGBFUNC32(yuv420_argb32_lasx, uint32_t, 0)
375  YUV2RGB
376  RGB32_PACK(a, r1, g1, b1, rgb1_l, rgb1_h);
377  RGB32_PACK(a, r2, g2, b2, rgb2_l, rgb2_h);
378  RGB32_STORE(rgb1_l, rgb1_h, image1);
379  RGB32_STORE(rgb2_l, rgb2_h, image2);
382  RGB32_PACK(a, r1, g1, b1, rgb1_l, rgb1_h);
383  RGB32_STORE_RES(rgb1_l, rgb1_h, image1, image2);
386  RGB32_PACK(a, r1, g1, b1, rgb1_l, rgb1_h);
387  ENDYUV2RGBLINE32(rgb1_l, rgb1_h, image1, image2);
388  END_FUNC()
389 
390 YUV2RGBFUNC32(yuv420_abgr32_lasx, uint32_t, 0)
392  YUV2RGB
393  RGB32_PACK(a, b1, g1, r1, rgb1_l, rgb1_h);
394  RGB32_PACK(a, b2, g2, r2, rgb2_l, rgb2_h);
395  RGB32_STORE(rgb1_l, rgb1_h, image1);
396  RGB32_STORE(rgb2_l, rgb2_h, image2);
399  RGB32_PACK(a, b1, g1, r1, rgb1_l, rgb1_h);
400  RGB32_STORE_RES(rgb1_l, rgb1_h, image1, image2);
403  RGB32_PACK(a, b1, g1, r1, rgb1_l, rgb1_h);
404  ENDYUV2RGBLINE32(rgb1_l, rgb1_h, image1, image2);
405  END_FUNC()
LOAD_YUV_16
#define LOAD_YUV_16
Definition: yuv2rgb_lasx.c:37
YUV2RGB_RES
#define YUV2RGB_RES
Definition: yuv2rgb_lasx.c:82
DEALYUV2RGBLINERES32
#define DEALYUV2RGBLINERES32
Definition: yuv2rgb_lasx.c:269
YUV2RGB
#define YUV2RGB
Definition: yuv2rgb_lasx.c:57
b1
static double b1(void *priv, double x, double y)
Definition: vf_xfade.c:2034
RGB32_STORE
#define RGB32_STORE(rgb_l, rgb_h, image)
Definition: yuv2rgb_lasx.c:150
swscale_loongarch.h
DEALYUV2RGBLINE
#define DEALYUV2RGBLINE
Definition: yuv2rgb_lasx.c:218
DEALYUV2RGBLINERES
#define DEALYUV2RGBLINERES
Definition: yuv2rgb_lasx.c:228
YUV2RGBFUNC32
#define YUV2RGBFUNC32(func_name, dst_type, alpha)
Definition: yuv2rgb_lasx.c:190
ENDYUV2RGBLINE32
#define ENDYUV2RGBLINE32(rgb_l, rgb_h, image_1, image_2)
Definition: yuv2rgb_lasx.c:280
ENDYUV2RGBLINE
#define ENDYUV2RGBLINE(rgb_l, rgb_h, image_1, image_2)
Definition: yuv2rgb_lasx.c:239
RGB_STORE
#define RGB_STORE(rgb_l, rgb_h, image)
Definition: yuv2rgb_lasx.c:140
YUV2RGBFUNC
#define YUV2RGBFUNC(func_name, dst_type, alpha)
Definition: yuv2rgb_lasx.c:162
DEALYUV2RGBLINE32
#define DEALYUV2RGBLINE32
Definition: yuv2rgb_lasx.c:259
b2
static double b2(void *priv, double x, double y)
Definition: vf_xfade.c:2035
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
RGB32_PACK
#define RGB32_PACK(a, r, g, b, rgb_l, rgb_h)
Definition: yuv2rgb_lasx.c:119
RGB_PACK
#define RGB_PACK(r, g, b, rgb_l, rgb_h)
Definition: yuv2rgb_lasx.c:112
loongson_intrinsics.h
RGB32_STORE_RES
#define RGB32_STORE_RES(rgb_l, rgb_h, image_1, image_2)
Definition: yuv2rgb_lasx.c:156
END_FUNC
#define END_FUNC()
Definition: yuv2rgb_lasx.c:299
RGB_STORE_RES
#define RGB_STORE_RES(rgb_l, rgb_h, image_1, image_2)
Definition: yuv2rgb_lasx.c:130