FFmpeg
yuv2rgb_lasx.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2022 Loongson Technology Corporation Limited
3  * Contributed by Hao Chen(chenhao@loongson.cn)
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "swscale_loongarch.h"
24 
25 #define YUV2RGB_LOAD_COE \
26  /* Load x_offset */ \
27  __m256i y_offset = __lasx_xvreplgr2vr_d(c->yOffset); \
28  __m256i u_offset = __lasx_xvreplgr2vr_d(c->uOffset); \
29  __m256i v_offset = __lasx_xvreplgr2vr_d(c->vOffset); \
30  /* Load x_coeff */ \
31  __m256i ug_coeff = __lasx_xvreplgr2vr_d(c->ugCoeff); \
32  __m256i vg_coeff = __lasx_xvreplgr2vr_d(c->vgCoeff); \
33  __m256i y_coeff = __lasx_xvreplgr2vr_d(c->yCoeff); \
34  __m256i ub_coeff = __lasx_xvreplgr2vr_d(c->ubCoeff); \
35  __m256i vr_coeff = __lasx_xvreplgr2vr_d(c->vrCoeff); \
36 
37 #define LOAD_YUV_16 \
38  m_y1 = __lasx_xvld(py_1, 0); \
39  m_y2 = __lasx_xvld(py_2, 0); \
40  m_u = __lasx_xvldrepl_d(pu, 0); \
41  m_v = __lasx_xvldrepl_d(pv, 0); \
42  m_u = __lasx_xvilvl_b(m_u, m_u); \
43  m_v = __lasx_xvilvl_b(m_v, m_v); \
44  DUP4_ARG1(__lasx_vext2xv_hu_bu, m_y1, m_y2, m_u, m_v, \
45  m_y1, m_y2, m_u, m_v); \
46 
47 /* YUV2RGB method
48  * The conversion method is as follows:
49  * R = Y' * y_coeff + V' * vr_coeff
50  * G = Y' * y_coeff + V' * vg_coeff + U' * ug_coeff
51  * B = Y' * y_coeff + U' * ub_coeff
52  *
53  * where X' = X * 8 - x_offset
54  *
55  */
56 
57 #define YUV2RGB \
58  m_y1 = __lasx_xvslli_h(m_y1, 3); \
59  m_y2 = __lasx_xvslli_h(m_y2, 3); \
60  m_u = __lasx_xvslli_h(m_u, 3); \
61  m_v = __lasx_xvslli_h(m_v, 3); \
62  m_y1 = __lasx_xvsub_h(m_y1, y_offset); \
63  m_y2 = __lasx_xvsub_h(m_y2, y_offset); \
64  m_u = __lasx_xvsub_h(m_u, u_offset); \
65  m_v = __lasx_xvsub_h(m_v, v_offset); \
66  y_1 = __lasx_xvmuh_h(m_y1, y_coeff); \
67  y_2 = __lasx_xvmuh_h(m_y2, y_coeff); \
68  u2g = __lasx_xvmuh_h(m_u, ug_coeff); \
69  u2b = __lasx_xvmuh_h(m_u, ub_coeff); \
70  v2r = __lasx_xvmuh_h(m_v, vr_coeff); \
71  v2g = __lasx_xvmuh_h(m_v, vg_coeff); \
72  r1 = __lasx_xvsadd_h(y_1, v2r); \
73  v2g = __lasx_xvsadd_h(v2g, u2g); \
74  g1 = __lasx_xvsadd_h(y_1, v2g); \
75  b1 = __lasx_xvsadd_h(y_1, u2b); \
76  r2 = __lasx_xvsadd_h(y_2, v2r); \
77  g2 = __lasx_xvsadd_h(y_2, v2g); \
78  b2 = __lasx_xvsadd_h(y_2, u2b); \
79  DUP4_ARG1(__lasx_xvclip255_h, r1, g1, b1, r2, r1, g1, b1, r2); \
80  DUP2_ARG1(__lasx_xvclip255_h, g2, b2, g2, b2); \
81 
82 #define YUV2RGB_RES \
83  m_y1 = __lasx_xvldrepl_d(py_1, 0); \
84  m_y2 = __lasx_xvldrepl_d(py_2, 0); \
85  m_u = __lasx_xvldrepl_d(pu, 0); \
86  m_v = __lasx_xvldrepl_d(pv, 0); \
87  m_y1 = __lasx_xvilvl_d(m_y2, m_y1); \
88  m_u = __lasx_xvilvl_b(m_u, m_u); \
89  m_v = __lasx_xvilvl_b(m_v, m_v); \
90  m_y1 = __lasx_vext2xv_hu_bu(m_y1); \
91  m_u = __lasx_vext2xv_hu_bu(m_u); \
92  m_v = __lasx_vext2xv_hu_bu(m_v); \
93  m_y1 = __lasx_xvslli_h(m_y1, 3); \
94  m_u = __lasx_xvslli_h(m_u, 3); \
95  m_v = __lasx_xvslli_h(m_v, 3); \
96  m_y1 = __lasx_xvsub_h(m_y1, y_offset); \
97  m_u = __lasx_xvsub_h(m_u, u_offset); \
98  m_v = __lasx_xvsub_h(m_v, v_offset); \
99  y_1 = __lasx_xvmuh_h(m_y1, y_coeff); \
100  u2g = __lasx_xvmuh_h(m_u, ug_coeff); \
101  u2b = __lasx_xvmuh_h(m_u, ub_coeff); \
102  v2r = __lasx_xvmuh_h(m_v, vr_coeff); \
103  v2g = __lasx_xvmuh_h(m_v, vg_coeff); \
104  r1 = __lasx_xvsadd_h(y_1, v2r); \
105  v2g = __lasx_xvsadd_h(v2g, u2g); \
106  g1 = __lasx_xvsadd_h(y_1, v2g); \
107  b1 = __lasx_xvsadd_h(y_1, u2b); \
108  r1 = __lasx_xvclip255_h(r1); \
109  g1 = __lasx_xvclip255_h(g1); \
110  b1 = __lasx_xvclip255_h(b1); \
111 
112 #define RGB_PACK(r, g, b, rgb_l, rgb_h) \
113 { \
114  __m256i rg; \
115  rg = __lasx_xvpackev_b(g, r); \
116  DUP2_ARG3(__lasx_xvshuf_b, b, rg, shuf2, b, rg, shuf3, rgb_l, rgb_h); \
117 }
118 
119 #define RGB32_PACK(a, r, g, b, rgb_l, rgb_h) \
120 { \
121  __m256i ra, bg, tmp0, tmp1; \
122  ra = __lasx_xvpackev_b(r, a); \
123  bg = __lasx_xvpackev_b(b, g); \
124  tmp0 = __lasx_xvilvl_h(bg, ra); \
125  tmp1 = __lasx_xvilvh_h(bg, ra); \
126  rgb_l = __lasx_xvpermi_q(tmp1, tmp0, 0x20); \
127  rgb_h = __lasx_xvpermi_q(tmp1, tmp0, 0x31); \
128 }
129 
130 #define RGB_STORE_RES(rgb_l, rgb_h, image_1, image_2) \
131 { \
132  __lasx_xvstelm_d(rgb_l, image_1, 0, 0); \
133  __lasx_xvstelm_d(rgb_l, image_1, 8, 1); \
134  __lasx_xvstelm_d(rgb_h, image_1, 16, 0); \
135  __lasx_xvstelm_d(rgb_l, image_2, 0, 2); \
136  __lasx_xvstelm_d(rgb_l, image_2, 8, 3); \
137  __lasx_xvstelm_d(rgb_h, image_2, 16, 2); \
138 }
139 
140 #define RGB_STORE(rgb_l, rgb_h, image) \
141 { \
142  __lasx_xvstelm_d(rgb_l, image, 0, 0); \
143  __lasx_xvstelm_d(rgb_l, image, 8, 1); \
144  __lasx_xvstelm_d(rgb_h, image, 16, 0); \
145  __lasx_xvstelm_d(rgb_l, image, 24, 2); \
146  __lasx_xvstelm_d(rgb_l, image, 32, 3); \
147  __lasx_xvstelm_d(rgb_h, image, 40, 2); \
148 }
149 
150 #define RGB32_STORE(rgb_l, rgb_h, image) \
151 { \
152  __lasx_xvst(rgb_l, image, 0); \
153  __lasx_xvst(rgb_h, image, 32); \
154 }
155 
156 #define RGB32_STORE_RES(rgb_l, rgb_h, image_1, image_2) \
157 { \
158  __lasx_xvst(rgb_l, image_1, 0); \
159  __lasx_xvst(rgb_h, image_2, 0); \
160 }
161 
162 #define YUV2RGBFUNC(func_name, dst_type, alpha) \
163  int func_name(SwsContext *c, const uint8_t *src[], \
164  int srcStride[], int srcSliceY, int srcSliceH, \
165  uint8_t *dst[], int dstStride[]) \
166 { \
167  int x, y, h_size, vshift, res; \
168  __m256i m_y1, m_y2, m_u, m_v; \
169  __m256i y_1, y_2, u2g, v2g, u2b, v2r, rgb1_l, rgb1_h; \
170  __m256i rgb2_l, rgb2_h, r1, g1, b1, r2, g2, b2; \
171  __m256i shuf2 = {0x0504120302100100, 0x0A18090816070614, \
172  0x0504120302100100, 0x0A18090816070614}; \
173  __m256i shuf3 = {0x1E0F0E1C0D0C1A0B, 0x0101010101010101, \
174  0x1E0F0E1C0D0C1A0B, 0x0101010101010101}; \
175  YUV2RGB_LOAD_COE \
176  y = (c->dstW + 7) & ~7; \
177  h_size = y >> 4; \
178  res = y & 15; \
179  \
180  vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \
181  for (y = 0; y < srcSliceH; y += 2) { \
182  dst_type *image1 = (dst_type *)(dst[0] + (y + srcSliceY) * dstStride[0]);\
183  dst_type *image2 = (dst_type *)(image1 + dstStride[0]);\
184  const uint8_t *py_1 = src[0] + y * srcStride[0]; \
185  const uint8_t *py_2 = py_1 + srcStride[0]; \
186  const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \
187  const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
188  for(x = 0; x < h_size; x++) { \
189 
190 #define YUV2RGBFUNC32(func_name, dst_type, alpha) \
191  int func_name(SwsContext *c, const uint8_t *src[], \
192  int srcStride[], int srcSliceY, int srcSliceH, \
193  uint8_t *dst[], int dstStride[]) \
194 { \
195  int x, y, h_size, vshift, res; \
196  __m256i m_y1, m_y2, m_u, m_v; \
197  __m256i y_1, y_2, u2g, v2g, u2b, v2r, rgb1_l, rgb1_h; \
198  __m256i rgb2_l, rgb2_h, r1, g1, b1, r2, g2, b2; \
199  __m256i a = __lasx_xvldi(0xFF); \
200  \
201  YUV2RGB_LOAD_COE \
202  y = (c->dstW + 7) & ~7; \
203  h_size = y >> 4; \
204  res = y & 15; \
205  \
206  vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \
207  for (y = 0; y < srcSliceH; y += 2) { \
208  int yd = y + srcSliceY; \
209  dst_type av_unused *r, *g, *b; \
210  dst_type *image1 = (dst_type *)(dst[0] + (yd) * dstStride[0]); \
211  dst_type *image2 = (dst_type *)(dst[0] + (yd + 1) * dstStride[0]); \
212  const uint8_t *py_1 = src[0] + y * srcStride[0]; \
213  const uint8_t *py_2 = py_1 + srcStride[0]; \
214  const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \
215  const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
216  for(x = 0; x < h_size; x++) { \
217 
218 #define DEALYUV2RGBREMAIN \
219  py_1 += 16; \
220  py_2 += 16; \
221  pu += 8; \
222  pv += 8; \
223  image1 += 48; \
224  image2 += 48; \
225  } \
226  if (res) { \
227 
228 #define DEALYUV2RGBREMAIN32 \
229  py_1 += 16; \
230  py_2 += 16; \
231  pu += 8; \
232  pv += 8; \
233  image1 += 16; \
234  image2 += 16; \
235  } \
236  if (res) { \
237 
238 
239 #define END_FUNC() \
240  } \
241  } \
242  return srcSliceH; \
243 }
244 
245 YUV2RGBFUNC(yuv420_rgb24_lasx, uint8_t, 0)
247  YUV2RGB
248  RGB_PACK(r1, g1, b1, rgb1_l, rgb1_h);
249  RGB_PACK(r2, g2, b2, rgb2_l, rgb2_h);
250  RGB_STORE(rgb1_l, rgb1_h, image1);
251  RGB_STORE(rgb2_l, rgb2_h, image2);
254  RGB_PACK(r1, g1, b1, rgb1_l, rgb1_h);
255  RGB_STORE_RES(rgb1_l, rgb1_h, image1, image2);
256  END_FUNC()
257 
258 YUV2RGBFUNC(yuv420_bgr24_lasx, uint8_t, 0)
260  YUV2RGB
261  RGB_PACK(b1, g1, r1, rgb1_l, rgb1_h);
262  RGB_PACK(b2, g2, r2, rgb2_l, rgb2_h);
263  RGB_STORE(rgb1_l, rgb1_h, image1);
264  RGB_STORE(rgb2_l, rgb2_h, image2);
267  RGB_PACK(b1, g1, r1, rgb1_l, rgb1_h);
268  RGB_STORE_RES(rgb1_l, rgb1_h, image1, image2);
269  END_FUNC()
270 
271 YUV2RGBFUNC32(yuv420_rgba32_lasx, uint32_t, 0)
273  YUV2RGB
274  RGB32_PACK(r1, g1, b1, a, rgb1_l, rgb1_h);
275  RGB32_PACK(r2, g2, b2, a, rgb2_l, rgb2_h);
276  RGB32_STORE(rgb1_l, rgb1_h, image1);
277  RGB32_STORE(rgb2_l, rgb2_h, image2);
280  RGB32_PACK(r1, g1, b1, a, rgb1_l, rgb1_h);
281  RGB32_STORE_RES(rgb1_l, rgb1_h, image1, image2);
282  END_FUNC()
283 
284 YUV2RGBFUNC32(yuv420_bgra32_lasx, uint32_t, 0)
286  YUV2RGB
287  RGB32_PACK(b1, g1, r1, a, rgb1_l, rgb1_h);
288  RGB32_PACK(b2, g2, r2, a, rgb2_l, rgb2_h);
289  RGB32_STORE(rgb1_l, rgb1_h, image1);
290  RGB32_STORE(rgb2_l, rgb2_h, image2);
293  RGB32_PACK(b1, g1, r1, a, rgb1_l, rgb1_h);
294  RGB32_STORE_RES(rgb1_l, rgb1_h, image1, image2);
295  END_FUNC()
296 
297 YUV2RGBFUNC32(yuv420_argb32_lasx, uint32_t, 0)
299  YUV2RGB
300  RGB32_PACK(a, r1, g1, b1, rgb1_l, rgb1_h);
301  RGB32_PACK(a, r2, g2, b2, rgb2_l, rgb2_h);
302  RGB32_STORE(rgb1_l, rgb1_h, image1);
303  RGB32_STORE(rgb2_l, rgb2_h, image2);
306  RGB32_PACK(a, r1, g1, b1, rgb1_l, rgb1_h);
307  RGB32_STORE_RES(rgb1_l, rgb1_h, image1, image2);
308  END_FUNC()
309 
310 YUV2RGBFUNC32(yuv420_abgr32_lasx, uint32_t, 0)
312  YUV2RGB
313  RGB32_PACK(a, b1, g1, r1, rgb1_l, rgb1_h);
314  RGB32_PACK(a, b2, g2, r2, rgb2_l, rgb2_h);
315  RGB32_STORE(rgb1_l, rgb1_h, image1);
316  RGB32_STORE(rgb2_l, rgb2_h, image2);
319  RGB32_PACK(a, b1, g1, r1, rgb1_l, rgb1_h);
320  RGB32_STORE_RES(rgb1_l, rgb1_h, image1, image2);
321  END_FUNC()
yuv420_bgr24_lasx
int yuv420_bgr24_lasx(SwsContext *c, const uint8_t *src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[])
DEALYUV2RGBREMAIN32
#define DEALYUV2RGBREMAIN32
Definition: yuv2rgb_lasx.c:228
LOAD_YUV_16
#define LOAD_YUV_16
Definition: yuv2rgb_lasx.c:37
YUV2RGB_RES
#define YUV2RGB_RES
Definition: yuv2rgb_lasx.c:82
YUV2RGB
#define YUV2RGB
Definition: yuv2rgb_lasx.c:57
b1
static double b1(void *priv, double x, double y)
Definition: vf_xfade.c:1771
RGB32_STORE
#define RGB32_STORE(rgb_l, rgb_h, image)
Definition: yuv2rgb_lasx.c:150
swscale_loongarch.h
YUV2RGBFUNC32
#define YUV2RGBFUNC32(func_name, dst_type, alpha)
Definition: yuv2rgb_lasx.c:190
RGB_STORE
#define RGB_STORE(rgb_l, rgb_h, image)
Definition: yuv2rgb_lasx.c:140
DEALYUV2RGBREMAIN
#define DEALYUV2RGBREMAIN
Definition: yuv2rgb_lasx.c:218
YUV2RGBFUNC
#define YUV2RGBFUNC(func_name, dst_type, alpha)
Definition: yuv2rgb_lasx.c:162
yuv420_bgra32_lasx
int yuv420_bgra32_lasx(SwsContext *c, const uint8_t *src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[])
b2
static double b2(void *priv, double x, double y)
Definition: vf_xfade.c:1772
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
RGB32_PACK
#define RGB32_PACK(a, r, g, b, rgb_l, rgb_h)
Definition: yuv2rgb_lasx.c:119
RGB_PACK
#define RGB_PACK(r, g, b, rgb_l, rgb_h)
Definition: yuv2rgb_lasx.c:112
yuv420_rgb24_lasx
int yuv420_rgb24_lasx(SwsContext *c, const uint8_t *src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[])
yuv420_abgr32_lasx
int yuv420_abgr32_lasx(SwsContext *c, const uint8_t *src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[])
yuv420_argb32_lasx
int yuv420_argb32_lasx(SwsContext *c, const uint8_t *src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[])
loongson_intrinsics.h
RGB32_STORE_RES
#define RGB32_STORE_RES(rgb_l, rgb_h, image_1, image_2)
Definition: yuv2rgb_lasx.c:156
END_FUNC
#define END_FUNC()
Definition: yuv2rgb_lasx.c:239
RGB_STORE_RES
#define RGB_STORE_RES(rgb_l, rgb_h, image_1, image_2)
Definition: yuv2rgb_lasx.c:130
yuv420_rgba32_lasx
int yuv420_rgba32_lasx(SwsContext *c, const uint8_t *src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[])