[FFmpeg-devel] [PATCH] Add support for "omp simd" pragma.
Reimar.Doeffinger at gmx.de
Reimar.Doeffinger at gmx.de
Sun Jan 10 18:43:51 EET 2021
From: Reimar Döffinger <Reimar.Doeffinger at gmx.de>
This requests loops to be vectorized using SIMD
instructions.
The performance increase is far from hand-optimized
assembly but still significant over the plain C version.
Typical values are a 2-4x speedup where a hand-written
version would achieve 4x-10x.
So it is far from a replacement, however some architures
will get hand-written assembler quite late or not at all,
and this is a good improvement for a trivial amount of work.
The cause, besides the compiler being a compiler, is
usually that it does not manage to use saturating instructions
and thus has to use 32-bit operations where actually
saturating 16-bit operations would be sufficient.
Other causes are for example the av_clip functions that
are not ideal for vectorization (and even as scalar code
not optimal for any modern CPU that has either CSEL or
MAX/MIN instructions).
And of course this only works for relatively simple
loops, the IDCT functions for example seemed not possible
to optimize that way.
Also note that while clang may accept the code and sometimes
produces warnings, it does not seem to do anything actually
useful at all.
Here are example measurements using gcc 10 under Linux (in a VM unfortunately)
on AArch64 on Apple M1:
Commad:
time ./ffplay_g LG\ 4K\ HDR\ Demo\ -\ New\ York.ts -t 10 -autoexit -threads 1 -noframedrop
Original code:
real 0m19.572s
user 0m23.386s
sys 0m0.213s
Changing all put_hevc:
real 0m15.648s
user 0m19.503s (83.4% of original)
sys 0m0.186s
In addition changing add_residual:
real 0m15.424s
user 0m19.278s (82.4% of original)
sys 0m0.133s
In addition changing planar copy dither:
real 0m15.040s
user 0m18.874s (80.7% of original)
sys 0m0.168s
Signed-off-by: Reimar Döffinger <Reimar.Doeffinger at gmx.de>
---
configure | 23 +++++++++++++++++
libavcodec/hevcdsp_template.c | 47 +++++++++++++++++++++++++++++++++++
libavutil/internal.h | 6 +++++
libswscale/swscale_unscaled.c | 3 +++
4 files changed, 79 insertions(+)
diff --git a/configure b/configure
index 900505756b..73b7c3daeb 100755
--- a/configure
+++ b/configure
@@ -406,6 +406,7 @@ Toolchain options:
--enable-pic build position-independent code
--enable-thumb compile for Thumb instruction set
--enable-lto use link-time optimization
+ --enable-openmp-simd use the "omp simd" pragma to optimize code
--env="ENV=override" override the environment variables
Advanced options (experts only):
@@ -2335,6 +2336,7 @@ HAVE_LIST="
opencl_dxva2
opencl_vaapi_beignet
opencl_vaapi_intel_media
+ openmp_simd
perl
pod2man
texi2html
@@ -2446,6 +2448,7 @@ CMDLINE_SELECT="
extra_warnings
logging
lto
+ openmp_simd
optimizations
rpath
stripping
@@ -6926,6 +6929,26 @@ if enabled lto; then
disable inline_asm_direct_symbol_refs
fi
+if enabled openmp_simd; then
+ ompopt="-fopenmp"
+ if ! test_cflags $ompopt ; then
+ test_cflags -Xpreprocessor -fopenmp && ompopt="-Xpreprocessor -fopenmp"
+ fi
+ test_cc $ompopt <<EOF && add_cflags "$ompopt" || die "failed to enable openmp SIMD"
+#ifndef _OPENMP
+#error _OPENMP is not defined
+#endif
+void test(unsigned char *c)
+{
+ _Pragma("omp simd")
+ for (int i = 0; i < 256; i++)
+ {
+ c[i] *= 16;
+ }
+}
+EOF
+fi
+
enabled ftrapv && check_cflags -ftrapv
test_cc -mno-red-zone <<EOF && noredzone_flags="-mno-red-zone"
diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 56cd9e605d..1a8b4160ec 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -50,6 +50,7 @@ static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
stride /= sizeof(pixel);
for (y = 0; y < size; y++) {
+ FF_OMP_SIMD
for (x = 0; x < size; x++) {
dst[x] = av_clip_pixel(dst[x] + *res);
res++;
@@ -247,6 +248,7 @@ static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs, \
int16_t *src = coeffs; \
IDCT_VAR ## H(H); \
\
+ FF_OMP_SIMD \
for (i = 0; i < H; i++) { \
TR_ ## H(src, src, H, H, SCALE, limit2); \
if (limit2 < H && i%4 == 0 && !!i) \
@@ -256,6 +258,7 @@ static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs, \
\
shift = 20 - BIT_DEPTH; \
add = 1 << (shift - 1); \
+ FF_OMP_SIMD \
for (i = 0; i < H; i++) { \
TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit); \
coeffs += H; \
@@ -502,6 +505,7 @@ static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
ptrdiff_t srcstride = _srcstride / sizeof(pixel);
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = src[x] << (14 - BIT_DEPTH);
src += srcstride;
@@ -543,6 +547,7 @@ static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, ui
#endif
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
src += srcstride;
@@ -568,6 +573,7 @@ static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride,
ox = ox * (1 << (BIT_DEPTH - 8));
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
src += srcstride;
@@ -592,6 +598,7 @@ static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride,
ox0 = ox0 * (1 << (BIT_DEPTH - 8));
ox1 = ox1 * (1 << (BIT_DEPTH - 8));
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++) {
dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1));
}
@@ -623,6 +630,7 @@ static void FUNC(put_hevc_qpel_h)(int16_t *dst,
ptrdiff_t srcstride = _srcstride / sizeof(pixel);
const int8_t *filter = ff_hevc_qpel_filters[mx - 1];
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
src += srcstride;
@@ -639,6 +647,7 @@ static void FUNC(put_hevc_qpel_v)(int16_t *dst,
ptrdiff_t srcstride = _srcstride / sizeof(pixel);
const int8_t *filter = ff_hevc_qpel_filters[my - 1];
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
src += srcstride;
@@ -662,6 +671,7 @@ static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
src -= QPEL_EXTRA_BEFORE * srcstride;
filter = ff_hevc_qpel_filters[mx - 1];
for (y = 0; y < height + QPEL_EXTRA; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
src += srcstride;
@@ -671,6 +681,7 @@ static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
filter = ff_hevc_qpel_filters[my - 1];
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
tmp += MAX_PB_SIZE;
@@ -697,6 +708,7 @@ static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride,
#endif
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
src += srcstride;
@@ -724,6 +736,7 @@ static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_
#endif
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
src += srcstride;
@@ -751,6 +764,7 @@ static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride,
#endif
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
src += srcstride;
@@ -779,6 +793,7 @@ static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_
#endif
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
src += srcstride;
@@ -810,6 +825,7 @@ static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride,
src -= QPEL_EXTRA_BEFORE * srcstride;
filter = ff_hevc_qpel_filters[mx - 1];
for (y = 0; y < height + QPEL_EXTRA; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
src += srcstride;
@@ -820,6 +836,7 @@ static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride,
filter = ff_hevc_qpel_filters[my - 1];
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
tmp += MAX_PB_SIZE;
@@ -849,6 +866,7 @@ static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8
src -= QPEL_EXTRA_BEFORE * srcstride;
filter = ff_hevc_qpel_filters[mx - 1];
for (y = 0; y < height + QPEL_EXTRA; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
src += srcstride;
@@ -859,6 +877,7 @@ static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8
filter = ff_hevc_qpel_filters[my - 1];
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
tmp += MAX_PB_SIZE;
@@ -887,6 +906,7 @@ static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride,
ox = ox * (1 << (BIT_DEPTH - 8));
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
src += srcstride;
@@ -913,6 +933,7 @@ static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint
ox0 = ox0 * (1 << (BIT_DEPTH - 8));
ox1 = ox1 * (1 << (BIT_DEPTH - 8));
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
@@ -942,6 +963,7 @@ static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride,
ox = ox * (1 << (BIT_DEPTH - 8));
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
src += srcstride;
@@ -968,6 +990,7 @@ static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint
ox0 = ox0 * (1 << (BIT_DEPTH - 8));
ox1 = ox1 * (1 << (BIT_DEPTH - 8));
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
@@ -1000,6 +1023,7 @@ static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride,
src -= QPEL_EXTRA_BEFORE * srcstride;
filter = ff_hevc_qpel_filters[mx - 1];
for (y = 0; y < height + QPEL_EXTRA; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
src += srcstride;
@@ -1011,6 +1035,7 @@ static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride,
ox = ox * (1 << (BIT_DEPTH - 8));
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
tmp += MAX_PB_SIZE;
@@ -1037,6 +1062,7 @@ static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin
src -= QPEL_EXTRA_BEFORE * srcstride;
filter = ff_hevc_qpel_filters[mx - 1];
for (y = 0; y < height + QPEL_EXTRA; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
src += srcstride;
@@ -1049,6 +1075,7 @@ static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin
ox0 = ox0 * (1 << (BIT_DEPTH - 8));
ox1 = ox1 * (1 << (BIT_DEPTH - 8));
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
@@ -1076,6 +1103,7 @@ static void FUNC(put_hevc_epel_h)(int16_t *dst,
ptrdiff_t srcstride = _srcstride / sizeof(pixel);
const int8_t *filter = ff_hevc_epel_filters[mx - 1];
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
src += srcstride;
@@ -1093,6 +1121,7 @@ static void FUNC(put_hevc_epel_v)(int16_t *dst,
const int8_t *filter = ff_hevc_epel_filters[my - 1];
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
src += srcstride;
@@ -1114,6 +1143,7 @@ static void FUNC(put_hevc_epel_hv)(int16_t *dst,
src -= EPEL_EXTRA_BEFORE * srcstride;
for (y = 0; y < height + EPEL_EXTRA; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
src += srcstride;
@@ -1124,6 +1154,7 @@ static void FUNC(put_hevc_epel_hv)(int16_t *dst,
filter = ff_hevc_epel_filters[my - 1];
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
tmp += MAX_PB_SIZE;
@@ -1148,6 +1179,7 @@ static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8
#endif
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
src += srcstride;
@@ -1173,6 +1205,7 @@ static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_
#endif
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++) {
dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
}
@@ -1199,6 +1232,7 @@ static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8
#endif
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
src += srcstride;
@@ -1224,6 +1258,7 @@ static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_
#endif
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
dst += dststride;
@@ -1253,6 +1288,7 @@ static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint
src -= EPEL_EXTRA_BEFORE * srcstride;
for (y = 0; y < height + EPEL_EXTRA; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
src += srcstride;
@@ -1263,6 +1299,7 @@ static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint
filter = ff_hevc_epel_filters[my - 1];
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
tmp += MAX_PB_SIZE;
@@ -1292,6 +1329,7 @@ static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8
src -= EPEL_EXTRA_BEFORE * srcstride;
for (y = 0; y < height + EPEL_EXTRA; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
src += srcstride;
@@ -1302,6 +1340,7 @@ static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8
filter = ff_hevc_epel_filters[my - 1];
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
tmp += MAX_PB_SIZE;
@@ -1328,6 +1367,7 @@ static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uin
ox = ox * (1 << (BIT_DEPTH - 8));
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++) {
dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
}
@@ -1353,6 +1393,7 @@ static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint
ox0 = ox0 * (1 << (BIT_DEPTH - 8));
ox1 = ox1 * (1 << (BIT_DEPTH - 8));
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
@@ -1380,6 +1421,7 @@ static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uin
ox = ox * (1 << (BIT_DEPTH - 8));
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++) {
dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
}
@@ -1405,6 +1447,7 @@ static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint
ox0 = ox0 * (1 << (BIT_DEPTH - 8));
ox1 = ox1 * (1 << (BIT_DEPTH - 8));
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
@@ -1435,6 +1478,7 @@ static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, ui
src -= EPEL_EXTRA_BEFORE * srcstride;
for (y = 0; y < height + EPEL_EXTRA; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
src += srcstride;
@@ -1446,6 +1490,7 @@ static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, ui
ox = ox * (1 << (BIT_DEPTH - 8));
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
tmp += MAX_PB_SIZE;
@@ -1472,6 +1517,7 @@ static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin
src -= EPEL_EXTRA_BEFORE * srcstride;
for (y = 0; y < height + EPEL_EXTRA; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
src += srcstride;
@@ -1484,6 +1530,7 @@ static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin
ox0 = ox0 * (1 << (BIT_DEPTH - 8));
ox1 = ox1 * (1 << (BIT_DEPTH - 8));
for (y = 0; y < height; y++) {
+ FF_OMP_SIMD
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
diff --git a/libavutil/internal.h b/libavutil/internal.h
index 93ea57c324..b0543bbf02 100644
--- a/libavutil/internal.h
+++ b/libavutil/internal.h
@@ -299,4 +299,10 @@ int avpriv_dict_set_timestamp(AVDictionary **dict, const char *key, int64_t time
#define FF_PSEUDOPAL 0
#endif
+#if HAVE_OPENMP_SIMD
+#define FF_OMP_SIMD _Pragma("omp simd")
+#else
+#define FF_OMP_SIMD
+#endif
+
#endif /* AVUTIL_INTERNAL_H */
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index c4dd8a4d83..c112a61037 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -1743,6 +1743,7 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[],
unsigned shift= src_depth-dst_depth, tmp;\
if (c->dither == SWS_DITHER_NONE) {\
for (i = 0; i < height; i++) {\
+ FF_OMP_SIMD \
for (j = 0; j < length-7; j+=8) {\
dst[j+0] = dbswap(bswap(src[j+0])>>shift);\
dst[j+1] = dbswap(bswap(src[j+1])>>shift);\
@@ -1762,6 +1763,7 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[],
} else if (shiftonly) {\
for (i = 0; i < height; i++) {\
const uint8_t *dither= dithers[shift-1][i&7];\
+ FF_OMP_SIMD \
for (j = 0; j < length-7; j+=8) {\
tmp = (bswap(src[j+0]) + dither[0])>>shift; dst[j+0] = dbswap(tmp - (tmp>>dst_depth));\
tmp = (bswap(src[j+1]) + dither[1])>>shift; dst[j+1] = dbswap(tmp - (tmp>>dst_depth));\
@@ -1781,6 +1783,7 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[],
} else {\
for (i = 0; i < height; i++) {\
const uint8_t *dither= dithers[shift-1][i&7];\
+ FF_OMP_SIMD \
for (j = 0; j < length-7; j+=8) {\
tmp = bswap(src[j+0]); dst[j+0] = dbswap((tmp - (tmp>>dst_depth) + dither[0])>>shift);\
tmp = bswap(src[j+1]); dst[j+1] = dbswap((tmp - (tmp>>dst_depth) + dither[1])>>shift);\
--
2.24.3 (Apple Git-128)
More information about the ffmpeg-devel
mailing list