[FFmpeg-devel] [PATCH 10/11] avcodec/blockdsp: add AVX-512 version of clear_block(s)

James Darnley jdarnley at obe.tv
Thu Nov 9 13:58:36 EET 2017


From: James Darnley <james.darnley at gmail.com>

Also adjust alignment requirements where nessecary.
---
Whether this patch is committed or not the change to 4xm.c should be picked to
master because the alignment is wrong for the AVX version of this function.  I
assume it hasn't been noticed yet because it manages to be 32-byte aligned
without intervention.

 libavcodec/4xm.c               |  2 +-
 libavcodec/asv.h               |  3 ++-
 libavcodec/bink.c              |  4 ++--
 libavcodec/dnxhdenc.h          |  2 +-
 libavcodec/eamad.c             |  2 +-
 libavcodec/eatqi.c             |  2 +-
 libavcodec/g2meet.c            |  2 +-
 libavcodec/ituh263dec.c        |  2 +-
 libavcodec/mdec.c              |  2 +-
 libavcodec/mimic.c             |  2 +-
 libavcodec/mjpegdec.h          |  3 ++-
 libavcodec/proresdec2.c        |  6 +++---
 libavcodec/speedhq.c           |  2 +-
 libavcodec/wmv2.h              |  2 +-
 libavcodec/x86/blockdsp.asm    | 14 ++++++++++++++
 libavcodec/x86/blockdsp_init.c |  8 ++++++++
 libavutil/internal.h           |  6 ++++++
 tests/checkasm/blockdsp.c      |  4 ++--
 18 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/libavcodec/4xm.c b/libavcodec/4xm.c
index 5547dfd87f..a97f051232 100644
--- a/libavcodec/4xm.c
+++ b/libavcodec/4xm.c
@@ -145,7 +145,7 @@ typedef struct FourXContext {
     int mv[256];
     VLC pre_vlc;
     int last_dc;
-    DECLARE_ALIGNED(16, int16_t, block)[6][64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[6][64];
     void *bitstream_buffer;
     unsigned int bitstream_buffer_size;
     int version;
diff --git a/libavcodec/asv.h b/libavcodec/asv.h
index a1366b6fe4..024ffa349a 100644
--- a/libavcodec/asv.h
+++ b/libavcodec/asv.h
@@ -35,6 +35,7 @@
 #include "bswapdsp.h"
 #include "fdctdsp.h"
 #include "idctdsp.h"
+#include "internal.h"
 #include "get_bits.h"
 #include "pixblockdsp.h"
 #include "put_bits.h"
@@ -54,7 +55,7 @@ typedef struct ASV1Context {
     int mb_height;
     int mb_width2;
     int mb_height2;
-    DECLARE_ALIGNED(32, int16_t, block)[6][64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[6][64];
     uint16_t intra_matrix[64];
     int q_intra_matrix[64];
     uint8_t *bitstream_buffer;
diff --git a/libavcodec/bink.c b/libavcodec/bink.c
index c4cf617a8b..eee19512f7 100644
--- a/libavcodec/bink.c
+++ b/libavcodec/bink.c
@@ -818,7 +818,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
     int v, col[2];
     const uint8_t *scan;
     int xoff, yoff;
-    LOCAL_ALIGNED_32(int16_t, block, [64]);
+    LOCAL_ALIGNED_64(int16_t, block, [64]);
     LOCAL_ALIGNED_16(int32_t, dctblock, [64]);
     int coordmap[64];
     int ybias = is_key ? -15 : 0;
@@ -985,7 +985,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
     uint8_t *dst, *prev, *ref_start, *ref_end;
     int v, col[2];
     const uint8_t *scan;
-    LOCAL_ALIGNED_32(int16_t, block, [64]);
+    LOCAL_ALIGNED_64(int16_t, block, [64]);
     LOCAL_ALIGNED_16(uint8_t, ublock, [64]);
     LOCAL_ALIGNED_16(int32_t, dctblock, [64]);
     int coordmap[64], quant_idx, coef_count, coef_idx[64];
diff --git a/libavcodec/dnxhdenc.h b/libavcodec/dnxhdenc.h
index 963821ac81..4567a4aa8a 100644
--- a/libavcodec/dnxhdenc.h
+++ b/libavcodec/dnxhdenc.h
@@ -74,7 +74,7 @@ typedef struct DNXHDEncContext {
     unsigned min_padding;
     int intra_quant_bias;
 
-    DECLARE_ALIGNED(32, int16_t, blocks)[12][64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, blocks)[12][64];
     DECLARE_ALIGNED(16, uint8_t, edge_buf_y)[512]; // has to hold 16x16 uint16 when depth=10
     DECLARE_ALIGNED(16, uint8_t, edge_buf_uv)[2][512]; // has to hold 16x16 uint16_t when depth=10
 
diff --git a/libavcodec/eamad.c b/libavcodec/eamad.c
index 7f28abbafe..0f638b43f6 100644
--- a/libavcodec/eamad.c
+++ b/libavcodec/eamad.c
@@ -54,7 +54,7 @@ typedef struct MadContext {
     GetBitContext gb;
     void *bitstream_buf;
     unsigned int bitstream_buf_size;
-    DECLARE_ALIGNED(32, int16_t, block)[64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[64];
     ScanTable scantable;
     uint16_t quant_matrix[64];
     int mb_x;
diff --git a/libavcodec/eatqi.c b/libavcodec/eatqi.c
index 1a847a35da..9527bc2713 100644
--- a/libavcodec/eatqi.c
+++ b/libavcodec/eatqi.c
@@ -51,7 +51,7 @@ typedef struct TqiContext {
     uint16_t intra_matrix[64];
     int last_dc[3];
 
-    DECLARE_ALIGNED(32, int16_t, block)[6][64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[6][64];
 } TqiContext;
 
 static av_cold int tqi_decode_init(AVCodecContext *avctx)
diff --git a/libavcodec/g2meet.c b/libavcodec/g2meet.c
index 842095ba3b..57ff92c002 100644
--- a/libavcodec/g2meet.c
+++ b/libavcodec/g2meet.c
@@ -122,7 +122,7 @@ typedef struct JPGContext {
 
     VLC        dc_vlc[2], ac_vlc[2];
     int        prev_dc[3];
-    DECLARE_ALIGNED(32, int16_t, block)[6][64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[6][64];
 
     uint8_t    *buf;
 } JPGContext;
diff --git a/libavcodec/ituh263dec.c b/libavcodec/ituh263dec.c
index fc95a532ce..c5079fe30d 100644
--- a/libavcodec/ituh263dec.c
+++ b/libavcodec/ituh263dec.c
@@ -574,7 +574,7 @@ not_coded:
 
 static int h263_skip_b_part(MpegEncContext *s, int cbp)
 {
-    LOCAL_ALIGNED_32(int16_t, dblock, [64]);
+    LOCAL_ALIGNED_64(int16_t, dblock, [64]);
     int i, mbi;
     int bli[6];
 
diff --git a/libavcodec/mdec.c b/libavcodec/mdec.c
index 330b761279..51708cdf34 100644
--- a/libavcodec/mdec.c
+++ b/libavcodec/mdec.c
@@ -48,7 +48,7 @@ typedef struct MDECContext {
     int mb_width;
     int mb_height;
     int mb_x, mb_y;
-    DECLARE_ALIGNED(32, int16_t, block)[6][64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[6][64];
     DECLARE_ALIGNED(16, uint16_t, quant_matrix)[64];
     uint8_t *bitstream_buffer;
     unsigned int bitstream_buffer_size;
diff --git a/libavcodec/mimic.c b/libavcodec/mimic.c
index 1d463e9962..5133860f55 100644
--- a/libavcodec/mimic.c
+++ b/libavcodec/mimic.c
@@ -49,7 +49,7 @@ typedef struct MimicContext {
 
     ThreadFrame     frames     [16];
 
-    DECLARE_ALIGNED(32, int16_t, dct_block)[64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, dct_block)[64];
 
     GetBitContext   gb;
     ScanTable       scantable;
diff --git a/libavcodec/mjpegdec.h b/libavcodec/mjpegdec.h
index c84a40aa6e..5b1d6c8f94 100644
--- a/libavcodec/mjpegdec.h
+++ b/libavcodec/mjpegdec.h
@@ -38,6 +38,7 @@
 #include "get_bits.h"
 #include "hpeldsp.h"
 #include "idctdsp.h"
+#include "internal.h"
 
 #define MAX_COMPONENTS 4
 
@@ -98,7 +99,7 @@ typedef struct MJpegDecodeContext {
     int got_picture;                                ///< we found a SOF and picture is valid, too.
     int linesize[MAX_COMPONENTS];                   ///< linesize << interlaced
     int8_t *qscale_table;
-    DECLARE_ALIGNED(32, int16_t, block)[64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[64];
     int16_t (*blocks[MAX_COMPONENTS])[64]; ///< intermediate sums (progressive mode)
     uint8_t *last_nnz[MAX_COMPONENTS];
     uint64_t coefs_finished[MAX_COMPONENTS]; ///< bitmask of which coefs have been completely decoded (progressive mode)
diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
index e9e0153ee9..662b92d1b7 100644
--- a/libavcodec/proresdec2.c
+++ b/libavcodec/proresdec2.c
@@ -368,7 +368,7 @@ static int decode_slice_luma(AVCodecContext *avctx, SliceContext *slice,
                              const int16_t *qmat)
 {
     ProresContext *ctx = avctx->priv_data;
-    LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]);
+    LOCAL_ALIGNED_64(int16_t, blocks, [8*4*64]);
     int16_t *block;
     GetBitContext gb;
     int i, blocks_per_slice = slice->mb_count<<2;
@@ -402,7 +402,7 @@ static int decode_slice_chroma(AVCodecContext *avctx, SliceContext *slice,
                                const int16_t *qmat, int log2_blocks_per_mb)
 {
     ProresContext *ctx = avctx->priv_data;
-    LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]);
+    LOCAL_ALIGNED_64(int16_t, blocks, [8*4*64]);
     int16_t *block;
     GetBitContext gb;
     int i, j, blocks_per_slice = slice->mb_count << log2_blocks_per_mb;
@@ -485,7 +485,7 @@ static void decode_slice_alpha(ProresContext *ctx,
 {
     GetBitContext gb;
     int i;
-    LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]);
+    LOCAL_ALIGNED_64(int16_t, blocks, [8*4*64]);
     int16_t *block;
 
     for (i = 0; i < blocks_per_slice<<2; i++)
diff --git a/libavcodec/speedhq.c b/libavcodec/speedhq.c
index 6d3487ca19..1f70602acc 100644
--- a/libavcodec/speedhq.c
+++ b/libavcodec/speedhq.c
@@ -224,7 +224,7 @@ static inline int decode_dct_block(const SHQContext *s, GetBitContext *gb, int l
 {
     const int *quant_matrix = s->quant_matrix;
     const uint8_t *scantable = s->intra_scantable.permutated;
-    LOCAL_ALIGNED_32(int16_t, block, [64]);
+    LOCAL_ALIGNED_64(int16_t, block, [64]);
     int dc_offset;
 
     s->bdsp.clear_block(block);
diff --git a/libavcodec/wmv2.h b/libavcodec/wmv2.h
index 0f459ae5ae..eb3e808ba2 100644
--- a/libavcodec/wmv2.h
+++ b/libavcodec/wmv2.h
@@ -51,7 +51,7 @@ typedef struct Wmv2Context {
     int hshift;
 
     ScanTable abt_scantable[2];
-    DECLARE_ALIGNED(32, int16_t, abt_block2)[6][64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, abt_block2)[6][64];
 } Wmv2Context;
 
 void ff_wmv2_common_init(Wmv2Context *w);
diff --git a/libavcodec/x86/blockdsp.asm b/libavcodec/x86/blockdsp.asm
index 9d0e8a3242..930ef7c833 100644
--- a/libavcodec/x86/blockdsp.asm
+++ b/libavcodec/x86/blockdsp.asm
@@ -52,6 +52,11 @@ CLEAR_BLOCK 1, 8
 INIT_YMM avx
 CLEAR_BLOCK 1, 4
 
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+CLEAR_BLOCK 1, 2
+%endif
+
 ;-----------------------------------------
 ; void ff_clear_blocks(int16_t *blocks);
 ;-----------------------------------------
@@ -68,9 +73,13 @@ cglobal clear_blocks, 1, 2, %1, blocks, len
     mova  [blocksq+lenq+mmsize*3], m0
     mova  [blocksq+lenq+mmsize*4], m0
     mova  [blocksq+lenq+mmsize*5], m0
+%if mmsize != 64
     mova  [blocksq+lenq+mmsize*6], m0
     mova  [blocksq+lenq+mmsize*7], m0
     add   lenq, mmsize*8
+%else
+    add   lenq, mmsize*6
+%endif
     js .loop
     RET
 %endmacro
@@ -83,3 +92,8 @@ INIT_XMM sse
 CLEAR_BLOCKS 1
 INIT_YMM avx
 CLEAR_BLOCKS 1
+
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+CLEAR_BLOCKS 1
+%endif
diff --git a/libavcodec/x86/blockdsp_init.c b/libavcodec/x86/blockdsp_init.c
index 8b01a447cd..2bcf5320d7 100644
--- a/libavcodec/x86/blockdsp_init.c
+++ b/libavcodec/x86/blockdsp_init.c
@@ -29,9 +29,11 @@
 void ff_clear_block_mmx(int16_t *block);
 void ff_clear_block_sse(int16_t *block);
 void ff_clear_block_avx(int16_t *block);
+void ff_clear_block_avx512(int16_t *block);
 void ff_clear_blocks_mmx(int16_t *blocks);
 void ff_clear_blocks_sse(int16_t *blocks);
 void ff_clear_blocks_avx(int16_t *blocks);
+void ff_clear_blocks_avx512(int16_t *blocks);
 
 av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
                                   AVCodecContext *avctx)
@@ -56,5 +58,11 @@ av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
         c->clear_block  = ff_clear_block_avx;
         c->clear_blocks = ff_clear_blocks_avx;
     }
+
+    if (EXTERNAL_AVX512(cpu_flags)) {
+        c->clear_block  = ff_clear_block_avx512;
+        c->clear_blocks = ff_clear_blocks_avx512;
+    }
+
 #endif /* HAVE_X86ASM */
 }
diff --git a/libavutil/internal.h b/libavutil/internal.h
index 6f92f71e8e..40c76a1fc4 100644
--- a/libavutil/internal.h
+++ b/libavutil/internal.h
@@ -137,6 +137,12 @@
 #   define LOCAL_ALIGNED_32(t, v, ...) E1(LOCAL_ALIGNED_A(32, t, v, __VA_ARGS__,,))
 #endif
 
+#if HAVE_LOCAL_ALIGNED
+#   define LOCAL_ALIGNED_64(t, v, ...) E1(LOCAL_ALIGNED_D(64, t, v, __VA_ARGS__,,))
+#else
+#   define LOCAL_ALIGNED_64(t, v, ...) E1(LOCAL_ALIGNED_A(64, t, v, __VA_ARGS__,,))
+#endif
+
 #define FF_ALLOC_OR_GOTO(ctx, p, size, label)\
 {\
     p = av_malloc(size);\
diff --git a/tests/checkasm/blockdsp.c b/tests/checkasm/blockdsp.c
index c753506b3c..2344dbdbed 100644
--- a/tests/checkasm/blockdsp.c
+++ b/tests/checkasm/blockdsp.c
@@ -53,8 +53,8 @@ do {                                                                \
 
 void checkasm_check_blockdsp(void)
 {
-    LOCAL_ALIGNED_32(uint16_t, buf0, [6 * 8 * 8]);
-    LOCAL_ALIGNED_32(uint16_t, buf1, [6 * 8 * 8]);
+    LOCAL_ALIGNED_64(uint16_t, buf0, [6 * 8 * 8]);
+    LOCAL_ALIGNED_64(uint16_t, buf1, [6 * 8 * 8]);
 
     AVCodecContext avctx = { 0 };
     BlockDSPContext h;
-- 
2.15.0



More information about the ffmpeg-devel mailing list