[FFmpeg-cvslog] Merge commit '112cee0241f5799edff0e4682b9e8639b046dc78'

Thu Mar 23 16:59:43 EET 2017

ffmpeg | branch: master | Clément Bœsch <u at pkh.me> | Thu Mar 23 15:39:16 2017 +0100| [947230837cb6d64323590650554dad7abaf9a93f] | committer: Clément Bœsch

Merge commit '112cee0241f5799edff0e4682b9e8639b046dc78'

* commit '112cee0241f5799edff0e4682b9e8639b046dc78':
  hevc: Add SSE2 and AVX IDCT

Merged-by: Clément Bœsch <u at pkh.me>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=947230837cb6d64323590650554dad7abaf9a93f
---

 doc/libav-merge.txt           |   1 +
 libavcodec/x86/hevc_idct.asm  | 800 ++++++++++++++++++++++++++++++++++++++++--
 libavcodec/x86/hevcdsp_init.c |  53 ++-
 3 files changed, 811 insertions(+), 43 deletions(-)

diff --git a/doc/libav-merge.txt b/doc/libav-merge.txt
index 577206f..4cb3e6e 100644
--- a/doc/libav-merge.txt
+++ b/doc/libav-merge.txt
@@ -110,3 +110,4 @@ Extra changes needed to be aligned with Libav:
 
 - Switching our examples to the new encode/decode API (see 67d28f4a0f)
 - AC3 speed-up for our fixed version (see a9ba59591e)
+- HEVC IDCT bit depth 12-bit support (Libav added 8 and 10 but doesn't have 12)
diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm
index 33b437c..1eb1973 100644
--- a/libavcodec/x86/hevc_idct.asm
+++ b/libavcodec/x86/hevc_idct.asm
@@ -2,6 +2,7 @@
 ;* SIMD-optimized IDCT functions for HEVC decoding
 ;* Copyright (c) 2014 Pierre-Edouard LEPERE
 ;* Copyright (c) 2014 James Almer
+;* Copyright (c) 2016 Alexandra Hájková
 ;*
 ;* This file is part of FFmpeg.
 ;*
@@ -22,6 +23,217 @@
 
 %include "libavutil/x86/x86util.asm"
 
+SECTION_RODATA
+
+pd_64: times 4 dd 64
+pd_2048: times 4 dd 2048
+pd_512: times 4 dd 512
+
+; 4x4 transform coeffs
+cextern pw_64
+pw_64_m64: times 4 dw 64, -64
+pw_83_36: times 4 dw 83, 36
+pw_36_m83: times 4 dw 36, -83
+
+; 8x8 transform coeffs
+pw_89_75: times 4 dw 89, 75
+pw_50_18: times 4 dw 50, 18
+
+pw_75_m18: times 4 dw 75, -18
+pw_m89_m50: times 4 dw -89, -50
+
+pw_50_m89: times 4 dw 50, -89
+pw_18_75: times 4 dw 18, 75
+
+pw_18_m50: times 4 dw 18, -50
+pw_75_m89: times 4 dw 75, -89
+
+; 16x16 transformation coeffs
+trans_coeffs16: times 4 dw 90, 87
+times 4 dw 80, 70
+times 4 dw 57, 43
+times 4 dw 25, 9
+
+times 4 dw 87, 57
+times 4 dw 9, -43
+times 4 dw -80, -90
+times 4 dw -70, -25
+
+times 4 dw 80, 9
+times 4 dw -70, -87
+times 4 dw -25, 57
+times 4 dw 90, 43
+
+times 4 dw 70, -43
+times 4 dw -87, 9
+times 4 dw 90, 25
+times 4 dw -80, -57
+
+times 4 dw 57, -80
+times 4 dw -25, 90
+times 4 dw -9, -87
+times 4 dw 43, 70
+
+times 4 dw 43, -90
+times 4 dw 57, 25
+times 4 dw -87, 70
+times 4 dw 9, -80
+
+times 4 dw 25, -70
+times 4 dw 90, -80
+times 4 dw 43, 9
+times 4 dw -57, 87
+
+times 4 dw 9, -25
+times 4 dw 43, -57
+times 4 dw 70, -80
+times 4 dw 87, -90
+
+; 32x32 transform coeffs
+trans_coeff32: times 8 dw 90
+times 4 dw 88, 85
+times 4 dw 82, 78
+times 4 dw 73, 67
+times 4 dw 61, 54
+times 4 dw 46, 38
+times 4 dw 31, 22
+times 4 dw 13, 4
+
+times 4 dw 90, 82
+times 4 dw 67, 46
+times 4 dw 22, -4
+times 4 dw -31, -54
+times 4 dw -73, -85
+times 4 dw -90, -88
+times 4 dw -78, -61
+times 4 dw -38, -13
+
+times 4 dw 88, 67
+times 4 dw 31, -13
+times 4 dw -54, -82
+times 4 dw -90, -78
+times 4 dw -46, -4
+times 4 dw 38, 73
+times 4 dw 90, 85
+times 4 dw 61, 22
+
+times 4 dw 85, 46
+times 4 dw -13, -67
+times 4 dw -90, -73
+times 4 dw -22, 38
+times 4 dw 82, 88
+times 4 dw 54, -4
+times 4 dw -61, -90
+times 4 dw -78, -31
+
+times 4 dw 82, 22
+times 4 dw -54, -90
+times 4 dw -61, 13
+times 4 dw 78, 85
+times 4 dw 31, -46
+times 4 dw -90, -67
+times 4 dw 4, 73
+times 4 dw 88, 38
+
+times 4 dw 78, -4
+times 4 dw -82, -73
+times 4 dw 13, 85
+times 4 dw 67, -22
+times 4 dw -88, -61
+times 4 dw 31, 90
+times 4 dw 54, -38
+times 4 dw -90, -46
+
+times 4 dw 73, -31
+times 4 dw -90, -22
+times 4 dw 78, 67
+times 4 dw -38, -90
+times 4 dw -13, 82
+times 4 dw 61, -46
+times 4 dw -88, -4
+times 4 dw 85, 54
+
+times 4 dw 67, -54
+times 4 dw -78, 38
+times 4 dw 85, -22
+times 4 dw -90, 4
+times 4 dw 90, 13
+times 4 dw -88, -31
+times 4 dw 82, 46
+times 4 dw -73, -61
+
+times 4 dw 61, -73
+times 4 dw -46, 82
+times 4 dw 31, -88
+times 4 dw -13, 90
+times 4 dw -4, -90
+times 4 dw 22, 85
+times 4 dw -38, -78
+times 4 dw 54, 67
+
+times 4 dw 54, -85
+times 4 dw -4, 88
+times 4 dw -46, -61
+times 4 dw 82, 13
+times 4 dw -90, 38
+times 4 dw 67, -78
+times 4 dw -22, 90
+times 4 dw -31, -73
+
+times 4 dw 46, -90
+times 4 dw 38, 54
+times 4 dw -90, 31
+times 4 dw 61, -88
+times 4 dw 22, 67
+times 4 dw -85, 13
+times 4 dw 73, -82
+times 4 dw 4, 78
+
+times 4 dw 38, -88
+times 4 dw 73, -4
+times 4 dw -67, 90
+times 4 dw -46, -31
+times 4 dw 85, -78
+times 4 dw 13, 61
+times 4 dw -90, 54
+times 4 dw 22, -82
+
+times 4 dw 31, -78
+times 4 dw 90, -61
+times 4 dw 4, 54
+times 4 dw -88, 82
+times 4 dw -38, -22
+times 4 dw 73, -90
+times 4 dw 67, -13
+times 4 dw -46, 85
+
+times 4 dw 22, -61
+times 4 dw 85, -90
+times 4 dw 73, -38
+times 4 dw -4, 46
+times 4 dw -78, 90
+times 4 dw -82, 54
+times 4 dw -13, -31
+times 4 dw 67, -88
+
+times 4 dw 13, -38
+times 4 dw 61, -78
+times 4 dw 88, -90
+times 4 dw 85, -73
+times 4 dw 54, -31
+times 4 dw 4, 22
+times 4 dw -46, 67
+times 4 dw -82, 90
+
+times 4 dw 4, -13
+times 4 dw 22, -31
+times 4 dw 38, -46
+times 4 dw 54, -61
+times 4 dw 67, -73
+times 4 dw 78, -82
+times 4 dw 85, -88
+times 4 dw 90, -90
+
 SECTION .text
 
 ; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)
@@ -74,50 +286,568 @@ cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp
     RET
 %endmacro
 
-; 8-bit
-INIT_MMX mmxext
-IDCT_DC_NL  4,      8
-IDCT_DC     8,  2,  8
+; IDCT 4x4, expects input in m0, m1
+; %1 - shift
+; %2 - 1/0 - SCALE and Transpose or not
+; %3 - 1/0 add constant or not
+%macro TR_4x4 3
+    ; interleaves src0 with src2 to m0
+    ;         and src1 with scr3 to m2
+    ; src0: 00 01 02 03     m0: 00 20 01 21 02 22 03 23
+    ; src1: 10 11 12 13 -->
+    ; src2: 20 21 22 23     m1: 10 30 11 31 12 32 13 33
+    ; src3: 30 31 32 33
 
-INIT_XMM sse2
-IDCT_DC_NL  8,      8
-IDCT_DC    16,  4,  8
-IDCT_DC    32, 16,  8
+    SBUTTERFLY wd, 0, 1, 2
 
-%if HAVE_AVX2_EXTERNAL
-INIT_YMM avx2
-IDCT_DC    16,  2,  8
-IDCT_DC    32,  8,  8
-%endif ;HAVE_AVX2_EXTERNAL
+    pmaddwd m2, m0, [pw_64]    ; e0
+    pmaddwd m3, m1, [pw_83_36] ; o0
+    pmaddwd m0, [pw_64_m64]    ; e1
+    pmaddwd m1, [pw_36_m83]    ; o1
 
-; 10-bit
-INIT_MMX mmxext
-IDCT_DC_NL  4,     10
-IDCT_DC     8,  2, 10
+%if %3 == 1
+    %assign %%add 1 << (%1 - 1)
+    mova  m4, [pd_ %+ %%add]
+    paddd m2, m4
+    paddd m0, m4
+%endif
 
-INIT_XMM sse2
-IDCT_DC_NL  8,     10
-IDCT_DC    16,  4, 10
-IDCT_DC    32, 16, 10
+    SUMSUB_BADC d, 3, 2, 1, 0, 4
 
-%if HAVE_AVX2_EXTERNAL
-INIT_YMM avx2
-IDCT_DC    16,  2, 10
-IDCT_DC    32,  8, 10
-%endif ;HAVE_AVX2_EXTERNAL
+%if %2 == 1
+    psrad m3, %1 ; e0 + o0
+    psrad m1, %1 ; e1 + o1
+    psrad m2, %1 ; e0 - o0
+    psrad m0, %1 ; e1 - o1
+    ;clip16
+    packssdw m3, m1
+    packssdw m0, m2
+    ; Transpose
+    SBUTTERFLY wd, 3, 0, 1
+    SBUTTERFLY wd, 3, 0, 1
+    SWAP 3, 1, 0
+%else
+    SWAP 3, 2, 0
+%endif
+%endmacro
+
+%macro DEFINE_BIAS 1
+    %assign shift (20 - %1)
+    %assign c_add (1 << (shift - 1))
+    %define arr_add pd_ %+ c_add
+%endmacro
+
+; %1 - bit_depth
+; %2 - register add constant
+; is loaded to
+; shift = 20 - bit_depth
+%macro LOAD_BIAS 2
+    DEFINE_BIAS %1
+    mova %2, [arr_add]
+%endmacro
+
+; %1, %2 - registers to load packed 16 bit values to
+; %3, %4, %5, %6 - vertical offsets
+; %7 - horizontal offset
+%macro LOAD_BLOCK 7
+    movq   %1, [r0 + %3 + %7]
+    movhps %1, [r0 + %5 + %7]
+    movq   %2, [r0 + %4 + %7]
+    movhps %2, [r0 + %6 + %7]
+%endmacro
+
+; void ff_hevc_idct_4x4__{8,10}_<opt>(int16_t *coeffs, int col_limit)
+; %1 = bitdepth
+%macro IDCT_4x4 1
+cglobal hevc_idct_4x4_%1, 1, 1, 5, coeffs
+    mova m0, [coeffsq]
+    mova m1, [coeffsq + 16]
+
+    TR_4x4 7, 1, 1
+    TR_4x4 20 - %1, 1, 1
+
+    mova [coeffsq],      m0
+    mova [coeffsq + 16], m1
+    RET
+%endmacro
+
+; scale, pack (clip16) and store the residuals     0 e8[0] + o8[0] --> + %1
+; 4 at one time (4 columns)                        1 e8[1] + o8[1]
+; from %5: e8/16 + o8/16, with %1 offset                  ...
+; and  %3: e8/16 - o8/16, with %2 offset           6 e8[1] - o8[1]
+; %4 - shift                                       7 e8[0] - o8[0] --> + %2
+%macro STORE_8 7
+    psrad    %5, %4
+    psrad    %3, %4
+    packssdw %5, %3
+    movq     [coeffsq + %1], %5
+    movhps   [coeffsq + %2], %5
+%endmacro
+
+; %1 - horizontal offset
+; %2 - shift
+; %3, %4 - transform coeffs
+; %5 - vertical offset for e8 + o8
+; %6 - vertical offset for e8 - o8
+; %7 - register with e8 inside
+; %8 - block_size
+; %9 - register to store e8 +o8
+; %10 - register to store e8 - o8
+%macro E8_O8 10
+    pmaddwd m6, m4, %3
+    pmaddwd m7, m5, %4
+
+    paddd m6, m7
+    paddd m7, m6, %7 ; o8 + e8
+    psubd %7, m6     ; e8 - o8
+%if %8 == 8
+    STORE_8 %5 + %1, %6 + %1, %7, %2, m7, 0, 0
+%else
+    SWAP m7, %9
+    SWAP %7, %10
+%endif
+%endmacro
+
+; 8x4 residuals are processed and stored
+; %1 - horizontal offset
+; %2 - shift
+; %3 - offset of the even row
+; %4 - step: 1 for 8x8, 2 for 16x16, 4 for 32x32
+; %5 - offset of the odd row
+; %6 - block size
+; %7 - 1/0 add a constant in TR_4x4 or not
+; I want to add a constant for 8x8 transform but not for 16x16 and 32x32
+%macro TR_8x4 7
+    ; load 4 columns of even rows
+    LOAD_BLOCK  m0, m1, 0, 2 * %4 * %3, %4 * %3, 3 * %4 * %3, %1
+
+    TR_4x4 %2, 0, %7 ; e8: m0, m1, m2, m3, for 4 columns only
+
+    ; load 4 columns of odd rows
+    LOAD_BLOCK m4, m5, %4 * %5, 3 * %4 * %5, 5 * %4 * %5, 7 * %4 * %5, %1
+
+    ; 00 01 02 03
+    ; 10 11 12 13      m4: 10 30 11 31 12 32 13 33
+
+    ; ...        -- >
+    ;                  m5: 50 70 51 71 52 72 53 73
+    ; 70 71 72 73
+    SBUTTERFLY wd, 4, 5, 6
+
+    E8_O8 %1, %2, [pw_89_75],  [pw_50_18],   0,      %5 * 7, m0, %6, m8, m15
+    E8_O8 %1, %2, [pw_75_m18], [pw_m89_m50], %5,     %5 * 6, m1, %6, m9, m14
+    E8_O8 %1, %2, [pw_50_m89], [pw_18_75],   %5 * 2, %5 * 5, m2, %6, m10, m13
+    E8_O8 %1, %2, [pw_18_m50], [pw_75_m89],  %5 * 3, %5 * 4, m3, %6, m11, m12
+%endmacro
+
+%macro STORE_PACKED 7
+    movq   [r0 + %3 + %7], %1
+    movhps [r0 + %4 + %7], %1
+    movq   [r0 + %5 + %7], %2
+    movhps [r0 + %6 + %7], %2
+%endmacro
+
+; transpose 4x4 block packed
+; in %1 and %2 registers
+; %3 - temporary register
+%macro TRANSPOSE_4x4 3
+    SBUTTERFLY wd, %1, %2, %3
+    SBUTTERFLY dq, %1, %2, %3
+%endmacro
+
+; %1 - horizontal offset of the block i
+; %2 - vertical offset of the block i
+; %3 - width in bytes
+; %4 - vertical offset for the block j
+; %5 - horizontal offset for the block j
+%macro SWAP_BLOCKS 5
+    ; M_j
+    LOAD_BLOCK m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
+    TRANSPOSE_4x4 4, 5, 6
+
+    ; M_i
+    LOAD_BLOCK m6, m7, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
+
+    STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
+
+    ; transpose and store M_i
+    SWAP m6, m4
+    SWAP m7, m5
+    TRANSPOSE_4x4 4, 5, 6
+    STORE_PACKED m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
+%endmacro
+
+; %1 - horizontal offset
+; %2 - vertical offset of the block
+; %3 - width in bytes
+%macro TRANSPOSE_BLOCK 3
+    LOAD_BLOCK m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
+    TRANSPOSE_4x4 4, 5, 6
+    STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
+%endmacro
+
+%macro TRANSPOSE_8x8 0
+cglobal hevc_idct_transpose_8x8, 0, 0, 0
+    ; M1 M2 ^T = M1^t M3^t
+    ; M3 M4      M2^t M4^t
+
+    ; M1 4x4 block
+    TRANSPOSE_BLOCK 0, 0, 16
+
+    ; M2 and M3
+    SWAP_BLOCKS 0, 64, 16, 0, 8
+
+    ; M4
+    TRANSPOSE_BLOCK 8, 64, 16
+
+    ret
+%endmacro
+
+; void ff_hevc_idct_8x8_{8,10}_<opt>(int16_t *coeffs, int col_limit)
+; %1 = bitdepth
+%macro IDCT_8x8 1
+cglobal hevc_idct_8x8_%1, 1, 1, 8, coeffs
+    TR_8x4 0, 7, 32, 1, 16, 8, 1
+    TR_8x4 8, 7, 32, 1, 16, 8, 1
+
+    call hevc_idct_transpose_8x8_ %+ cpuname
+
+    DEFINE_BIAS %1
+    TR_8x4 0, shift, 32, 1, 16, 8, 1
+    TR_8x4 8, shift, 32, 1, 16, 8, 1
+
+    TAIL_CALL hevc_idct_transpose_8x8_ %+ cpuname, 1
+%endmacro
+
+; store intermedite e32 coeffs on stack
+; as 16x4 matrix
+; from m10: e8 + o8, with %6 offset
+; and  %3:  e8 - o8, with %7 offset
+; %4 - shift, unused here
+%macro STORE_16 7
+    mova [rsp + %6], %5
+    mova [rsp + %7], %3
+%endmacro
+
+; %1, %2 - transform constants
+; %3, %4 - regs with interleaved coeffs
+; %5 - 1/0 SWAP or add
+; %6, %7 - registers for intermidiate sums
+; %8 - accumulator register
+%macro ADD_ROWS 8
+    pmaddwd %6, %3, %1
+    pmaddwd %7, %4, %2
+    paddd   %6, %7
+%if %5 == 1
+    SWAP %6, %8
+%else
+    paddd %8, %6
+%endif
+%endmacro
+
+; %1 - transform coeffs
+; %2, %3 offsets for storing e+o/e-o back to coeffsq
+; %4 - shift
+; %5 - add
+; %6 - block_size
+; %7 - register with e16
+; %8, %9 - stack offsets for storing e+o/e-o
+%macro E16_O16 9
+    ADD_ROWS [%1],          [%1 +     16], m0, m1, 1, m5, m6, m7
+    ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m5, m6, m7
+
+%if %6 == 8
+    paddd %7, %5
+%endif
+
+    paddd m4, m7, %7 ; o16 + e16
+    psubd %7, m7     ; e16 - o16
+    STORE_%6 %2, %3, %7, %4, m4, %8, %9
+%endmacro
+
+%macro TR_16x4 10
+    ; produce 8x4 matrix of e16 coeffs
+    ; for 4 first rows and store it on stack (128 bytes)
+    TR_8x4 %1, 7, %4, %5, %6, %8, 0
+
+    ; load 8 even rows
+    LOAD_BLOCK m0, m1, %9 * %6, %9 * 3 * %6, %9 * 5 * %6, %9 * 7 * %6, %1
+    LOAD_BLOCK m2, m3, %9 * 9 * %6, %9 * 11 * %6, %9 * 13 * %6, %9 * 15 * %6, %1
+
+    SBUTTERFLY wd, 0, 1, 4
+    SBUTTERFLY wd, 2, 3, 4
+
+    E16_O16 trans_coeffs16,               0 + %1, 15 * %6 + %1, %2, %3, %7, m8,       0, 15 * 16
+    mova m8, %3
+    E16_O16 trans_coeffs16 +     64,     %6 + %1, 14 * %6 + %1, %2, m8, %7, m9,      16, 14 * 16
+    E16_O16 trans_coeffs16 + 2 * 64, 2 * %6 + %1, 13 * %6 + %1, %2, m8, %7, m10, 2 * 16, 13 * 16
+    E16_O16 trans_coeffs16 + 3 * 64, 3 * %6 + %1, 12 * %6 + %1, %2, m8, %7, m11, 3 * 16, 12 * 16
+    E16_O16 trans_coeffs16 + 4 * 64, 4 * %6 + %1, 11 * %6 + %1, %2, m8, %7, m12, 4 * 16, 11 * 16
+    E16_O16 trans_coeffs16 + 5 * 64, 5 * %6 + %1, 10 * %6 + %1, %2, m8, %7, m13, 5 * 16, 10 * 16
+    E16_O16 trans_coeffs16 + 6 * 64, 6 * %6 + %1,  9 * %6 + %1, %2, m8, %7, m14, 6 * 16,  9 * 16
+    E16_O16 trans_coeffs16 + 7 * 64, 7 * %6 + %1,  8 * %6 + %1, %2, m8, %7, m15, 7 * 16,  8 * 16
+%endmacro
+
+%macro TRANSPOSE_16x16 0
+cglobal hevc_idct_transpose_16x16, 0, 0, 0
+; M1  M2  M3  M4 ^T      m1 m5 m9  m13   M_i^T = m_i
+; M5  M6  M7  M8    -->  m2 m6 m10 m14
+; M9  M10 M11 M12        m3 m7 m11 m15
+; M13 M14 M15 M16        m4 m8 m12 m16
+
+    ; M1 4x4 block
+    TRANSPOSE_BLOCK 0, 0, 32
+
+    ; M5, M2
+    SWAP_BLOCKS 0, 128, 32, 0, 8
+    ; M9, M3
+    SWAP_BLOCKS 0, 256, 32, 0, 16
+    ; M13, M4
+    SWAP_BLOCKS 0, 384, 32, 0, 24
+
+    ;M6
+    TRANSPOSE_BLOCK 8, 128, 32
+
+    ; M10, M7
+    SWAP_BLOCKS 8, 256, 32, 128, 16
+    ; M14, M8
+    SWAP_BLOCKS 8, 384, 32, 128, 24
+
+    ;M11
+    TRANSPOSE_BLOCK 16, 256, 32
+
+    ; M15, M12
+    SWAP_BLOCKS 16, 384, 32, 256, 24
+
+    ;M16
+    TRANSPOSE_BLOCK 24, 384, 32
+
+    ret
+%endmacro
+
+; void ff_hevc_idct_16x16_{8,10}_<opt>(int16_t *coeffs, int col_limit)
+; %1 = bitdepth
+%macro IDCT_16x16 1
+cglobal hevc_idct_16x16_%1, 1, 2, 16, coeffs
+    mov r1d, 3
+.loop16:
+    TR_16x4 8 * r1, 7, [pd_64], 64, 2, 32, 8, 16, 1, 0
+    dec r1d
+    jge .loop16
 
-; 12-bit
+    call hevc_idct_transpose_16x16_ %+ cpuname
+
+    DEFINE_BIAS %1
+    mov r1d, 3
+.loop16_2:
+    TR_16x4 8 * r1, shift, [arr_add], 64, 2, 32, 8, 16, 1, 1
+    dec r1d
+    jge .loop16_2
+
+    TAIL_CALL hevc_idct_transpose_16x16_ %+ cpuname, 1
+%endmacro
+
+; scale, pack (clip16) and store the residuals     0 e32[0] + o32[0] --> %1
+; 4 at one time (4 columns)                        1 e32[1] + o32[1]
+; %1 - address to store e32 + o32
+; %2 - address to store e32 - e32
+; %5 - reg with e32 + o32                                  ...
+; %3 - reg with e32 - o32                          30 e32[1] - o32[1]
+; %4 - shift                                       31 e32[0] - o32[0] --> %2
+%macro STORE_32 5
+    psrad    %5, %4
+    psrad    %3, %4
+    packssdw %5, %3
+    movq     [%1], %5
+    movhps   [%2], %5
+%endmacro
+
+; %1 - transform coeffs
+; %2 - stack offset for e32
+; %2, %3 offsets for storing e+o/e-o back to coeffsq
+; %4 - shift
+; %5 - stack offset of e32
+%macro E32_O32 5
+    ADD_ROWS [%1],          [%1 +     16], m0, m1, 1, m8, m9, m10
+    ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m8, m9, m10
+    ADD_ROWS [%1 + 4 * 16], [%1 + 5 * 16], m4, m5, 0, m8, m9, m10
+    ADD_ROWS [%1 + 6 * 16], [%1 + 7 * 16], m6, m7, 0, m8, m9, m10
+
+    paddd m11, m14, [rsp + %5]
+    paddd m12, m10, m11 ; o32 + e32
+    psubd m11, m10      ; e32 - o32
+    STORE_32 %2, %3, m11, %4, m12
+%endmacro
+
+; %1 - horizontal offset
+; %2 - bitdepth
+%macro TR_32x4 3
+    TR_16x4 %1, 7, [pd_64], 128, 4, 64, 16, 16, 2, 0
+
+    LOAD_BLOCK m0, m1,      64,  3 * 64,  5 * 64,  7 * 64, %1
+    LOAD_BLOCK m2, m3,  9 * 64, 11 * 64, 13 * 64, 15 * 64, %1
+    LOAD_BLOCK m4, m5, 17 * 64, 19 * 64, 21 * 64, 23 * 64, %1
+    LOAD_BLOCK m6, m7, 25 * 64, 27 * 64, 29 * 64, 31 * 64, %1
+
+    SBUTTERFLY wd, 0, 1, 8
+    SBUTTERFLY wd, 2, 3, 8
+    SBUTTERFLY wd, 4, 5, 8
+    SBUTTERFLY wd, 6, 7, 8
+
+%if %3 == 1
+    %assign shift 7
+    mova m14, [pd_64]
+%else
+    LOAD_BIAS %2, m14
+%endif
+
+    lea r2, [trans_coeff32 + 15 * 128]
+    lea r3, [coeffsq + %1]
+    lea r4, [r3 + 16 * 64]
+    mov r5d, 15 * 16
+%%loop:
+    E32_O32 r2, r3 + r5 * 4, r4, shift, r5
+    sub r2, 128
+    add r4, 64
+    sub r5d, 16
+    jge %%loop
+%endmacro
+
+%macro TRANSPOSE_32x32 0
+cglobal hevc_idct_transpose_32x32, 0, 0, 0
+    ; M0  M1 ... M7
+    ; M8         M15
+    ;
+    ; ...
+    ;
+    ; M56        M63
+
+    TRANSPOSE_BLOCK 0, 0, 64 ; M1
+    mov r1d, 7
+    mov r2d, 7 * 256
+.loop_transpose:
+    SWAP_BLOCKS 0, r2, 64, 0, r1 * 8
+    sub r2d, 256
+    dec r1d
+    jg .loop_transpose
+
+    TRANSPOSE_BLOCK 8, 256, 64 ; M9
+    mov r1d, 6
+    mov r2d, 512
+    mov r3d, 16
+.loop_transpose2:
+    SWAP_BLOCKS 8, r2, 64, 256, r3
+    add r3d, 8
+    add r2d, 256
+    dec r1d
+    jg .loop_transpose2
+
+    TRANSPOSE_BLOCK 2 * 8, 2 * 256, 64 ; M9
+    mov r1d, 5
+    mov r2d, 768
+    mov r3d, 24
+.loop_transpose3:
+    SWAP_BLOCKS 2 * 8, r2, 64, 2 * 256, r3
+    add r3d, 8
+    add r2d, 256
+    dec r1d
+    jg .loop_transpose3
+
+    TRANSPOSE_BLOCK 3 * 8, 3 * 256, 64 ; M27
+    mov r1d, 4
+    mov r2d, 1024
+    mov r3d, 32
+.loop_transpose4:
+    SWAP_BLOCKS 3 * 8, r2, 64, 3 * 256, r3
+    add r3d, 8
+    add r2d, 256
+    dec r1d
+    jg .loop_transpose4
+
+    TRANSPOSE_BLOCK 4 * 8, 4 * 256, 64 ; M36
+    mov r1d, 3
+    mov r2d, 1280
+    mov r3d, 40
+.loop_transpose5:
+    SWAP_BLOCKS 4 * 8, r2, 64, 4 * 256, r3
+    add r3d, 8
+    add r2d, 256
+    dec r1d
+    jg .loop_transpose5
+
+    TRANSPOSE_BLOCK 5 * 8, 5 * 256, 64 ; M45
+    SWAP_BLOCKS 5 * 8, 6 * 256, 64, 5 * 256, 6 * 8
+    SWAP_BLOCKS 5 * 8, 7 * 256, 64, 5 * 256, 7 * 8
+
+    TRANSPOSE_BLOCK 6 * 8, 6 * 256, 64 ; M54
+    SWAP_BLOCKS 6 * 8, 7 * 256, 64, 6 * 256, 7 * 8
+
+    TRANSPOSE_BLOCK 7 * 8, 7 * 256, 64 ; M63
+
+    ret
+%endmacro
+
+; void ff_hevc_idct_32x32_{8,10}_<opt>(int16_t *coeffs, int col_limit)
+; %1 = bitdepth
+%macro IDCT_32x32 1
+cglobal hevc_idct_32x32_%1, 1, 6, 16, 256, coeffs
+    mov r1d, 7
+.loop32:
+    TR_32x4 8 * r1, %1, 1
+    dec r1d
+    jge .loop32
+
+    call hevc_idct_transpose_32x32_ %+ cpuname
+
+    mov r1d, 7
+.loop32_2:
+    TR_32x4 8 * r1, %1, 0
+    dec r1d
+    jge .loop32_2
+
+    TAIL_CALL hevc_idct_transpose_32x32_ %+ cpuname, 1
+%endmacro
+
+%macro INIT_IDCT_DC 1
 INIT_MMX mmxext
-IDCT_DC_NL  4,     12
-IDCT_DC     8,  2, 12
+IDCT_DC_NL  4,      %1
+IDCT_DC     8,  2,  %1
 
 INIT_XMM sse2
-IDCT_DC_NL  8,     12
-IDCT_DC    16,  4, 12
-IDCT_DC    32, 16, 12
+IDCT_DC_NL  8,      %1
+IDCT_DC    16,  4,  %1
+IDCT_DC    32, 16,  %1
 
 %if HAVE_AVX2_EXTERNAL
-INIT_YMM avx2
-IDCT_DC    16,  2, 12
-IDCT_DC    32,  8, 12
+    INIT_YMM avx2
+    IDCT_DC    16,  2,  %1
+    IDCT_DC    32,  8,  %1
 %endif ;HAVE_AVX2_EXTERNAL
+%endmacro
+
+%macro INIT_IDCT 2
+INIT_XMM %2
+%if %1 == 8
+    TRANSPOSE_8x8
+    %if ARCH_X86_64
+        TRANSPOSE_16x16
+        TRANSPOSE_32x32
+    %endif
+%endif
+%if ARCH_X86_64
+    IDCT_32x32 %1
+    IDCT_16x16 %1
+%endif
+IDCT_8x8 %1
+IDCT_4x4 %1
+%endmacro
+
+INIT_IDCT_DC 8
+INIT_IDCT_DC 10
+INIT_IDCT_DC 12
+INIT_IDCT 8, sse2
+INIT_IDCT 8, avx
+INIT_IDCT 10, sse2
+INIT_IDCT 10, avx
+;INIT_IDCT 12, sse2
+;INIT_IDCT 12, avx
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index c4d9564..0b17671 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -58,18 +58,31 @@ LFL_FUNCS(uint8_t,   8, avx)
 LFL_FUNCS(uint8_t,  10, avx)
 LFL_FUNCS(uint8_t,  12, avx)
 
-#define IDCT_FUNCS(W, opt) \
+#define IDCT_DC_FUNCS(W, opt) \
 void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
 void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs); \
 void ff_hevc_idct_ ## W ## _dc_12_ ## opt(int16_t *coeffs)
 
-IDCT_FUNCS(4x4,   mmxext);
-IDCT_FUNCS(8x8,   mmxext);
-IDCT_FUNCS(8x8,   sse2);
-IDCT_FUNCS(16x16, sse2);
-IDCT_FUNCS(32x32, sse2);
-IDCT_FUNCS(16x16, avx2);
-IDCT_FUNCS(32x32, avx2);
+IDCT_DC_FUNCS(4x4,   mmxext);
+IDCT_DC_FUNCS(8x8,   mmxext);
+IDCT_DC_FUNCS(8x8,   sse2);
+IDCT_DC_FUNCS(16x16, sse2);
+IDCT_DC_FUNCS(32x32, sse2);
+IDCT_DC_FUNCS(16x16, avx2);
+IDCT_DC_FUNCS(32x32, avx2);
+
+#define IDCT_FUNCS(opt)                                             \
+void ff_hevc_idct_4x4_8_    ## opt(int16_t *coeffs, int col_limit); \
+void ff_hevc_idct_4x4_10_   ## opt(int16_t *coeffs, int col_limit); \
+void ff_hevc_idct_8x8_8_    ## opt(int16_t *coeffs, int col_limit); \
+void ff_hevc_idct_8x8_10_   ## opt(int16_t *coeffs, int col_limit); \
+void ff_hevc_idct_16x16_8_  ## opt(int16_t *coeffs, int col_limit); \
+void ff_hevc_idct_16x16_10_ ## opt(int16_t *coeffs, int col_limit); \
+void ff_hevc_idct_32x32_8_  ## opt(int16_t *coeffs, int col_limit); \
+void ff_hevc_idct_32x32_10_ ## opt(int16_t *coeffs, int col_limit);
+
+IDCT_FUNCS(sse2)
+IDCT_FUNCS(avx)
 
 #define mc_rep_func(name, bitd, step, W, opt) \
 void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst,                                                 \
@@ -709,6 +722,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
                 c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
                 c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
 
+                c->idct[2] = ff_hevc_idct_16x16_8_sse2;
+                c->idct[3] = ff_hevc_idct_32x32_8_sse2;
             }
             SAO_BAND_INIT(8, sse2);
 
@@ -716,6 +731,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
             c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
 
+            c->idct[0]    = ff_hevc_idct_4x4_8_sse2;
+            c->idct[1]    = ff_hevc_idct_8x8_8_sse2;
+
             c->add_residual[1] = ff_hevc_add_residual8_8_sse2;
             c->add_residual[2] = ff_hevc_add_residual16_8_sse2;
             c->add_residual[3] = ff_hevc_add_residual32_8_sse2;
@@ -745,9 +763,15 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             if (ARCH_X86_64) {
                 c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
                 c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
+
+                c->idct[2] = ff_hevc_idct_16x16_8_avx;
+                c->idct[3] = ff_hevc_idct_32x32_8_avx;
             }
             SAO_BAND_INIT(8, avx);
 
+            c->idct[0] = ff_hevc_idct_4x4_8_avx;
+            c->idct[1] = ff_hevc_idct_8x8_8_avx;
+
             c->add_residual[1] = ff_hevc_add_residual8_8_avx;
             c->add_residual[2] = ff_hevc_add_residual16_8_avx;
             c->add_residual[3] = ff_hevc_add_residual32_8_avx;
@@ -864,6 +888,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             if (ARCH_X86_64) {
                 c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
                 c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
+
+                c->idct[2] = ff_hevc_idct_16x16_10_sse2;
+                c->idct[3] = ff_hevc_idct_32x32_10_sse2;
             }
             SAO_BAND_INIT(10, sse2);
             SAO_EDGE_INIT(10, sse2);
@@ -872,6 +899,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
             c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_sse2;
 
+            c->idct[0]    = ff_hevc_idct_4x4_10_sse2;
+            c->idct[1]    = ff_hevc_idct_8x8_10_sse2;
+
             c->add_residual[1] = ff_hevc_add_residual8_10_sse2;
             c->add_residual[2] = ff_hevc_add_residual16_10_sse2;
             c->add_residual[3] = ff_hevc_add_residual32_10_sse2;
@@ -897,7 +927,14 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             if (ARCH_X86_64) {
                 c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
                 c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
+
+                c->idct[2] = ff_hevc_idct_16x16_10_avx;
+                c->idct[3] = ff_hevc_idct_32x32_10_avx;
             }
+
+            c->idct[0] = ff_hevc_idct_4x4_10_avx;
+            c->idct[1] = ff_hevc_idct_8x8_10_avx;
+
             SAO_BAND_INIT(10, avx);
         }
         if (EXTERNAL_AVX2(cpu_flags)) {


======================================================================

diff --cc doc/libav-merge.txt
index 577206f,0000000..4cb3e6e
mode 100644,000000..100644
--- a/doc/libav-merge.txt
+++ b/doc/libav-merge.txt
@@@ -1,112 -1,0 +1,113 @@@
 +CONTEXT
 +=======
 +
 +The FFmpeg project merges all the changes from the Libav project
 +(https://libav.org) since the origin of the fork (around 2011).
 +
 +With the exceptions of some commits due to technical/political disagreements or
 +issues, the changes are merged on a more or less regular schedule (daily for
 +years thanks to Michael, but more sparse nowadays).
 +
 +WHY
 +===
 +
 +The majority of the active developers believe the project needs to keep this
 +policy for various reasons.
 +
 +The most important one is that we don't want our users to have to choose
 +between two distributors of libraries of the exact same name in order to have a
 +different set of features and bugfixes. By taking the responsibility of
 +unifying the two codebases, we allow users to benefit from the changes from the
 +two teams.
 +
 +Today, FFmpeg has a much larger user database (we are distributed by every
 +major distribution), so we consider this mission a priority.
 +
 +A different approach to the merge could have been to pick the changes we are
 +interested in and drop most of the cosmetics and other less important changes.
 +Unfortunately, this makes the following picks much harder, especially since the
 +Libav project is involved in various deep API changes. As a result, we decide
 +to virtually take everything done there.
 +
 +Any Libav developer is of course welcome anytime to contribute directly to the
 +FFmpeg tree. Of course, we fully understand and are forced to accept that very
 +few Libav developers are interested in doing so, but we still want to recognize
 +their work. This leads us to create merge commits for every single one from
 +Libav. The original commit appears totally unchanged with full authorship in
 +our history (and the conflict are solved in the merge one). That way, not a
 +single thing from Libav will be lost in the future in case some reunification
 +happens, or that project disappears one way or another.
 +
 +DOWNSIDES
 +=========
 +
 +Of course, there are many downsides to this approach.
 +
 +- It causes a non negligible merge commits pollution. We make sure there are
 +  not several level of merges entangled (we do a 1:1 merge/commit), but it's
 +  still a non-linear history.
 +
 +- Many duplicated work. For instance, we added libavresample in our tree to
 +  keep compatibility with Libav when our libswresample was already covering the
 +  exact same purpose. The same thing happened for various elements such as the
 +  ProRes support (but differences in features, bugs, licenses, ...). There are
 +  many work to do to unify them, and any help is very much welcome.
 +
 +- So much manpower from both FFmpeg and Libav is lost because of this mess. We
 +  know it, and we don't know how to fix it. It takes incredible time to do
 +  these merges, so we have even less time to work on things we personally care
 +  about. The bad vibes also do not help with keeping our developers motivated.
 +
 +- There is a growing technical risk factor with the merges due to the codebase
 +  differing more and more.
 +
 +MERGE GUIDELINES
 +================
 +
 +The following gives developer guidelines on how to proceed when merging Libav commits.
 +
 +Before starting, you can reduce the risk of errors on merge conflicts by using
 +a different merge conflict style:
 +
 +    $ git config --global merge.conflictstyle diff3
 +
 +tools/libav-merge-next-commit is a script to help merging the next commit in
 +the queue. It assumes a remote named libav. It has two modes: merge, and noop.
 +The noop mode creates a merge with no change to the HEAD. You can pass a hash
 +as extra argument to reference a justification (it is common that we already
 +have the change done in FFmpeg).
 +
 +Also see tools/murge, you can copy and paste a 3 way conflict into its stdin
 +and it will display colored diffs. Any arguments to murge (like ones to suppress
 +whitespace differences) are passed into colordiff.
 +
 +TODO/FIXME/UNMERGED
 +===================
 +
 +Stuff that didn't reach the codebase:
 +-------------------------------------
 +
 +- HEVC DSP and x86 MC SIMD improvements from Libav (see https://ffmpeg.org/pipermail/ffmpeg-devel/2015-December/184777.html)
 +  - 1f821750f hevcdsp: split the qpel functions by width instead of by the subpixel fraction
 +  - 818bfe7f0 hevcdsp: split the epel functions by width
 +  - 688417399 hevcdsp: split the pred functions by width
 +  - a853388d2 hevc: change the stride of the MC buffer to be in bytes instead of elements
 +  - 0cef06df0 checkasm: add HEVC MC tests
 +  - e7078e842 hevcdsp: add x86 SIMD for MC
 +- VAAPI VP8 decode hwaccel (currently under review: http://ffmpeg.org/pipermail/ffmpeg-devel/2017-February/thread.html#207348)
 +- Removal of the custom atomic API (5cc0057f49, see http://ffmpeg.org/pipermail/ffmpeg-devel/2017-March/209003.html)
 +
 +Collateral damage that needs work locally:
 +------------------------------------------
 +
 +- Merge proresdec2.c and proresdec_lgpl.c
 +- Merge proresenc_anatoliy.c and proresenc_kostya.c
 +- Remove ADVANCED_PARSER in libavcodec/hevc_parser.c
 +- Fix MIPS AC3 downmix
 +
 +Extra changes needed to be aligned with Libav:
 +----------------------------------------------
 +
 +- Switching our examples to the new encode/decode API (see 67d28f4a0f)
 +- AC3 speed-up for our fixed version (see a9ba59591e)
++- HEVC IDCT bit depth 12-bit support (Libav added 8 and 10 but doesn't have 12)
diff --cc libavcodec/x86/hevc_idct.asm
index 33b437c,f397cc1..1eb1973
--- a/libavcodec/x86/hevc_idct.asm
+++ b/libavcodec/x86/hevc_idct.asm
@@@ -2,10 -2,11 +2,11 @@@
  ;* SIMD-optimized IDCT functions for HEVC decoding
  ;* Copyright (c) 2014 Pierre-Edouard LEPERE
  ;* Copyright (c) 2014 James Almer
+ ;* Copyright (c) 2016 Alexandra Hájková
  ;*
 -;* This file is part of Libav.
 +;* This file is part of FFmpeg.
  ;*
 -;* Libav is free software; you can redistribute it and/or
 +;* FFmpeg is free software; you can redistribute it and/or
  ;* modify it under the terms of the GNU Lesser General Public
  ;* License as published by the Free Software Foundation; either
  ;* version 2.1 of the License, or (at your option) any later version.
@@@ -22,9 -23,220 +23,220 @@@
  
  %include "libavutil/x86/x86util.asm"
  
+ SECTION_RODATA
+ 
+ pd_64: times 4 dd 64
+ pd_2048: times 4 dd 2048
+ pd_512: times 4 dd 512
+ 
+ ; 4x4 transform coeffs
+ cextern pw_64
+ pw_64_m64: times 4 dw 64, -64
+ pw_83_36: times 4 dw 83, 36
+ pw_36_m83: times 4 dw 36, -83
+ 
+ ; 8x8 transform coeffs
+ pw_89_75: times 4 dw 89, 75
+ pw_50_18: times 4 dw 50, 18
+ 
+ pw_75_m18: times 4 dw 75, -18
+ pw_m89_m50: times 4 dw -89, -50
+ 
+ pw_50_m89: times 4 dw 50, -89
+ pw_18_75: times 4 dw 18, 75
+ 
+ pw_18_m50: times 4 dw 18, -50
+ pw_75_m89: times 4 dw 75, -89
+ 
+ ; 16x16 transformation coeffs
+ trans_coeffs16: times 4 dw 90, 87
+ times 4 dw 80, 70
+ times 4 dw 57, 43
+ times 4 dw 25, 9
+ 
+ times 4 dw 87, 57
+ times 4 dw 9, -43
+ times 4 dw -80, -90
+ times 4 dw -70, -25
+ 
+ times 4 dw 80, 9
+ times 4 dw -70, -87
+ times 4 dw -25, 57
+ times 4 dw 90, 43
+ 
+ times 4 dw 70, -43
+ times 4 dw -87, 9
+ times 4 dw 90, 25
+ times 4 dw -80, -57
+ 
+ times 4 dw 57, -80
+ times 4 dw -25, 90
+ times 4 dw -9, -87
+ times 4 dw 43, 70
+ 
+ times 4 dw 43, -90
+ times 4 dw 57, 25
+ times 4 dw -87, 70
+ times 4 dw 9, -80
+ 
+ times 4 dw 25, -70
+ times 4 dw 90, -80
+ times 4 dw 43, 9
+ times 4 dw -57, 87
+ 
+ times 4 dw 9, -25
+ times 4 dw 43, -57
+ times 4 dw 70, -80
+ times 4 dw 87, -90
+ 
+ ; 32x32 transform coeffs
+ trans_coeff32: times 8 dw 90
+ times 4 dw 88, 85
+ times 4 dw 82, 78
+ times 4 dw 73, 67
+ times 4 dw 61, 54
+ times 4 dw 46, 38
+ times 4 dw 31, 22
+ times 4 dw 13, 4
+ 
+ times 4 dw 90, 82
+ times 4 dw 67, 46
+ times 4 dw 22, -4
+ times 4 dw -31, -54
+ times 4 dw -73, -85
+ times 4 dw -90, -88
+ times 4 dw -78, -61
+ times 4 dw -38, -13
+ 
+ times 4 dw 88, 67
+ times 4 dw 31, -13
+ times 4 dw -54, -82
+ times 4 dw -90, -78
+ times 4 dw -46, -4
+ times 4 dw 38, 73
+ times 4 dw 90, 85
+ times 4 dw 61, 22
+ 
+ times 4 dw 85, 46
+ times 4 dw -13, -67
+ times 4 dw -90, -73
+ times 4 dw -22, 38
+ times 4 dw 82, 88
+ times 4 dw 54, -4
+ times 4 dw -61, -90
+ times 4 dw -78, -31
+ 
+ times 4 dw 82, 22
+ times 4 dw -54, -90
+ times 4 dw -61, 13
+ times 4 dw 78, 85
+ times 4 dw 31, -46
+ times 4 dw -90, -67
+ times 4 dw 4, 73
+ times 4 dw 88, 38
+ 
+ times 4 dw 78, -4
+ times 4 dw -82, -73
+ times 4 dw 13, 85
+ times 4 dw 67, -22
+ times 4 dw -88, -61
+ times 4 dw 31, 90
+ times 4 dw 54, -38
+ times 4 dw -90, -46
+ 
+ times 4 dw 73, -31
+ times 4 dw -90, -22
+ times 4 dw 78, 67
+ times 4 dw -38, -90
+ times 4 dw -13, 82
+ times 4 dw 61, -46
+ times 4 dw -88, -4
+ times 4 dw 85, 54
+ 
+ times 4 dw 67, -54
+ times 4 dw -78, 38
+ times 4 dw 85, -22
+ times 4 dw -90, 4
+ times 4 dw 90, 13
+ times 4 dw -88, -31
+ times 4 dw 82, 46
+ times 4 dw -73, -61
+ 
+ times 4 dw 61, -73
+ times 4 dw -46, 82
+ times 4 dw 31, -88
+ times 4 dw -13, 90
+ times 4 dw -4, -90
+ times 4 dw 22, 85
+ times 4 dw -38, -78
+ times 4 dw 54, 67
+ 
+ times 4 dw 54, -85
+ times 4 dw -4, 88
+ times 4 dw -46, -61
+ times 4 dw 82, 13
+ times 4 dw -90, 38
+ times 4 dw 67, -78
+ times 4 dw -22, 90
+ times 4 dw -31, -73
+ 
+ times 4 dw 46, -90
+ times 4 dw 38, 54
+ times 4 dw -90, 31
+ times 4 dw 61, -88
+ times 4 dw 22, 67
+ times 4 dw -85, 13
+ times 4 dw 73, -82
+ times 4 dw 4, 78
+ 
+ times 4 dw 38, -88
+ times 4 dw 73, -4
+ times 4 dw -67, 90
+ times 4 dw -46, -31
+ times 4 dw 85, -78
+ times 4 dw 13, 61
+ times 4 dw -90, 54
+ times 4 dw 22, -82
+ 
+ times 4 dw 31, -78
+ times 4 dw 90, -61
+ times 4 dw 4, 54
+ times 4 dw -88, 82
+ times 4 dw -38, -22
+ times 4 dw 73, -90
+ times 4 dw 67, -13
+ times 4 dw -46, 85
+ 
+ times 4 dw 22, -61
+ times 4 dw 85, -90
+ times 4 dw 73, -38
+ times 4 dw -4, 46
+ times 4 dw -78, 90
+ times 4 dw -82, 54
+ times 4 dw -13, -31
+ times 4 dw 67, -88
+ 
+ times 4 dw 13, -38
+ times 4 dw 61, -78
+ times 4 dw 88, -90
+ times 4 dw 85, -73
+ times 4 dw 54, -31
+ times 4 dw 4, 22
+ times 4 dw -46, 67
+ times 4 dw -82, 90
+ 
+ times 4 dw 4, -13
+ times 4 dw 22, -31
+ times 4 dw 38, -46
+ times 4 dw 54, -61
+ times 4 dw 67, -73
+ times 4 dw 78, -82
+ times 4 dw 85, -88
+ times 4 dw 90, -90
+ 
 -section .text
 +SECTION .text
  
 -; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs)
 +; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)
  ; %1 = HxW
  ; %2 = number of loops
  ; %3 = bitdepth
@@@ -74,50 -286,565 +286,568 @@@ cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1
      RET
  %endmacro
  
- ; 8-bit
- INIT_MMX mmxext
- IDCT_DC_NL  4,      8
- IDCT_DC     8,  2,  8
+ ; IDCT 4x4, expects input in m0, m1
+ ; %1 - shift
+ ; %2 - 1/0 - SCALE and Transpose or not
+ ; %3 - 1/0 add constant or not
+ %macro TR_4x4 3
+     ; interleaves src0 with src2 to m0
+     ;         and src1 with scr3 to m2
+     ; src0: 00 01 02 03     m0: 00 20 01 21 02 22 03 23
+     ; src1: 10 11 12 13 -->
+     ; src2: 20 21 22 23     m1: 10 30 11 31 12 32 13 33
+     ; src3: 30 31 32 33
  
- INIT_XMM sse2
- IDCT_DC_NL  8,      8
- IDCT_DC    16,  4,  8
- IDCT_DC    32, 16,  8
+     SBUTTERFLY wd, 0, 1, 2
  
- %if HAVE_AVX2_EXTERNAL
- INIT_YMM avx2
- IDCT_DC    16,  2,  8
- IDCT_DC    32,  8,  8
- %endif ;HAVE_AVX2_EXTERNAL
+     pmaddwd m2, m0, [pw_64]    ; e0
+     pmaddwd m3, m1, [pw_83_36] ; o0
+     pmaddwd m0, [pw_64_m64]    ; e1
+     pmaddwd m1, [pw_36_m83]    ; o1
  
- ; 10-bit
- INIT_MMX mmxext
- IDCT_DC_NL  4,     10
- IDCT_DC     8,  2, 10
+ %if %3 == 1
+     %assign %%add 1 << (%1 - 1)
+     mova  m4, [pd_ %+ %%add]
+     paddd m2, m4
+     paddd m0, m4
+ %endif
  
- INIT_XMM sse2
- IDCT_DC_NL  8,     10
- IDCT_DC    16,  4, 10
- IDCT_DC    32, 16, 10
+     SUMSUB_BADC d, 3, 2, 1, 0, 4
  
- %if HAVE_AVX2_EXTERNAL
- INIT_YMM avx2
- IDCT_DC    16,  2, 10
- IDCT_DC    32,  8, 10
- %endif ;HAVE_AVX2_EXTERNAL
+ %if %2 == 1
+     psrad m3, %1 ; e0 + o0
+     psrad m1, %1 ; e1 + o1
+     psrad m2, %1 ; e0 - o0
+     psrad m0, %1 ; e1 - o1
+     ;clip16
+     packssdw m3, m1
+     packssdw m0, m2
+     ; Transpose
+     SBUTTERFLY wd, 3, 0, 1
+     SBUTTERFLY wd, 3, 0, 1
+     SWAP 3, 1, 0
+ %else
+     SWAP 3, 2, 0
+ %endif
+ %endmacro
+ 
+ %macro DEFINE_BIAS 1
+     %assign shift (20 - %1)
+     %assign c_add (1 << (shift - 1))
+     %define arr_add pd_ %+ c_add
+ %endmacro
+ 
+ ; %1 - bit_depth
+ ; %2 - register add constant
+ ; is loaded to
+ ; shift = 20 - bit_depth
+ %macro LOAD_BIAS 2
+     DEFINE_BIAS %1
+     mova %2, [arr_add]
+ %endmacro
+ 
+ ; %1, %2 - registers to load packed 16 bit values to
+ ; %3, %4, %5, %6 - vertical offsets
+ ; %7 - horizontal offset
+ %macro LOAD_BLOCK 7
+     movq   %1, [r0 + %3 + %7]
+     movhps %1, [r0 + %5 + %7]
+     movq   %2, [r0 + %4 + %7]
+     movhps %2, [r0 + %6 + %7]
+ %endmacro
+ 
+ ; void ff_hevc_idct_4x4__{8,10}_<opt>(int16_t *coeffs, int col_limit)
+ ; %1 = bitdepth
+ %macro IDCT_4x4 1
+ cglobal hevc_idct_4x4_%1, 1, 1, 5, coeffs
+     mova m0, [coeffsq]
+     mova m1, [coeffsq + 16]
+ 
+     TR_4x4 7, 1, 1
+     TR_4x4 20 - %1, 1, 1
+ 
+     mova [coeffsq],      m0
+     mova [coeffsq + 16], m1
+     RET
+ %endmacro
+ 
+ ; scale, pack (clip16) and store the residuals     0 e8[0] + o8[0] --> + %1
+ ; 4 at one time (4 columns)                        1 e8[1] + o8[1]
+ ; from %5: e8/16 + o8/16, with %1 offset                  ...
+ ; and  %3: e8/16 - o8/16, with %2 offset           6 e8[1] - o8[1]
+ ; %4 - shift                                       7 e8[0] - o8[0] --> + %2
+ %macro STORE_8 7
+     psrad    %5, %4
+     psrad    %3, %4
+     packssdw %5, %3
+     movq     [coeffsq + %1], %5
+     movhps   [coeffsq + %2], %5
+ %endmacro
+ 
+ ; %1 - horizontal offset
+ ; %2 - shift
+ ; %3, %4 - transform coeffs
+ ; %5 - vertical offset for e8 + o8
+ ; %6 - vertical offset for e8 - o8
+ ; %7 - register with e8 inside
+ ; %8 - block_size
+ ; %9 - register to store e8 +o8
+ ; %10 - register to store e8 - o8
+ %macro E8_O8 10
+     pmaddwd m6, m4, %3
+     pmaddwd m7, m5, %4
+ 
+     paddd m6, m7
+     paddd m7, m6, %7 ; o8 + e8
+     psubd %7, m6     ; e8 - o8
+ %if %8 == 8
+     STORE_8 %5 + %1, %6 + %1, %7, %2, m7, 0, 0
+ %else
+     SWAP m7, %9
+     SWAP %7, %10
+ %endif
+ %endmacro
+ 
+ ; 8x4 residuals are processed and stored
+ ; %1 - horizontal offset
+ ; %2 - shift
+ ; %3 - offset of the even row
+ ; %4 - step: 1 for 8x8, 2 for 16x16, 4 for 32x32
+ ; %5 - offset of the odd row
+ ; %6 - block size
+ ; %7 - 1/0 add a constant in TR_4x4 or not
+ ; I want to add a constant for 8x8 transform but not for 16x16 and 32x32
+ %macro TR_8x4 7
+     ; load 4 columns of even rows
+     LOAD_BLOCK  m0, m1, 0, 2 * %4 * %3, %4 * %3, 3 * %4 * %3, %1
+ 
+     TR_4x4 %2, 0, %7 ; e8: m0, m1, m2, m3, for 4 columns only
+ 
+     ; load 4 columns of odd rows
+     LOAD_BLOCK m4, m5, %4 * %5, 3 * %4 * %5, 5 * %4 * %5, 7 * %4 * %5, %1
+ 
+     ; 00 01 02 03
+     ; 10 11 12 13      m4: 10 30 11 31 12 32 13 33
+ 
+     ; ...        -- >
+     ;                  m5: 50 70 51 71 52 72 53 73
+     ; 70 71 72 73
+     SBUTTERFLY wd, 4, 5, 6
+ 
+     E8_O8 %1, %2, [pw_89_75],  [pw_50_18],   0,      %5 * 7, m0, %6, m8, m15
+     E8_O8 %1, %2, [pw_75_m18], [pw_m89_m50], %5,     %5 * 6, m1, %6, m9, m14
+     E8_O8 %1, %2, [pw_50_m89], [pw_18_75],   %5 * 2, %5 * 5, m2, %6, m10, m13
+     E8_O8 %1, %2, [pw_18_m50], [pw_75_m89],  %5 * 3, %5 * 4, m3, %6, m11, m12
+ %endmacro
+ 
+ %macro STORE_PACKED 7
+     movq   [r0 + %3 + %7], %1
+     movhps [r0 + %4 + %7], %1
+     movq   [r0 + %5 + %7], %2
+     movhps [r0 + %6 + %7], %2
+ %endmacro
+ 
+ ; transpose 4x4 block packed
+ ; in %1 and %2 registers
+ ; %3 - temporary register
+ %macro TRANSPOSE_4x4 3
+     SBUTTERFLY wd, %1, %2, %3
+     SBUTTERFLY dq, %1, %2, %3
+ %endmacro
+ 
+ ; %1 - horizontal offset of the block i
+ ; %2 - vertical offset of the block i
+ ; %3 - width in bytes
+ ; %4 - vertical offset for the block j
+ ; %5 - horizontal offset for the block j
+ %macro SWAP_BLOCKS 5
+     ; M_j
+     LOAD_BLOCK m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
+     TRANSPOSE_4x4 4, 5, 6
+ 
+     ; M_i
+     LOAD_BLOCK m6, m7, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
+ 
+     STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
+ 
+     ; transpose and store M_i
+     SWAP m6, m4
+     SWAP m7, m5
+     TRANSPOSE_4x4 4, 5, 6
+     STORE_PACKED m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
+ %endmacro
+ 
+ ; %1 - horizontal offset
+ ; %2 - vertical offset of the block
+ ; %3 - width in bytes
+ %macro TRANSPOSE_BLOCK 3
+     LOAD_BLOCK m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
+     TRANSPOSE_4x4 4, 5, 6
+     STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
+ %endmacro
+ 
+ %macro TRANSPOSE_8x8 0
+ cglobal hevc_idct_transpose_8x8, 0, 0, 0
+     ; M1 M2 ^T = M1^t M3^t
+     ; M3 M4      M2^t M4^t
+ 
+     ; M1 4x4 block
+     TRANSPOSE_BLOCK 0, 0, 16
+ 
+     ; M2 and M3
+     SWAP_BLOCKS 0, 64, 16, 0, 8
+ 
+     ; M4
+     TRANSPOSE_BLOCK 8, 64, 16
+ 
+     ret
+ %endmacro
+ 
+ ; void ff_hevc_idct_8x8_{8,10}_<opt>(int16_t *coeffs, int col_limit)
+ ; %1 = bitdepth
+ %macro IDCT_8x8 1
+ cglobal hevc_idct_8x8_%1, 1, 1, 8, coeffs
+     TR_8x4 0, 7, 32, 1, 16, 8, 1
+     TR_8x4 8, 7, 32, 1, 16, 8, 1
+ 
+     call hevc_idct_transpose_8x8_ %+ cpuname
+ 
+     DEFINE_BIAS %1
+     TR_8x4 0, shift, 32, 1, 16, 8, 1
+     TR_8x4 8, shift, 32, 1, 16, 8, 1
+ 
+     TAIL_CALL hevc_idct_transpose_8x8_ %+ cpuname, 1
+ %endmacro
+ 
+ ; store intermedite e32 coeffs on stack
+ ; as 16x4 matrix
+ ; from m10: e8 + o8, with %6 offset
+ ; and  %3:  e8 - o8, with %7 offset
+ ; %4 - shift, unused here
+ %macro STORE_16 7
+     mova [rsp + %6], %5
+     mova [rsp + %7], %3
+ %endmacro
+ 
+ ; %1, %2 - transform constants
+ ; %3, %4 - regs with interleaved coeffs
+ ; %5 - 1/0 SWAP or add
+ ; %6, %7 - registers for intermidiate sums
+ ; %8 - accumulator register
+ %macro ADD_ROWS 8
+     pmaddwd %6, %3, %1
+     pmaddwd %7, %4, %2
+     paddd   %6, %7
+ %if %5 == 1
+     SWAP %6, %8
+ %else
+     paddd %8, %6
+ %endif
+ %endmacro
+ 
+ ; %1 - transform coeffs
+ ; %2, %3 offsets for storing e+o/e-o back to coeffsq
+ ; %4 - shift
+ ; %5 - add
+ ; %6 - block_size
+ ; %7 - register with e16
+ ; %8, %9 - stack offsets for storing e+o/e-o
+ %macro E16_O16 9
+     ADD_ROWS [%1],          [%1 +     16], m0, m1, 1, m5, m6, m7
+     ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m5, m6, m7
+ 
+ %if %6 == 8
+     paddd %7, %5
+ %endif
+ 
+     paddd m4, m7, %7 ; o16 + e16
+     psubd %7, m7     ; e16 - o16
+     STORE_%6 %2, %3, %7, %4, m4, %8, %9
+ %endmacro
+ 
+ %macro TR_16x4 10
+     ; produce 8x4 matrix of e16 coeffs
+     ; for 4 first rows and store it on stack (128 bytes)
+     TR_8x4 %1, 7, %4, %5, %6, %8, 0
+ 
+     ; load 8 even rows
+     LOAD_BLOCK m0, m1, %9 * %6, %9 * 3 * %6, %9 * 5 * %6, %9 * 7 * %6, %1
+     LOAD_BLOCK m2, m3, %9 * 9 * %6, %9 * 11 * %6, %9 * 13 * %6, %9 * 15 * %6, %1
+ 
+     SBUTTERFLY wd, 0, 1, 4
+     SBUTTERFLY wd, 2, 3, 4
+ 
+     E16_O16 trans_coeffs16,               0 + %1, 15 * %6 + %1, %2, %3, %7, m8,       0, 15 * 16
+     mova m8, %3
+     E16_O16 trans_coeffs16 +     64,     %6 + %1, 14 * %6 + %1, %2, m8, %7, m9,      16, 14 * 16
+     E16_O16 trans_coeffs16 + 2 * 64, 2 * %6 + %1, 13 * %6 + %1, %2, m8, %7, m10, 2 * 16, 13 * 16
+     E16_O16 trans_coeffs16 + 3 * 64, 3 * %6 + %1, 12 * %6 + %1, %2, m8, %7, m11, 3 * 16, 12 * 16
+     E16_O16 trans_coeffs16 + 4 * 64, 4 * %6 + %1, 11 * %6 + %1, %2, m8, %7, m12, 4 * 16, 11 * 16
+     E16_O16 trans_coeffs16 + 5 * 64, 5 * %6 + %1, 10 * %6 + %1, %2, m8, %7, m13, 5 * 16, 10 * 16
+     E16_O16 trans_coeffs16 + 6 * 64, 6 * %6 + %1,  9 * %6 + %1, %2, m8, %7, m14, 6 * 16,  9 * 16
+     E16_O16 trans_coeffs16 + 7 * 64, 7 * %6 + %1,  8 * %6 + %1, %2, m8, %7, m15, 7 * 16,  8 * 16
+ %endmacro
+ 
+ %macro TRANSPOSE_16x16 0
+ cglobal hevc_idct_transpose_16x16, 0, 0, 0
+ ; M1  M2  M3  M4 ^T      m1 m5 m9  m13   M_i^T = m_i
+ ; M5  M6  M7  M8    -->  m2 m6 m10 m14
+ ; M9  M10 M11 M12        m3 m7 m11 m15
+ ; M13 M14 M15 M16        m4 m8 m12 m16
+ 
+     ; M1 4x4 block
+     TRANSPOSE_BLOCK 0, 0, 32
+ 
+     ; M5, M2
+     SWAP_BLOCKS 0, 128, 32, 0, 8
+     ; M9, M3
+     SWAP_BLOCKS 0, 256, 32, 0, 16
+     ; M13, M4
+     SWAP_BLOCKS 0, 384, 32, 0, 24
+ 
+     ;M6
+     TRANSPOSE_BLOCK 8, 128, 32
+ 
+     ; M10, M7
+     SWAP_BLOCKS 8, 256, 32, 128, 16
+     ; M14, M8
+     SWAP_BLOCKS 8, 384, 32, 128, 24
+ 
+     ;M11
+     TRANSPOSE_BLOCK 16, 256, 32
+ 
+     ; M15, M12
+     SWAP_BLOCKS 16, 384, 32, 256, 24
+ 
+     ;M16
+     TRANSPOSE_BLOCK 24, 384, 32
+ 
+     ret
+ %endmacro
+ 
+ ; void ff_hevc_idct_16x16_{8,10}_<opt>(int16_t *coeffs, int col_limit)
+ ; %1 = bitdepth
+ %macro IDCT_16x16 1
+ cglobal hevc_idct_16x16_%1, 1, 2, 16, coeffs
+     mov r1d, 3
+ .loop16:
+     TR_16x4 8 * r1, 7, [pd_64], 64, 2, 32, 8, 16, 1, 0
+     dec r1d
+     jge .loop16
  
- ; 12-bit
+     call hevc_idct_transpose_16x16_ %+ cpuname
+ 
+     DEFINE_BIAS %1
+     mov r1d, 3
+ .loop16_2:
+     TR_16x4 8 * r1, shift, [arr_add], 64, 2, 32, 8, 16, 1, 1
+     dec r1d
+     jge .loop16_2
+ 
+     TAIL_CALL hevc_idct_transpose_16x16_ %+ cpuname, 1
+ %endmacro
+ 
+ ; scale, pack (clip16) and store the residuals     0 e32[0] + o32[0] --> %1
+ ; 4 at one time (4 columns)                        1 e32[1] + o32[1]
+ ; %1 - address to store e32 + o32
+ ; %2 - address to store e32 - e32
+ ; %5 - reg with e32 + o32                                  ...
+ ; %3 - reg with e32 - o32                          30 e32[1] - o32[1]
+ ; %4 - shift                                       31 e32[0] - o32[0] --> %2
+ %macro STORE_32 5
+     psrad    %5, %4
+     psrad    %3, %4
+     packssdw %5, %3
+     movq     [%1], %5
+     movhps   [%2], %5
+ %endmacro
+ 
+ ; %1 - transform coeffs
+ ; %2 - stack offset for e32
+ ; %2, %3 offsets for storing e+o/e-o back to coeffsq
+ ; %4 - shift
+ ; %5 - stack offset of e32
+ %macro E32_O32 5
+     ADD_ROWS [%1],          [%1 +     16], m0, m1, 1, m8, m9, m10
+     ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m8, m9, m10
+     ADD_ROWS [%1 + 4 * 16], [%1 + 5 * 16], m4, m5, 0, m8, m9, m10
+     ADD_ROWS [%1 + 6 * 16], [%1 + 7 * 16], m6, m7, 0, m8, m9, m10
+ 
+     paddd m11, m14, [rsp + %5]
+     paddd m12, m10, m11 ; o32 + e32
+     psubd m11, m10      ; e32 - o32
+     STORE_32 %2, %3, m11, %4, m12
+ %endmacro
+ 
+ ; %1 - horizontal offset
+ ; %2 - bitdepth
+ %macro TR_32x4 3
+     TR_16x4 %1, 7, [pd_64], 128, 4, 64, 16, 16, 2, 0
+ 
+     LOAD_BLOCK m0, m1,      64,  3 * 64,  5 * 64,  7 * 64, %1
+     LOAD_BLOCK m2, m3,  9 * 64, 11 * 64, 13 * 64, 15 * 64, %1
+     LOAD_BLOCK m4, m5, 17 * 64, 19 * 64, 21 * 64, 23 * 64, %1
+     LOAD_BLOCK m6, m7, 25 * 64, 27 * 64, 29 * 64, 31 * 64, %1
+ 
+     SBUTTERFLY wd, 0, 1, 8
+     SBUTTERFLY wd, 2, 3, 8
+     SBUTTERFLY wd, 4, 5, 8
+     SBUTTERFLY wd, 6, 7, 8
+ 
+ %if %3 == 1
+     %assign shift 7
+     mova m14, [pd_64]
+ %else
+     LOAD_BIAS %2, m14
+ %endif
+ 
+     lea r2, [trans_coeff32 + 15 * 128]
+     lea r3, [coeffsq + %1]
+     lea r4, [r3 + 16 * 64]
+     mov r5d, 15 * 16
+ %%loop:
+     E32_O32 r2, r3 + r5 * 4, r4, shift, r5
+     sub r2, 128
+     add r4, 64
+     sub r5d, 16
+     jge %%loop
+ %endmacro
+ 
+ %macro TRANSPOSE_32x32 0
+ cglobal hevc_idct_transpose_32x32, 0, 0, 0
+     ; M0  M1 ... M7
+     ; M8         M15
+     ;
+     ; ...
+     ;
+     ; M56        M63
+ 
+     TRANSPOSE_BLOCK 0, 0, 64 ; M1
+     mov r1d, 7
+     mov r2d, 7 * 256
+ .loop_transpose:
+     SWAP_BLOCKS 0, r2, 64, 0, r1 * 8
+     sub r2d, 256
+     dec r1d
+     jg .loop_transpose
+ 
+     TRANSPOSE_BLOCK 8, 256, 64 ; M9
+     mov r1d, 6
+     mov r2d, 512
+     mov r3d, 16
+ .loop_transpose2:
+     SWAP_BLOCKS 8, r2, 64, 256, r3
+     add r3d, 8
+     add r2d, 256
+     dec r1d
+     jg .loop_transpose2
+ 
+     TRANSPOSE_BLOCK 2 * 8, 2 * 256, 64 ; M9
+     mov r1d, 5
+     mov r2d, 768
+     mov r3d, 24
+ .loop_transpose3:
+     SWAP_BLOCKS 2 * 8, r2, 64, 2 * 256, r3
+     add r3d, 8
+     add r2d, 256
+     dec r1d
+     jg .loop_transpose3
+ 
+     TRANSPOSE_BLOCK 3 * 8, 3 * 256, 64 ; M27
+     mov r1d, 4
+     mov r2d, 1024
+     mov r3d, 32
+ .loop_transpose4:
+     SWAP_BLOCKS 3 * 8, r2, 64, 3 * 256, r3
+     add r3d, 8
+     add r2d, 256
+     dec r1d
+     jg .loop_transpose4
+ 
+     TRANSPOSE_BLOCK 4 * 8, 4 * 256, 64 ; M36
+     mov r1d, 3
+     mov r2d, 1280
+     mov r3d, 40
+ .loop_transpose5:
+     SWAP_BLOCKS 4 * 8, r2, 64, 4 * 256, r3
+     add r3d, 8
+     add r2d, 256
+     dec r1d
+     jg .loop_transpose5
+ 
+     TRANSPOSE_BLOCK 5 * 8, 5 * 256, 64 ; M45
+     SWAP_BLOCKS 5 * 8, 6 * 256, 64, 5 * 256, 6 * 8
+     SWAP_BLOCKS 5 * 8, 7 * 256, 64, 5 * 256, 7 * 8
+ 
+     TRANSPOSE_BLOCK 6 * 8, 6 * 256, 64 ; M54
+     SWAP_BLOCKS 6 * 8, 7 * 256, 64, 6 * 256, 7 * 8
+ 
+     TRANSPOSE_BLOCK 7 * 8, 7 * 256, 64 ; M63
+ 
+     ret
+ %endmacro
+ 
+ ; void ff_hevc_idct_32x32_{8,10}_<opt>(int16_t *coeffs, int col_limit)
+ ; %1 = bitdepth
+ %macro IDCT_32x32 1
+ cglobal hevc_idct_32x32_%1, 1, 6, 16, 256, coeffs
+     mov r1d, 7
+ .loop32:
+     TR_32x4 8 * r1, %1, 1
+     dec r1d
+     jge .loop32
+ 
+     call hevc_idct_transpose_32x32_ %+ cpuname
+ 
+     mov r1d, 7
+ .loop32_2:
+     TR_32x4 8 * r1, %1, 0
+     dec r1d
+     jge .loop32_2
+ 
+     TAIL_CALL hevc_idct_transpose_32x32_ %+ cpuname, 1
+ %endmacro
+ 
+ %macro INIT_IDCT_DC 1
  INIT_MMX mmxext
- IDCT_DC_NL  4,     12
- IDCT_DC     8,  2, 12
+ IDCT_DC_NL  4,      %1
+ IDCT_DC     8,  2,  %1
  
  INIT_XMM sse2
- IDCT_DC_NL  8,     12
- IDCT_DC    16,  4, 12
- IDCT_DC    32, 16, 12
+ IDCT_DC_NL  8,      %1
+ IDCT_DC    16,  4,  %1
+ IDCT_DC    32, 16,  %1
  
  %if HAVE_AVX2_EXTERNAL
- INIT_YMM avx2
- IDCT_DC    16,  2, 12
- IDCT_DC    32,  8, 12
+     INIT_YMM avx2
+     IDCT_DC    16,  2,  %1
+     IDCT_DC    32,  8,  %1
  %endif ;HAVE_AVX2_EXTERNAL
+ %endmacro
+ 
+ %macro INIT_IDCT 2
+ INIT_XMM %2
+ %if %1 == 8
+     TRANSPOSE_8x8
+     %if ARCH_X86_64
+         TRANSPOSE_16x16
+         TRANSPOSE_32x32
+     %endif
+ %endif
+ %if ARCH_X86_64
+     IDCT_32x32 %1
+     IDCT_16x16 %1
+ %endif
+ IDCT_8x8 %1
+ IDCT_4x4 %1
+ %endmacro
+ 
+ INIT_IDCT_DC 8
+ INIT_IDCT_DC 10
++INIT_IDCT_DC 12
+ INIT_IDCT 8, sse2
+ INIT_IDCT 8, avx
+ INIT_IDCT 10, sse2
+ INIT_IDCT 10, avx
++;INIT_IDCT 12, sse2
++;INIT_IDCT 12, avx
diff --cc libavcodec/x86/hevcdsp_init.c
index c4d9564,0a06347..0b17671
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@@ -34,663 -32,222 +34,676 @@@ void ff_hevc_ ## DIR ## _loop_filter_ch
  #define LFL_FUNC(DIR, DEPTH, OPT) \
  void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q);
  
 -#define LFC_FUNCS(type, depth) \
 -    LFC_FUNC(h, depth, sse2)   \
 -    LFC_FUNC(v, depth, sse2)
 -
 -#define LFL_FUNCS(type, depth) \
 -    LFL_FUNC(h, depth, ssse3)  \
 -    LFL_FUNC(v, depth, ssse3)
 -
 -LFC_FUNCS(uint8_t, 8)
 -LFC_FUNCS(uint8_t, 10)
 -LFL_FUNCS(uint8_t, 8)
 -LFL_FUNCS(uint8_t, 10)
 -
 -#define idct_dc_proto(size, bitd, opt) \
 -                void ff_hevc_idct_ ## size ## _dc_add_ ## bitd ## _ ## opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 -
 -idct_dc_proto(4, 8,mmxext);
 -idct_dc_proto(8, 8,mmxext);
 -idct_dc_proto(16,8,  sse2);
 -idct_dc_proto(32,8,  sse2);
 -
 -idct_dc_proto(32,8,  avx2);
 +#define LFC_FUNCS(type, depth, opt) \
 +    LFC_FUNC(h, depth, opt)  \
 +    LFC_FUNC(v, depth, opt)
  
 -idct_dc_proto(4, 10,mmxext);
 -idct_dc_proto(8, 10,  sse2);
 -idct_dc_proto(16,10,  sse2);
 -idct_dc_proto(32,10,  sse2);
 -idct_dc_proto(8, 10,   avx);
 -idct_dc_proto(16,10,   avx);
 -idct_dc_proto(32,10,   avx);
 +#define LFL_FUNCS(type, depth, opt) \
 +    LFL_FUNC(h, depth, opt)  \
 +    LFL_FUNC(v, depth, opt)
  
 -idct_dc_proto(16,10,  avx2);
 -idct_dc_proto(32,10,  avx2);
 +LFC_FUNCS(uint8_t,   8, sse2)
 +LFC_FUNCS(uint8_t,  10, sse2)
 +LFC_FUNCS(uint8_t,  12, sse2)
 +LFC_FUNCS(uint8_t,   8, avx)
 +LFC_FUNCS(uint8_t,  10, avx)
 +LFC_FUNCS(uint8_t,  12, avx)
 +LFL_FUNCS(uint8_t,   8, sse2)
 +LFL_FUNCS(uint8_t,  10, sse2)
 +LFL_FUNCS(uint8_t,  12, sse2)
 +LFL_FUNCS(uint8_t,   8, ssse3)
 +LFL_FUNCS(uint8_t,  10, ssse3)
 +LFL_FUNCS(uint8_t,  12, ssse3)
 +LFL_FUNCS(uint8_t,   8, avx)
 +LFL_FUNCS(uint8_t,  10, avx)
 +LFL_FUNCS(uint8_t,  12, avx)
  
- #define IDCT_FUNCS(W, opt) \
+ #define IDCT_DC_FUNCS(W, opt) \
  void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
 -void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs)
 +void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs); \
 +void ff_hevc_idct_ ## W ## _dc_12_ ## opt(int16_t *coeffs)
  
- IDCT_FUNCS(4x4,   mmxext);
- IDCT_FUNCS(8x8,   mmxext);
- IDCT_FUNCS(8x8,   sse2);
- IDCT_FUNCS(16x16, sse2);
- IDCT_FUNCS(32x32, sse2);
- IDCT_FUNCS(16x16, avx2);
- IDCT_FUNCS(32x32, avx2);
+ IDCT_DC_FUNCS(4x4,   mmxext);
+ IDCT_DC_FUNCS(8x8,   mmxext);
+ IDCT_DC_FUNCS(8x8,   sse2);
+ IDCT_DC_FUNCS(16x16, sse2);
+ IDCT_DC_FUNCS(32x32, sse2);
+ IDCT_DC_FUNCS(16x16, avx2);
+ IDCT_DC_FUNCS(32x32, avx2);
+ 
+ #define IDCT_FUNCS(opt)                                             \
+ void ff_hevc_idct_4x4_8_    ## opt(int16_t *coeffs, int col_limit); \
+ void ff_hevc_idct_4x4_10_   ## opt(int16_t *coeffs, int col_limit); \
+ void ff_hevc_idct_8x8_8_    ## opt(int16_t *coeffs, int col_limit); \
+ void ff_hevc_idct_8x8_10_   ## opt(int16_t *coeffs, int col_limit); \
+ void ff_hevc_idct_16x16_8_  ## opt(int16_t *coeffs, int col_limit); \
+ void ff_hevc_idct_16x16_10_ ## opt(int16_t *coeffs, int col_limit); \
+ void ff_hevc_idct_32x32_8_  ## opt(int16_t *coeffs, int col_limit); \
+ void ff_hevc_idct_32x32_10_ ## opt(int16_t *coeffs, int col_limit);
+ 
+ IDCT_FUNCS(sse2)
+ IDCT_FUNCS(avx)
  
 -#define GET_PIXELS(width, depth, cf)                                                                      \
 -void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, ptrdiff_t dststride,             \
 -                                                           uint8_t *src, ptrdiff_t srcstride,             \
 -                                                           int height, int mx, int my, int16_t *mcbuffer);
 -
 -GET_PIXELS(4,  8, sse2)
 -GET_PIXELS(8,  8, sse2)
 -GET_PIXELS(12, 8, sse2)
 -GET_PIXELS(16, 8, sse2)
 -GET_PIXELS(24, 8, sse2)
 -GET_PIXELS(32, 8, sse2)
 -GET_PIXELS(48, 8, sse2)
 -GET_PIXELS(64, 8, sse2)
 -
 -GET_PIXELS(4,  10, sse2)
 -GET_PIXELS(8,  10, sse2)
 -GET_PIXELS(12, 10, sse2)
 -GET_PIXELS(16, 10, sse2)
 -GET_PIXELS(24, 10, sse2)
 -GET_PIXELS(32, 10, sse2)
 -GET_PIXELS(48, 10, sse2)
 -GET_PIXELS(64, 10, sse2)
 -
 -/* those are independent of the bit depth, so declared separately */
 -#define INTERP_HV_FUNC(width, cf)                                                         \
 -void ff_hevc_qpel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride,              \
 -                                          int16_t *src, ptrdiff_t srcstride,              \
 -                                          int height, int mx, int my, int16_t *mcbuffer); \
 -void ff_hevc_epel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride,              \
 -                                          int16_t *src, ptrdiff_t srcstride,              \
 -                                          int height, int mx, int my, int16_t *mcbuffer);
 -
 -INTERP_HV_FUNC(4,  avx)
 -INTERP_HV_FUNC(8,  avx)
 -INTERP_HV_FUNC(12, avx)
 -INTERP_HV_FUNC(16, avx)
 -INTERP_HV_FUNC(24, avx)
 -INTERP_HV_FUNC(32, avx)
 -INTERP_HV_FUNC(48, avx)
 -INTERP_HV_FUNC(64, avx)
 -
 -#if ARCH_X86_64 && HAVE_AVX_EXTERNAL
 -#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)                                                         \
 -static void hevc_qpel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride,             \
 -                                                               uint8_t *src, ptrdiff_t srcstride,             \
 -                                                               int height, int mx, int my, int16_t *mcbuffer) \
 +#define mc_rep_func(name, bitd, step, W, opt) \
 +void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst,                                                 \
 +                                                uint8_t *_src, ptrdiff_t _srcstride, int height,                \
 +                                                intptr_t mx, intptr_t my, int width)                            \
 +{                                                                                                               \
 +    int i;                                                                                                      \
 +    uint8_t *src;                                                                                               \
 +    int16_t *dst;                                                                                               \
 +    for (i = 0; i < W; i += step) {                                                                             \
 +        src  = _src + (i * ((bitd + 7) / 8));                                                                   \
 +        dst = _dst + i;                                                                                         \
 +        ff_hevc_put_hevc_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width);            \
 +    }                                                                                                           \
 +}
 +#define mc_rep_uni_func(name, bitd, step, W, opt) \
 +void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride,                        \
 +                                                    uint8_t *_src, ptrdiff_t _srcstride, int height,            \
 +                                                    intptr_t mx, intptr_t my, int width)                        \
 +{                                                                                                               \
 +    int i;                                                                                                      \
 +    uint8_t *src;                                                                                               \
 +    uint8_t *dst;                                                                                               \
 +    for (i = 0; i < W; i += step) {                                                                             \
 +        src = _src + (i * ((bitd + 7) / 8));                                                                    \
 +        dst = _dst + (i * ((bitd + 7) / 8));                                                                    \
 +        ff_hevc_put_hevc_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride,                     \
 +                                                          height, mx, my, width);                               \
 +    }                                                                                                           \
 +}
 +#define mc_rep_bi_func(name, bitd, step, W, opt) \
 +void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, uint8_t *_src,          \
 +                                                   ptrdiff_t _srcstride, int16_t* _src2,                        \
 +                                                   int height, intptr_t mx, intptr_t my, int width)             \
 +{                                                                                                               \
 +    int i;                                                                                                      \
 +    uint8_t  *src;                                                                                              \
 +    uint8_t  *dst;                                                                                              \
 +    int16_t  *src2;                                                                                             \
 +    for (i = 0; i < W ; i += step) {                                                                            \
 +        src  = _src + (i * ((bitd + 7) / 8));                                                                   \
 +        dst  = _dst + (i * ((bitd + 7) / 8));                                                                   \
 +        src2 = _src2 + i;                                                                                       \
 +        ff_hevc_put_hevc_bi_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2,                \
 +                                                          height, mx, my, width);                               \
 +    }                                                                                                           \
 +}
 +
 +#define mc_rep_funcs(name, bitd, step, W, opt)        \
 +    mc_rep_func(name, bitd, step, W, opt)            \
 +    mc_rep_uni_func(name, bitd, step, W, opt)        \
 +    mc_rep_bi_func(name, bitd, step, W, opt)
 +
 +#define mc_rep_func2(name, bitd, step1, step2, W, opt) \
 +void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *dst,                                                  \
 +                                                 uint8_t *src, ptrdiff_t _srcstride, int height,                \
 +                                                 intptr_t mx, intptr_t my, int width)                           \
 +{                                                                                                               \
 +    ff_hevc_put_hevc_##name##step1##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width);               \
 +    ff_hevc_put_hevc_##name##step2##_##bitd##_##opt(dst + step1, src + (step1 * ((bitd + 7) / 8)),              \
 +                                                    _srcstride, height, mx, my, width);                         \
 +}
 +#define mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
 +void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride,                         \
 +                                                     uint8_t *src, ptrdiff_t _srcstride, int height,            \
 +                                                     intptr_t mx, intptr_t my, int width)                       \
 +{                                                                                                               \
 +    ff_hevc_put_hevc_uni_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, height, mx, my, width);\
 +    ff_hevc_put_hevc_uni_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride,            \
 +                                                        src + (step1 * ((bitd + 7) / 8)), _srcstride,           \
 +                                                        height, mx, my, width);                                 \
 +}
 +#define mc_rep_bi_func2(name, bitd, step1, step2, W, opt) \
 +void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,            \
 +                                                    ptrdiff_t _srcstride, int16_t* src2,                        \
 +                                                    int height, intptr_t mx, intptr_t my, int width)            \
 +{                                                                                                               \
 +    ff_hevc_put_hevc_bi_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, height, mx, my, width);\
 +    ff_hevc_put_hevc_bi_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride,             \
 +                                                       src + (step1 * ((bitd + 7) / 8)), _srcstride,            \
 +                                                       src2 + step1, height, mx, my, width);                    \
 +}
 +
 +#define mc_rep_funcs2(name, bitd, step1, step2, W, opt) \
 +    mc_rep_func2(name, bitd, step1, step2, W, opt)      \
 +    mc_rep_uni_func2(name, bitd, step1, step2, W, opt)  \
 +    mc_rep_bi_func2(name, bitd, step1, step2, W, opt)
 +
 +#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
 +
 +#define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                       \
 +void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride,            \
 +                                                 int height, intptr_t mx, intptr_t my, int width)             \
 +                                                                                                              \
 +{                                                                                                             \
 +    ff_hevc_put_hevc_##name##width2##_10_##opt1(dst, src, _srcstride, height, mx, my, width);                 \
 +    ff_hevc_put_hevc_##name##width3##_10_##opt2(dst+ width2, src+ width4, _srcstride, height, mx, my, width); \
 +}
 +
 +#define mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                    \
 +void ff_hevc_put_hevc_bi_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,          \
 +                                                    ptrdiff_t _srcstride, int16_t *src2,                      \
 +                                                    int height, intptr_t mx, intptr_t my, int width)          \
 +{                                                                                                             \
 +    ff_hevc_put_hevc_bi_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, src2,                     \
 +                                                   height, mx, my, width);                                    \
 +    ff_hevc_put_hevc_bi_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, src2+width2,\
 +                                                   height, mx, my, width);                                    \
 +}
 +
 +#define mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                   \
 +void ff_hevc_put_hevc_uni_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride,                       \
 +                                                     uint8_t *src, ptrdiff_t _srcstride, int height,          \
 +                                                     intptr_t mx, intptr_t my, int width)                     \
 +{                                                                                                             \
 +    ff_hevc_put_hevc_uni_##name##width2##_10_##opt1(dst, dststride, src, _srcstride,                          \
 +                                                      height, mx, my, width);                                 \
 +    ff_hevc_put_hevc_uni_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride,            \
 +                                                      height, mx, my, width);                                 \
 +}
 +
 +#define mc_rep_mixs_10(name, width1, width2, width3, opt1, opt2, width4)   \
 +mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)            \
 +mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)         \
 +mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)
 +
 +#define mc_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                                \
 +void ff_hevc_put_hevc_##name##width1##_8_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride,             \
 +                                                int height, intptr_t mx, intptr_t my, int width)              \
 +                                                                                                              \
  {                                                                                                             \
 -    const ptrdiff_t stride = FFALIGN(width + 7, 8);                                                           \
 -    ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - 3 * srcstride, srcstride, \
 -                                                        height + 7, mx, my, mcbuffer);                        \
 -    ff_hevc_qpel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + 3 * stride, 2 * stride,                \
 -                                            height, mx, my, mcbuffer);                                        \
 +    ff_hevc_put_hevc_##name##width2##_8_##opt1(dst, src, _srcstride, height, mx, my, width);                  \
 +    ff_hevc_put_hevc_##name##width3##_8_##opt2(dst+ width2, src+ width2, _srcstride, height, mx, my, width);  \
  }
 -#else
 -#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
 -#endif /* ARCH_X86_64 && HAVE_AVX_EXTERNAL */
 -
 -#define QPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv)                                                           \
 -void ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride,                   \
 -                                                         uint8_t *src, ptrdiff_t srcstride,                   \
 -                                                         int height, int mx, int my, int16_t *mcbuffer);      \
 -void ff_hevc_qpel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride,                   \
 -                                                         uint8_t *src, ptrdiff_t srcstride,                   \
 -                                                         int height, int mx, int my, int16_t *mcbuffer);      \
 -QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
 -
 -QPEL_FUNCS(4,  8, ssse3, ssse3, avx)
 -QPEL_FUNCS(8,  8, ssse3, ssse3, avx)
 -QPEL_FUNCS(12, 8, ssse3, ssse3, avx)
 -QPEL_FUNCS(16, 8, ssse3, ssse3, avx)
 -QPEL_FUNCS(24, 8, ssse3, ssse3, avx)
 -QPEL_FUNCS(32, 8, ssse3, ssse3, avx)
 -QPEL_FUNCS(48, 8, ssse3, ssse3, avx)
 -QPEL_FUNCS(64, 8, ssse3, ssse3, avx)
 -
 -QPEL_FUNCS(4,  10, avx, avx, avx)
 -QPEL_FUNCS(8,  10, avx, avx, avx)
 -QPEL_FUNCS(12, 10, avx, avx, avx)
 -QPEL_FUNCS(16, 10, avx, avx, avx)
 -QPEL_FUNCS(24, 10, avx, avx, avx)
 -QPEL_FUNCS(32, 10, avx, avx, avx)
 -QPEL_FUNCS(48, 10, avx, avx, avx)
 -QPEL_FUNCS(64, 10, avx, avx, avx)
 -
 -#if ARCH_X86_64 && HAVE_AVX_EXTERNAL
 -#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)                                                         \
 -static void hevc_epel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride,             \
 -                                                               uint8_t *src, ptrdiff_t srcstride,             \
 -                                                               int height, int mx, int my, int16_t *mcbuffer) \
 +
 +#define mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                             \
 +void ff_hevc_put_hevc_bi_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,           \
 +                                                   ptrdiff_t _srcstride, int16_t* src2,                       \
 +                                                   int height, intptr_t mx, intptr_t my, int width)           \
  {                                                                                                             \
 -    const ptrdiff_t stride = FFALIGN(width + 3, 8);                                                           \
 -    ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - srcstride, srcstride,     \
 -                                                        height + 3, mx, my, mcbuffer);                        \
 -    ff_hevc_epel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + stride, 2 * stride,                    \
 -                                            height, mx, my, mcbuffer);                                        \
 +    ff_hevc_put_hevc_bi_##name##width2##_8_##opt1(dst, dststride, src, _srcstride,                            \
 +                                                  src2, height, mx, my, width);                               \
 +    ff_hevc_put_hevc_bi_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride,              \
 +                                                  src2+width2, height, mx, my, width);                        \
 +}
 +
 +#define mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                            \
 +void ff_hevc_put_hevc_uni_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride,                        \
 +                                                    uint8_t *src, ptrdiff_t _srcstride, int height,           \
 +                                                    intptr_t mx, intptr_t my, int width)                      \
 +{                                                                                                             \
 +    ff_hevc_put_hevc_uni_##name##width2##_8_##opt1(dst, dststride, src, _srcstride,                           \
 +                                                   height, mx, my, width);                                    \
 +    ff_hevc_put_hevc_uni_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride,             \
 +                                                   height, mx, my, width);                                    \
 +}
 +
 +#define mc_rep_mixs_8(name, width1, width2, width3, opt1, opt2)   \
 +mc_rep_mix_8(name, width1, width2, width3, opt1, opt2)            \
 +mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2)         \
 +mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)
 +
 +#if HAVE_AVX2_EXTERNAL
 +
 +mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4)
 +mc_rep_mixs_8(epel_hv,    48, 32, 16, avx2, sse4)
 +mc_rep_mixs_8(epel_h ,    48, 32, 16, avx2, sse4)
 +mc_rep_mixs_8(epel_v ,    48, 32, 16, avx2, sse4)
 +
 +mc_rep_mix_10(pel_pixels, 24, 16, 8, avx2, sse4, 32)
 +mc_bi_rep_mix_10(pel_pixels,24, 16, 8, avx2, sse4, 32)
 +mc_rep_mixs_10(epel_hv,   24, 16, 8, avx2, sse4, 32)
 +mc_rep_mixs_10(epel_h ,   24, 16, 8, avx2, sse4, 32)
 +mc_rep_mixs_10(epel_v ,   24, 16, 8, avx2, sse4, 32)
 +
 +
 +mc_rep_mixs_10(qpel_h ,   24, 16, 8, avx2, sse4, 32)
 +mc_rep_mixs_10(qpel_v ,   24, 16, 8, avx2, sse4, 32)
 +mc_rep_mixs_10(qpel_hv,   24, 16, 8, avx2, sse4, 32)
 +
 +
 +mc_rep_uni_func(pel_pixels, 8, 64, 128, avx2)//used for 10bit
 +mc_rep_uni_func(pel_pixels, 8, 32, 96, avx2) //used for 10bit
 +
 +mc_rep_funcs(pel_pixels, 8, 32, 64, avx2)
 +
 +mc_rep_func(pel_pixels, 10, 16, 32, avx2)
 +mc_rep_func(pel_pixels, 10, 16, 48, avx2)
 +mc_rep_func(pel_pixels, 10, 32, 64, avx2)
 +
 +mc_rep_bi_func(pel_pixels, 10, 16, 32, avx2)
 +mc_rep_bi_func(pel_pixels, 10, 16, 48, avx2)
 +mc_rep_bi_func(pel_pixels, 10, 32, 64, avx2)
 +
 +mc_rep_funcs(epel_h, 8, 32, 64, avx2)
 +
 +mc_rep_funcs(epel_v, 8, 32, 64, avx2)
 +
 +mc_rep_funcs(epel_h, 10, 16, 32, avx2)
 +mc_rep_funcs(epel_h, 10, 16, 48, avx2)
 +mc_rep_funcs(epel_h, 10, 32, 64, avx2)
 +
 +mc_rep_funcs(epel_v, 10, 16, 32, avx2)
 +mc_rep_funcs(epel_v, 10, 16, 48, avx2)
 +mc_rep_funcs(epel_v, 10, 32, 64, avx2)
 +
 +
 +mc_rep_funcs(epel_hv,  8, 32, 64, avx2)
 +
 +mc_rep_funcs(epel_hv, 10, 16, 32, avx2)
 +mc_rep_funcs(epel_hv, 10, 16, 48, avx2)
 +mc_rep_funcs(epel_hv, 10, 32, 64, avx2)
 +
 +mc_rep_funcs(qpel_h, 8, 32, 64, avx2)
 +mc_rep_mixs_8(qpel_h ,  48, 32, 16, avx2, sse4)
 +
 +mc_rep_funcs(qpel_v, 8, 32, 64, avx2)
 +mc_rep_mixs_8(qpel_v,  48, 32, 16, avx2, sse4)
 +
 +mc_rep_funcs(qpel_h, 10, 16, 32, avx2)
 +mc_rep_funcs(qpel_h, 10, 16, 48, avx2)
 +mc_rep_funcs(qpel_h, 10, 32, 64, avx2)
 +
 +mc_rep_funcs(qpel_v, 10, 16, 32, avx2)
 +mc_rep_funcs(qpel_v, 10, 16, 48, avx2)
 +mc_rep_funcs(qpel_v, 10, 32, 64, avx2)
 +
 +mc_rep_funcs(qpel_hv, 10, 16, 32, avx2)
 +mc_rep_funcs(qpel_hv, 10, 16, 48, avx2)
 +mc_rep_funcs(qpel_hv, 10, 32, 64, avx2)
 +
 +#endif //AVX2
 +
 +mc_rep_funcs(pel_pixels, 8, 16, 64, sse4)
 +mc_rep_funcs(pel_pixels, 8, 16, 48, sse4)
 +mc_rep_funcs(pel_pixels, 8, 16, 32, sse4)
 +mc_rep_funcs(pel_pixels, 8,  8, 24, sse4)
 +mc_rep_funcs(pel_pixels,10,  8, 64, sse4)
 +mc_rep_funcs(pel_pixels,10,  8, 48, sse4)
 +mc_rep_funcs(pel_pixels,10,  8, 32, sse4)
 +mc_rep_funcs(pel_pixels,10,  8, 24, sse4)
 +mc_rep_funcs(pel_pixels,10,  8, 16, sse4)
 +mc_rep_funcs(pel_pixels,10,  4, 12, sse4)
 +mc_rep_funcs(pel_pixels,12,  8, 64, sse4)
 +mc_rep_funcs(pel_pixels,12,  8, 48, sse4)
 +mc_rep_funcs(pel_pixels,12,  8, 32, sse4)
 +mc_rep_funcs(pel_pixels,12,  8, 24, sse4)
 +mc_rep_funcs(pel_pixels,12,  8, 16, sse4)
 +mc_rep_funcs(pel_pixels,12,  4, 12, sse4)
 +
 +mc_rep_funcs(epel_h, 8, 16, 64, sse4)
 +mc_rep_funcs(epel_h, 8, 16, 48, sse4)
 +mc_rep_funcs(epel_h, 8, 16, 32, sse4)
 +mc_rep_funcs(epel_h, 8,  8, 24, sse4)
 +mc_rep_funcs(epel_h,10,  8, 64, sse4)
 +mc_rep_funcs(epel_h,10,  8, 48, sse4)
 +mc_rep_funcs(epel_h,10,  8, 32, sse4)
 +mc_rep_funcs(epel_h,10,  8, 24, sse4)
 +mc_rep_funcs(epel_h,10,  8, 16, sse4)
 +mc_rep_funcs(epel_h,10,  4, 12, sse4)
 +mc_rep_funcs(epel_h,12,  8, 64, sse4)
 +mc_rep_funcs(epel_h,12,  8, 48, sse4)
 +mc_rep_funcs(epel_h,12,  8, 32, sse4)
 +mc_rep_funcs(epel_h,12,  8, 24, sse4)
 +mc_rep_funcs(epel_h,12,  8, 16, sse4)
 +mc_rep_funcs(epel_h,12,  4, 12, sse4)
 +mc_rep_funcs(epel_v, 8, 16, 64, sse4)
 +mc_rep_funcs(epel_v, 8, 16, 48, sse4)
 +mc_rep_funcs(epel_v, 8, 16, 32, sse4)
 +mc_rep_funcs(epel_v, 8,  8, 24, sse4)
 +mc_rep_funcs(epel_v,10,  8, 64, sse4)
 +mc_rep_funcs(epel_v,10,  8, 48, sse4)
 +mc_rep_funcs(epel_v,10,  8, 32, sse4)
 +mc_rep_funcs(epel_v,10,  8, 24, sse4)
 +mc_rep_funcs(epel_v,10,  8, 16, sse4)
 +mc_rep_funcs(epel_v,10,  4, 12, sse4)
 +mc_rep_funcs(epel_v,12,  8, 64, sse4)
 +mc_rep_funcs(epel_v,12,  8, 48, sse4)
 +mc_rep_funcs(epel_v,12,  8, 32, sse4)
 +mc_rep_funcs(epel_v,12,  8, 24, sse4)
 +mc_rep_funcs(epel_v,12,  8, 16, sse4)
 +mc_rep_funcs(epel_v,12,  4, 12, sse4)
 +mc_rep_funcs(epel_hv, 8, 16, 64, sse4)
 +mc_rep_funcs(epel_hv, 8, 16, 48, sse4)
 +mc_rep_funcs(epel_hv, 8, 16, 32, sse4)
 +mc_rep_funcs(epel_hv, 8,  8, 24, sse4)
 +mc_rep_funcs2(epel_hv,8,  8,  4, 12, sse4)
 +mc_rep_funcs(epel_hv,10,  8, 64, sse4)
 +mc_rep_funcs(epel_hv,10,  8, 48, sse4)
 +mc_rep_funcs(epel_hv,10,  8, 32, sse4)
 +mc_rep_funcs(epel_hv,10,  8, 24, sse4)
 +mc_rep_funcs(epel_hv,10,  8, 16, sse4)
 +mc_rep_funcs(epel_hv,10,  4, 12, sse4)
 +mc_rep_funcs(epel_hv,12,  8, 64, sse4)
 +mc_rep_funcs(epel_hv,12,  8, 48, sse4)
 +mc_rep_funcs(epel_hv,12,  8, 32, sse4)
 +mc_rep_funcs(epel_hv,12,  8, 24, sse4)
 +mc_rep_funcs(epel_hv,12,  8, 16, sse4)
 +mc_rep_funcs(epel_hv,12,  4, 12, sse4)
 +
 +mc_rep_funcs(qpel_h, 8, 16, 64, sse4)
 +mc_rep_funcs(qpel_h, 8, 16, 48, sse4)
 +mc_rep_funcs(qpel_h, 8, 16, 32, sse4)
 +mc_rep_funcs(qpel_h, 8,  8, 24, sse4)
 +mc_rep_funcs(qpel_h,10,  8, 64, sse4)
 +mc_rep_funcs(qpel_h,10,  8, 48, sse4)
 +mc_rep_funcs(qpel_h,10,  8, 32, sse4)
 +mc_rep_funcs(qpel_h,10,  8, 24, sse4)
 +mc_rep_funcs(qpel_h,10,  8, 16, sse4)
 +mc_rep_funcs(qpel_h,10,  4, 12, sse4)
 +mc_rep_funcs(qpel_h,12,  8, 64, sse4)
 +mc_rep_funcs(qpel_h,12,  8, 48, sse4)
 +mc_rep_funcs(qpel_h,12,  8, 32, sse4)
 +mc_rep_funcs(qpel_h,12,  8, 24, sse4)
 +mc_rep_funcs(qpel_h,12,  8, 16, sse4)
 +mc_rep_funcs(qpel_h,12,  4, 12, sse4)
 +mc_rep_funcs(qpel_v, 8, 16, 64, sse4)
 +mc_rep_funcs(qpel_v, 8, 16, 48, sse4)
 +mc_rep_funcs(qpel_v, 8, 16, 32, sse4)
 +mc_rep_funcs(qpel_v, 8,  8, 24, sse4)
 +mc_rep_funcs(qpel_v,10,  8, 64, sse4)
 +mc_rep_funcs(qpel_v,10,  8, 48, sse4)
 +mc_rep_funcs(qpel_v,10,  8, 32, sse4)
 +mc_rep_funcs(qpel_v,10,  8, 24, sse4)
 +mc_rep_funcs(qpel_v,10,  8, 16, sse4)
 +mc_rep_funcs(qpel_v,10,  4, 12, sse4)
 +mc_rep_funcs(qpel_v,12,  8, 64, sse4)
 +mc_rep_funcs(qpel_v,12,  8, 48, sse4)
 +mc_rep_funcs(qpel_v,12,  8, 32, sse4)
 +mc_rep_funcs(qpel_v,12,  8, 24, sse4)
 +mc_rep_funcs(qpel_v,12,  8, 16, sse4)
 +mc_rep_funcs(qpel_v,12,  4, 12, sse4)
 +mc_rep_funcs(qpel_hv, 8,  8, 64, sse4)
 +mc_rep_funcs(qpel_hv, 8,  8, 48, sse4)
 +mc_rep_funcs(qpel_hv, 8,  8, 32, sse4)
 +mc_rep_funcs(qpel_hv, 8,  8, 24, sse4)
 +mc_rep_funcs(qpel_hv, 8,  8, 16, sse4)
 +mc_rep_funcs2(qpel_hv,8,  8,  4, 12, sse4)
 +mc_rep_funcs(qpel_hv,10,  8, 64, sse4)
 +mc_rep_funcs(qpel_hv,10,  8, 48, sse4)
 +mc_rep_funcs(qpel_hv,10,  8, 32, sse4)
 +mc_rep_funcs(qpel_hv,10,  8, 24, sse4)
 +mc_rep_funcs(qpel_hv,10,  8, 16, sse4)
 +mc_rep_funcs(qpel_hv,10,  4, 12, sse4)
 +mc_rep_funcs(qpel_hv,12,  8, 64, sse4)
 +mc_rep_funcs(qpel_hv,12,  8, 48, sse4)
 +mc_rep_funcs(qpel_hv,12,  8, 32, sse4)
 +mc_rep_funcs(qpel_hv,12,  8, 24, sse4)
 +mc_rep_funcs(qpel_hv,12,  8, 16, sse4)
 +mc_rep_funcs(qpel_hv,12,  4, 12, sse4)
 +
 +#define mc_rep_uni_w(bitd, step, W, opt) \
 +void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
 +                                               int height, int denom,  int _wx, int _ox)                                \
 +{                                                                                                                       \
 +    int i;                                                                                                              \
 +    int16_t *src;                                                                                                       \
 +    uint8_t *dst;                                                                                                       \
 +    for (i = 0; i < W; i += step) {                                                                                     \
 +        src= _src + i;                                                                                                  \
 +        dst= _dst + (i * ((bitd + 7) / 8));                                                                             \
 +        ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src,                                   \
 +                                                     height, denom, _wx, _ox);                                          \
 +    }                                                                                                                   \
 +}
 +
 +mc_rep_uni_w(8, 6, 12, sse4)
 +mc_rep_uni_w(8, 8, 16, sse4)
 +mc_rep_uni_w(8, 8, 24, sse4)
 +mc_rep_uni_w(8, 8, 32, sse4)
 +mc_rep_uni_w(8, 8, 48, sse4)
 +mc_rep_uni_w(8, 8, 64, sse4)
 +
 +mc_rep_uni_w(10, 6, 12, sse4)
 +mc_rep_uni_w(10, 8, 16, sse4)
 +mc_rep_uni_w(10, 8, 24, sse4)
 +mc_rep_uni_w(10, 8, 32, sse4)
 +mc_rep_uni_w(10, 8, 48, sse4)
 +mc_rep_uni_w(10, 8, 64, sse4)
 +
 +mc_rep_uni_w(12, 6, 12, sse4)
 +mc_rep_uni_w(12, 8, 16, sse4)
 +mc_rep_uni_w(12, 8, 24, sse4)
 +mc_rep_uni_w(12, 8, 32, sse4)
 +mc_rep_uni_w(12, 8, 48, sse4)
 +mc_rep_uni_w(12, 8, 64, sse4)
 +
 +#define mc_rep_bi_w(bitd, step, W, opt) \
 +void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
 +                                              int16_t *_src2, int height,                                               \
 +                                              int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)                      \
 +{                                                                                                                       \
 +    int i;                                                                                                              \
 +    int16_t *src;                                                                                                       \
 +    int16_t *src2;                                                                                                      \
 +    uint8_t *dst;                                                                                                       \
 +    for (i = 0; i < W; i += step) {                                                                                     \
 +        src  = _src  + i;                                                                                               \
 +        src2 = _src2 + i;                                                                                               \
 +        dst  = _dst  + (i * ((bitd + 7) / 8));                                                                          \
 +        ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, src2,                             \
 +                                                     height, denom, _wx0, _wx1, _ox0, _ox1);                             \
 +    }                                                                                                                   \
 +}
 +
 +mc_rep_bi_w(8, 6, 12, sse4)
 +mc_rep_bi_w(8, 8, 16, sse4)
 +mc_rep_bi_w(8, 8, 24, sse4)
 +mc_rep_bi_w(8, 8, 32, sse4)
 +mc_rep_bi_w(8, 8, 48, sse4)
 +mc_rep_bi_w(8, 8, 64, sse4)
 +
 +mc_rep_bi_w(10, 6, 12, sse4)
 +mc_rep_bi_w(10, 8, 16, sse4)
 +mc_rep_bi_w(10, 8, 24, sse4)
 +mc_rep_bi_w(10, 8, 32, sse4)
 +mc_rep_bi_w(10, 8, 48, sse4)
 +mc_rep_bi_w(10, 8, 64, sse4)
 +
 +mc_rep_bi_w(12, 6, 12, sse4)
 +mc_rep_bi_w(12, 8, 16, sse4)
 +mc_rep_bi_w(12, 8, 24, sse4)
 +mc_rep_bi_w(12, 8, 32, sse4)
 +mc_rep_bi_w(12, 8, 48, sse4)
 +mc_rep_bi_w(12, 8, 64, sse4)
 +
 +#define mc_uni_w_func(name, bitd, W, opt) \
 +void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,         \
 +                                                      uint8_t *_src, ptrdiff_t _srcstride,          \
 +                                                      int height, int denom,                        \
 +                                                      int _wx, int _ox,                             \
 +                                                      intptr_t mx, intptr_t my, int width)          \
 +{                                                                                                   \
 +    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                            \
 +    ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width);     \
 +    ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, height, denom, _wx, _ox);\
  }
 -#else
 -#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
 -#endif /* ARCH_X86_64 && HAVE_AVX_EXTERNAL */
 -
 -#define EPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv)                                                           \
 -void ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride,                   \
 -                                                         uint8_t *src, ptrdiff_t srcstride,                   \
 -                                                         int height, int mx, int my, int16_t *mcbuffer);      \
 -void ff_hevc_epel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride,                   \
 -                                                         uint8_t *src, ptrdiff_t srcstride,                   \
 -                                                         int height, int mx, int my, int16_t *mcbuffer);      \
 -EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
 -
 -EPEL_FUNCS(4,  8, ssse3, ssse3, avx)
 -EPEL_FUNCS(8,  8, ssse3, ssse3, avx)
 -EPEL_FUNCS(12, 8, ssse3, ssse3, avx)
 -EPEL_FUNCS(16, 8, ssse3, ssse3, avx)
 -EPEL_FUNCS(24, 8, ssse3, ssse3, avx)
 -EPEL_FUNCS(32, 8, ssse3, ssse3, avx)
 -
 -EPEL_FUNCS(4,  10, avx, avx, avx)
 -EPEL_FUNCS(8,  10, avx, avx, avx)
 -EPEL_FUNCS(12, 10, avx, avx, avx)
 -EPEL_FUNCS(16, 10, avx, avx, avx)
 -EPEL_FUNCS(24, 10, avx, avx, avx)
 -EPEL_FUNCS(32, 10, avx, avx, avx)
 -
 -#define PUT_PRED(width, depth, cf_uw, cf_w) \
 -void ff_hevc_put_unweighted_pred_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride,                   \
 -                                                                       int16_t *src, ptrdiff_t srcstride,                   \
 -                                                                       int height);                                         \
 -void ff_hevc_put_unweighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride,               \
 -                                                                           int16_t *src1, int16_t *src2,                    \
 -                                                                           ptrdiff_t srcstride, int height);                \
 -void ff_hevc_put_weighted_pred_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight, int16_t offset,          \
 -                                                                    uint8_t *dst, ptrdiff_t dststride,                      \
 -                                                                    int16_t *src, ptrdiff_t srcstride,                      \
 -                                                                    int height);                                            \
 -void ff_hevc_put_weighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight0, int16_t weight1,    \
 -                                                                        int16_t offset0, int16_t offset1,                   \
 -                                                                        uint8_t *dst, ptrdiff_t dststride,                  \
 -                                                                        int16_t *src0, int16_t *src1, ptrdiff_t srcstride,  \
 -                                                                        int height);
 -
 -PUT_PRED(4,  8, sse2, sse4)
 -PUT_PRED(8,  8, sse2, sse4)
 -PUT_PRED(12, 8, sse2, sse4)
 -PUT_PRED(16, 8, sse2, sse4)
 -PUT_PRED(24, 8, sse2, sse4)
 -PUT_PRED(32, 8, sse2, sse4)
 -PUT_PRED(48, 8, sse2, sse4)
 -PUT_PRED(64, 8, sse2, sse4)
 -
 -PUT_PRED(4,  10, sse2, sse4)
 -PUT_PRED(8,  10, sse2, sse4)
 -PUT_PRED(12, 10, sse2, sse4)
 -PUT_PRED(16, 10, sse2, sse4)
 -PUT_PRED(24, 10, sse2, sse4)
 -PUT_PRED(32, 10, sse2, sse4)
 -PUT_PRED(48, 10, sse2, sse4)
 -PUT_PRED(64, 10, sse2, sse4)
 +
 +#define mc_uni_w_funcs(name, bitd, opt)      \
 +        mc_uni_w_func(name, bitd, 4, opt)    \
 +        mc_uni_w_func(name, bitd, 8, opt)    \
 +        mc_uni_w_func(name, bitd, 12, opt)   \
 +        mc_uni_w_func(name, bitd, 16, opt)   \
 +        mc_uni_w_func(name, bitd, 24, opt)   \
 +        mc_uni_w_func(name, bitd, 32, opt)   \
 +        mc_uni_w_func(name, bitd, 48, opt)   \
 +        mc_uni_w_func(name, bitd, 64, opt)
 +
 +mc_uni_w_funcs(pel_pixels, 8, sse4)
 +mc_uni_w_func(pel_pixels, 8, 6, sse4)
 +mc_uni_w_funcs(epel_h, 8, sse4)
 +mc_uni_w_func(epel_h, 8, 6, sse4)
 +mc_uni_w_funcs(epel_v, 8, sse4)
 +mc_uni_w_func(epel_v, 8, 6, sse4)
 +mc_uni_w_funcs(epel_hv, 8, sse4)
 +mc_uni_w_func(epel_hv, 8, 6, sse4)
 +mc_uni_w_funcs(qpel_h, 8, sse4)
 +mc_uni_w_funcs(qpel_v, 8, sse4)
 +mc_uni_w_funcs(qpel_hv, 8, sse4)
 +
 +mc_uni_w_funcs(pel_pixels, 10, sse4)
 +mc_uni_w_func(pel_pixels, 10, 6, sse4)
 +mc_uni_w_funcs(epel_h, 10, sse4)
 +mc_uni_w_func(epel_h, 10, 6, sse4)
 +mc_uni_w_funcs(epel_v, 10, sse4)
 +mc_uni_w_func(epel_v, 10, 6, sse4)
 +mc_uni_w_funcs(epel_hv, 10, sse4)
 +mc_uni_w_func(epel_hv, 10, 6, sse4)
 +mc_uni_w_funcs(qpel_h, 10, sse4)
 +mc_uni_w_funcs(qpel_v, 10, sse4)
 +mc_uni_w_funcs(qpel_hv, 10, sse4)
 +
 +mc_uni_w_funcs(pel_pixels, 12, sse4)
 +mc_uni_w_func(pel_pixels, 12, 6, sse4)
 +mc_uni_w_funcs(epel_h, 12, sse4)
 +mc_uni_w_func(epel_h, 12, 6, sse4)
 +mc_uni_w_funcs(epel_v, 12, sse4)
 +mc_uni_w_func(epel_v, 12, 6, sse4)
 +mc_uni_w_funcs(epel_hv, 12, sse4)
 +mc_uni_w_func(epel_hv, 12, 6, sse4)
 +mc_uni_w_funcs(qpel_h, 12, sse4)
 +mc_uni_w_funcs(qpel_v, 12, sse4)
 +mc_uni_w_funcs(qpel_hv, 12, sse4)
 +
 +#define mc_bi_w_func(name, bitd, W, opt) \
 +void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,           \
 +                                                     uint8_t *_src, ptrdiff_t _srcstride,            \
 +                                                     int16_t *_src2,                                 \
 +                                                     int height, int denom,                          \
 +                                                     int _wx0, int _wx1, int _ox0, int _ox1,         \
 +                                                     intptr_t mx, intptr_t my, int width)            \
 +{                                                                                                    \
 +    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                             \
 +    ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width);      \
 +    ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, _src2,                         \
 +                                              height, denom, _wx0, _wx1, _ox0, _ox1);                \
 +}
 +
 +#define mc_bi_w_funcs(name, bitd, opt)      \
 +        mc_bi_w_func(name, bitd, 4, opt)    \
 +        mc_bi_w_func(name, bitd, 8, opt)    \
 +        mc_bi_w_func(name, bitd, 12, opt)   \
 +        mc_bi_w_func(name, bitd, 16, opt)   \
 +        mc_bi_w_func(name, bitd, 24, opt)   \
 +        mc_bi_w_func(name, bitd, 32, opt)   \
 +        mc_bi_w_func(name, bitd, 48, opt)   \
 +        mc_bi_w_func(name, bitd, 64, opt)
 +
 +mc_bi_w_funcs(pel_pixels, 8, sse4)
 +mc_bi_w_func(pel_pixels, 8, 6, sse4)
 +mc_bi_w_funcs(epel_h, 8, sse4)
 +mc_bi_w_func(epel_h, 8, 6, sse4)
 +mc_bi_w_funcs(epel_v, 8, sse4)
 +mc_bi_w_func(epel_v, 8, 6, sse4)
 +mc_bi_w_funcs(epel_hv, 8, sse4)
 +mc_bi_w_func(epel_hv, 8, 6, sse4)
 +mc_bi_w_funcs(qpel_h, 8, sse4)
 +mc_bi_w_funcs(qpel_v, 8, sse4)
 +mc_bi_w_funcs(qpel_hv, 8, sse4)
 +
 +mc_bi_w_funcs(pel_pixels, 10, sse4)
 +mc_bi_w_func(pel_pixels, 10, 6, sse4)
 +mc_bi_w_funcs(epel_h, 10, sse4)
 +mc_bi_w_func(epel_h, 10, 6, sse4)
 +mc_bi_w_funcs(epel_v, 10, sse4)
 +mc_bi_w_func(epel_v, 10, 6, sse4)
 +mc_bi_w_funcs(epel_hv, 10, sse4)
 +mc_bi_w_func(epel_hv, 10, 6, sse4)
 +mc_bi_w_funcs(qpel_h, 10, sse4)
 +mc_bi_w_funcs(qpel_v, 10, sse4)
 +mc_bi_w_funcs(qpel_hv, 10, sse4)
 +
 +mc_bi_w_funcs(pel_pixels, 12, sse4)
 +mc_bi_w_func(pel_pixels, 12, 6, sse4)
 +mc_bi_w_funcs(epel_h, 12, sse4)
 +mc_bi_w_func(epel_h, 12, 6, sse4)
 +mc_bi_w_funcs(epel_v, 12, sse4)
 +mc_bi_w_func(epel_v, 12, 6, sse4)
 +mc_bi_w_funcs(epel_hv, 12, sse4)
 +mc_bi_w_func(epel_hv, 12, 6, sse4)
 +mc_bi_w_funcs(qpel_h, 12, sse4)
 +mc_bi_w_funcs(qpel_v, 12, sse4)
 +mc_bi_w_funcs(qpel_hv, 12, sse4)
 +#endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
 +
 +#define SAO_BAND_FILTER_FUNCS(bitd, opt)                                                                                   \
 +void ff_hevc_sao_band_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,  \
 +                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
 +void ff_hevc_sao_band_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
 +                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
 +void ff_hevc_sao_band_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
 +                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
 +void ff_hevc_sao_band_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
 +                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
 +void ff_hevc_sao_band_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
 +                                             int16_t *sao_offset_val, int sao_left_class, int width, int height);
 +
 +SAO_BAND_FILTER_FUNCS(8,  sse2)
 +SAO_BAND_FILTER_FUNCS(10, sse2)
 +SAO_BAND_FILTER_FUNCS(12, sse2)
 +SAO_BAND_FILTER_FUNCS(8,   avx)
 +SAO_BAND_FILTER_FUNCS(10,  avx)
 +SAO_BAND_FILTER_FUNCS(12,  avx)
 +SAO_BAND_FILTER_FUNCS(8,  avx2)
 +SAO_BAND_FILTER_FUNCS(10, avx2)
 +SAO_BAND_FILTER_FUNCS(12, avx2)
 +
 +#define SAO_BAND_INIT(bitd, opt) do {                                       \
 +    c->sao_band_filter[0]      = ff_hevc_sao_band_filter_8_##bitd##_##opt;  \
 +    c->sao_band_filter[1]      = ff_hevc_sao_band_filter_16_##bitd##_##opt; \
 +    c->sao_band_filter[2]      = ff_hevc_sao_band_filter_32_##bitd##_##opt; \
 +    c->sao_band_filter[3]      = ff_hevc_sao_band_filter_48_##bitd##_##opt; \
 +    c->sao_band_filter[4]      = ff_hevc_sao_band_filter_64_##bitd##_##opt; \
 +} while (0)
 +
 +#define SAO_EDGE_FILTER_FUNCS(bitd, opt)                                                                                    \
 +void ff_hevc_sao_edge_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,  \
 +                                              int eo, int width, int height);                                               \
 +void ff_hevc_sao_edge_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
 +                                               int eo, int width, int height);                                              \
 +void ff_hevc_sao_edge_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
 +                                               int eo, int width, int height);                                              \
 +void ff_hevc_sao_edge_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
 +                                               int eo, int width, int height);                                              \
 +void ff_hevc_sao_edge_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
 +                                               int eo, int width, int height);                                              \
 +
 +SAO_EDGE_FILTER_FUNCS(8, ssse3)
 +SAO_EDGE_FILTER_FUNCS(8, avx2)
 +SAO_EDGE_FILTER_FUNCS(10, sse2)
 +SAO_EDGE_FILTER_FUNCS(10, avx2)
 +SAO_EDGE_FILTER_FUNCS(12, sse2)
 +SAO_EDGE_FILTER_FUNCS(12, avx2)
 +
 +#define SAO_EDGE_INIT(bitd, opt) do {                                       \
 +    c->sao_edge_filter[0]      = ff_hevc_sao_edge_filter_8_##bitd##_##opt;  \
 +    c->sao_edge_filter[1]      = ff_hevc_sao_edge_filter_16_##bitd##_##opt; \
 +    c->sao_edge_filter[2]      = ff_hevc_sao_edge_filter_32_##bitd##_##opt; \
 +    c->sao_edge_filter[3]      = ff_hevc_sao_edge_filter_48_##bitd##_##opt; \
 +    c->sao_edge_filter[4]      = ff_hevc_sao_edge_filter_64_##bitd##_##opt; \
 +} while (0)
 +
 +#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt )           \
 +        PEL_LINK(pointer, 1, my , mx , fname##4 ,  bitd, opt ); \
 +        PEL_LINK(pointer, 2, my , mx , fname##6 ,  bitd, opt ); \
 +        PEL_LINK(pointer, 3, my , mx , fname##8 ,  bitd, opt ); \
 +        PEL_LINK(pointer, 4, my , mx , fname##12,  bitd, opt ); \
 +        PEL_LINK(pointer, 5, my , mx , fname##16,  bitd, opt ); \
 +        PEL_LINK(pointer, 6, my , mx , fname##24,  bitd, opt ); \
 +        PEL_LINK(pointer, 7, my , mx , fname##32,  bitd, opt ); \
 +        PEL_LINK(pointer, 8, my , mx , fname##48,  bitd, opt ); \
 +        PEL_LINK(pointer, 9, my , mx , fname##64,  bitd, opt )
 +#define QPEL_LINKS(pointer, my, mx, fname, bitd, opt)           \
 +        PEL_LINK(pointer, 1, my , mx , fname##4 ,  bitd, opt ); \
 +        PEL_LINK(pointer, 3, my , mx , fname##8 ,  bitd, opt ); \
 +        PEL_LINK(pointer, 4, my , mx , fname##12,  bitd, opt ); \
 +        PEL_LINK(pointer, 5, my , mx , fname##16,  bitd, opt ); \
 +        PEL_LINK(pointer, 6, my , mx , fname##24,  bitd, opt ); \
 +        PEL_LINK(pointer, 7, my , mx , fname##32,  bitd, opt ); \
 +        PEL_LINK(pointer, 8, my , mx , fname##48,  bitd, opt ); \
 +        PEL_LINK(pointer, 9, my , mx , fname##64,  bitd, opt )
  
  void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
  {
@@@ -705,152 -282,31 +718,163 @@@
          if (EXTERNAL_SSE2(cpu_flags)) {
              c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
              c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
 +
++                c->idct[2] = ff_hevc_idct_16x16_8_sse2;
++                c->idct[3] = ff_hevc_idct_32x32_8_sse2;
 +            }
 +            SAO_BAND_INIT(8, sse2);
  
              c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
              c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
              c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
  
+             c->idct[0]    = ff_hevc_idct_4x4_8_sse2;
+             c->idct[1]    = ff_hevc_idct_8x8_8_sse2;
 -            SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
 -            SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
+ 
 -            SET_LUMA_FUNCS(put_unweighted_pred,              ff_hevc_put_unweighted_pred,     8, sse2);
 -            SET_LUMA_FUNCS(put_unweighted_pred_avg,          ff_hevc_put_unweighted_pred_avg, 8, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_chroma,     ff_hevc_put_unweighted_pred,     8, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 8, sse2);
 +            c->add_residual[1] = ff_hevc_add_residual8_8_sse2;
 +            c->add_residual[2] = ff_hevc_add_residual16_8_sse2;
 +            c->add_residual[3] = ff_hevc_add_residual32_8_sse2;
          }
          if (EXTERNAL_SSSE3(cpu_flags)) {
 -            SET_QPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_qpel_h);
 -            SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v);
 -            SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h);
 -            SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v);
 +            if(ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
 +            }
 +            SAO_EDGE_INIT(8, ssse3);
 +        }
 +        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
 +
 +            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels,  8, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,      8, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,      8, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,     8, sse4);
  
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
          }
          if (EXTERNAL_AVX(cpu_flags)) {
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
++
++                c->idct[2] = ff_hevc_idct_16x16_8_avx;
++                c->idct[3] = ff_hevc_idct_32x32_8_avx;
 +            }
 +            SAO_BAND_INIT(8, avx);
 +
+             c->idct[0] = ff_hevc_idct_4x4_8_avx;
+             c->idct[1] = ff_hevc_idct_8x8_8_avx;
++
 +            c->add_residual[1] = ff_hevc_add_residual8_8_avx;
 +            c->add_residual[2] = ff_hevc_add_residual16_8_avx;
 +            c->add_residual[3] = ff_hevc_add_residual32_8_avx;
 +        }
 +        if (EXTERNAL_AVX2(cpu_flags)) {
 +            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
 +            c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
 +        }
 +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
 +            c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
 +            c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
 +            if (ARCH_X86_64) {
 +                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
 +                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
 +                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
 +                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
 +                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
 +                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
 +                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
 +                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
 +                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
 +                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
 +                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
 +
 +                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
 +                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
 +                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
 +                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
 +                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
 +                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
 +                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
 +
 +                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
 +                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
 +                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
 +                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
 +                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
 +                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
 +                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
 +
 +                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
 +                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
 +                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
 +
 +                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
 +                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
 +                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
 +
 +                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
 +                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
 +                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
 +
 +                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
 +                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
 +                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
 +
 +                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
 +                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
 +                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
 +
 +                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
 +                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
 +                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
 +            }
 +            SAO_BAND_INIT(8, avx2);
 +
 +            c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
 +            c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
 +            c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
 +
 +            c->add_residual[3] = ff_hevc_add_residual32_8_avx2;
          }
      } else if (bit_depth == 10) {
          if (EXTERNAL_MMXEXT(cpu_flags)) {
@@@ -861,254 -316,88 +885,267 @@@
          if (EXTERNAL_SSE2(cpu_flags)) {
              c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
              c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
++
++                c->idct[2] = ff_hevc_idct_16x16_10_sse2;
++                c->idct[3] = ff_hevc_idct_32x32_10_sse2;
 +            }
 +            SAO_BAND_INIT(10, sse2);
 +            SAO_EDGE_INIT(10, sse2);
  
              c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2;
              c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
              c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_sse2;
  
+             c->idct[0]    = ff_hevc_idct_4x4_10_sse2;
+             c->idct[1]    = ff_hevc_idct_8x8_10_sse2;
 -            SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
 -            SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
+ 
 -            SET_LUMA_FUNCS(put_unweighted_pred,              ff_hevc_put_unweighted_pred,     10, sse2);
 -            SET_LUMA_FUNCS(put_unweighted_pred_avg,          ff_hevc_put_unweighted_pred_avg, 10, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_chroma,     ff_hevc_put_unweighted_pred,     10, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2);
 +            c->add_residual[1] = ff_hevc_add_residual8_10_sse2;
 +            c->add_residual[2] = ff_hevc_add_residual16_10_sse2;
 +            c->add_residual[3] = ff_hevc_add_residual32_10_sse2;
 +        }
 +        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
 +            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
 +            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
 +        }
 +        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
 +            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     10, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     10, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    10, sse4);
 +
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     10, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
          }
          if (EXTERNAL_AVX(cpu_flags)) {
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
++
++                c->idct[2] = ff_hevc_idct_16x16_10_avx;
++                c->idct[3] = ff_hevc_idct_32x32_10_avx;
 +            }
++
+             c->idct[0] = ff_hevc_idct_4x4_10_avx;
+             c->idct[1] = ff_hevc_idct_8x8_10_avx;
 -        }
 -    }
+ 
 -#if ARCH_X86_64
 -    if (bit_depth == 8) {
 -        if (EXTERNAL_SSE2(cpu_flags)) {
 -            c->idct[2] = ff_hevc_idct_16x16_8_sse2;
 -            c->idct[3] = ff_hevc_idct_32x32_8_sse2;
 +            SAO_BAND_INIT(10, avx);
          }
 -        if (EXTERNAL_SSSE3(cpu_flags)) {
 -            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
 -            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
 +        if (EXTERNAL_AVX2(cpu_flags)) {
 +            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
          }
 +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
 +            c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
 +            c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
 +            if (ARCH_X86_64) {
 +                c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
 +                c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
 +                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
 +                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
 +                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
  
 -        if (EXTERNAL_SSE4(cpu_flags)) {
 -            SET_LUMA_FUNCS(weighted_pred,              ff_hevc_put_weighted_pred,     8, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_chroma,     ff_hevc_put_weighted_pred,     8, sse4);
 -            SET_LUMA_FUNCS(weighted_pred_avg,          ff_hevc_put_weighted_pred_avg, 8, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 8, sse4);
 -        }
 +                c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
 +                c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
 +                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
 +                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
 +                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
 +                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
 +
 +                c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
 +                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
 +
 +                c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
 +                c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
 +                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
 +                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
 +                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
 +                c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
 +                c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
 +                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
 +                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
 +                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
 +
 +                c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
 +                c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
 +                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
 +                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
 +                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
 +                c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
 +                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
 +                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
 +                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
 +
 +                c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
 +                c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
 +                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
 +                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
 +                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
 +
 +                c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
 +                c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
 +                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
 +                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
 +                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
 +                c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
 +                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
 +                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
 +                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
 +
 +                c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
 +                c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
 +                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
 +                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
 +                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
 +
 +                c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
 +                c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
 +                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
 +                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
 +                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
 +                c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
 +                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
 +                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
 +                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
 +
 +                c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
 +                c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
 +                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
 +                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
 +                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
 +
 +                c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
 +                c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
 +                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
 +                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
 +                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
 +
 +                c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
 +                c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
 +                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
 +                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
 +                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
 +
 +                c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
 +                c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
 +                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
 +                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
 +                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
 +
 +                c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
 +                c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
 +                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
 +                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
 +                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
 +
 +                c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
 +                c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
 +                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
 +                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
 +                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
 +
 +                c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
 +                c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
 +                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
 +                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
 +                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
 +
 +                c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
 +                c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
 +                c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
 +                c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
 +                c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
 +
 +                c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
 +                c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
 +                c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
 +                c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
 +                c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
 +
 +                c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
 +                c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
 +                c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
 +                c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
 +                c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
 +            }
 +            SAO_BAND_INIT(10, avx2);
 +            SAO_EDGE_INIT(10, avx2);
 +
 +            c->add_residual[2] = ff_hevc_add_residual16_10_avx2;
 +            c->add_residual[3] = ff_hevc_add_residual32_10_avx2;
  
 -        if (EXTERNAL_AVX(cpu_flags)) {
 -#if HAVE_AVX_EXTERNAL
 -            SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv);
 -            SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv);
 -#endif /* HAVE_AVX_EXTERNAL */
 -            c->idct[2] = ff_hevc_idct_16x16_8_avx;
 -            c->idct[3] = ff_hevc_idct_32x32_8_avx;
          }
 -        if (EXTERNAL_AVX2(cpu_flags)) {
 -            c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
 -            c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
 +    } else if (bit_depth == 12) {
 +        if (EXTERNAL_MMXEXT(cpu_flags)) {
 +            c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_mmxext;
 +            c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_mmxext;
          }
 -    } else if (bit_depth == 10) {
          if (EXTERNAL_SSE2(cpu_flags)) {
 -            c->idct[2] = ff_hevc_idct_16x16_10_sse2;
 -            c->idct[3] = ff_hevc_idct_32x32_10_sse2;
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
 +            }
 +            SAO_BAND_INIT(12, sse2);
 +            SAO_EDGE_INIT(12, sse2);
 +
 +            c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
 +            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
 +            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
          }
 -        if (EXTERNAL_SSSE3(cpu_flags)) {
 -            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
 -            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
 +        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
 +            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
 +            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
          }
 -        if (EXTERNAL_SSE4(cpu_flags)) {
 -            SET_LUMA_FUNCS(weighted_pred,              ff_hevc_put_weighted_pred,     10, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_chroma,     ff_hevc_put_weighted_pred,     10, sse4);
 -            SET_LUMA_FUNCS(weighted_pred_avg,          ff_hevc_put_weighted_pred_avg, 10, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 10, sse4);
 +        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
 +            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     12, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     12, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    12, sse4);
 +
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     12, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     12, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    12, sse4);
          }
          if (EXTERNAL_AVX(cpu_flags)) {
 -#if HAVE_AVX_EXTERNAL
 -            SET_QPEL_FUNCS(0, 1, 10, avx, ff_hevc_qpel_h);
 -            SET_QPEL_FUNCS(1, 0, 10, avx, ff_hevc_qpel_v);
 -            SET_QPEL_FUNCS(1, 1, 10, avx, hevc_qpel_hv);
 -            SET_EPEL_FUNCS(0, 1, 10, avx, ff_hevc_epel_h);
 -            SET_EPEL_FUNCS(1, 0, 10, avx, ff_hevc_epel_v);
 -            SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv);
 -#endif /* HAVE_AVX_EXTERNAL */
 -            c->idct[2] = ff_hevc_idct_16x16_10_avx;
 -            c->idct[3] = ff_hevc_idct_32x32_10_avx;
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
 +            }
 +            SAO_BAND_INIT(12, avx);
          }
          if (EXTERNAL_AVX2(cpu_flags)) {
 -            c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
 -            c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
 +            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
 +        }
 +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
 +            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_avx2;
 +            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_avx2;
 +
 +            SAO_BAND_INIT(12, avx2);
 +            SAO_EDGE_INIT(12, avx2);
          }
      }
 -#endif /* ARCH_X86_64 */
  }