[FFmpeg-devel] [PATCH 5/5] x86: hevc_mc: put_pixels and 1d epel for x86_32

Christophe Gisquet christophe.gisquet at gmail.com
Sat Feb 7 19:49:40 CET 2015


Now that the xmm register and gpr count has decreased, it is
possible to port to x86_32. To save on code, x86_32 with or
without PIC is handled as if PIC.
---
 libavcodec/x86/hevc_mc.asm    | 39 +++++++++++++++++++++++--------
 libavcodec/x86/hevcdsp.h      |  4 +++-
 libavcodec/x86/hevcdsp_init.c | 53 +++++++++++++++++++++++++++----------------
 3 files changed, 66 insertions(+), 30 deletions(-)

diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index a127a4d..085a212 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -59,9 +59,6 @@ hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
 %endmacro
 
 
-EPEL_TABLE  8,16, b, avx2
-EPEL_TABLE 10, 8, w, avx2
-
 EPEL_TABLE  8, 8, b, sse4
 EPEL_TABLE 10, 4, w, sse4
 EPEL_TABLE 12, 4, w, sse4
@@ -85,17 +82,20 @@ QPEL_TABLE  8, 8, b, sse4
 QPEL_TABLE 10, 4, w, sse4
 QPEL_TABLE 12, 4, w, sse4
 
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
 QPEL_TABLE  8,16, b, avx2
 QPEL_TABLE 10, 8, w, avx2
 
+EPEL_TABLE  8,16, b, avx2
+EPEL_TABLE 10, 8, w, avx2
+%endif
+
 %define MAX_PB_SIZE  64
 
 %define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
 
 %define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10
 
-%if ARCH_X86_64
-
 %macro SIMPLE_BILOAD 4   ;width, tab, r1, r2
 %if %1 <= 4
     movq              %3, [%2]                                              ; load data from source2
@@ -139,7 +139,7 @@ QPEL_TABLE 10, 8, w, avx2
 %macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp
 %if cpuflag(avx2)
 %assign %%offset 32
-%ifdef PIC
+%if ARCH_X86_32
     lea              %5q, [hevc_epel_filters_avx2_%1]
     %define FILTER %5q
 %else
@@ -147,7 +147,7 @@ QPEL_TABLE 10, 8, w, avx2
 %endif
 %else
 %assign %%offset 16
-%ifdef PIC
+%if ARCH_X86_32
     lea              %5q, [hevc_epel_filters_sse4_%1]
     %define FILTER %5q
 %else
@@ -759,9 +759,19 @@ cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 8, dst, dststride, src, srcstride,
     jnz               .loop                      ; height loop
     RET
 
+%if ARCH_X86_32
+cglobal hevc_put_hevc_bi_epel_h%1_%2, 4, 7, 8, dst, dststride, src, srcstride, src2, height, mx
+    mov              r4d, mxm
+    EPEL_FILTER       %2, r4, m4, m5, r5
+    mov              r4d, r4m
+    mov              r5d, r5m
+%else
 cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, 8, dst, dststride, src, srcstride, src2, height, mx, rfilter
-    movdqa            m6, [pw_bi_%2]
+    movifnidn        mxd, mxm
     EPEL_FILTER       %2, mx, m4, m5, rfilter
+%endif
+%assign %%stride ((%2 + 7)/8)
+    movdqa            m6, [pw_bi_%2]
 .loop
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
     EPEL_COMPUTE      %2, %1, m4, m5, 1
@@ -811,11 +821,18 @@ cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, 8, dst, dststride, src, srcstride,
     RET
 
 
+%if ARCH_X86_32
+cglobal hevc_put_hevc_bi_epel_v%1_%2, 5, 7, 8, dst, dststride, src, srcstride, src2, height, r3src, my
+    mov              r5d, mym
+    EPEL_FILTER       %2, r5, m4, m5, r3src
+    mov              r5d, r5m
+%else
 cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, 8, dst, dststride, src, srcstride, src2, height, r3src, my
     movifnidn        myd, mym
+    EPEL_FILTER       %2, my, m4, m5, r3src
+%endif
     movdqa            m6, [pw_bi_%2]
     sub             srcq, srcstrideq
-    EPEL_FILTER       %2, my, m4, m5, r3src
     lea           r3srcq, [srcstrideq*3]
 .loop
     EPEL_LOAD         %2, srcq, srcstride, %1
@@ -832,6 +849,7 @@ cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, 8, dst, dststride, src, srcstride, s
 %endmacro
 
 
+%if ARCH_X86_64
 ; ******************************
 ; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
 ;                       uint8_t *_src, ptrdiff_t _srcstride,
@@ -1571,7 +1589,9 @@ WEIGHTING_FUNCS 2, 12
 WEIGHTING_FUNCS 4, 12
 WEIGHTING_FUNCS 6, 12
 WEIGHTING_FUNCS 8, 12
+%endif
 
+INIT_XMM sse4                                    ; adds ff_ and _sse4 to function name
 HEVC_PUT_HEVC_PEL_PIXELS  2, 8
 HEVC_PUT_HEVC_PEL_PIXELS  4, 8
 HEVC_PUT_HEVC_PEL_PIXELS  6, 8
@@ -1607,6 +1627,7 @@ HEVC_PUT_HEVC_EPEL 4, 12
 HEVC_PUT_HEVC_EPEL 6, 12
 HEVC_PUT_HEVC_EPEL 8, 12
 
+%if ARCH_X86_64
 HEVC_PUT_HEVC_EPEL_HV 2,  8
 HEVC_PUT_HEVC_EPEL_HV 4,  8
 HEVC_PUT_HEVC_EPEL_HV 6,  8
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index ad8168f..4f83d7b 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -36,8 +36,10 @@
 dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
 dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
 dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
+if (ARCH_X86_64) { \
 dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
-dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
+dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt; \
+}
 
 
 #define PEL_PROTOTYPE(name, D, opt) \
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index e493033..b0abd27 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -163,7 +163,7 @@ void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dsts
     mc_rep_uni_func2(name, bitd, step1, step2, W, opt); \
     mc_rep_bi_func2(name, bitd, step1, step2, W, opt)
 
-#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+#if HAVE_SSE4_EXTERNAL
 
 #define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                       \
 void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride,            \
@@ -237,7 +237,7 @@ mc_rep_mix_8(name, width1, width2, width3, opt1, opt2);            \
 mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2);         \
 mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)
 
-#if HAVE_AVX2_EXTERNAL
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
 
 mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4);
 mc_rep_mixs_8(epel_hv,    48, 32, 16, avx2, sse4);
@@ -357,6 +357,8 @@ mc_rep_funcs(epel_v,12,  8, 32, sse4);
 mc_rep_funcs(epel_v,12,  8, 24, sse4);
 mc_rep_funcs(epel_v,12,  8, 16, sse4);
 mc_rep_funcs(epel_v,12,  4, 12, sse4);
+
+# if ARCH_X86_64
 mc_rep_funcs(epel_hv, 8, 16, 64, sse4);
 mc_rep_funcs(epel_hv, 8, 16, 48, sse4);
 mc_rep_funcs(epel_hv, 8, 16, 32, sse4);
@@ -618,7 +620,9 @@ mc_bi_w_func(epel_hv, 12, 6, sse4);
 mc_bi_w_funcs(qpel_h, 12, sse4);
 mc_bi_w_funcs(qpel_v, 12, sse4);
 mc_bi_w_funcs(qpel_hv, 12, sse4);
-#endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+# endif // ~ARCH_X86_64
+
+#endif //HAVE_SSE4_EXTERNAL
 
 #define SAO_BAND_FILTER_FUNCS(bitd, opt)                                                                                   \
 void ff_hevc_sao_band_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,  \
@@ -731,17 +735,20 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             }
             SAO_EDGE_INIT(8, ssse3);
         }
-        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+        if (EXTERNAL_SSE4(cpu_flags)) {
 
             EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels,  8, sse4);
             EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,      8, sse4);
             EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,      8, sse4);
-            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,     8, sse4);
 
-            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
+            if (ARCH_X86_64) {
+                EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,     8, sse4);
+
+                QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
+                QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
+                QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
+                QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
+            }
         }
         if (EXTERNAL_AVX(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
@@ -880,16 +887,19 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
             c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
         }
-        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+        if (EXTERNAL_SSE4(cpu_flags)) {
             EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
             EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     10, sse4);
             EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     10, sse4);
-            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    10, sse4);
 
-            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     10, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
+            if (ARCH_X86_64) {
+                EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    10, sse4);
+
+                QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
+                QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     10, sse4);
+                QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
+                QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
+            }
         }
         if (EXTERNAL_AVX(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
@@ -1087,12 +1097,15 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
             EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     12, sse4);
             EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     12, sse4);
-            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    12, sse4);
 
-            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     12, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     12, sse4);
-            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    12, sse4);
+            if (ARCH_X86_64) {
+                EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    12, sse4);
+
+                QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
+                QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     12, sse4);
+                QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     12, sse4);
+                QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    12, sse4);
+            }
         }
         if (EXTERNAL_AVX(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
-- 
1.9.2.msysgit.0



More information about the ffmpeg-devel mailing list