[FFmpeg-devel] [PATCH 30/34] aarch64: vp9itxfm: Avoid reloading the idct32 coefficients

Wed Mar 8 12:01:10 EET 2017

The idct32x32 function actually pushed d8-d15 onto the stack even
though it didn't clobber them; there are plenty of registers that
can be used to allow keeping all the idct coefficients in registers
without having to reload different subsets of them at different
stages in the transform.

After this, we still can skip pushing d12-d15.

Before:
vp9_inv_dct_dct_32x32_sub32_add_neon: 8128.3
After:
vp9_inv_dct_dct_32x32_sub32_add_neon: 8053.3

This is cherrypicked from libav commit
65aa002d54433154a6924dc13e498bec98451ad0.
---
 libavcodec/aarch64/vp9itxfm_neon.S | 110 +++++++++++++++----------------------
 1 file changed, 43 insertions(+), 67 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index be65eb7..dd9fde1 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -1123,18 +1123,14 @@ endfunc
 .endm
 
 function idct32_odd
-        ld1             {v0.8h,v1.8h}, [x11]
-
-        dmbutterfly     v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
-        dmbutterfly     v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
-        dmbutterfly     v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
-        dmbutterfly     v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
-        dmbutterfly     v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
-        dmbutterfly     v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
-        dmbutterfly     v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
-        dmbutterfly     v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
-
-        ld1             {v0.8h}, [x10]
+        dmbutterfly     v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly     v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly     v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly     v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly     v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly     v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly     v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly     v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
 
         butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
         butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
@@ -1153,18 +1149,14 @@ function idct32_odd
 endfunc
 
 function idct32_odd_half
-        ld1             {v0.8h,v1.8h}, [x11]
-
-        dmbutterfly_h1  v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
-        dmbutterfly_h2  v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
-        dmbutterfly_h1  v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
-        dmbutterfly_h2  v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
-        dmbutterfly_h1  v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
-        dmbutterfly_h2  v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
-        dmbutterfly_h1  v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
-        dmbutterfly_h2  v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
-
-        ld1             {v0.8h}, [x10]
+        dmbutterfly_h1  v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly_h2  v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly_h1  v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly_h2  v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly_h1  v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly_h2  v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly_h1  v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly_h2  v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
 
         butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
         butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
@@ -1183,18 +1175,14 @@ function idct32_odd_half
 endfunc
 
 function idct32_odd_quarter
-        ld1             {v0.8h,v1.8h}, [x11]
-
-        dsmull_h        v4,  v5,  v16, v0.h[0]
-        dsmull_h        v28, v29, v19, v0.h[7]
-        dsmull_h        v30, v31, v16, v0.h[1]
-        dsmull_h        v22, v23, v17, v1.h[6]
-        dsmull_h        v7,  v6,  v17, v1.h[7]
-        dsmull_h        v26, v27, v19, v0.h[6]
-        dsmull_h        v20, v21, v18, v1.h[0]
-        dsmull_h        v24, v25, v18, v1.h[1]
-
-        ld1             {v0.8h}, [x10]
+        dsmull_h        v4,  v5,  v16, v8.h[0]
+        dsmull_h        v28, v29, v19, v8.h[7]
+        dsmull_h        v30, v31, v16, v8.h[1]
+        dsmull_h        v22, v23, v17, v9.h[6]
+        dsmull_h        v7,  v6,  v17, v9.h[7]
+        dsmull_h        v26, v27, v19, v8.h[6]
+        dsmull_h        v20, v21, v18, v9.h[0]
+        dsmull_h        v24, v25, v18, v9.h[1]
 
         neg             v28.4s, v28.4s
         neg             v29.4s, v29.4s
@@ -1240,12 +1228,8 @@ endfunc
 // x1 = unused
 // x2 = src
 // x9 = double input stride
-// x10 = idct_coeffs
-// x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass1\suffix\()_neon
         mov             x14, x30
-        ld1             {v0.8h,v1.8h}, [x10]
-
         movi            v2.8h, #0
 
         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
@@ -1278,14 +1262,14 @@ function idct32_1d_8x32_pass1\suffix\()_neon
 .macro store_rev a, b
         // There's no rev128 instruction, but we reverse each 64 bit
         // half, and then flip them using an ext with 8 bytes offset.
-        rev64           v1.8h, \b
+        rev64           v3.8h, \b
         st1             {\a},  [x0], #16
-        rev64           v0.8h, \a
-        ext             v1.16b, v1.16b, v1.16b, #8
+        rev64           v2.8h, \a
+        ext             v3.16b, v3.16b, v3.16b, #8
         st1             {\b},  [x0], #16
-        ext             v0.16b, v0.16b, v0.16b, #8
-        st1             {v1.8h},  [x0], #16
-        st1             {v0.8h},  [x0], #16
+        ext             v2.16b, v2.16b, v2.16b, #8
+        st1             {v3.8h},  [x0], #16
+        st1             {v2.8h},  [x0], #16
 .endm
         store_rev       v16.8h, v24.8h
         store_rev       v17.8h, v25.8h
@@ -1339,20 +1323,20 @@ function idct32_1d_8x32_pass1\suffix\()_neon
         // subtracted from the output.
 .macro store_rev a, b
         ld1             {v4.8h},  [x0]
-        rev64           v1.8h, \b
+        rev64           v3.8h, \b
         add             v4.8h, v4.8h, \a
-        rev64           v0.8h, \a
+        rev64           v2.8h, \a
         st1             {v4.8h},  [x0], #16
-        ext             v1.16b, v1.16b, v1.16b, #8
+        ext             v3.16b, v3.16b, v3.16b, #8
         ld1             {v5.8h},  [x0]
-        ext             v0.16b, v0.16b, v0.16b, #8
+        ext             v2.16b, v2.16b, v2.16b, #8
         add             v5.8h, v5.8h, \b
         st1             {v5.8h},  [x0], #16
         ld1             {v6.8h},  [x0]
-        sub             v6.8h, v6.8h, v1.8h
+        sub             v6.8h, v6.8h, v3.8h
         st1             {v6.8h},  [x0], #16
         ld1             {v7.8h},  [x0]
-        sub             v7.8h, v7.8h, v0.8h
+        sub             v7.8h, v7.8h, v2.8h
         st1             {v7.8h},  [x0], #16
 .endm
 
@@ -1376,12 +1360,8 @@ endfunc
 // x2 = src (temp buffer)
 // x7 = negative double temp buffer stride
 // x9 = double temp buffer stride
-// x10 = idct_coeffs
-// x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass2\suffix\()_neon
         mov             x14, x30
-        ld1             {v0.8h,v1.8h}, [x10]
-
         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
 .ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -1454,15 +1434,15 @@ function idct32_1d_8x32_pass2\suffix\()_neon
         sub             v6.8h, v6.8h, \c
         sub             v7.8h, v7.8h, \d
 .endif
-        ld1             {v0.8b}, [x0], x1
-        ld1             {v1.8b}, [x0], x1
+        ld1             {v10.8b}, [x0], x1
+        ld1             {v11.8b}, [x0], x1
         srshr           v4.8h, v4.8h, #6
         ld1             {v2.8b}, [x0], x1
         srshr           v5.8h, v5.8h, #6
-        uaddw           v4.8h, v4.8h, v0.8b
+        uaddw           v4.8h, v4.8h, v10.8b
         ld1             {v3.8b}, [x0], x1
         srshr           v6.8h, v6.8h, #6
-        uaddw           v5.8h, v5.8h, v1.8b
+        uaddw           v5.8h, v5.8h, v11.8b
         srshr           v7.8h, v7.8h, #6
         sub             x0,  x0,  x1, lsl #2
         uaddw           v6.8h, v6.8h, v2.8b
@@ -1503,13 +1483,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
         b.eq            idct32x32_dc_add_neon
 
         movrel          x10, idct_coeffs
-        add             x11, x10, #32
         movrel          x12, min_eob_idct_idct_32, 2
 
         mov             x15, x30
 
-        stp             d14, d15, [sp, #-0x10]!
-        stp             d12, d13, [sp, #-0x10]!
         stp             d10, d11, [sp, #-0x10]!
         stp             d8,  d9,  [sp, #-0x10]!
 
@@ -1523,6 +1500,9 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
         mov             x9,  #128
         neg             x7,  x9
 
+        ld1             {v0.8h,v1.8h}, [x10], #32
+        ld1             {v8.8h,v9.8h}, [x10]
+
         cmp             w3,  #34
         b.le            idct32x32_quarter_add_neon
         cmp             w3,  #135
@@ -1565,8 +1545,6 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
 
         ldp             d8,  d9,  [sp], 0x10
         ldp             d10, d11, [sp], 0x10
-        ldp             d12, d13, [sp], 0x10
-        ldp             d14, d15, [sp], 0x10
 
         br              x15
 endfunc
@@ -1592,8 +1570,6 @@ function idct32x32_\size\()_add_neon
 
         ldp             d8,  d9,  [sp], 0x10
         ldp             d10, d11, [sp], 0x10
-        ldp             d12, d13, [sp], 0x10
-        ldp             d14, d15, [sp], 0x10
 
         br              x15
 endfunc
-- 
2.7.4