[FFmpeg-devel] [PATCH 1/2] lavu/sha: Fully unroll the transform function loops

Tue Sep 10 01:16:39 CEST 2013

crypto_bench SHA-1 and SHA-256 results using an AMD Athlon X2 7750+, mingw32-w64 GCC 4.7.3 x86_64

Before:
lavu       SHA-1        size: 1048576  runs:   1024  time:    9.012 +- 0.162
lavu       SHA-256      size: 1048576  runs:   1024  time:   19.625 +- 0.173

After:
lavu       SHA-1        size: 1048576  runs:   1024  time:    7.948 +- 0.154
lavu       SHA-256      size: 1048576  runs:   1024  time:   17.841 +- 0.170

Signed-off-by: James Almer <jamrial at gmail.com>
---
 libavutil/sha.c | 115 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 67 insertions(+), 48 deletions(-)

diff --git a/libavutil/sha.c b/libavutil/sha.c
index 8c4f8a0..0cf9424 100644
--- a/libavutil/sha.c
+++ b/libavutil/sha.c
@@ -98,39 +98,53 @@ static void sha1_transform(uint32_t state[5], const uint8_t buffer[64])
         a = t;
     }
 #else
-    for (i = 0; i < 15; i += 5) {
-        R0(a, b, c, d, e, 0 + i);
-        R0(e, a, b, c, d, 1 + i);
-        R0(d, e, a, b, c, 2 + i);
-        R0(c, d, e, a, b, 3 + i);
-        R0(b, c, d, e, a, 4 + i);
-    }
+
+#define R1_0 \
+    R0(a, b, c, d, e, 0 + i); \
+    R0(e, a, b, c, d, 1 + i); \
+    R0(d, e, a, b, c, 2 + i); \
+    R0(c, d, e, a, b, 3 + i); \
+    R0(b, c, d, e, a, 4 + i); \
+    i += 5
+
+    i = 0;
+    R1_0; R1_0; R1_0;
     R0(a, b, c, d, e, 15);
     R1(e, a, b, c, d, 16);
     R1(d, e, a, b, c, 17);
     R1(c, d, e, a, b, 18);
     R1(b, c, d, e, a, 19);
-    for (i = 20; i < 40; i += 5) {
-        R2(a, b, c, d, e, 0 + i);
-        R2(e, a, b, c, d, 1 + i);
-        R2(d, e, a, b, c, 2 + i);
-        R2(c, d, e, a, b, 3 + i);
-        R2(b, c, d, e, a, 4 + i);
-    }
-    for (; i < 60; i += 5) {
-        R3(a, b, c, d, e, 0 + i);
-        R3(e, a, b, c, d, 1 + i);
-        R3(d, e, a, b, c, 2 + i);
-        R3(c, d, e, a, b, 3 + i);
-        R3(b, c, d, e, a, 4 + i);
-    }
-    for (; i < 80; i += 5) {
-        R4(a, b, c, d, e, 0 + i);
-        R4(e, a, b, c, d, 1 + i);
-        R4(d, e, a, b, c, 2 + i);
-        R4(c, d, e, a, b, 3 + i);
-        R4(b, c, d, e, a, 4 + i);
-    }
+
+#define R1_20 \
+    R2(a, b, c, d, e, 0 + i); \
+    R2(e, a, b, c, d, 1 + i); \
+    R2(d, e, a, b, c, 2 + i); \
+    R2(c, d, e, a, b, 3 + i); \
+    R2(b, c, d, e, a, 4 + i); \
+    i += 5
+
+    i = 20;
+    R1_20; R1_20; R1_20; R1_20;
+
+#define R1_40 \
+    R3(a, b, c, d, e, 0 + i); \
+    R3(e, a, b, c, d, 1 + i); \
+    R3(d, e, a, b, c, 2 + i); \
+    R3(c, d, e, a, b, 3 + i); \
+    R3(b, c, d, e, a, 4 + i); \
+    i += 5
+
+    R1_40; R1_40; R1_40; R1_40;
+
+#define R1_60 \
+    R4(a, b, c, d, e, 0 + i); \
+    R4(e, a, b, c, d, 1 + i); \
+    R4(d, e, a, b, c, 2 + i); \
+    R4(c, d, e, a, b, 3 + i); \
+    R4(b, c, d, e, a, 4 + i); \
+    i += 5
+
+    R1_60; R1_60; R1_60; R1_60;
 #endif
     state[0] += a;
     state[1] += b;
@@ -218,27 +232,32 @@ static void sha256_transform(uint32_t *state, const uint8_t buffer[64])
         a = T1 + T2;
     }
 #else
-    for (i = 0; i < 16 - 7;) {
-        ROUND256_0_TO_15(a, b, c, d, e, f, g, h);
-        ROUND256_0_TO_15(h, a, b, c, d, e, f, g);
-        ROUND256_0_TO_15(g, h, a, b, c, d, e, f);
-        ROUND256_0_TO_15(f, g, h, a, b, c, d, e);
-        ROUND256_0_TO_15(e, f, g, h, a, b, c, d);
-        ROUND256_0_TO_15(d, e, f, g, h, a, b, c);
-        ROUND256_0_TO_15(c, d, e, f, g, h, a, b);
-        ROUND256_0_TO_15(b, c, d, e, f, g, h, a);
-    }
 
-    for (; i < 64 - 7;) {
-        ROUND256_16_TO_63(a, b, c, d, e, f, g, h);
-        ROUND256_16_TO_63(h, a, b, c, d, e, f, g);
-        ROUND256_16_TO_63(g, h, a, b, c, d, e, f);
-        ROUND256_16_TO_63(f, g, h, a, b, c, d, e);
-        ROUND256_16_TO_63(e, f, g, h, a, b, c, d);
-        ROUND256_16_TO_63(d, e, f, g, h, a, b, c);
-        ROUND256_16_TO_63(c, d, e, f, g, h, a, b);
-        ROUND256_16_TO_63(b, c, d, e, f, g, h, a);
-    }
+    i = 0;
+#define R256_0 \
+    ROUND256_0_TO_15(a, b, c, d, e, f, g, h); \
+    ROUND256_0_TO_15(h, a, b, c, d, e, f, g); \
+    ROUND256_0_TO_15(g, h, a, b, c, d, e, f); \
+    ROUND256_0_TO_15(f, g, h, a, b, c, d, e); \
+    ROUND256_0_TO_15(e, f, g, h, a, b, c, d); \
+    ROUND256_0_TO_15(d, e, f, g, h, a, b, c); \
+    ROUND256_0_TO_15(c, d, e, f, g, h, a, b); \
+    ROUND256_0_TO_15(b, c, d, e, f, g, h, a)
+
+    R256_0; R256_0;
+
+#define R256_16 \
+    ROUND256_16_TO_63(a, b, c, d, e, f, g, h); \
+    ROUND256_16_TO_63(h, a, b, c, d, e, f, g); \
+    ROUND256_16_TO_63(g, h, a, b, c, d, e, f); \
+    ROUND256_16_TO_63(f, g, h, a, b, c, d, e); \
+    ROUND256_16_TO_63(e, f, g, h, a, b, c, d); \
+    ROUND256_16_TO_63(d, e, f, g, h, a, b, c); \
+    ROUND256_16_TO_63(c, d, e, f, g, h, a, b); \
+    ROUND256_16_TO_63(b, c, d, e, f, g, h, a)
+
+    R256_16; R256_16; R256_16;
+    R256_16; R256_16; R256_16;
 #endif
     state[0] += a;
     state[1] += b;
-- 
1.8.1.5