From 00e38cc2c60a10ef00f3e41e77ffc067cf522d8e Mon Sep 17 00:00:00 2001 From: jsteube Date: Sun, 27 Aug 2017 19:36:07 +0200 Subject: [PATCH] Add VEGA specific inline assembly to improve all MD4, MD5, SHA1 and SHA256 based kernels --- OpenCL/inc_hash_functions.cl | 68 +++++++++++++------------ OpenCL/inc_types.cl | 97 ++++++++++++++++++++++++++++++++++++ OpenCL/inc_vendor.cl | 3 ++ 3 files changed, 135 insertions(+), 33 deletions(-) diff --git a/OpenCL/inc_hash_functions.cl b/OpenCL/inc_hash_functions.cl index c356d8244..cf7daf3a5 100644 --- a/OpenCL/inc_hash_functions.cl +++ b/OpenCL/inc_hash_functions.cl @@ -34,23 +34,23 @@ #define MD4_STEP_S(f,a,b,c,d,x,K,s) \ { \ a += K; \ - a += x; \ - a += f (b, c, d); \ + const u32 t = f (b, c, d); \ + a = __add3_S (a, x, t); \ a = rotl32_S (a, s); \ } #define MD4_STEP(f,a,b,c,d,x,K,s) \ { \ a += K; \ - a += x; \ - a += f (b, c, d); \ + const u32x t = f (b, c, d); \ + a = __add3 (a, x, t); \ a = rotl32 (a, s); \ } #define MD4_STEP0(f,a,b,c,d,K,s) \ { \ - a += K; \ - a += f (b, c, d); \ + const u32x t = f (b, c, d); \ + a = __add3 (a, K, t); \ a = rotl32 (a, s); \ } @@ -72,7 +72,7 @@ #define MD5_F(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) #define MD5_G(x,y,z) ((y) ^ ((z) & ((x) ^ (y)))) #define MD5_H(x,y,z) ((x) ^ (y) ^ (z)) -#define MD5_I(x,y,z) (bitselect (0xffffffffU, (x), (z)) ^ (y)) +#define MD5_I(x,y,z) ((y) ^ ((x) | ~(z))) #define MD5_Fo(x,y,z) (bitselect ((z), (y), (x))) #define MD5_Go(x,y,z) (bitselect ((y), (x), (z))) #endif @@ -89,8 +89,8 @@ #define MD5_STEP_S(f,a,b,c,d,x,K,s) \ { \ a += K; \ - a += x; \ - a += f (b, c, d); \ + const u32 t = f (b, c, d); \ + a = __add3_S (a, x, t); \ a = rotl32_S (a, s); \ a += b; \ } @@ -98,16 +98,16 @@ #define MD5_STEP(f,a,b,c,d,x,K,s) \ { \ a += K; \ - a += x; \ - a += f (b, c, d); \ + const u32x t = f (b, c, d); \ + a = __add3 (a, x, t); \ a = rotl32 (a, s); \ a += b; \ } #define MD5_STEP0(f,a,b,c,d,K,s) \ { \ - a += K; \ - a += f (b, c, d); \ + const u32x t = f (b, c, d); \ + a = __add3 (a, K, t); \ a = rotl32 (a, s); \ a += b; \ } @@ -139,8 +139,8 @@ #define SHA1_STEP_S(f,a,b,c,d,e,x) \ { \ e += K; \ - e += x; \ - e += f (b, c, d); \ + const u32 t = f (b, c, d); \ + e = __add3_S (e, x, t); \ e += rotl32_S (a, 5u); \ b = rotl32_S (b, 30u); \ } @@ -148,24 +148,24 @@ #define SHA1_STEP(f,a,b,c,d,e,x) \ { \ e += K; \ - e += x; \ - e += f (b, c, d); \ + const u32x t = f (b, c, d); \ + e = __add3 (e, x, t); \ e += rotl32 (a, 5u); \ b = rotl32 (b, 30u); \ } #define SHA1_STEP0(f,a,b,c,d,e,x) \ { \ - e += K; \ - e += f (b, c, d); \ + const u32x t = f (b, c, d); \ + e = __add3 (e, K, t); \ e += rotl32 (a, 5u); \ b = rotl32 (b, 30u); \ } #define SHA1_STEPX(f,a,b,c,d,e,x) \ { \ - e += x; \ - e += f (b, c, d); \ + const u32x t = f (b, c, d); \ + e = __add3 (e, x, t); \ e += rotl32 (a, 5u); \ b = rotl32 (b, 30u); \ } @@ -218,26 +218,28 @@ #define SHA256_STEP_S(F0,F1,a,b,c,d,e,f,g,h,x,K) \ { \ - h += K; \ - h += x; \ - h += SHA256_S3_S (e); \ - h += F1 (e,f,g); \ + const u32 t1 = SHA256_S3_S (e); \ + const u32 t2 = F1 (e,f,g); \ + h = __add3_S (h, K, x); \ + h = __add3_S (h, t1, t2); \ d += h; \ - h += SHA256_S2_S (a); \ - h += F0 (a,b,c); \ + const u32 t3 = SHA256_S2_S (a); \ + const u32 t4 = F0 (a,b,c); \ + h = __add3_S (h, t3, t4); \ } #define SHA256_EXPAND_S(x,y,z,w) (SHA256_S1_S (x) + y + SHA256_S0_S (z) + w) #define SHA256_STEP(F0,F1,a,b,c,d,e,f,g,h,x,K) \ { \ - h += K; \ - h += x; \ - h += SHA256_S3 (e); \ - h += F1 (e,f,g); \ + const u32 t1 = SHA256_S3 (e); \ + const u32 t2 = F1 (e,f,g); \ + h = __add3 (h, K, x); \ + h = __add3 (h, t1, t2); \ d += h; \ - h += SHA256_S2 (a); \ - h += F0 (a,b,c); \ + const u32 t3 = SHA256_S2 (a); \ + const u32 t4 = F0 (a,b,c); \ + h = __add3 (h, t3, t4); \ } #define SHA256_EXPAND(x,y,z,w) (SHA256_S1 (x) + y + SHA256_S0 (z) + w) diff --git a/OpenCL/inc_types.cl b/OpenCL/inc_types.cl index b3d3ac39e..51c024aea 100644 --- a/OpenCL/inc_types.cl +++ b/OpenCL/inc_types.cl @@ -339,6 +339,82 @@ static u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) } #endif +#ifdef IS_AMD_ROCM_VEGA +static u32x __add3 (const u32x a, const u32x b, const u32x c) +{ + u32x r; + + #if VECT_SIZE == 1 + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r) : "v"(b), "v"(a), "v"(c)); + #endif + + #if VECT_SIZE >= 2 + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1)); + #endif + + #if VECT_SIZE >= 4 + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3)); + #endif + + #if VECT_SIZE >= 8 + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s4) : "v"(b.s4), "v"(a.s4), "v"(c.s4)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s5) : "v"(b.s5), "v"(a.s5), "v"(c.s5)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s6) : "v"(b.s6), "v"(a.s6), "v"(c.s6)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s7) : "v"(b.s7), "v"(a.s7), "v"(c.s7)); + #endif + + #if VECT_SIZE >= 16 + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s4) : "v"(b.s4), "v"(a.s4), "v"(c.s4)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s5) : "v"(b.s5), "v"(a.s5), "v"(c.s5)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s6) : "v"(b.s6), "v"(a.s6), "v"(c.s6)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s7) : "v"(b.s7), "v"(a.s7), "v"(c.s7)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s8) : "v"(b.s8), "v"(a.s8), "v"(c.s8)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s9) : "v"(b.s9), "v"(a.s9), "v"(c.s9)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sa) : "v"(b.sa), "v"(a.sa), "v"(c.sa)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sb) : "v"(b.sb), "v"(a.sb), "v"(c.sb)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sc) : "v"(b.sc), "v"(a.sc), "v"(c.sc)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sd) : "v"(b.sd), "v"(a.sd), "v"(c.sd)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.se) : "v"(b.se), "v"(a.se), "v"(c.se)); + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sf) : "v"(b.sf), "v"(a.sf), "v"(c.sf)); + #endif + + return r; +} +#else +static u32x __add3 (const u32x a, const u32x b, const u32x c) +{ + return a + b + c; +} +#endif + +#ifdef IS_AMD_ROCM_VEGA +static u32 __add3_S (const u32 a, const u32 b, const u32 c) +{ + u32 r; + + __asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r) : "v"(b), "v"(a), "v"(c)); + + return r; +} +#else +static u32 __add3_S (const u32 a, const u32 b, const u32 c) +{ + return a + b + c; +} +#endif + #endif #ifdef IS_NV @@ -571,6 +647,17 @@ static u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) return r; } + +static u32x __add3 (const u32x a, const u32x b, const u32x c) +{ + return a + b + c; +} + +static u32 __add3_S (const u32 a, const u32 b, const u32 c) +{ + return a + b + c; +} + #endif #ifdef IS_GENERIC @@ -710,6 +797,16 @@ static u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) return (u32) (tmp); } +static u32x __add3 (const u32x a, const u32x b, const u32x c) +{ + return a + b + c; +} + +static u32 __add3_S (const u32 a, const u32 b, const u32 c) +{ + return a + b + c; +} + #endif typedef struct digest diff --git a/OpenCL/inc_vendor.cl b/OpenCL/inc_vendor.cl index b608854bf..f843fd3bf 100644 --- a/OpenCL/inc_vendor.cl +++ b/OpenCL/inc_vendor.cl @@ -36,6 +36,9 @@ #else #define IS_AMD #define IS_AMD_ROCM +#if defined __gfx900__ || defined __gfx901__ || defined __gfx902__ || defined __gfx903__ +#define IS_AMD_ROCM_VEGA +#endif #endif #elif VENDOR_ID == (1 << 1) #define IS_APPLE