diff --git a/OpenCL/inc_hash_sm3.h b/OpenCL/inc_hash_sm3.h index 742c37277..2bb4acfa3 100644 --- a/OpenCL/inc_hash_sm3.h +++ b/OpenCL/inc_hash_sm3.h @@ -15,8 +15,13 @@ #define SM3_FF0(x, y, z) ((x) ^ (y) ^ (z)) #define SM3_GG0(x, y, z) ((x) ^ (y) ^ (z)) -#define SM3_FF1(x, y, z) (((x) & (y)) | (((x) | (y)) & (z))) +#ifdef USE_BITSELECT +#define SM3_FF1(x, y, z) (bitselect ((x), (y), ((x) ^ (z)))) +#define SM3_GG1(x, y, z) (bitselect ((z), (y), (x))) +#else +#define SM3_FF1(x, y, z) (((x) & (y)) | ((z) & ((x) ^ (y)))) #define SM3_GG1(x, y, z) (((z) ^ ((x) & ((y) ^ (z))))) +#endif #define SM3_EXPAND_S(a, b, c, d, e) (SM3_P1_S(a ^ b ^ hc_rotl32_S(c, 15)) ^ hc_rotl32_S(d, 7) ^ e) #define SM3_EXPAND(a, b, c, d, e) (SM3_P1(a ^ b ^ hc_rotl32(c, 15)) ^ hc_rotl32(d, 7) ^ e) diff --git a/OpenCL/m31100_a3-optimized.cl b/OpenCL/m31100_a3-optimized.cl index 63f69a330..bd89f0fb0 100644 --- a/OpenCL/m31100_a3-optimized.cl +++ b/OpenCL/m31100_a3-optimized.cl @@ -263,6 +263,241 @@ DECLSPEC void m31100s (PRIVATE_AS u32 *w, const u32 pw_len, KERN_ATTR_FUNC_VECTO } } +/* expansion phase optimization, for some reason slower than current implementation - probably compiler optimizer + +DECLSPEC void m31100s (PRIVATE_AS u32 *w, const u32 pw_len, KERN_ATTR_FUNC_VECTOR ()) +{ + const u32 search[4] = + { + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0], + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1], + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2], + digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3] + }; + + const u32 d_rev = hc_rotr32_S (search[0], 9); + + u32 pre_t[68]; + + pre_t[ 0] = 0; + pre_t[ 1] = w[ 1]; + pre_t[ 2] = w[ 2]; + pre_t[ 3] = w[ 3]; + pre_t[ 4] = w[ 4]; + pre_t[ 5] = w[ 5]; + pre_t[ 6] = w[ 6]; + pre_t[ 7] = w[ 7]; + pre_t[ 8] = w[ 8]; + pre_t[ 9] = w[ 9]; + pre_t[10] = w[10]; + pre_t[11] = w[11]; + pre_t[12] = w[12]; + pre_t[13] = w[13]; + pre_t[14] = w[14]; + pre_t[15] = w[15]; + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 16; i < 68; i++) + { + pre_t[i] = SM3_EXPAND_S (pre_t[i - 16], pre_t[i - 9], pre_t[i - 3], pre_t[i - 13], pre_t[i - 6]); + } + + u32 w0l = w[0]; + + for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + u32x t[68]; + + t[0] = w0; + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 1; i < 65; i++) + { + t[i] = pre_t[i]; + } + + const u32x fix16 = SM3_EXPAND ( w0, 0, 0, 0, 0); + const u32x fix19 = SM3_EXPAND ( 0, 0, fix16, 0, 0); + const u32x fix22 = SM3_EXPAND ( 0, 0, fix19, 0, fix16); + const u32x fix25 = SM3_EXPAND ( 0, fix16, fix22, 0, fix19); + const u32x fix28 = SM3_EXPAND ( 0, fix19, fix25, 0, fix22); + const u32x fix29 = SM3_EXPAND ( 0, 0, 0, fix16, 0); + const u32x fix31 = SM3_EXPAND ( 0, fix22, fix28, 0, fix25); + const u32x fix32 = SM3_EXPAND (fix16, 0, fix29, fix19, 0); + const u32x fix34 = SM3_EXPAND ( 0, fix25, fix31, 0, fix28); + const u32x fix35 = SM3_EXPAND (fix19, 0, fix32, fix22, fix29); + const u32x fix37 = SM3_EXPAND ( 0, fix28, fix34, 0, fix31); + const u32x fix38 = SM3_EXPAND (fix22, fix29, fix35, fix25, fix32); + const u32x fix40 = SM3_EXPAND ( 0, fix31, fix37, 0, fix34); + const u32x fix41 = SM3_EXPAND (fix25, fix32, fix38, fix28, fix35); + const u32x fix42 = SM3_EXPAND ( 0, 0, 0, fix29, 0); + const u32x fix43 = SM3_EXPAND ( 0, fix34, fix40, 0, fix37); + const u32x fix44 = SM3_EXPAND (fix28, fix35, fix41, fix31, fix38); + const u32x fix45 = SM3_EXPAND (fix29, 0, fix42, fix32, 0); + const u32x fix46 = SM3_EXPAND ( 0, fix37, fix43, 0, fix40); + const u32x fix47 = SM3_EXPAND (fix31, fix38, fix44, fix34, fix41); + const u32x fix48 = SM3_EXPAND (fix32, 0, fix45, fix35, fix42); + const u32x fix49 = SM3_EXPAND ( 0, fix40, fix46, 0, fix43); + const u32x fix50 = SM3_EXPAND (fix34, fix41, fix47, fix37, fix44); + const u32x fix51 = SM3_EXPAND (fix35, fix42, fix48, fix38, fix45); + const u32x fix52 = SM3_EXPAND ( 0, fix43, fix49, 0, fix46); + const u32x fix53 = SM3_EXPAND (fix37, fix44, fix50, fix40, fix47); + const u32x fix54 = SM3_EXPAND (fix38, fix45, fix51, fix41, fix48); + const u32x fix55 = SM3_EXPAND ( 0, fix46, fix52, fix42, fix49); + const u32x fix56 = SM3_EXPAND (fix40, fix47, fix53, fix43, fix50); + const u32x fix57 = SM3_EXPAND (fix41, fix48, fix54, fix44, fix51); + const u32x fix58 = SM3_EXPAND (fix42, fix49, fix55, fix45, fix52); + const u32x fix59 = SM3_EXPAND (fix43, fix50, fix56, fix46, fix53); + const u32x fix60 = SM3_EXPAND (fix44, fix51, fix57, fix47, fix54); + const u32x fix61 = SM3_EXPAND (fix45, fix52, fix58, fix48, fix55); + const u32x fix62 = SM3_EXPAND (fix46, fix53, fix59, fix49, fix56); + const u32x fix63 = SM3_EXPAND (fix47, fix54, fix60, fix50, fix57); + const u32x fix64 = SM3_EXPAND (fix48, fix55, fix61, fix51, fix58); + + t[16] ^= fix16; + t[19] ^= fix19; + t[22] ^= fix22; + t[25] ^= fix25; + t[28] ^= fix28; + t[29] ^= fix29; + t[31] ^= fix31; + t[32] ^= fix32; + t[34] ^= fix34; + t[35] ^= fix35; + t[37] ^= fix37; + t[38] ^= fix38; + t[40] ^= fix40; + t[41] ^= fix41; + t[42] ^= fix42; + t[43] ^= fix43; + t[44] ^= fix44; + t[45] ^= fix45; + t[46] ^= fix46; + t[47] ^= fix47; + t[48] ^= fix48; + t[49] ^= fix49; + t[50] ^= fix50; + t[51] ^= fix51; + t[52] ^= fix52; + t[53] ^= fix53; + t[54] ^= fix54; + t[55] ^= fix55; + t[56] ^= fix56; + t[57] ^= fix57; + t[58] ^= fix58; + t[59] ^= fix59; + t[60] ^= fix60; + t[61] ^= fix61; + t[62] ^= fix62; + t[63] ^= fix63; + t[64] ^= fix64; + + u32x a = SM3_IV_A; + u32x b = SM3_IV_B; + u32x c = SM3_IV_C; + u32x d = SM3_IV_D; + u32x e = SM3_IV_E; + u32x f = SM3_IV_F; + u32x g = SM3_IV_G; + u32x h = SM3_IV_H; + + SM3_ROUND1 (a, b, c, d, e, f, g, h, SM3_T00, t[ 0], t[ 0] ^ t[ 4]); + SM3_ROUND1 (d, a, b, c, h, e, f, g, SM3_T01, t[ 1], t[ 1] ^ t[ 5]); + SM3_ROUND1 (c, d, a, b, g, h, e, f, SM3_T02, t[ 2], t[ 2] ^ t[ 6]); + SM3_ROUND1 (b, c, d, a, f, g, h, e, SM3_T03, t[ 3], t[ 3] ^ t[ 7]); + SM3_ROUND1 (a, b, c, d, e, f, g, h, SM3_T04, t[ 4], t[ 4] ^ t[ 8]); + SM3_ROUND1 (d, a, b, c, h, e, f, g, SM3_T05, t[ 5], t[ 5] ^ t[ 9]); + SM3_ROUND1 (c, d, a, b, g, h, e, f, SM3_T06, t[ 6], t[ 6] ^ t[10]); + SM3_ROUND1 (b, c, d, a, f, g, h, e, SM3_T07, t[ 7], t[ 7] ^ t[11]); + SM3_ROUND1 (a, b, c, d, e, f, g, h, SM3_T08, t[ 8], t[ 8] ^ t[12]); + SM3_ROUND1 (d, a, b, c, h, e, f, g, SM3_T09, t[ 9], t[ 9] ^ t[13]); + SM3_ROUND1 (c, d, a, b, g, h, e, f, SM3_T10, t[10], t[10] ^ t[14]); + SM3_ROUND1 (b, c, d, a, f, g, h, e, SM3_T11, t[11], t[11] ^ t[15]); + SM3_ROUND1 (a, b, c, d, e, f, g, h, SM3_T12, t[12], t[12] ^ t[16]); + SM3_ROUND1 (d, a, b, c, h, e, f, g, SM3_T13, t[13], t[13] ^ t[17]); + SM3_ROUND1 (c, d, a, b, g, h, e, f, SM3_T14, t[14], t[14] ^ t[18]); + SM3_ROUND1 (b, c, d, a, f, g, h, e, SM3_T15, t[15], t[15] ^ t[19]); + SM3_ROUND2 (a, b, c, d, e, f, g, h, SM3_T16, t[16], t[16] ^ t[20]); + SM3_ROUND2 (d, a, b, c, h, e, f, g, SM3_T17, t[17], t[17] ^ t[21]); + SM3_ROUND2 (c, d, a, b, g, h, e, f, SM3_T18, t[18], t[18] ^ t[22]); + SM3_ROUND2 (b, c, d, a, f, g, h, e, SM3_T19, t[19], t[19] ^ t[23]); + SM3_ROUND2 (a, b, c, d, e, f, g, h, SM3_T20, t[20], t[20] ^ t[24]); + SM3_ROUND2 (d, a, b, c, h, e, f, g, SM3_T21, t[21], t[21] ^ t[25]); + SM3_ROUND2 (c, d, a, b, g, h, e, f, SM3_T22, t[22], t[22] ^ t[26]); + SM3_ROUND2 (b, c, d, a, f, g, h, e, SM3_T23, t[23], t[23] ^ t[27]); + SM3_ROUND2 (a, b, c, d, e, f, g, h, SM3_T24, t[24], t[24] ^ t[28]); + SM3_ROUND2 (d, a, b, c, h, e, f, g, SM3_T25, t[25], t[25] ^ t[29]); + SM3_ROUND2 (c, d, a, b, g, h, e, f, SM3_T26, t[26], t[26] ^ t[30]); + SM3_ROUND2 (b, c, d, a, f, g, h, e, SM3_T27, t[27], t[27] ^ t[31]); + SM3_ROUND2 (a, b, c, d, e, f, g, h, SM3_T28, t[28], t[28] ^ t[32]); + SM3_ROUND2 (d, a, b, c, h, e, f, g, SM3_T29, t[29], t[29] ^ t[33]); + SM3_ROUND2 (c, d, a, b, g, h, e, f, SM3_T30, t[30], t[30] ^ t[34]); + SM3_ROUND2 (b, c, d, a, f, g, h, e, SM3_T31, t[31], t[31] ^ t[35]); + SM3_ROUND2 (a, b, c, d, e, f, g, h, SM3_T32, t[32], t[32] ^ t[36]); + SM3_ROUND2 (d, a, b, c, h, e, f, g, SM3_T33, t[33], t[33] ^ t[37]); + SM3_ROUND2 (c, d, a, b, g, h, e, f, SM3_T34, t[34], t[34] ^ t[38]); + SM3_ROUND2 (b, c, d, a, f, g, h, e, SM3_T35, t[35], t[35] ^ t[39]); + SM3_ROUND2 (a, b, c, d, e, f, g, h, SM3_T36, t[36], t[36] ^ t[40]); + SM3_ROUND2 (d, a, b, c, h, e, f, g, SM3_T37, t[37], t[37] ^ t[41]); + SM3_ROUND2 (c, d, a, b, g, h, e, f, SM3_T38, t[38], t[38] ^ t[42]); + SM3_ROUND2 (b, c, d, a, f, g, h, e, SM3_T39, t[39], t[39] ^ t[43]); + SM3_ROUND2 (a, b, c, d, e, f, g, h, SM3_T40, t[40], t[40] ^ t[44]); + SM3_ROUND2 (d, a, b, c, h, e, f, g, SM3_T41, t[41], t[41] ^ t[45]); + SM3_ROUND2 (c, d, a, b, g, h, e, f, SM3_T42, t[42], t[42] ^ t[46]); + SM3_ROUND2 (b, c, d, a, f, g, h, e, SM3_T43, t[43], t[43] ^ t[47]); + SM3_ROUND2 (a, b, c, d, e, f, g, h, SM3_T44, t[44], t[44] ^ t[48]); + SM3_ROUND2 (d, a, b, c, h, e, f, g, SM3_T45, t[45], t[45] ^ t[49]); + SM3_ROUND2 (c, d, a, b, g, h, e, f, SM3_T46, t[46], t[46] ^ t[50]); + SM3_ROUND2 (b, c, d, a, f, g, h, e, SM3_T47, t[47], t[47] ^ t[51]); + SM3_ROUND2 (a, b, c, d, e, f, g, h, SM3_T48, t[48], t[48] ^ t[52]); + SM3_ROUND2 (d, a, b, c, h, e, f, g, SM3_T49, t[49], t[49] ^ t[53]); + SM3_ROUND2 (c, d, a, b, g, h, e, f, SM3_T50, t[50], t[50] ^ t[54]); + SM3_ROUND2 (b, c, d, a, f, g, h, e, SM3_T51, t[51], t[51] ^ t[55]); + SM3_ROUND2 (a, b, c, d, e, f, g, h, SM3_T52, t[52], t[52] ^ t[56]); + SM3_ROUND2 (d, a, b, c, h, e, f, g, SM3_T53, t[53], t[53] ^ t[57]); + SM3_ROUND2 (c, d, a, b, g, h, e, f, SM3_T54, t[54], t[54] ^ t[58]); + SM3_ROUND2 (b, c, d, a, f, g, h, e, SM3_T55, t[55], t[55] ^ t[59]); + SM3_ROUND2 (a, b, c, d, e, f, g, h, SM3_T56, t[56], t[56] ^ t[60]); + SM3_ROUND2 (d, a, b, c, h, e, f, g, SM3_T57, t[57], t[57] ^ t[61]); + SM3_ROUND2 (c, d, a, b, g, h, e, f, SM3_T58, t[58], t[58] ^ t[62]); + SM3_ROUND2 (b, c, d, a, f, g, h, e, SM3_T59, t[59], t[59] ^ t[63]); + SM3_ROUND2 (a, b, c, d, e, f, g, h, SM3_T60, t[60], t[60] ^ t[64]); + + if (MATCHES_NONE_VS (d, d_rev)) continue; + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 65; i < 68; i++) + { + t[i] = pre_t[i]; + } + + const u32x fix65 = SM3_EXPAND (fix49, fix56, fix62, fix52, fix59); + const u32x fix66 = SM3_EXPAND (fix50, fix57, fix63, fix53, fix60); + const u32x fix67 = SM3_EXPAND (fix51, fix58, fix64, fix54, fix61); + + t[65] ^= fix65; + t[66] ^= fix66; + t[67] ^= fix67; + + SM3_ROUND2 (d, a, b, c, h, e, f, g, SM3_T61, t[61], t[61] ^ t[65]); + SM3_ROUND2 (c, d, a, b, g, h, e, f, SM3_T62, t[62], t[62] ^ t[66]); + SM3_ROUND2 (b, c, d, a, f, g, h, e, SM3_T63, t[63], t[63] ^ t[67]); + + COMPARE_S_SIMD (d, h, c, g); + } +} +*/ + KERNEL_FQ void m31100_m04 (KERN_ATTR_VECTOR ()) { /**